summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-18 17:26:18 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-18 17:26:18 (GMT)
commita7cc532248146968a9296be7db42830e35525afa (patch) (side-by-side diff)
tree93c44ad4f1ac27a1daed4ad5c7fdcb76971c0221
parent0d0e0886d17612fb7ebdb9110679d5b7bd5087be (diff)
downloadexp007-a7cc532248146968a9296be7db42830e35525afa.zip
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.gz
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.bz2
Implemented updator to calculate and assign protein type values to
records which do not have them. This is a re-entrant process that only updates missing records and skips quickly through existing records. In addition premature termination loses at most 999 records of work since every thousand records are written back to the file and flushed to disk. Still to do is to write and flush the last set of records in the final bin under 1000. Also refactored the sequence_data structure so that it can be shared between HDF5 reading and writing operations.
-rw-r--r--.gitignore5
-rw-r--r--src/Makefile.am10
-rw-r--r--src/assign_protein_type.c135
-rw-r--r--src/load_influenza_faa.c67
-rw-r--r--src/sequence_data.h16
-rw-r--r--src/sequence_data_init.c37
-rw-r--r--src/sequence_data_init.h14
7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
depcomp
install-sh
missing
-stamp-h1
+src/.deps
src/aggregator
src/influenza.h5
-src/.deps
+src/updator
+stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
check_error.c \
check_h5_error.c \
load_influenza_aa_dat.c \
- load_influenza_faa.c
+ load_influenza_faa.c \
+ sequence_data_init.c
updator_SOURCES = \
updator.c \
check_error.c \
check_h5_error.c \
check_ncbi_error.c \
- assign_protein_type.c
+ assign_protein_type.c \
+ sequence_data_init.c
noinst_HEADERS = \
assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
check_h5_error.h \
check_ncbi_error.h \
load_influenza_aa_dat.h \
- load_influenza_faa.h
+ load_influenza_faa.h \
+ sequence_data.h \
+ sequence_data_init.h
AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
#include "assign_protein_type.h"
#include "check_ncbi_error.h"
#include "check_h5_error.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
#include <ncbi.h>
#include <readdb.h>
#include <blast.h>
#include <salpacc.h>
#include <stdbool.h>
#include <hdf5_hl.h>
+#include <error.h>
/*
* BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
* Get default BLAST options.
*/
BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+
+ /*
+ * Set to single hit mode.
+ */
+ options->window_size = 0;
+ options->multiple_hits_only = false;
+
ValNodePtr error_returns = NULL;
/*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- /*
- * todo: Allocate memory of nrecords for dst_buf.
- *
- * todo: Refactor code to share structres in read and write HDF5
- * calls.
- */
+ sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
+
+ size_t dst_size;
+ size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
+ hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
+ sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
dst_sizes, dst_buf);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- for (int i = 0; i < nrecords; i++)
- {
-
- }
-
/*
- * Read the sequence from the database by GI.
+ * Assign protein types to records for which the field is empty.
*/
- Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
- BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
-
- SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
- "blastp",
- REFDB,
- options,
- NULL,
- &error_returns,
- NULL);
-
- if (error_returns != NULL)
- check_ncbi_error (error_returns, __FILE__, __LINE__);
-
- if (seqalign != NULL)
+ printf ("Records to process: %i\n", (int)nrecords);
+ for (int i = 0; i < nrecords; i++)
{
- Char target_id_buf[BUFFER_LEN + 1];
- SeqIdPtr target_id = SeqAlignId (seqalign, 1);
- SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
- printf ("%s\n", target_id_buf);
- }
-
- // Clean up memory for the next ieration.
- seqalign = SeqAlignSetFree (seqalign);
- bsp = BioseqFree (bsp);
+ if (dst_buf[i].protein_type[0] == '\0') {
+ /*
+ * Read the sequence from the database by GI.
+ */
+ Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
+ BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+
+ SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+ "blastp",
+ REFDB,
+ options,
+ NULL,
+ &error_returns,
+ NULL);
+
+ /*
+ * BLAST reported an error. Write it out and continue processing.
+ */
+ if (error_returns != NULL)
+ {
+ printf ("Warning: An error has been reported by the NCBI Toolkit API "
+ "for sequence gi|%i: %s",
+ dst_buf[i].gi, BlastErrorToString (error_returns));
+
+ }
+
+ /*
+ * A hit was found. Record the first hit as the protein type.
+ * Skip the first 6 characters and eat the "lcl|x_".
+ */
+ else if (seqalign != NULL)
+ {
+ Char target_id_buf[BUFFER_LEN + 1];
+ SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+ SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+ strncpy (dst_buf[i].protein_type, &target_id_buf[6],
+ sizeof (dst_buf[i].protein_type));
+ }
+
+ /*
+ * BLAST did not find any hits.
+ */
+ else
+ {
+ printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
+ dst_buf[i].gi);
+ }
+
+ /*
+ * Clean up memory for the next ieration.
+ */
+ seqalign = SeqAlignSetFree (seqalign);
+ bsp = BioseqFree (bsp);
+
+ /*
+ * Write the data out to the file.
+ */
+ if ( (i % 1000 == 0) && (i > 0) )
+ {
+ status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
+ dst_size, dst_offset, dst_sizes,
+ &dst_buf[i-1000]);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ printf ("Processed %i of %i records.\n", i, (int)nrecords);
+ }
+ }
+
+ }
+
+ free (dst_buf);
+
options = BLASTOptionDelete (options);
-
+
return;
}
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
-#include "load_influenza_faa.h"
#include "check_error.h"
#include "check_h5_error.h"
-#include "hdf5_hl.h"
+#include "load_influenza_faa.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
+#include <hdf5_hl.h>
#include <string.h>
#include <stdlib.h>
-#define SEQUENCE_DATA_FIELD_NUM 4
-
void
load_influenza_faa (hid_t file_id)
{
- typedef struct
- {
- int gi;
- char gb[9];
- char description[196];
- char protein_type[7];
- } sequence_data;
-
- size_t dst_size = sizeof (sequence_data);
- size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
- { HOFFSET (sequence_data, gi),
- HOFFSET (sequence_data, gb),
- HOFFSET (sequence_data, description),
- HOFFSET (sequence_data, protein_type)
- };
-
- sequence_data dst_buf[1];
-
- size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
- sizeof (dst_buf[0].gi),
- sizeof (dst_buf[0].gb),
- sizeof (dst_buf[0].description),
- sizeof (dst_buf[0].protein_type)
- };
-
+ size_t dst_size;
+ size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
- field_type[0] = H5T_NATIVE_INT;
-
- hid_t gb_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (gb_type, 9);
- field_type[1] = gb_type;
-
- hid_t description_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (description_type, 196);
- field_type[2] = description_type;
-
- hid_t protein_type_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (protein_type_type, 7);
- field_type[3] = protein_type_type;
-
- const char *field_names[SEQUENCE_DATA_FIELD_NUM] =
- { "GI",
- "GB",
- "Description",
- "Protein Type" };
-
+ sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
+
hsize_t chunk_size = 10;
int *fill_data = NULL;
int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
+ const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
+ SEQUENCE_DATA_FIELD_NAMES;
+
if (current_line == 1)
{
herr_t status = H5TBmake_table ("influenza.faa", file_id,
"influenza.faa",
SEQUENCE_DATA_FIELD_NUM, 1,
- dst_size, field_names,
+ dst_size, sequence_data_field_names,
dst_offset, field_type,
chunk_size, fill_data, compress,
&p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
fclose (dat);
- H5Tclose (gb_type);
- H5Tclose (description_type);
- H5Tclose (protein_type_type);
-
return;
}
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
+#ifndef SEQUENCE_DATA_H
+#define SEQUENCE_DATA_H
+
+#define SEQUENCE_DATA_FIELD_NUM 4
+
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+
+typedef struct
+{
+ int gi;
+ char gb[9];
+ char description[196];
+ char protein_type[7];
+} sequence_data;
+
+#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
+#include "sequence_data_init.h"
+#include "sequence_data.h"
+
+void
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type)
+{
+ *dst_size = sizeof (sequence_data);
+
+ dst_offset[0] = HOFFSET (sequence_data, gi);
+ dst_offset[1] = HOFFSET (sequence_data, gb);
+ dst_offset[2] = HOFFSET (sequence_data, description);
+ dst_offset[3] = HOFFSET (sequence_data, protein_type);
+
+ sequence_data dst_buf[1];
+
+ dst_sizes[0] = sizeof (dst_buf[0].gi);
+ dst_sizes[1] = sizeof (dst_buf[0].gb);
+ dst_sizes[2] = sizeof (dst_buf[0].description);
+ dst_sizes[3] = sizeof (dst_buf[0].protein_type);
+
+ field_type[0] = H5T_NATIVE_INT;
+
+ hid_t gb_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (gb_type, 9);
+ field_type[1] = gb_type;
+
+ hid_t description_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (description_type, 196);
+ field_type[2] = description_type;
+
+ hid_t protein_type_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (protein_type_type, 7);
+ field_type[3] = protein_type_type;
+
+ return;
+}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
+#ifndef SEQUENCE_DATA_INIT_H
+#define SEQUENCE_DATA_INIT_H
+
+#include <hdf5.h>
+
+/*
+ * Initialize the structures describing sequence_data. These
+ * descriptive structures are used by the HDF5 API.
+ */
+void
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type);
+
+#endif // SEQUENCE_DATA_INIT_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.