7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
 depcomp
 install-sh
 missing
-stamp-h1
+src/.deps
 src/aggregator
 src/influenza.h5
-src/.deps
+src/updator
+stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
 	check_error.c \
 	check_h5_error.c \
 	load_influenza_aa_dat.c \
-	load_influenza_faa.c
+	load_influenza_faa.c \
+	sequence_data_init.c
 
 updator_SOURCES = \
 	updator.c \
 	check_error.c \
 	check_h5_error.c \
 	check_ncbi_error.c \
-	assign_protein_type.c
+	assign_protein_type.c \
+	sequence_data_init.c
 
 noinst_HEADERS = \
 	assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
 	check_h5_error.h \
 	check_ncbi_error.h \
 	load_influenza_aa_dat.h \
-	load_influenza_faa.h
+	load_influenza_faa.h \
+	sequence_data.h \
+	sequence_data_init.h
 
 AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
 #include "assign_protein_type.h"
 #include "check_ncbi_error.h"
 #include "check_h5_error.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
 #include <ncbi.h>
 #include <readdb.h>
 #include <blast.h>
 #include <salpacc.h>
 #include <stdbool.h>
 #include <hdf5_hl.h>
+#include <error.h>
 
 /*
  * BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
    * Get default BLAST options.
    */
   BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+
+  /*
+   * Set to single hit mode.
+   */
+  options->window_size = 0;
+  options->multiple_hits_only = false;
+
   ValNodePtr error_returns = NULL;
 
   /*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
   if (status < 0)
     check_h5_error (status, __FILE__, __LINE__);
 
-  /*
-   * todo: Allocate memory of nrecords for dst_buf.
-   *
-   * todo: Refactor code to share structres in read and write HDF5
-   * calls.
-   */
+  sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
+
+  size_t dst_size;
+  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
+  hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
+  sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);  
 
   status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
 			   dst_sizes, dst_buf);
   if (status < 0)
     check_h5_error (status, __FILE__, __LINE__);
 
-  for (int i = 0; i < nrecords; i++)
-    {
-      
-    }
-
   /*
-   * Read the sequence from the database by GI.
+   * Assign protein types to records for which the field is empty.
    */
-  Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
-  BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
-
-  SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
-					    "blastp",
-					    REFDB,
-					    options,
-					    NULL,
-					    &error_returns,
-					    NULL);
-
-  if (error_returns != NULL)
-    check_ncbi_error (error_returns, __FILE__, __LINE__);
-
-  if (seqalign != NULL)
+  printf ("Records to process: %i\n", (int)nrecords);
+  for (int i = 0; i < nrecords; i++)
     {
-      Char target_id_buf[BUFFER_LEN + 1];
-      SeqIdPtr target_id = SeqAlignId (seqalign, 1);
-      SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
-      printf ("%s\n", target_id_buf);
-    }
-
-  // Clean up memory for the next ieration.
-  seqalign = SeqAlignSetFree (seqalign);
-  bsp = BioseqFree (bsp);
 
+      if (dst_buf[i].protein_type[0] == '\0') {
+	/*
+	 * Read the sequence from the database by GI.
+	 */
+	Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
+	BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+	
+	SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+						  "blastp",
+						  REFDB,
+						  options,
+						  NULL,
+						  &error_returns,
+						  NULL);
+
+	/*
+	 * BLAST reported an error.  Write it out and continue processing.
+	 */
+	if (error_returns != NULL)
+	  {
+	    printf ("Warning: An error has been reported by the NCBI Toolkit API "
+		    "for sequence gi|%i: %s",
+		    dst_buf[i].gi, BlastErrorToString (error_returns));
+
+	  }
+
+	/*
+	 * A hit was found.  Record the first hit as the protein type.
+	 * Skip the first 6 characters and eat the "lcl|x_".
+	 */
+	else if (seqalign != NULL)
+	  {
+	    Char target_id_buf[BUFFER_LEN + 1];
+	    SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+	    SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+	    strncpy (dst_buf[i].protein_type, &target_id_buf[6], 
+		     sizeof (dst_buf[i].protein_type));
+	  }
+
+	/*
+	 * BLAST did not find any hits.
+	 */
+	else
+	  {
+	    printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
+		    dst_buf[i].gi);
+	  }
+	
+	/*
+	 * Clean up memory for the next ieration.
+	 */
+	seqalign = SeqAlignSetFree (seqalign);
+	bsp = BioseqFree (bsp);
+
+	/*
+	 * Write the data out to the file.
+	 */
+	if ( (i % 1000 == 0) && (i > 0) )
+	  {
+	    status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
+					dst_size, dst_offset, dst_sizes, 
+					&dst_buf[i-1000]);
+	    if (status < 0)
+	      check_h5_error (status, __FILE__, __LINE__);
+
+	    status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+	    if (status < 0)
+	      check_h5_error (status, __FILE__, __LINE__);
+
+	    printf ("Processed %i of %i records.\n", i, (int)nrecords);
+	  }
+      }
+      
+    }
+  
+  free (dst_buf);
+  
   options = BLASTOptionDelete (options);
-
+  
   return;
 }
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
-#include "load_influenza_faa.h"
 #include "check_error.h"
 #include "check_h5_error.h"
-#include "hdf5_hl.h"
+#include "load_influenza_faa.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
+#include <hdf5_hl.h>
 #include <string.h>
 #include <stdlib.h>
 
-#define SEQUENCE_DATA_FIELD_NUM 4
-
 void
 load_influenza_faa (hid_t file_id)
 {
-  typedef struct
-  {
-    int gi;
-    char gb[9];
-    char description[196];
-    char protein_type[7];
-  } sequence_data;
-  
-  size_t dst_size = sizeof (sequence_data);
-  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
-    { HOFFSET (sequence_data, gi),
-      HOFFSET (sequence_data, gb),
-      HOFFSET (sequence_data, description),
-      HOFFSET (sequence_data, protein_type)
-    };
-
-  sequence_data dst_buf[1];
-
-  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
-    sizeof (dst_buf[0].gi),
-    sizeof (dst_buf[0].gb),
-    sizeof (dst_buf[0].description),
-    sizeof (dst_buf[0].protein_type)
-  };
-
+  size_t dst_size;
+  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
   hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
 
-  field_type[0] = H5T_NATIVE_INT;
-
-  hid_t gb_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (gb_type, 9);
-  field_type[1] = gb_type;
-
-  hid_t description_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (description_type, 196);
-  field_type[2] = description_type;
-
-  hid_t protein_type_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (protein_type_type, 7);
-  field_type[3] = protein_type_type;
-
-  const char *field_names[SEQUENCE_DATA_FIELD_NUM] = 
-    { "GI",
-      "GB",
-      "Description",
-      "Protein Type" };
-
+  sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
+  
   hsize_t chunk_size = 10;
   int *fill_data = NULL;
   int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
 
 	  strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
 
+	  const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
+	    SEQUENCE_DATA_FIELD_NAMES;
+
 	  if (current_line == 1)
 	    {
 	      herr_t status = H5TBmake_table ("influenza.faa", file_id,
 					      "influenza.faa",
 					      SEQUENCE_DATA_FIELD_NUM, 1,
-					      dst_size, field_names,
+					      dst_size, sequence_data_field_names,
 					      dst_offset, field_type,
 					      chunk_size, fill_data, compress,
 					      &p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
 
   fclose (dat);
 
-  H5Tclose (gb_type);
-  H5Tclose (description_type);
-  H5Tclose (protein_type_type);
-
   return;
 }
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
+#ifndef SEQUENCE_DATA_H
+#define SEQUENCE_DATA_H
+
+#define SEQUENCE_DATA_FIELD_NUM 4
+
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+
+typedef struct
+{
+  int gi;
+  char gb[9];
+  char description[196];
+  char protein_type[7];
+} sequence_data;
+
+#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
+#include "sequence_data_init.h"
+#include "sequence_data.h"
+
+void 
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+		    hid_t *field_type)
+{
+  *dst_size = sizeof (sequence_data);
+  
+  dst_offset[0] = HOFFSET (sequence_data, gi);
+  dst_offset[1] = HOFFSET (sequence_data, gb);
+  dst_offset[2] = HOFFSET (sequence_data, description);
+  dst_offset[3] = HOFFSET (sequence_data, protein_type);
+  
+  sequence_data dst_buf[1];
+  
+  dst_sizes[0] = sizeof (dst_buf[0].gi);
+  dst_sizes[1] = sizeof (dst_buf[0].gb);
+  dst_sizes[2] = sizeof (dst_buf[0].description);
+  dst_sizes[3] = sizeof (dst_buf[0].protein_type);
+  
+  field_type[0] = H5T_NATIVE_INT;
+  
+  hid_t gb_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (gb_type, 9);
+  field_type[1] = gb_type;
+  
+  hid_t description_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (description_type, 196);
+  field_type[2] = description_type;
+  
+  hid_t protein_type_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (protein_type_type, 7);
+  field_type[3] = protein_type_type;
+
+  return;
+}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
+#ifndef SEQUENCE_DATA_INIT_H
+#define SEQUENCE_DATA_INIT_H
+
+#include <hdf5.h>
+
+/*
+ * Initialize the structures describing sequence_data.  These
+ * descriptive structures are used by the HDF5 API.
+ */
+void 
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+		    hid_t *field_type);
+
+#endif // SEQUENCE_DATA_INIT_H