Program now fully uses an existing file. The load functions detect

their existing target tables and purge them if they exist or create them if they don't. The assign function will reuse existing type values by gi and write a new table that is aligned by gi with influenza.faa.
author: Don Pellegrino <don@drexel.edu> 2010-01-19 20:27:10 (GMT)
committer: Don Pellegrino <don@drexel.edu> 2010-01-19 20:27:10 (GMT)
commit: 9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8 (patch) (side-by-side diff)
tree: 05ad69a1255a77ca0739420345a540a03cd304e1
parent: 9cbe92a3f1ea871cb23818e32a1b0f85734d9453 (diff)
download: exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.zip
exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.tar.gz
exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.tar.bz2
7 files changed, 201 insertions, 86 deletions
diff --git a/src/aggregator.c b/src/aggregator.c
index c00d912..c9a03b5 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -6,28 +6,48 @@
 #include "error/check_h5_error.h"
 #include "load/load_influenza_aa_dat.h"
 #include "load/load_influenza_faa.h"
+#include <stdio.h>
 
-#define FILE "influenza.h5"
+#define H5FILE "influenza.h5"
+#define INFLUENZA_AA_DAT "/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat"
+#define INFLUENZA_FAA "/home/don/exp004/genomes/INFLUENZA/influenza.faa"
 
 int
 main ()
 {
   /*
-   * Create the HDF5 file.
+   * Create a new HDF5 file if it does not already exist.  If an
+   * existing file is found then open it.
    */
-  hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
-  if (file_id < 0)
-    check_h5_error (file_id, __FILE__, __LINE__);
+  hid_t file_id = 0;
+  FILE *f = fopen (H5FILE, "r+");
+  if (f == NULL)
+    {
+      file_id = H5Fcreate (H5FILE, H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
+      if (file_id < 0)
+	check_h5_error (file_id, __FILE__, __LINE__);
+    }
+  else
+    {
+      fclose (f);
+      file_id = H5Fopen (H5FILE, H5F_ACC_RDWR, H5P_DEFAULT);
+      if (file_id < 0)
+	check_h5_error (file_id, __FILE__, __LINE__);
+    }
 
   /*
    * Load the supplementary protein data file.
    */
-  load_influenza_aa_dat (file_id);
+  printf ("Loading \"influenza_aa.dat\" with contents of %s.\n",
+	  INFLUENZA_AA_DAT);
+  load_influenza_aa_dat (file_id, INFLUENZA_AA_DAT);
 
   /*
    * Load the FASTA protein sequence data file.
    */
-  load_influenza_faa (file_id);
+  printf ("Loading \"influenza.faa\" with contents of %s.\n",
+	  INFLUENZA_FAA);
+  load_influenza_faa (file_id, INFLUENZA_FAA);
 
   /*
    * Close the HDF5 file.
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 73685bb..3947800 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,5 +1,6 @@
 #define _GNU_SOURCE
 #include "assign_protein_type.h"
+#include "error/check_error.h"
 #include "error/check_h5_error.h"
 #include "error/check_ncbi_error.h"
 #include "model/gi_type_data.h"
@@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id)
     check_h5_error (status, __FILE__, __LINE__);
 
   /*
+   * Allocate memory for the new table.
+   */
+  gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);
+  if (new_buf == NULL)
+    check_error (__FILE__, __LINE__);
+
+  /*
    * Read the data from HDF5 gi_type_data.
    */
   hsize_t gi_nfields = 0;
@@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id)
   hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
   gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
 
-  gi_type_data* gi_buf = NULL;
+  gi_type_data* old_buf = NULL;
 
+  /*
+   * If the table is already present read the values into memory and
+   * then clear the table.
+   */
   if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
     {
 
@@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id)
 				   &gi_nrecords);
       if (status < 0)
 	check_h5_error (status, __FILE__, __LINE__);
+
+      printf ("  Using gi_type_data cache of %i records.\n", (int)gi_nrecords);
   
-      gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
+      old_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
             
       status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
-			       gi_sizes, gi_buf);
+			       gi_sizes, old_buf);
+      if (status < 0)
+	check_h5_error (status, __FILE__, __LINE__);
+
+      status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);
       if (status < 0)
 	check_h5_error (status, __FILE__, __LINE__);
       
     }
+
+  /*
+   * If the table is not already present create it.
+   */
   else
     {      
 
       printf ("Creating gi_type_data.\n");
 
-      gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
-
       const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
 	GI_TYPE_DATA_FIELD_NAMES;
 
@@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id)
 
       status = H5TBmake_table ("gi_type_data", file_id,
 			       "gi_type_data",
-			       GI_TYPE_DATA_FIELD_NUM, faa_nrecords,
+			       GI_TYPE_DATA_FIELD_NUM, 0,
 			       gi_size, gi_type_data_field_names,
 			       gi_offset, gi_field_type,
 			       chunk_size, fill_data, compress,
@@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id)
 
     }
 
+  /*
+   * Copy the contents of the old table into a hash.
+   */
   struct hsearch_data htab;
   bzero (&htab, sizeof (htab));
-  hcreate_r (gi_nrecords * 2, &htab);
+  if (hcreate_r (gi_nrecords * 2, &htab) == 0)
+    error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+		   "Allocation of cache failed.");
   ENTRY e, *ep;
-  
-  for (int i = 0; i < gi_nrecords; i++)
+
+  for (int i = 0; i < (int)gi_nrecords; i++)
     {
       char gi_chr[25];
-      snprintf (gi_chr, 25, "%i", gi_buf[i].gi);
-      e.key = gi_chr;
-      e.data = &gi_buf[i];
+      snprintf (gi_chr, 25, "%i", old_buf[i].gi);
+      e.key = strdup (gi_chr);
+      e.data = &old_buf[i];
       if (hsearch_r (e, ENTER, &ep, &htab) == 0)
 	error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
 		       "Allocation failed.");
@@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id)
    * Assign protein types to records for which the field is empty.
    */
   printf ("Records to process: %i\n", (int)faa_nrecords);
-  bool updates_pending = false;
-  for (int i = 0; i < faa_nrecords; i++)
+  int written = 0;
+  for (int i = 0; i < (int)faa_nrecords; i++)
     {
+      new_buf[i].gi = faa_buf[i].gi;
+      strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));
+      strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));
       
       char gi_chr[25];
       snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
       e.key = gi_chr;
+      e.data = NULL;
+
+      /*
+       * A record was not found in the cache for this gi.
+       */
       if (hsearch_r (e, FIND, &ep, &htab) == 0) 
 	{
-
-	  gi_buf[i].gi = faa_buf[i].gi;
-	  gi_buf[i].type[0] = '\0';
-	  gi_buf[i].protein[0] = '\0';
 	  
 	  /*
 	   * Read the sequence from the database by GI.
@@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id)
 	   */
 	  if (error_returns != NULL)
 	    {
-	      char *msg = BlastErrorToString (error_returns);
+	      CharPtr msg = BlastErrorToString (error_returns);
 	      printf ("Warning: An error has been reported by the NCBI Toolkit "
 		      "API for sequence gi|%i: %s",
 		      faa_buf[i].gi, msg);
@@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id)
 			  BUFFER_LEN);
 
 	      // Species Type
-	      gi_buf[i].type[0] = target_id_buf[4];
-	      gi_buf[i].type[1] = '\0';
+	      new_buf[i].type[0] = target_id_buf[4];
+	      new_buf[i].type[1] = '\0';
 	      
 	      // Protein Type
-	      strncpy (gi_buf[i].protein, &target_id_buf[6], 
-		       sizeof (gi_buf[i].protein));
-	      
-	      updates_pending = true;
+	      strncpy (new_buf[i].protein, &target_id_buf[6], 
+		       sizeof (new_buf[i].protein));	     
 	    }
 	  
 	  /*
@@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id)
 	  seqalign = SeqAlignSetFree (seqalign);
 	  bsp = BioseqFree (bsp);
 	  
+	} // End existing entry not found.
+
+      /*
+       * Hash table entry found.  Keep the old value.
+       */
+      else
+	{
+	  gi_type_data* old_value = (gi_type_data*)ep->data;
+	  new_buf[i].gi = old_value->gi;
+	  strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type));
+	  strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein));
 	}
       
       /*
        * Write the data out to the file.
        */
-      if ( (i % 1000 == 0) && (i > 0) && updates_pending)
+      if ( (i % 1000 == 0) && (i > 0) )
 	{
-	  status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,
+	  status = H5TBappend_records (file_id, "gi_type_data", 1000,
 				      gi_size, gi_offset, gi_sizes, 
-				      &gi_buf[i-1000]);
+				      &new_buf[i-1000]);
 	  if (status < 0)
 	    check_h5_error (status, __FILE__, __LINE__);
 	  
@@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id)
 	  if (status < 0)
 	    check_h5_error (status, __FILE__, __LINE__);
 
-	  updates_pending = false;
+	  written = i;
 
 	  printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
 	}
@@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id)
    * Write out records from the last bin if it was less than 1000
    * records in size.
    */
-  if (updates_pending)
+  if ((int)faa_nrecords < 1000) 
     {
-      /*
-      if ((int)faa_nrecords < 1000) 
-	{
-	  status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
-				      dst_size, dst_offset, dst_sizes,
-				      dst_buf);
-	}
-      else
-	{
-	  status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,
-				      dst_size, dst_offset, dst_sizes, 
-				      &dst_buf[nrecords-1000]);
-	}
-      if (status < 0)
-	check_h5_error (status, __FILE__, __LINE__);
-      
-      status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
-      if (status < 0)
-	check_h5_error (status, __FILE__, __LINE__);
-      
-      updates_pending = false;
-      */
+      status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,
+				  gi_size, gi_offset, gi_sizes,
+				  new_buf);
+    }
+
+  else
+    {
+      status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,
+				  gi_size, gi_offset, gi_sizes, 
+				   &new_buf[written]);
     }
+
+  if (status < 0)
+    check_h5_error (status, __FILE__, __LINE__);
+  
+  status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+  if (status < 0)
+    check_h5_error (status, __FILE__, __LINE__);
   
   free (faa_buf);
-  free (gi_buf);
+  free (old_buf);
+  free (new_buf);
   hdestroy_r (&htab);
 
   options = BLASTOptionDelete (options);
+  readdb_destruct (seqdb);
   
   return;
 }
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
index 8bf47aa..3826349 100644
--- a/src/load/load_influenza_aa_dat.c
+++ b/src/load/load_influenza_aa_dat.c
@@ -13,10 +13,9 @@
 #include <stdlib.h>
 
 #define NFIELDS (hsize_t) 11
-#define TABLE_NAME "influenza_aa.dat"
 
 void
-load_influenza_aa_dat (hid_t file_id)
+load_influenza_aa_dat (hid_t file_id, const char* file_name)
 {
   /*
    * Model the data using native types.
@@ -145,8 +144,7 @@ load_influenza_aa_dat (hid_t file_id)
    * Insert the records.
    */
   supplementary_data p_data;
-  FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat",
-		     "r");
+  FILE *dat = fopen (file_name, "r");
   if (dat == NULL)
     check_error (__FILE__, __LINE__);
   char *line = NULL;
@@ -214,18 +212,49 @@ load_influenza_aa_dat (hid_t file_id)
 
       if (current_line == 1)
 	{
-	  herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
-					  TABLE_NAME, NFIELDS, 1, dst_size,
-					  field_names, dst_offset, field_type,
-					  chunk_size, fill_data, compress,
-					  &p_data);
-	  if (status < 0)
-	    check_h5_error (status, __FILE__, __LINE__);
+
+	  /*
+	   * Dataset already exists.  Purge it.
+	   */
+	  if (H5LTfind_dataset (file_id, "influenza_aa.dat") == 1)
+	    {
+	      hsize_t nfields = 0;
+	      hsize_t nrecords = 0;
+	      herr_t status = H5TBget_table_info (file_id, "influenza_aa.dat",
+						  &nfields, &nrecords);
+	      if (status < 0)
+		check_h5_error (status, __FILE__, __LINE__);
+
+	      status = H5TBdelete_record (file_id, "influenza_aa.dat", 0, nrecords);
+	      if (status < 0)
+		check_h5_error (status, __FILE__, __LINE__);
+
+	      status =
+		H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size,
+				    dst_offset, dst_sizes, &p_data);
+	      if (status < 0)
+		check_h5_error (status, __FILE__, __LINE__);		
+	    }
+
+	  /*
+	   * Dataset does not exist.  Create it.
+	   */
+	  else
+	    {
+	      herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
+					      "influenza_aa.dat", NFIELDS, 1, dst_size,
+					      field_names, dst_offset, field_type,
+					      chunk_size, fill_data, compress,
+					      &p_data);
+	      if (status < 0)
+		check_h5_error (status, __FILE__, __LINE__);
+	    }
 	}
+
       else
 	{
 	  herr_t status =
-	    H5TBappend_records (file_id, TABLE_NAME, 1, dst_size,
+	    H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size,
 				dst_offset, dst_sizes, &p_data);
 	  if (status < 0)
 	    check_h5_error (status, __FILE__, __LINE__);
diff --git a/src/load/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h
index f6c60be..97e36f8 100644
--- a/src/load/load_influenza_aa_dat.h
+++ b/src/load/load_influenza_aa_dat.h
@@ -7,6 +7,6 @@
  * Load the supplementary protein data from the NCBI influenza_aa.dat
  * file.
  */
-void load_influenza_aa_dat (hid_t file_id);
+void load_influenza_aa_dat (hid_t file_id, const char* file_name);
 
 #endif // LOAD_INFLUENZA_AA_DAT_H
diff --git a/src/load/load_influenza_faa.c b/src/load/load_influenza_faa.c
index a217989..04bf05b 100644
--- a/src/load/load_influenza_faa.c
+++ b/src/load/load_influenza_faa.c
@@ -8,7 +8,7 @@
 #include <stdlib.h>
 
 void
-load_influenza_faa (hid_t file_id)
+load_influenza_faa (hid_t file_id, const char* file_name)
 {
   size_t dst_size;
   size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
@@ -22,8 +22,7 @@ load_influenza_faa (hid_t file_id)
   int compress = 0;
 
   sequence_data p_data;
-  FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza.faa",
-		     "r");
+  FILE *dat = fopen (file_name, "r");
   if (dat == NULL)
     check_error (__FILE__, __LINE__);
   char *line = NULL;
@@ -62,16 +61,46 @@ load_influenza_faa (hid_t file_id)
 
 	  if (current_line == 1)
 	    {
-	      herr_t status = H5TBmake_table ("influenza.faa", file_id,
-					      "influenza.faa",
-					      SEQUENCE_DATA_FIELD_NUM, 1,
-					      dst_size, sequence_data_field_names,
-					      dst_offset, field_type,
-					      chunk_size, fill_data, compress,
-					      &p_data);
-	      if (status < 0)
-		check_h5_error (status, __FILE__, __LINE__);
+	      /*
+	       * Dataset already exists.  Purge it.
+	       */
+	      if (H5LTfind_dataset (file_id, "influenza.faa") == 1)
+		{
+		  hsize_t nfields = 0;
+		  hsize_t nrecords = 0;
+		  herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
+						      &nrecords);
+		  if (status < 0)
+		    check_h5_error (status, __FILE__, __LINE__);
+
+		  status = H5TBdelete_record (file_id, "influenza.faa", 0, nrecords);
+		  if (status < 0)
+		    check_h5_error (status, __FILE__, __LINE__);
+
+		  status =
+		    H5TBappend_records (file_id, "influenza.faa", 1, dst_size,
+					dst_offset, dst_sizes, &p_data);
+		  if (status < 0)
+		    check_h5_error (status, __FILE__, __LINE__);
+		}
+
+	      /*
+	       * Dataset does not exist.  Create it.
+	       */
+	      else
+		{
+		  herr_t status = H5TBmake_table ("influenza.faa", file_id,
+						  "influenza.faa",
+						  SEQUENCE_DATA_FIELD_NUM, 1,
+						  dst_size, sequence_data_field_names,
+						  dst_offset, field_type,
+						  chunk_size, fill_data, compress,
+						  &p_data);
+		  if (status < 0)
+		    check_h5_error (status, __FILE__, __LINE__);
+		}
 	    }
+
 	  else
 	    {
 	      herr_t status =
diff --git a/src/load/load_influenza_faa.h b/src/load/load_influenza_faa.h
index 569c411..1ad5797 100644
--- a/src/load/load_influenza_faa.h
+++ b/src/load/load_influenza_faa.h
@@ -6,6 +6,6 @@
 /*
  * Load the protein sequence data from the NCBI influenza.faa file.
  */
-void load_influenza_faa (hid_t file_id);
+void load_influenza_faa (hid_t file_id, const char* file_name);
 
 #endif // LOAD_INFLUENZA_FAA_H
diff --git a/src/updator.c b/src/updator.c
index 591d2f6..9a5ad18 100644
--- a/src/updator.c
+++ b/src/updator.c
@@ -4,6 +4,8 @@
 
 #include "assign/assign_protein_type.h"
 #include "error/check_h5_error.h"
+#include <stdio.h>
+#include <signal.h>
 
 #define FILE "influenza.h5"
author	Don Pellegrino <don@drexel.edu>	2010-01-19 20:27:10 (GMT)
committer	Don Pellegrino <don@drexel.edu>	2010-01-19 20:27:10 (GMT)
commit	9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8 (patch) (side-by-side diff)
tree	05ad69a1255a77ca0739420345a540a03cd304e1
parent	9cbe92a3f1ea871cb23818e32a1b0f85734d9453 (diff)
download	exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.zip exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.tar.gz exp007-9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8.tar.bz2