From d32c4e84c2969303b90474b19714efdd00ddb8f9 Mon Sep 17 00:00:00 2001
From: Don Pellegrino <don@drexel.edu>
Date: Mon, 18 Jan 2010 23:31:27 +0000
Subject: Organzied the functions into assign, error, load and model subdirectories.  Code does not compile as the transition to storing species and protein type data in a separate HDF5 table is incomplete.

---
diff --git a/analysis/year.R b/analysis/year.R
index b7da3c8..6d68925 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -5,21 +5,23 @@ require(hdf5);
 hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
 
 A <- influenza.aa.dat;
-
 B <- influenza.faa;
 
+# Join the two tables by GB value.
+C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
+
 # All records for 1918.  Based on code from
 # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
-C <- A[A$Year == 1918, ]
+D <- C[C$Year == 1918, ]
 
-# Countries represented in the 1918 dataset.
-C$Country;
+summary (D);
 
-# All record with a year value.
-D <- A[A$Year != 0, ];
+# Countries represented in the 1918 dataset.
+D$Country;
 
-hist(D$Year);
+D[D$"Protein Type" == "HA", ]
 
-B[B$GI == 305182, ]
+# All record with a year value.
+E <- A[A$Year != 0, ];
 
-B[B$"Protein Type" == "HA", ]
+hist(E$Year);
diff --git a/src/Makefile.am b/src/Makefile.am
index 6dd8e16..06a6f5c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,28 +2,31 @@ bin_PROGRAMS = aggregator updator
 
 aggregator_SOURCES = \
 	aggregator.c \
-	check_error.c \
-	check_h5_error.c \
-	load_influenza_aa_dat.c \
-	load_influenza_faa.c \
-	sequence_data_init.c
+	error/check_error.c \
+	error/check_h5_error.c \
+	load/load_influenza_aa_dat.c \
+	load/load_influenza_faa.c \
+	model/sequence_data_init.c
 
 updator_SOURCES = \
 	updator.c \
-	check_error.c \
-	check_h5_error.c \
-	check_ncbi_error.c \
-	assign_protein_type.c \
-	sequence_data_init.c
+	assign/assign_protein_type.c \
+	error/check_error.c \
+	error/check_h5_error.c \
+	error/check_ncbi_error.c \
+	model/gi_type_data_init.c \
+	model/sequence_data_init.c
 
 noinst_HEADERS = \
-	assign_protein_type.h \
-	check_error.h \
-	check_h5_error.h \
-	check_ncbi_error.h \
-	load_influenza_aa_dat.h \
-	load_influenza_faa.h \
-	sequence_data.h \
-	sequence_data_init.h
+	assign/assign_protein_type.h \
+	error/check_error.h \
+	error/check_h5_error.h \
+	error/check_ncbi_error.h \
+	load/load_influenza_aa_dat.h \
+	load/load_influenza_faa.h \
+	model/gi_type_data.h \
+	model/gi_type_data_init.h \
+	model/sequence_data.h \
+	model/sequence_data_init.h
 
 AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/aggregator.c b/src/aggregator.c
index 07aba39..c00d912 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -3,9 +3,9 @@
  * container.
  */
 
-#include "check_h5_error.h"
-#include "load_influenza_aa_dat.h"
-#include "load_influenza_faa.h"
+#include "error/check_h5_error.h"
+#include "load/load_influenza_aa_dat.h"
+#include "load/load_influenza_faa.h"
 
 #define FILE "influenza.h5"
 
diff --git a/src/assign_protein_type.c b/src/assign/assign_protein_type.c
index ec3a959..0a11c23 100644
--- a/src/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,8 +1,10 @@
 #include "assign_protein_type.h"
-#include "check_ncbi_error.h"
-#include "check_h5_error.h"
-#include "sequence_data.h"
-#include "sequence_data_init.h"
+#include "error/check_ncbi_error.h"
+#include "error/check_h5_error.h"
+#include "model/gi_type_data.h"
+#include "model/gi_type_data_init.h"
+#include "model/sequence_data.h"
+#include "model/sequence_data_init.h"
 #include <ncbi.h>
 #include <readdb.h>
 #include <blast.h>
@@ -12,6 +14,16 @@
 #include <error.h>
 
 /*
+ * todo: Write results to an independent table by GI so that they can
+ * be reused when the input tables influenza.faa and influenza_aa.dat
+ * are replaced with new data.  Since GI values are unique and
+ * persistent the assignment of a protein type should stay valid
+ * unless the list of reference sequences is updated.
+ *
+ * todo: Save the species type as a new field { A, B, C }.
+ */
+
+/*
  * BLAST database containing all of the influenza protein sequences.
  */
 #define SEQDB "/home/don/exp004/influenzadb/influenzadb"
@@ -57,34 +69,57 @@ assign_protein_type (hid_t file_id)
   ValNodePtr error_returns = NULL;
 
   /*
-   * Read the data from HDF5 file.
+   * Read the data from HDF5 influenza.faa.
    */
-  hsize_t nfields;
-  hsize_t nrecords;
-  herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
-				      &nrecords);
+  hsize_t faa_nfields;
+  hsize_t faa_nrecords;
+  herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields,
+				      &faa_nrecords);
+  if (status < 0)
+    check_h5_error (status, __FILE__, __LINE__);
+
+  sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);
+
+  size_t faa_size;
+  size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
+  size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM];
+  hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM];
+  sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type);
+
+  status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset,
+			   faa_sizes, faa_buf);
   if (status < 0)
     check_h5_error (status, __FILE__, __LINE__);
 
-  sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
+  /*
+   * Read the data from HDF5 gi_type_data.
+   */
+  hsize_t gi_nfields;
+  hsize_t gi_nrecords;
+  status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
+			       &gi_nrecords);
+  if (status < 0)
+    check_h5_error (status, __FILE__, __LINE__);
+  
+  gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
 
-  size_t dst_size;
-  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
-  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
-  hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
-  sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);  
+  size_t gi_size;
+  size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
+  size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
+  hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
+  gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
 
-  status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
-			   dst_sizes, dst_buf);
+  status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
+			   gi_sizes, gi_buf);
   if (status < 0)
     check_h5_error (status, __FILE__, __LINE__);
 
   /*
    * Assign protein types to records for which the field is empty.
    */
-  printf ("Records to process: %i\n", (int)nrecords);
+  printf ("Records to process: %i\n", (int)faa_nrecords);
   bool updates_pending = false;
-  for (int i = 0; i < nrecords; i++)
+  for (int i = 0; i < faa_nrecords; i++)
     {
 
       if (dst_buf[i].protein_type[0] == '\0') {
@@ -130,8 +165,15 @@ assign_protein_type (hid_t file_id)
 	    Char target_id_buf[BUFFER_LEN + 1];
 	    SeqIdPtr target_id = SeqAlignId (seqalign, 1);
 	    SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
-	    strncpy (dst_buf[i].protein_type, &target_id_buf[6], 
-		     sizeof (dst_buf[i].protein_type));
+
+	    // Species Type
+	    dst_buf[i].type[0] = &target_id_buf[4];
+	    dst_buf[i].type[1] = '\0';
+
+	    // Protein Type
+	    strncpy (dst_buf[i].protein, &target_id_buf[6], 
+		     sizeof (dst_buf[i].protein));
+
 	    updates_pending = true;
 	  }
 
@@ -202,7 +244,8 @@ assign_protein_type (hid_t file_id)
       updates_pending = false;
     }
   
-  free (dst_buf);
+  free (faa_buf);
+  free (gi_buf);
   
   options = BLASTOptionDelete (options);
   
diff --git a/src/assign_protein_type.h b/src/assign/assign_protein_type.h
index 221154f..221154f 100644
--- a/src/assign_protein_type.h
+++ b/src/assign/assign_protein_type.h
diff --git a/src/check_error.c b/src/error/check_error.c
index 4630b50..4630b50 100644
--- a/src/check_error.c
+++ b/src/error/check_error.c
diff --git a/src/check_error.h b/src/error/check_error.h
index 2250c59..2250c59 100644
--- a/src/check_error.h
+++ b/src/error/check_error.h
diff --git a/src/check_h5_error.c b/src/error/check_h5_error.c
index d90b21f..d90b21f 100644
--- a/src/check_h5_error.c
+++ b/src/error/check_h5_error.c
diff --git a/src/check_h5_error.h b/src/error/check_h5_error.h
index e460e97..e460e97 100644
--- a/src/check_h5_error.h
+++ b/src/error/check_h5_error.h
diff --git a/src/check_ncbi_error.c b/src/error/check_ncbi_error.c
index 6071d1a..6071d1a 100644
--- a/src/check_ncbi_error.c
+++ b/src/error/check_ncbi_error.c
diff --git a/src/check_ncbi_error.h b/src/error/check_ncbi_error.h
index 45ac0ca..45ac0ca 100644
--- a/src/check_ncbi_error.h
+++ b/src/error/check_ncbi_error.h
diff --git a/src/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
index aed33e8..8bf47aa 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load/load_influenza_aa_dat.c
@@ -6,8 +6,8 @@
  */
 
 #include "load_influenza_aa_dat.h"
-#include "check_error.h"
-#include "check_h5_error.h"
+#include "error/check_error.h"
+#include "error/check_h5_error.h"
 #include <hdf5_hl.h>
 #include <string.h>
 #include <stdlib.h>
diff --git a/src/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h
index f6c60be..f6c60be 100644
--- a/src/load_influenza_aa_dat.h
+++ b/src/load/load_influenza_aa_dat.h
diff --git a/src/load_influenza_faa.c b/src/load/load_influenza_faa.c
index fd35254..a217989 100644
--- a/src/load_influenza_faa.c
+++ b/src/load/load_influenza_faa.c
@@ -1,8 +1,8 @@
-#include "check_error.h"
-#include "check_h5_error.h"
+#include "error/check_error.h"
+#include "error/check_h5_error.h"
 #include "load_influenza_faa.h"
-#include "sequence_data.h"
-#include "sequence_data_init.h"
+#include "model/sequence_data.h"
+#include "model/sequence_data_init.h"
 #include <hdf5_hl.h>
 #include <string.h>
 #include <stdlib.h>
@@ -57,8 +57,6 @@ load_influenza_faa (hid_t file_id)
 	  strncpy (p_data.description, strsep (&running, "|"),
 		   sizeof (p_data.description));
 
-	  strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
-
 	  const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
 	    SEQUENCE_DATA_FIELD_NAMES;
 
diff --git a/src/load_influenza_faa.h b/src/load/load_influenza_faa.h
index 569c411..569c411 100644
--- a/src/load_influenza_faa.h
+++ b/src/load/load_influenza_faa.h
diff --git a/src/model/gi_type_data.h b/src/model/gi_type_data.h
new file mode 100644
index 0000000..bf149c3
--- a/dev/null
+++ b/src/model/gi_type_data.h
@@ -0,0 +1,21 @@
+#ifndef GI_TYPE_DATA_H
+#define GI_TYPE_DATA_H
+
+#define GI_TYPE_DATA_FIELD_NUM 3
+
+/*
+ * Field names selected to match the names of fields on the "Influenza
+ * Virus Resource - Influenza Virus Sequence Database" search form
+ * online at
+ * http://www.ncbi.nlm.nih.gov/genomes/FLU/Database/select.cgi?go=database.
+ */
+#define GI_TYPE_DATA_FIELD_NAMES { "GI", "Type", "Protein" }
+
+typedef struct
+{
+  int gi;
+  char type[2];
+  char protein[7];
+} gi_type_data;
+
+#endif // GI_TYPE_DATA_H
diff --git a/src/model/gi_type_data_init.c b/src/model/gi_type_data_init.c
new file mode 100644
index 0000000..4a161c7
--- a/dev/null
+++ b/src/model/gi_type_data_init.c
@@ -0,0 +1,36 @@
+#include "gi_type_data_init.h"
+#include "gi_type_data.h"
+
+/*
+ * There should be some way to automate the generation of these values
+ * or at least this source file based on just the definition of the
+ * struct.  Perhaps an HDF5 precompiler could do such a thing.
+ */
+void
+gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+		   hid_t *field_type)
+{
+  *dst_size = sizeof (gi_type_data);
+
+  dst_offset[0] = HOFFSET (gi_type_data, gi);
+  dst_offset[1] = HOFFSET (gi_type_data, type);
+  dst_offset[2] = HOFFSET (gi_type_data, protein);
+
+  gi_type_data dst_buf[1];
+
+  dst_sizes[0] = sizeof (dst_buf[0].gi);
+  dst_sizes[1] = sizeof (dst_buf[0].type);
+  dst_sizes[2] = sizeof (dst_buf[0].protein);
+
+  field_type[0] = H5T_NATIVE_INT;
+
+  hid_t type_h5t = H5Tcopy (H5T_C_S1);
+  H5Tset_size (type_h5t, 2);
+  field_type[1] = type_h5t;
+
+  hid_t protein_h5t = H5Tcopy (H5T_C_S1);
+  H5Tset_size (protein_h5t, 7);
+  field_type[2] = protein_h5t;
+
+  return;
+}
diff --git a/src/model/gi_type_data_init.h b/src/model/gi_type_data_init.h
new file mode 100644
index 0000000..5c45cba
--- a/dev/null
+++ b/src/model/gi_type_data_init.h
@@ -0,0 +1,14 @@
+#ifndef GI_TYPE_DATA_INIT_H
+#define GI_TYPE_DATA_INIT_H
+
+#include <hdf5.h>
+
+/*
+ * Initialize the structures describing the struct.  These descriptive
+ * structures are used by the HDF5 API.
+ */
+void
+gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+		   hid_t *field_type);
+
+#endif // GI_TYPE_DATA_INIT_H
diff --git a/src/sequence_data.h b/src/model/sequence_data.h
index fb4ff78..9db91ac 100644
--- a/src/sequence_data.h
+++ b/src/model/sequence_data.h
@@ -1,16 +1,15 @@
 #ifndef SEQUENCE_DATA_H
 #define SEQUENCE_DATA_H
 
-#define SEQUENCE_DATA_FIELD_NUM 4
+#define SEQUENCE_DATA_FIELD_NUM 3
 
-#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description" }
 
 typedef struct
 {
   int gi;
   char gb[9];
   char description[196];
-  char protein_type[7];
 } sequence_data;
 
 #endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/model/sequence_data_init.c
index 09ba189..f6b3b1f 100644
--- a/src/sequence_data_init.c
+++ b/src/model/sequence_data_init.c
@@ -10,14 +10,12 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
   dst_offset[0] = HOFFSET (sequence_data, gi);
   dst_offset[1] = HOFFSET (sequence_data, gb);
   dst_offset[2] = HOFFSET (sequence_data, description);
-  dst_offset[3] = HOFFSET (sequence_data, protein_type);
   
   sequence_data dst_buf[1];
   
   dst_sizes[0] = sizeof (dst_buf[0].gi);
   dst_sizes[1] = sizeof (dst_buf[0].gb);
   dst_sizes[2] = sizeof (dst_buf[0].description);
-  dst_sizes[3] = sizeof (dst_buf[0].protein_type);
   
   field_type[0] = H5T_NATIVE_INT;
   
@@ -28,10 +26,6 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
   hid_t description_type = H5Tcopy (H5T_C_S1);
   H5Tset_size (description_type, 196);
   field_type[2] = description_type;
-  
-  hid_t protein_type_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (protein_type_type, 7);
-  field_type[3] = protein_type_type;
 
   return;
 }
diff --git a/src/sequence_data_init.h b/src/model/sequence_data_init.h
index c87e7e6..c87e7e6 100644
--- a/src/sequence_data_init.h
+++ b/src/model/sequence_data_init.h
diff --git a/src/updator.c b/src/updator.c
index 8d556da..591d2f6 100644
--- a/src/updator.c
+++ b/src/updator.c
@@ -2,8 +2,8 @@
  * Update derived fields.
  */
 
-#include "assign_protein_type.h"
-#include "check_h5_error.h"
+#include "assign/assign_protein_type.h"
+#include "error/check_h5_error.h"
 
 #define FILE "influenza.h5"
 
--
cgit v0.8.3.1-22-g547a