4 files changed, 145 insertions, 41 deletions
diff --git a/README b/README
index 9caedb8..197d289 100644
--- a/README
+++ b/README
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that
 expose the various records as well as the transform routines that are
 used for aggregation of the data on the local system.
- LocalWords:  NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia
+BUILDING
+An autogen.sh script is provided to initialize the project directory
+with the necessary GNU Autotools configuration.
+When building on a Debian system the mpi.h file is in a subdirectory
+of /usr/include and therefore not found within the default include
+path.  To account for this run the following before running
+./configure.
+  $ export CPPFLAGS=-I/usr/include/mpi
+TEST CASES
+The "load_influenza_aa_dat" function loads a single tab delimited text
+file into a table structure in the HDF5 file.  The HDFView GUI can be
+used to open the loaded table and then export it back out as a text
+file.  The text file can then be compared with the original input to
+verify that the load was completed without error.
+  $ diff --report-identical-files \
+    /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \
+    Protein\ Sequences.txt 
+  Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and
+  Protein Sequences.txt are identical
+ LocalWords:  NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi
+ LocalWords:  autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt
diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia
index b8ad4af..277d53a 100644
--- a/doc/Data Deployments.dia
+++ b/doc/Data Deployments.dia
Binary files differ
diff --git a/src/aggregator.c b/src/aggregator.c
index ae5aa60..da6db08 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -24,7 +24,7 @@ main()
  /*
   * Close the HD5 file.
   */
-  herr_t status = H5Fclose (file_id);
+  H5Fclose (file_id);
  return 0;
 }
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 72aacb5..5af8a72 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
+/*
+ * Load the influnza_aa.dat tab delimited text file into an HDF5
+ * binary table.
+ *
+ * todo: Handle NULL values occuring in numeric fields.
+ */
 #include "load_influenza_aa_dat.h"
 #include "hdf5_hl.h"
+#include <string.h>
+#include <stdlib.h>
 #define NFIELDS (hsize_t) 11
-//#define NRECORDS (hsize_t) 138052
+#define TABLE_NAME "Protein Sequences"
-#define NRECORDS (hsize_t) 1
-#define TABLE_NAME "influenza_aa.dat"
 void
 load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
  /*
   * Model the data using native types.
   */
-  typedef struct supplementary_data
+  typedef struct
  {
    char genbank_accession_number[9];
    char host[15];
    int genome_segment_number;
    char subtype[7];
    char country[25];
-    int year;
+    char year[8];
    int sequence_length;
    char virus_name[196];
    char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
                                 HOFFSET ( supplementary_data, gender ),
                                 HOFFSET ( supplementary_data, full_length_indicator )};
-  /*
+  supplementary_data dst_buf[1];
-    Only needed for reading?
-  supplementary_data dst_buf[NRECORDS];
  size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
                                sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
                                sizeof ( dst_buf[0].age ),
                                sizeof ( dst_buf[0].gender ),
                                sizeof ( dst_buf[0].full_length_indicator)};
-  */
  /*
-   * "Define field information."
+   * Map the native types to HDF5 types for each field.
-   */
-  const char *field_names[NFIELDS] =
-    { "GenBank accession number",
-      "Host",
-      "Genome segment number",
-      "Subtype",
-      "Country",
-      "Year",
-      "Sequence length",
-      "Virus name",
-      "Age",
-      "Gender",
-      "Full-length Indicator" };
-  hsize_t chunk_size = 10;
-  int *fill_data = NULL;
-  int compress = 0;
-  
-  /*
-   * "Initialize field type."
   */
  hid_t field_type[NFIELDS];
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
  H5Tset_size (country_type, 25 );
  field_type[4] = country_type;
-  field_type[5] = H5T_NATIVE_INT; 
+  hid_t year_type = H5Tcopy ( H5T_C_S1 );
+  H5Tset_size (year_type, 8);
+  field_type[5] = year_type;
  field_type[6] = H5T_NATIVE_INT;
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
  H5Tset_size (full_length_indicator_type, 4);
  field_type[10] = full_length_indicator_type;
-  supplementary_data p_data[NRECORDS] = {
+  /*
-    {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",
+   * Labels used for the fields in the table.
-     "", "", "yes"}
+   */
-  };
+  const char *field_names[NFIELDS] =
+    { "GenBank accession number",
+      "Host",
+      "Genome segment number",
+      "Subtype",
+      "Country",
+      "Year",
+      "Sequence length",
+      "Virus name",
+      "Age",
+      "Gender",
+      "Full-length Indicator" };
+  /*
+   * Table storage options.
+   */
+  hsize_t chunk_size = 10;
+  int *fill_data = NULL;
+  int compress = 0;
+  /*
+   * Insert the records.
+   */
+  supplementary_data p_data;
+  FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
+  char *line = NULL;
+  size_t len = 0;
+  int current_line = 0;
+  while (getline (&line, &len, dat) != -1) {
+    current_line++;
+    char *running = strdup (line);
+    char *token;
+    
+    /*
+     * Parse the line, handling the case of empty fields represented
+     * by sequential delimiters.
+     */
+    strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
+            sizeof(p_data.genbank_accession_number));
+    
+    strncpy(p_data.host, strsep (&running, "\t"),
+            sizeof(p_data.host));
+    
+    token = strsep (&running, "\t");
+    if (strcmp (token, "\0") == 0)
+      p_data.genome_segment_number = 0;
+    else
+      p_data.genome_segment_number = atoi(token);
+    
+    strncpy(p_data.subtype, strsep (&running, "\t"),
+            sizeof(p_data.subtype));
+    
+    strncpy(p_data.country, strsep (&running, "\t"),
+            sizeof(p_data.country));
+    
+    strncpy (p_data.year, strsep (&running, "\t"),
+             sizeof(p_data.year));
+    token = strsep (&running, "\t");
+    if (strcmp (token, "\0") == 0)
+      p_data.sequence_length = 0;
+    else
+      p_data.sequence_length = atoi(token);
+    
+    strncpy(p_data.virus_name, strsep (&running, "\t"),
+            sizeof(p_data.virus_name));
+    
+    strncpy(p_data.age, strsep (&running, "\t"),
+            sizeof(p_data.age));
+    
+    strncpy(p_data.gender, strsep (&running, "\t"),
+            sizeof(p_data.gender));
+    
+    strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
+            sizeof(p_data.full_length_indicator));
+    if (current_line == 1)     
+      H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
+                      dst_size,field_names, dst_offset, field_type,
+                      chunk_size, fill_data, compress, &p_data);
+    else     
+      H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
+                         dst_sizes, &p_data);
+    if (running)
+      free (running);
+   
+  }
+  
+  if (line)
+    free (line);
-  herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,
+  fclose (dat);
-                                  dst_size,field_names, dst_offset, field_type,
-                                  chunk_size, fill_data, compress, p_data);
  H5Tclose (genbank_accession_number_type);
  H5Tclose (host_type);
  H5Tclose (subtype_type);
  H5Tclose (country_type);
+  H5Tclose (year_type);
  H5Tclose (virus_name_type);
  H5Tclose (age_type);
  H5Tclose (gender_type);

diff --git a/README b/README index 9caedb8..197d289 100644 --- a/README +++ b/README
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that
32	expose the various records as well as the transform routines that are	32	expose the various records as well as the transform routines that are
33	used for aggregation of the data on the local system.	33	used for aggregation of the data on the local system.
34		34
35	LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia	35	BUILDING
		36
		37	An autogen.sh script is provided to initialize the project directory
		38	with the necessary GNU Autotools configuration.
		39
		40	When building on a Debian system the mpi.h file is in a subdirectory
		41	of /usr/include and therefore not found within the default include
		42	path. To account for this run the following before running
		43	./configure.
		44
		45	$ export CPPFLAGS=-I/usr/include/mpi
		46
		47	TEST CASES
		48
		49	The "load_influenza_aa_dat" function loads a single tab delimited text
		50	file into a table structure in the HDF5 file. The HDFView GUI can be
		51	used to open the loaded table and then export it back out as a text
		52	file. The text file can then be compared with the original input to
		53	verify that the load was completed without error.
		54
		55	$ diff --report-identical-files \
		56	/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \
		57	Protein\ Sequences.txt
		58
		59	Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and
		60	Protein Sequences.txt are identical
		61
		62	LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi
		63	LocalWords: autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt


diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia index b8ad4af..277d53a 100644 --- a/doc/Data Deployments.dia +++ b/doc/Data Deployments.dia
Binary files differ


diff --git a/src/aggregator.c b/src/aggregator.c index ae5aa60..da6db08 100644 --- a/src/aggregator.c +++ b/src/aggregator.c
@@ -24,7 +24,7 @@ main()
24	/*	24	/*
25	* Close the HD5 file.	25	* Close the HD5 file.
26	*/	26	*/
27	herr_t status = H5Fclose (file_id);	27	H5Fclose (file_id);
28		28
29	return 0;	29	return 0;
30	}	30	}


diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index 72aacb5..5af8a72 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
		1	/*
		2	* Load the influnza_aa.dat tab delimited text file into an HDF5
		3	* binary table.
		4	*
		5	* todo: Handle NULL values occuring in numeric fields.
		6	*/
		7
1	#include "load_influenza_aa_dat.h"	8	#include "load_influenza_aa_dat.h"
2	#include "hdf5_hl.h"	9	#include "hdf5_hl.h"
		10	#include <string.h>
		11	#include <stdlib.h>
3		12
4	#define NFIELDS (hsize_t) 11	13	#define NFIELDS (hsize_t) 11
5	//#define NRECORDS (hsize_t) 138052	14	#define TABLE_NAME "Protein Sequences"
6	#define NRECORDS (hsize_t) 1
7	#define TABLE_NAME "influenza_aa.dat"
8		15
9	void	16	void
10	load_influenza_aa_dat (hid_t file_id)	17	load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
12	/*	19	/*
13	* Model the data using native types.	20	* Model the data using native types.
14	*/	21	*/
15	typedef struct supplementary_data	22	typedef struct
16	{	23	{
17	char genbank_accession_number[9];	24	char genbank_accession_number[9];
18	char host[15];	25	char host[15];
19	int genome_segment_number;	26	int genome_segment_number;
20	char subtype[7];	27	char subtype[7];
21	char country[25];	28	char country[25];
22	int year;	29	char year[8];
23	int sequence_length;	30	int sequence_length;
24	char virus_name[196];	31	char virus_name[196];
25	char age[17];	32	char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
49	HOFFSET ( supplementary_data, gender ),	56	HOFFSET ( supplementary_data, gender ),
50	HOFFSET ( supplementary_data, full_length_indicator )};	57	HOFFSET ( supplementary_data, full_length_indicator )};
51		58
52	/*	59	supplementary_data dst_buf[1];
53
54	Only needed for reading?
55
56	supplementary_data dst_buf[NRECORDS];
57		60
58	size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),	61	size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
59	sizeof ( dst_buf[0].host ),	62	sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
66	sizeof ( dst_buf[0].age ),	69	sizeof ( dst_buf[0].age ),
67	sizeof ( dst_buf[0].gender ),	70	sizeof ( dst_buf[0].gender ),
68	sizeof ( dst_buf[0].full_length_indicator)};	71	sizeof ( dst_buf[0].full_length_indicator)};
69	*/
70		72
71	/*	73	/*
72	* "Define field information."	74	* Map the native types to HDF5 types for each field.
73	*/
74	const char *field_names[NFIELDS] =
75	{ "GenBank accession number",
76	"Host",
77	"Genome segment number",
78	"Subtype",
79	"Country",
80	"Year",
81	"Sequence length",
82	"Virus name",
83	"Age",
84	"Gender",
85	"Full-length Indicator" };
86	hsize_t chunk_size = 10;
87	int *fill_data = NULL;
88	int compress = 0;
89
90	/*
91	* "Initialize field type."
92	*/	75	*/
93	hid_t field_type[NFIELDS];	76	hid_t field_type[NFIELDS];
94		77
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
110	H5Tset_size (country_type, 25 );	93	H5Tset_size (country_type, 25 );
111	field_type[4] = country_type;	94	field_type[4] = country_type;
112		95
113	field_type[5] = H5T_NATIVE_INT;	96	hid_t year_type = H5Tcopy ( H5T_C_S1 );
		97	H5Tset_size (year_type, 8);
		98	field_type[5] = year_type;
114		99
115	field_type[6] = H5T_NATIVE_INT;	100	field_type[6] = H5T_NATIVE_INT;
116		101
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
130	H5Tset_size (full_length_indicator_type, 4);	115	H5Tset_size (full_length_indicator_type, 4);
131	field_type[10] = full_length_indicator_type;	116	field_type[10] = full_length_indicator_type;
132		117
133	supplementary_data p_data[NRECORDS] = {	118	/*
134	{"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",	119	* Labels used for the fields in the table.
135	"", "", "yes"}	120	*/
136	};	121	const char *field_names[NFIELDS] =
		122	{ "GenBank accession number",
		123	"Host",
		124	"Genome segment number",
		125	"Subtype",
		126	"Country",
		127	"Year",
		128	"Sequence length",
		129	"Virus name",
		130	"Age",
		131	"Gender",
		132	"Full-length Indicator" };
		133
		134	/*
		135	* Table storage options.
		136	*/
		137	hsize_t chunk_size = 10;
		138	int *fill_data = NULL;
		139	int compress = 0;
		140
		141	/*
		142	* Insert the records.
		143	*/
		144	supplementary_data p_data;
		145	FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
		146	char *line = NULL;
		147	size_t len = 0;
		148	int current_line = 0;
		149
		150	while (getline (&line, &len, dat) != -1) {
		151
		152	current_line++;
		153	char *running = strdup (line);
		154	char *token;
		155
		156	/*
		157	* Parse the line, handling the case of empty fields represented
		158	* by sequential delimiters.
		159	*/
		160	strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
		161	sizeof(p_data.genbank_accession_number));
		162
		163	strncpy(p_data.host, strsep (&running, "\t"),
		164	sizeof(p_data.host));
		165
		166	token = strsep (&running, "\t");
		167	if (strcmp (token, "\0") == 0)
		168	p_data.genome_segment_number = 0;
		169	else
		170	p_data.genome_segment_number = atoi(token);
		171
		172	strncpy(p_data.subtype, strsep (&running, "\t"),
		173	sizeof(p_data.subtype));
		174
		175	strncpy(p_data.country, strsep (&running, "\t"),
		176	sizeof(p_data.country));
		177
		178	strncpy (p_data.year, strsep (&running, "\t"),
		179	sizeof(p_data.year));
		180
		181	token = strsep (&running, "\t");
		182	if (strcmp (token, "\0") == 0)
		183	p_data.sequence_length = 0;
		184	else
		185	p_data.sequence_length = atoi(token);
		186
		187	strncpy(p_data.virus_name, strsep (&running, "\t"),
		188	sizeof(p_data.virus_name));
		189
		190	strncpy(p_data.age, strsep (&running, "\t"),
		191	sizeof(p_data.age));
		192
		193	strncpy(p_data.gender, strsep (&running, "\t"),
		194	sizeof(p_data.gender));
		195
		196	strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
		197	sizeof(p_data.full_length_indicator));
		198
		199	if (current_line == 1)
		200	H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
		201	dst_size,field_names, dst_offset, field_type,
		202	chunk_size, fill_data, compress, &p_data);
		203	else
		204	H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
		205	dst_sizes, &p_data);
		206
		207	if (running)
		208	free (running);
		209
		210	}
		211
		212	if (line)
		213	free (line);
137		214
138	herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,	215	fclose (dat);
139	dst_size,field_names, dst_offset, field_type,
140	chunk_size, fill_data, compress, p_data);
141		216
142	H5Tclose (genbank_accession_number_type);	217	H5Tclose (genbank_accession_number_type);
143	H5Tclose (host_type);	218	H5Tclose (host_type);
144	H5Tclose (subtype_type);	219	H5Tclose (subtype_type);
145	H5Tclose (country_type);	220	H5Tclose (country_type);
		221	H5Tclose (year_type);
146	H5Tclose (virus_name_type);	222	H5Tclose (virus_name_type);
147	H5Tclose (age_type);	223	H5Tclose (age_type);
148	H5Tclose (gender_type);	224	H5Tclose (gender_type);