summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-16 01:46:51 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-16 01:46:51 (GMT)
commit4498865663dc42c25faf72e6ff72675538cdd697 (patch) (side-by-side diff)
treeccbf407b218f2e1272070eb19f26d6d1b6d744e2
parent85c4b93261bec06aeaa677caaa31ffdc5ae9a814 (diff)
downloadexp007-4498865663dc42c25faf72e6ff72675538cdd697.zip
exp007-4498865663dc42c25faf72e6ff72675538cdd697.tar.gz
exp007-4498865663dc42c25faf72e6ff72675538cdd697.tar.bz2
Implemented the loading of the influenza_aa.dat file. Tested by
comparing an export of the data from the HDF5 file and the original file.
-rw-r--r--README30
-rw-r--r--doc/Data Deployments.diabin3566 -> 4057 bytes
-rw-r--r--src/aggregator.c2
-rw-r--r--src/load_influenza_aa_dat.c154
4 files changed, 145 insertions, 41 deletions
diff --git a/README b/README
index 9caedb8..197d289 100644
--- a/README
+++ b/README
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that
expose the various records as well as the transform routines that are
used for aggregation of the data on the local system.
- LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia
+BUILDING
+
+An autogen.sh script is provided to initialize the project directory
+with the necessary GNU Autotools configuration.
+
+When building on a Debian system the mpi.h file is in a subdirectory
+of /usr/include and therefore not found within the default include
+path. To account for this run the following before running
+./configure.
+
+ $ export CPPFLAGS=-I/usr/include/mpi
+
+TEST CASES
+
+The "load_influenza_aa_dat" function loads a single tab delimited text
+file into a table structure in the HDF5 file. The HDFView GUI can be
+used to open the loaded table and then export it back out as a text
+file. The text file can then be compared with the original input to
+verify that the load was completed without error.
+
+ $ diff --report-identical-files \
+ /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \
+ Protein\ Sequences.txt
+
+ Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and
+ Protein Sequences.txt are identical
+
+ LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi
+ LocalWords: autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt
diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia
index b8ad4af..277d53a 100644
--- a/doc/Data Deployments.dia
+++ b/doc/Data Deployments.dia
Binary files differ
diff --git a/src/aggregator.c b/src/aggregator.c
index ae5aa60..da6db08 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -24,7 +24,7 @@ main()
/*
* Close the HD5 file.
*/
- herr_t status = H5Fclose (file_id);
+ H5Fclose (file_id);
return 0;
}
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 72aacb5..5af8a72 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
+/*
+ * Load the influnza_aa.dat tab delimited text file into an HDF5
+ * binary table.
+ *
+ * todo: Handle NULL values occuring in numeric fields.
+ */
+
#include "load_influenza_aa_dat.h"
#include "hdf5_hl.h"
+#include <string.h>
+#include <stdlib.h>
#define NFIELDS (hsize_t) 11
-//#define NRECORDS (hsize_t) 138052
-#define NRECORDS (hsize_t) 1
-#define TABLE_NAME "influenza_aa.dat"
+#define TABLE_NAME "Protein Sequences"
void
load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
/*
* Model the data using native types.
*/
- typedef struct supplementary_data
+ typedef struct
{
char genbank_accession_number[9];
char host[15];
int genome_segment_number;
char subtype[7];
char country[25];
- int year;
+ char year[8];
int sequence_length;
char virus_name[196];
char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
HOFFSET ( supplementary_data, gender ),
HOFFSET ( supplementary_data, full_length_indicator )};
- /*
-
- Only needed for reading?
-
- supplementary_data dst_buf[NRECORDS];
+ supplementary_data dst_buf[1];
size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
sizeof ( dst_buf[0].age ),
sizeof ( dst_buf[0].gender ),
sizeof ( dst_buf[0].full_length_indicator)};
- */
/*
- * "Define field information."
- */
- const char *field_names[NFIELDS] =
- { "GenBank accession number",
- "Host",
- "Genome segment number",
- "Subtype",
- "Country",
- "Year",
- "Sequence length",
- "Virus name",
- "Age",
- "Gender",
- "Full-length Indicator" };
- hsize_t chunk_size = 10;
- int *fill_data = NULL;
- int compress = 0;
-
- /*
- * "Initialize field type."
+ * Map the native types to HDF5 types for each field.
*/
hid_t field_type[NFIELDS];
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
H5Tset_size (country_type, 25 );
field_type[4] = country_type;
- field_type[5] = H5T_NATIVE_INT;
+ hid_t year_type = H5Tcopy ( H5T_C_S1 );
+ H5Tset_size (year_type, 8);
+ field_type[5] = year_type;
field_type[6] = H5T_NATIVE_INT;
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
H5Tset_size (full_length_indicator_type, 4);
field_type[10] = full_length_indicator_type;
- supplementary_data p_data[NRECORDS] = {
- {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",
- "", "", "yes"}
- };
+ /*
+ * Labels used for the fields in the table.
+ */
+ const char *field_names[NFIELDS] =
+ { "GenBank accession number",
+ "Host",
+ "Genome segment number",
+ "Subtype",
+ "Country",
+ "Year",
+ "Sequence length",
+ "Virus name",
+ "Age",
+ "Gender",
+ "Full-length Indicator" };
+
+ /*
+ * Table storage options.
+ */
+ hsize_t chunk_size = 10;
+ int *fill_data = NULL;
+ int compress = 0;
+
+ /*
+ * Insert the records.
+ */
+ supplementary_data p_data;
+ FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
+ char *line = NULL;
+ size_t len = 0;
+ int current_line = 0;
+
+ while (getline (&line, &len, dat) != -1) {
+
+ current_line++;
+ char *running = strdup (line);
+ char *token;
+
+ /*
+ * Parse the line, handling the case of empty fields represented
+ * by sequential delimiters.
+ */
+ strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
+ sizeof(p_data.genbank_accession_number));
+
+ strncpy(p_data.host, strsep (&running, "\t"),
+ sizeof(p_data.host));
+
+ token = strsep (&running, "\t");
+ if (strcmp (token, "\0") == 0)
+ p_data.genome_segment_number = 0;
+ else
+ p_data.genome_segment_number = atoi(token);
+
+ strncpy(p_data.subtype, strsep (&running, "\t"),
+ sizeof(p_data.subtype));
+
+ strncpy(p_data.country, strsep (&running, "\t"),
+ sizeof(p_data.country));
+
+ strncpy (p_data.year, strsep (&running, "\t"),
+ sizeof(p_data.year));
+
+ token = strsep (&running, "\t");
+ if (strcmp (token, "\0") == 0)
+ p_data.sequence_length = 0;
+ else
+ p_data.sequence_length = atoi(token);
+
+ strncpy(p_data.virus_name, strsep (&running, "\t"),
+ sizeof(p_data.virus_name));
+
+ strncpy(p_data.age, strsep (&running, "\t"),
+ sizeof(p_data.age));
+
+ strncpy(p_data.gender, strsep (&running, "\t"),
+ sizeof(p_data.gender));
+
+ strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
+ sizeof(p_data.full_length_indicator));
+
+ if (current_line == 1)
+ H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
+ dst_size,field_names, dst_offset, field_type,
+ chunk_size, fill_data, compress, &p_data);
+ else
+ H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
+ dst_sizes, &p_data);
+
+ if (running)
+ free (running);
+
+ }
+
+ if (line)
+ free (line);
- herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,
- dst_size,field_names, dst_offset, field_type,
- chunk_size, fill_data, compress, p_data);
+ fclose (dat);
H5Tclose (genbank_accession_number_type);
H5Tclose (host_type);
H5Tclose (subtype_type);
H5Tclose (country_type);
+ H5Tclose (year_type);
H5Tclose (virus_name_type);
H5Tclose (age_type);
H5Tclose (gender_type);

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.