From 4498865663dc42c25faf72e6ff72675538cdd697 Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Sat, 16 Jan 2010 01:46:51 +0000 Subject: Implemented the loading of the influenza_aa.dat file. Tested by comparing an export of the data from the HDF5 file and the original file. --- diff --git a/README b/README index 9caedb8..197d289 100644 --- a/README +++ b/README @@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that expose the various records as well as the transform routines that are used for aggregation of the data on the local system. - LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia +BUILDING + +An autogen.sh script is provided to initialize the project directory +with the necessary GNU Autotools configuration. + +When building on a Debian system the mpi.h file is in a subdirectory +of /usr/include and therefore not found within the default include +path. To account for this run the following before running +./configure. + + $ export CPPFLAGS=-I/usr/include/mpi + +TEST CASES + +The "load_influenza_aa_dat" function loads a single tab delimited text +file into a table structure in the HDF5 file. The HDFView GUI can be +used to open the loaded table and then export it back out as a text +file. The text file can then be compared with the original input to +verify that the load was completed without error. + + $ diff --report-identical-files \ + /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \ + Protein\ Sequences.txt + + Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and + Protein Sequences.txt are identical + + LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi + LocalWords: autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia index b8ad4af..277d53a 100644 --- a/doc/Data Deployments.dia +++ b/doc/Data Deployments.dia Binary files differ diff --git a/src/aggregator.c b/src/aggregator.c index ae5aa60..da6db08 100644 --- a/src/aggregator.c +++ b/src/aggregator.c @@ -24,7 +24,7 @@ main() /* * Close the HD5 file. */ - herr_t status = H5Fclose (file_id); + H5Fclose (file_id); return 0; } diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index 72aacb5..5af8a72 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c @@ -1,10 +1,17 @@ +/* + * Load the influnza_aa.dat tab delimited text file into an HDF5 + * binary table. + * + * todo: Handle NULL values occuring in numeric fields. + */ + #include "load_influenza_aa_dat.h" #include "hdf5_hl.h" +#include +#include #define NFIELDS (hsize_t) 11 -//#define NRECORDS (hsize_t) 138052 -#define NRECORDS (hsize_t) 1 -#define TABLE_NAME "influenza_aa.dat" +#define TABLE_NAME "Protein Sequences" void load_influenza_aa_dat (hid_t file_id) @@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id) /* * Model the data using native types. */ - typedef struct supplementary_data + typedef struct { char genbank_accession_number[9]; char host[15]; int genome_segment_number; char subtype[7]; char country[25]; - int year; + char year[8]; int sequence_length; char virus_name[196]; char age[17]; @@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id) HOFFSET ( supplementary_data, gender ), HOFFSET ( supplementary_data, full_length_indicator )}; - /* - - Only needed for reading? - - supplementary_data dst_buf[NRECORDS]; + supplementary_data dst_buf[1]; size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ), sizeof ( dst_buf[0].host ), @@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id) sizeof ( dst_buf[0].age ), sizeof ( dst_buf[0].gender ), sizeof ( dst_buf[0].full_length_indicator)}; - */ /* - * "Define field information." - */ - const char *field_names[NFIELDS] = - { "GenBank accession number", - "Host", - "Genome segment number", - "Subtype", - "Country", - "Year", - "Sequence length", - "Virus name", - "Age", - "Gender", - "Full-length Indicator" }; - hsize_t chunk_size = 10; - int *fill_data = NULL; - int compress = 0; - - /* - * "Initialize field type." + * Map the native types to HDF5 types for each field. */ hid_t field_type[NFIELDS]; @@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id) H5Tset_size (country_type, 25 ); field_type[4] = country_type; - field_type[5] = H5T_NATIVE_INT; + hid_t year_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size (year_type, 8); + field_type[5] = year_type; field_type[6] = H5T_NATIVE_INT; @@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id) H5Tset_size (full_length_indicator_type, 4); field_type[10] = full_length_indicator_type; - supplementary_data p_data[NRECORDS] = { - {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)", - "", "", "yes"} - }; + /* + * Labels used for the fields in the table. + */ + const char *field_names[NFIELDS] = + { "GenBank accession number", + "Host", + "Genome segment number", + "Subtype", + "Country", + "Year", + "Sequence length", + "Virus name", + "Age", + "Gender", + "Full-length Indicator" }; + + /* + * Table storage options. + */ + hsize_t chunk_size = 10; + int *fill_data = NULL; + int compress = 0; + + /* + * Insert the records. + */ + supplementary_data p_data; + FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); + char *line = NULL; + size_t len = 0; + int current_line = 0; + + while (getline (&line, &len, dat) != -1) { + + current_line++; + char *running = strdup (line); + char *token; + + /* + * Parse the line, handling the case of empty fields represented + * by sequential delimiters. + */ + strncpy(p_data.genbank_accession_number, strsep (&running, "\t"), + sizeof(p_data.genbank_accession_number)); + + strncpy(p_data.host, strsep (&running, "\t"), + sizeof(p_data.host)); + + token = strsep (&running, "\t"); + if (strcmp (token, "\0") == 0) + p_data.genome_segment_number = 0; + else + p_data.genome_segment_number = atoi(token); + + strncpy(p_data.subtype, strsep (&running, "\t"), + sizeof(p_data.subtype)); + + strncpy(p_data.country, strsep (&running, "\t"), + sizeof(p_data.country)); + + strncpy (p_data.year, strsep (&running, "\t"), + sizeof(p_data.year)); + + token = strsep (&running, "\t"); + if (strcmp (token, "\0") == 0) + p_data.sequence_length = 0; + else + p_data.sequence_length = atoi(token); + + strncpy(p_data.virus_name, strsep (&running, "\t"), + sizeof(p_data.virus_name)); + + strncpy(p_data.age, strsep (&running, "\t"), + sizeof(p_data.age)); + + strncpy(p_data.gender, strsep (&running, "\t"), + sizeof(p_data.gender)); + + strncpy(p_data.full_length_indicator, strsep (&running, "\t"), + sizeof(p_data.full_length_indicator)); + + if (current_line == 1) + H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1, + dst_size,field_names, dst_offset, field_type, + chunk_size, fill_data, compress, &p_data); + else + H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, + dst_sizes, &p_data); + + if (running) + free (running); + + } + + if (line) + free (line); - herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS, - dst_size,field_names, dst_offset, field_type, - chunk_size, fill_data, compress, p_data); + fclose (dat); H5Tclose (genbank_accession_number_type); H5Tclose (host_type); H5Tclose (subtype_type); H5Tclose (country_type); + H5Tclose (year_type); H5Tclose (virus_name_type); H5Tclose (age_type); H5Tclose (gender_type); -- cgit v0.8.3.1-22-g547a