summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-16 01:46:51 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-16 01:46:51 (GMT)
commit4498865663dc42c25faf72e6ff72675538cdd697 (patch) (unidiff)
treeccbf407b218f2e1272070eb19f26d6d1b6d744e2
parent85c4b93261bec06aeaa677caaa31ffdc5ae9a814 (diff)
downloadexp007-4498865663dc42c25faf72e6ff72675538cdd697.zip
exp007-4498865663dc42c25faf72e6ff72675538cdd697.tar.gz
exp007-4498865663dc42c25faf72e6ff72675538cdd697.tar.bz2
Implemented the loading of the influenza_aa.dat file. Tested by
comparing an export of the data from the HDF5 file and the original file.
-rw-r--r--README30
-rw-r--r--doc/Data Deployments.diabin3566 -> 4057 bytes
-rw-r--r--src/aggregator.c2
-rw-r--r--src/load_influenza_aa_dat.c154
4 files changed, 145 insertions, 41 deletions
diff --git a/README b/README
index 9caedb8..197d289 100644
--- a/README
+++ b/README
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that
32expose the various records as well as the transform routines that are32expose the various records as well as the transform routines that are
33used for aggregation of the data on the local system.33used for aggregation of the data on the local system.
3434
35 LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia35BUILDING
36
37An autogen.sh script is provided to initialize the project directory
38with the necessary GNU Autotools configuration.
39
40When building on a Debian system the mpi.h file is in a subdirectory
41of /usr/include and therefore not found within the default include
42path. To account for this run the following before running
43./configure.
44
45 $ export CPPFLAGS=-I/usr/include/mpi
46
47TEST CASES
48
49The "load_influenza_aa_dat" function loads a single tab delimited text
50file into a table structure in the HDF5 file. The HDFView GUI can be
51used to open the loaded table and then export it back out as a text
52file. The text file can then be compared with the original input to
53verify that the load was completed without error.
54
55 $ diff --report-identical-files \
56 /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \
57 Protein\ Sequences.txt
58
59 Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and
60 Protein Sequences.txt are identical
61
62 LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi
63 LocalWords: autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt
diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia
index b8ad4af..277d53a 100644
--- a/doc/Data Deployments.dia
+++ b/doc/Data Deployments.dia
Binary files differ
diff --git a/src/aggregator.c b/src/aggregator.c
index ae5aa60..da6db08 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -24,7 +24,7 @@ main()
24 /*24 /*
25 * Close the HD5 file.25 * Close the HD5 file.
26 */26 */
27 herr_t status = H5Fclose (file_id);27 H5Fclose (file_id);
2828
29 return 0;29 return 0;
30}30}
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 72aacb5..5af8a72 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
1/*
2 * Load the influnza_aa.dat tab delimited text file into an HDF5
3 * binary table.
4 *
5 * todo: Handle NULL values occuring in numeric fields.
6 */
7
1#include "load_influenza_aa_dat.h"8#include "load_influenza_aa_dat.h"
2#include "hdf5_hl.h"9#include "hdf5_hl.h"
10#include <string.h>
11#include <stdlib.h>
312
4#define NFIELDS (hsize_t) 1113#define NFIELDS (hsize_t) 11
5//#define NRECORDS (hsize_t) 13805214#define TABLE_NAME "Protein Sequences"
6#define NRECORDS (hsize_t) 1
7#define TABLE_NAME "influenza_aa.dat"
815
9void16void
10load_influenza_aa_dat (hid_t file_id)17load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
12 /*19 /*
13 * Model the data using native types.20 * Model the data using native types.
14 */21 */
15 typedef struct supplementary_data22 typedef struct
16 {23 {
17 char genbank_accession_number[9];24 char genbank_accession_number[9];
18 char host[15];25 char host[15];
19 int genome_segment_number;26 int genome_segment_number;
20 char subtype[7];27 char subtype[7];
21 char country[25];28 char country[25];
22 int year;29 char year[8];
23 int sequence_length;30 int sequence_length;
24 char virus_name[196];31 char virus_name[196];
25 char age[17];32 char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
49 HOFFSET ( supplementary_data, gender ),56 HOFFSET ( supplementary_data, gender ),
50 HOFFSET ( supplementary_data, full_length_indicator )};57 HOFFSET ( supplementary_data, full_length_indicator )};
5158
52 /*59 supplementary_data dst_buf[1];
53
54 Only needed for reading?
55
56 supplementary_data dst_buf[NRECORDS];
5760
58 size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),61 size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
59 sizeof ( dst_buf[0].host ),62 sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
66 sizeof ( dst_buf[0].age ),69 sizeof ( dst_buf[0].age ),
67 sizeof ( dst_buf[0].gender ),70 sizeof ( dst_buf[0].gender ),
68 sizeof ( dst_buf[0].full_length_indicator)};71 sizeof ( dst_buf[0].full_length_indicator)};
69 */
7072
71 /*73 /*
72 * "Define field information."74 * Map the native types to HDF5 types for each field.
73 */
74 const char *field_names[NFIELDS] =
75 { "GenBank accession number",
76 "Host",
77 "Genome segment number",
78 "Subtype",
79 "Country",
80 "Year",
81 "Sequence length",
82 "Virus name",
83 "Age",
84 "Gender",
85 "Full-length Indicator" };
86 hsize_t chunk_size = 10;
87 int *fill_data = NULL;
88 int compress = 0;
89
90 /*
91 * "Initialize field type."
92 */75 */
93 hid_t field_type[NFIELDS];76 hid_t field_type[NFIELDS];
9477
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
110 H5Tset_size (country_type, 25 );93 H5Tset_size (country_type, 25 );
111 field_type[4] = country_type;94 field_type[4] = country_type;
11295
113 field_type[5] = H5T_NATIVE_INT; 96 hid_t year_type = H5Tcopy ( H5T_C_S1 );
97 H5Tset_size (year_type, 8);
98 field_type[5] = year_type;
11499
115 field_type[6] = H5T_NATIVE_INT;100 field_type[6] = H5T_NATIVE_INT;
116101
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
130 H5Tset_size (full_length_indicator_type, 4);115 H5Tset_size (full_length_indicator_type, 4);
131 field_type[10] = full_length_indicator_type;116 field_type[10] = full_length_indicator_type;
132117
133 supplementary_data p_data[NRECORDS] = {118 /*
134 {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",119 * Labels used for the fields in the table.
135 "", "", "yes"}120 */
136 };121 const char *field_names[NFIELDS] =
122 { "GenBank accession number",
123 "Host",
124 "Genome segment number",
125 "Subtype",
126 "Country",
127 "Year",
128 "Sequence length",
129 "Virus name",
130 "Age",
131 "Gender",
132 "Full-length Indicator" };
133
134 /*
135 * Table storage options.
136 */
137 hsize_t chunk_size = 10;
138 int *fill_data = NULL;
139 int compress = 0;
140
141 /*
142 * Insert the records.
143 */
144 supplementary_data p_data;
145 FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
146 char *line = NULL;
147 size_t len = 0;
148 int current_line = 0;
149
150 while (getline (&line, &len, dat) != -1) {
151
152 current_line++;
153 char *running = strdup (line);
154 char *token;
155
156 /*
157 * Parse the line, handling the case of empty fields represented
158 * by sequential delimiters.
159 */
160 strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
161 sizeof(p_data.genbank_accession_number));
162
163 strncpy(p_data.host, strsep (&running, "\t"),
164 sizeof(p_data.host));
165
166 token = strsep (&running, "\t");
167 if (strcmp (token, "\0") == 0)
168 p_data.genome_segment_number = 0;
169 else
170 p_data.genome_segment_number = atoi(token);
171
172 strncpy(p_data.subtype, strsep (&running, "\t"),
173 sizeof(p_data.subtype));
174
175 strncpy(p_data.country, strsep (&running, "\t"),
176 sizeof(p_data.country));
177
178 strncpy (p_data.year, strsep (&running, "\t"),
179 sizeof(p_data.year));
180
181 token = strsep (&running, "\t");
182 if (strcmp (token, "\0") == 0)
183 p_data.sequence_length = 0;
184 else
185 p_data.sequence_length = atoi(token);
186
187 strncpy(p_data.virus_name, strsep (&running, "\t"),
188 sizeof(p_data.virus_name));
189
190 strncpy(p_data.age, strsep (&running, "\t"),
191 sizeof(p_data.age));
192
193 strncpy(p_data.gender, strsep (&running, "\t"),
194 sizeof(p_data.gender));
195
196 strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
197 sizeof(p_data.full_length_indicator));
198
199 if (current_line == 1)
200 H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
201 dst_size,field_names, dst_offset, field_type,
202 chunk_size, fill_data, compress, &p_data);
203 else
204 H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
205 dst_sizes, &p_data);
206
207 if (running)
208 free (running);
209
210 }
211
212 if (line)
213 free (line);
137214
138 herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,215 fclose (dat);
139 dst_size,field_names, dst_offset, field_type,
140 chunk_size, fill_data, compress, p_data);
141216
142 H5Tclose (genbank_accession_number_type);217 H5Tclose (genbank_accession_number_type);
143 H5Tclose (host_type);218 H5Tclose (host_type);
144 H5Tclose (subtype_type);219 H5Tclose (subtype_type);
145 H5Tclose (country_type);220 H5Tclose (country_type);
221 H5Tclose (year_type);
146 H5Tclose (virus_name_type);222 H5Tclose (virus_name_type);
147 H5Tclose (age_type);223 H5Tclose (age_type);
148 H5Tclose (gender_type);224 H5Tclose (gender_type);

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.