-rw-r--r-- | README | 30 | ||||
-rw-r--r-- | doc/Data Deployments.dia | bin | 3566 -> 4057 bytes | |||
-rw-r--r-- | src/aggregator.c | 2 | ||||
-rw-r--r-- | src/load_influenza_aa_dat.c | 154 |
4 files changed, 145 insertions, 41 deletions
@@ -32,4 +32,32 @@ The "doc/Data Deployments.dia" diagram shows the source systems that | |||
32 | expose the various records as well as the transform routines that are | 32 | expose the various records as well as the transform routines that are |
33 | used for aggregation of the data on the local system. | 33 | used for aggregation of the data on the local system. |
34 | 34 | ||
35 | LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia | 35 | BUILDING |
36 | |||
37 | An autogen.sh script is provided to initialize the project directory | ||
38 | with the necessary GNU Autotools configuration. | ||
39 | |||
40 | When building on a Debian system the mpi.h file is in a subdirectory | ||
41 | of /usr/include and therefore not found within the default include | ||
42 | path. To account for this run the following before running | ||
43 | ./configure. | ||
44 | |||
45 | $ export CPPFLAGS=-I/usr/include/mpi | ||
46 | |||
47 | TEST CASES | ||
48 | |||
49 | The "load_influenza_aa_dat" function loads a single tab delimited text | ||
50 | file into a table structure in the HDF5 file. The HDFView GUI can be | ||
51 | used to open the loaded table and then export it back out as a text | ||
52 | file. The text file can then be compared with the original input to | ||
53 | verify that the load was completed without error. | ||
54 | |||
55 | $ diff --report-identical-files \ | ||
56 | /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat \ | ||
57 | Protein\ Sequences.txt | ||
58 | |||
59 | Files /home/don/exp004/genomes/INFLUENZA/influenza_aa.dat and | ||
60 | Protein Sequences.txt are identical | ||
61 | |||
62 | LocalWords: NCBI parallelization HDF SQL Pellegrino phylogenetic DBMS dia mpi | ||
63 | LocalWords: autogen Autotools CPPFLAGS aa dat HDFView GUI diff txt | ||
diff --git a/doc/Data Deployments.dia b/doc/Data Deployments.dia index b8ad4af..277d53a 100644 --- a/doc/Data Deployments.dia +++ b/doc/Data Deployments.dia | |||
Binary files differ | |||
diff --git a/src/aggregator.c b/src/aggregator.c index ae5aa60..da6db08 100644 --- a/src/aggregator.c +++ b/src/aggregator.c | |||
@@ -24,7 +24,7 @@ main() | |||
24 | /* | 24 | /* |
25 | * Close the HD5 file. | 25 | * Close the HD5 file. |
26 | */ | 26 | */ |
27 | herr_t status = H5Fclose (file_id); | 27 | H5Fclose (file_id); |
28 | 28 | ||
29 | return 0; | 29 | return 0; |
30 | } | 30 | } |
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index 72aacb5..5af8a72 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c | |||
@@ -1,10 +1,17 @@ | |||
1 | /* | ||
2 | * Load the influnza_aa.dat tab delimited text file into an HDF5 | ||
3 | * binary table. | ||
4 | * | ||
5 | * todo: Handle NULL values occuring in numeric fields. | ||
6 | */ | ||
7 | |||
1 | #include "load_influenza_aa_dat.h" | 8 | #include "load_influenza_aa_dat.h" |
2 | #include "hdf5_hl.h" | 9 | #include "hdf5_hl.h" |
10 | #include <string.h> | ||
11 | #include <stdlib.h> | ||
3 | 12 | ||
4 | #define NFIELDS (hsize_t) 11 | 13 | #define NFIELDS (hsize_t) 11 |
5 | //#define NRECORDS (hsize_t) 138052 | 14 | #define TABLE_NAME "Protein Sequences" |
6 | #define NRECORDS (hsize_t) 1 | ||
7 | #define TABLE_NAME "influenza_aa.dat" | ||
8 | 15 | ||
9 | void | 16 | void |
10 | load_influenza_aa_dat (hid_t file_id) | 17 | load_influenza_aa_dat (hid_t file_id) |
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id) | |||
12 | /* | 19 | /* |
13 | * Model the data using native types. | 20 | * Model the data using native types. |
14 | */ | 21 | */ |
15 | typedef struct supplementary_data | 22 | typedef struct |
16 | { | 23 | { |
17 | char genbank_accession_number[9]; | 24 | char genbank_accession_number[9]; |
18 | char host[15]; | 25 | char host[15]; |
19 | int genome_segment_number; | 26 | int genome_segment_number; |
20 | char subtype[7]; | 27 | char subtype[7]; |
21 | char country[25]; | 28 | char country[25]; |
22 | int year; | 29 | char year[8]; |
23 | int sequence_length; | 30 | int sequence_length; |
24 | char virus_name[196]; | 31 | char virus_name[196]; |
25 | char age[17]; | 32 | char age[17]; |
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id) | |||
49 | HOFFSET ( supplementary_data, gender ), | 56 | HOFFSET ( supplementary_data, gender ), |
50 | HOFFSET ( supplementary_data, full_length_indicator )}; | 57 | HOFFSET ( supplementary_data, full_length_indicator )}; |
51 | 58 | ||
52 | /* | 59 | supplementary_data dst_buf[1]; |
53 | |||
54 | Only needed for reading? | ||
55 | |||
56 | supplementary_data dst_buf[NRECORDS]; | ||
57 | 60 | ||
58 | size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ), | 61 | size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ), |
59 | sizeof ( dst_buf[0].host ), | 62 | sizeof ( dst_buf[0].host ), |
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id) | |||
66 | sizeof ( dst_buf[0].age ), | 69 | sizeof ( dst_buf[0].age ), |
67 | sizeof ( dst_buf[0].gender ), | 70 | sizeof ( dst_buf[0].gender ), |
68 | sizeof ( dst_buf[0].full_length_indicator)}; | 71 | sizeof ( dst_buf[0].full_length_indicator)}; |
69 | */ | ||
70 | 72 | ||
71 | /* | 73 | /* |
72 | * "Define field information." | 74 | * Map the native types to HDF5 types for each field. |
73 | */ | ||
74 | const char *field_names[NFIELDS] = | ||
75 | { "GenBank accession number", | ||
76 | "Host", | ||
77 | "Genome segment number", | ||
78 | "Subtype", | ||
79 | "Country", | ||
80 | "Year", | ||
81 | "Sequence length", | ||
82 | "Virus name", | ||
83 | "Age", | ||
84 | "Gender", | ||
85 | "Full-length Indicator" }; | ||
86 | hsize_t chunk_size = 10; | ||
87 | int *fill_data = NULL; | ||
88 | int compress = 0; | ||
89 | |||
90 | /* | ||
91 | * "Initialize field type." | ||
92 | */ | 75 | */ |
93 | hid_t field_type[NFIELDS]; | 76 | hid_t field_type[NFIELDS]; |
94 | 77 | ||
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id) | |||
110 | H5Tset_size (country_type, 25 ); | 93 | H5Tset_size (country_type, 25 ); |
111 | field_type[4] = country_type; | 94 | field_type[4] = country_type; |
112 | 95 | ||
113 | field_type[5] = H5T_NATIVE_INT; | 96 | hid_t year_type = H5Tcopy ( H5T_C_S1 ); |
97 | H5Tset_size (year_type, 8); | ||
98 | field_type[5] = year_type; | ||
114 | 99 | ||
115 | field_type[6] = H5T_NATIVE_INT; | 100 | field_type[6] = H5T_NATIVE_INT; |
116 | 101 | ||
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id) | |||
130 | H5Tset_size (full_length_indicator_type, 4); | 115 | H5Tset_size (full_length_indicator_type, 4); |
131 | field_type[10] = full_length_indicator_type; | 116 | field_type[10] = full_length_indicator_type; |
132 | 117 | ||
133 | supplementary_data p_data[NRECORDS] = { | 118 | /* |
134 | {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)", | 119 | * Labels used for the fields in the table. |
135 | "", "", "yes"} | 120 | */ |
136 | }; | 121 | const char *field_names[NFIELDS] = |
122 | { "GenBank accession number", | ||
123 | "Host", | ||
124 | "Genome segment number", | ||
125 | "Subtype", | ||
126 | "Country", | ||
127 | "Year", | ||
128 | "Sequence length", | ||
129 | "Virus name", | ||
130 | "Age", | ||
131 | "Gender", | ||
132 | "Full-length Indicator" }; | ||
133 | |||
134 | /* | ||
135 | * Table storage options. | ||
136 | */ | ||
137 | hsize_t chunk_size = 10; | ||
138 | int *fill_data = NULL; | ||
139 | int compress = 0; | ||
140 | |||
141 | /* | ||
142 | * Insert the records. | ||
143 | */ | ||
144 | supplementary_data p_data; | ||
145 | FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); | ||
146 | char *line = NULL; | ||
147 | size_t len = 0; | ||
148 | int current_line = 0; | ||
149 | |||
150 | while (getline (&line, &len, dat) != -1) { | ||
151 | |||
152 | current_line++; | ||
153 | char *running = strdup (line); | ||
154 | char *token; | ||
155 | |||
156 | /* | ||
157 | * Parse the line, handling the case of empty fields represented | ||
158 | * by sequential delimiters. | ||
159 | */ | ||
160 | strncpy(p_data.genbank_accession_number, strsep (&running, "\t"), | ||
161 | sizeof(p_data.genbank_accession_number)); | ||
162 | |||
163 | strncpy(p_data.host, strsep (&running, "\t"), | ||
164 | sizeof(p_data.host)); | ||
165 | |||
166 | token = strsep (&running, "\t"); | ||
167 | if (strcmp (token, "\0") == 0) | ||
168 | p_data.genome_segment_number = 0; | ||
169 | else | ||
170 | p_data.genome_segment_number = atoi(token); | ||
171 | |||
172 | strncpy(p_data.subtype, strsep (&running, "\t"), | ||
173 | sizeof(p_data.subtype)); | ||
174 | |||
175 | strncpy(p_data.country, strsep (&running, "\t"), | ||
176 | sizeof(p_data.country)); | ||
177 | |||
178 | strncpy (p_data.year, strsep (&running, "\t"), | ||
179 | sizeof(p_data.year)); | ||
180 | |||
181 | token = strsep (&running, "\t"); | ||
182 | if (strcmp (token, "\0") == 0) | ||
183 | p_data.sequence_length = 0; | ||
184 | else | ||
185 | p_data.sequence_length = atoi(token); | ||
186 | |||
187 | strncpy(p_data.virus_name, strsep (&running, "\t"), | ||
188 | sizeof(p_data.virus_name)); | ||
189 | |||
190 | strncpy(p_data.age, strsep (&running, "\t"), | ||
191 | sizeof(p_data.age)); | ||
192 | |||
193 | strncpy(p_data.gender, strsep (&running, "\t"), | ||
194 | sizeof(p_data.gender)); | ||
195 | |||
196 | strncpy(p_data.full_length_indicator, strsep (&running, "\t"), | ||
197 | sizeof(p_data.full_length_indicator)); | ||
198 | |||
199 | if (current_line == 1) | ||
200 | H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1, | ||
201 | dst_size,field_names, dst_offset, field_type, | ||
202 | chunk_size, fill_data, compress, &p_data); | ||
203 | else | ||
204 | H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, | ||
205 | dst_sizes, &p_data); | ||
206 | |||
207 | if (running) | ||
208 | free (running); | ||
209 | |||
210 | } | ||
211 | |||
212 | if (line) | ||
213 | free (line); | ||
137 | 214 | ||
138 | herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS, | 215 | fclose (dat); |
139 | dst_size,field_names, dst_offset, field_type, | ||
140 | chunk_size, fill_data, compress, p_data); | ||
141 | 216 | ||
142 | H5Tclose (genbank_accession_number_type); | 217 | H5Tclose (genbank_accession_number_type); |
143 | H5Tclose (host_type); | 218 | H5Tclose (host_type); |
144 | H5Tclose (subtype_type); | 219 | H5Tclose (subtype_type); |
145 | H5Tclose (country_type); | 220 | H5Tclose (country_type); |
221 | H5Tclose (year_type); | ||
146 | H5Tclose (virus_name_type); | 222 | H5Tclose (virus_name_type); |
147 | H5Tclose (age_type); | 223 | H5Tclose (age_type); |
148 | H5Tclose (gender_type); | 224 | H5Tclose (gender_type); |