summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--README30
-rw-r--r--doc/Data Deployments.diabin3566 -> 4057 bytes
-rw-r--r--src/aggregator.c2
-rw-r--r--src/load_influenza_aa_dat.c154
4 files changed, 145 insertions, 41 deletions
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 72aacb5..5af8a72 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -1,10 +1,17 @@
1/*
2 * Load the influnza_aa.dat tab delimited text file into an HDF5
3 * binary table.
4 *
5 * todo: Handle NULL values occuring in numeric fields.
6 */
7
1#include "load_influenza_aa_dat.h"8#include "load_influenza_aa_dat.h"
2#include "hdf5_hl.h"9#include "hdf5_hl.h"
10#include <string.h>
11#include <stdlib.h>
312
4#define NFIELDS (hsize_t) 1113#define NFIELDS (hsize_t) 11
5//#define NRECORDS (hsize_t) 13805214#define TABLE_NAME "Protein Sequences"
6#define NRECORDS (hsize_t) 1
7#define TABLE_NAME "influenza_aa.dat"
815
9void16void
10load_influenza_aa_dat (hid_t file_id)17load_influenza_aa_dat (hid_t file_id)
@@ -12,14 +19,14 @@ load_influenza_aa_dat (hid_t file_id)
12 /*19 /*
13 * Model the data using native types.20 * Model the data using native types.
14 */21 */
15 typedef struct supplementary_data22 typedef struct
16 {23 {
17 char genbank_accession_number[9];24 char genbank_accession_number[9];
18 char host[15];25 char host[15];
19 int genome_segment_number;26 int genome_segment_number;
20 char subtype[7];27 char subtype[7];
21 char country[25];28 char country[25];
22 int year;29 char year[8];
23 int sequence_length;30 int sequence_length;
24 char virus_name[196];31 char virus_name[196];
25 char age[17];32 char age[17];
@@ -49,11 +56,7 @@ load_influenza_aa_dat (hid_t file_id)
49 HOFFSET ( supplementary_data, gender ),56 HOFFSET ( supplementary_data, gender ),
50 HOFFSET ( supplementary_data, full_length_indicator )};57 HOFFSET ( supplementary_data, full_length_indicator )};
5158
52 /*59 supplementary_data dst_buf[1];
53
54 Only needed for reading?
55
56 supplementary_data dst_buf[NRECORDS];
5760
58 size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),61 size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ),
59 sizeof ( dst_buf[0].host ),62 sizeof ( dst_buf[0].host ),
@@ -66,29 +69,9 @@ load_influenza_aa_dat (hid_t file_id)
66 sizeof ( dst_buf[0].age ),69 sizeof ( dst_buf[0].age ),
67 sizeof ( dst_buf[0].gender ),70 sizeof ( dst_buf[0].gender ),
68 sizeof ( dst_buf[0].full_length_indicator)};71 sizeof ( dst_buf[0].full_length_indicator)};
69 */
7072
71 /*73 /*
72 * "Define field information."74 * Map the native types to HDF5 types for each field.
73 */
74 const char *field_names[NFIELDS] =
75 { "GenBank accession number",
76 "Host",
77 "Genome segment number",
78 "Subtype",
79 "Country",
80 "Year",
81 "Sequence length",
82 "Virus name",
83 "Age",
84 "Gender",
85 "Full-length Indicator" };
86 hsize_t chunk_size = 10;
87 int *fill_data = NULL;
88 int compress = 0;
89
90 /*
91 * "Initialize field type."
92 */75 */
93 hid_t field_type[NFIELDS];76 hid_t field_type[NFIELDS];
9477
@@ -110,7 +93,9 @@ load_influenza_aa_dat (hid_t file_id)
110 H5Tset_size (country_type, 25 );93 H5Tset_size (country_type, 25 );
111 field_type[4] = country_type;94 field_type[4] = country_type;
11295
113 field_type[5] = H5T_NATIVE_INT; 96 hid_t year_type = H5Tcopy ( H5T_C_S1 );
97 H5Tset_size (year_type, 8);
98 field_type[5] = year_type;
11499
115 field_type[6] = H5T_NATIVE_INT;100 field_type[6] = H5T_NATIVE_INT;
116101
@@ -130,19 +115,110 @@ load_influenza_aa_dat (hid_t file_id)
130 H5Tset_size (full_length_indicator_type, 4);115 H5Tset_size (full_length_indicator_type, 4);
131 field_type[10] = full_length_indicator_type;116 field_type[10] = full_length_indicator_type;
132117
133 supplementary_data p_data[NRECORDS] = {118 /*
134 {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)",119 * Labels used for the fields in the table.
135 "", "", "yes"}120 */
136 };121 const char *field_names[NFIELDS] =
122 { "GenBank accession number",
123 "Host",
124 "Genome segment number",
125 "Subtype",
126 "Country",
127 "Year",
128 "Sequence length",
129 "Virus name",
130 "Age",
131 "Gender",
132 "Full-length Indicator" };
133
134 /*
135 * Table storage options.
136 */
137 hsize_t chunk_size = 10;
138 int *fill_data = NULL;
139 int compress = 0;
140
141 /*
142 * Insert the records.
143 */
144 supplementary_data p_data;
145 FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
146 char *line = NULL;
147 size_t len = 0;
148 int current_line = 0;
149
150 while (getline (&line, &len, dat) != -1) {
151
152 current_line++;
153 char *running = strdup (line);
154 char *token;
155
156 /*
157 * Parse the line, handling the case of empty fields represented
158 * by sequential delimiters.
159 */
160 strncpy(p_data.genbank_accession_number, strsep (&running, "\t"),
161 sizeof(p_data.genbank_accession_number));
162
163 strncpy(p_data.host, strsep (&running, "\t"),
164 sizeof(p_data.host));
165
166 token = strsep (&running, "\t");
167 if (strcmp (token, "\0") == 0)
168 p_data.genome_segment_number = 0;
169 else
170 p_data.genome_segment_number = atoi(token);
171
172 strncpy(p_data.subtype, strsep (&running, "\t"),
173 sizeof(p_data.subtype));
174
175 strncpy(p_data.country, strsep (&running, "\t"),
176 sizeof(p_data.country));
177
178 strncpy (p_data.year, strsep (&running, "\t"),
179 sizeof(p_data.year));
180
181 token = strsep (&running, "\t");
182 if (strcmp (token, "\0") == 0)
183 p_data.sequence_length = 0;
184 else
185 p_data.sequence_length = atoi(token);
186
187 strncpy(p_data.virus_name, strsep (&running, "\t"),
188 sizeof(p_data.virus_name));
189
190 strncpy(p_data.age, strsep (&running, "\t"),
191 sizeof(p_data.age));
192
193 strncpy(p_data.gender, strsep (&running, "\t"),
194 sizeof(p_data.gender));
195
196 strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
197 sizeof(p_data.full_length_indicator));
198
199 if (current_line == 1)
200 H5TBmake_table ("Protein Sequences", file_id, TABLE_NAME,NFIELDS,1,
201 dst_size,field_names, dst_offset, field_type,
202 chunk_size, fill_data, compress, &p_data);
203 else
204 H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
205 dst_sizes, &p_data);
206
207 if (running)
208 free (running);
209
210 }
211
212 if (line)
213 free (line);
137214
138 herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS,215 fclose (dat);
139 dst_size,field_names, dst_offset, field_type,
140 chunk_size, fill_data, compress, p_data);
141216
142 H5Tclose (genbank_accession_number_type);217 H5Tclose (genbank_accession_number_type);
143 H5Tclose (host_type);218 H5Tclose (host_type);
144 H5Tclose (subtype_type);219 H5Tclose (subtype_type);
145 H5Tclose (country_type);220 H5Tclose (country_type);
221 H5Tclose (year_type);
146 H5Tclose (virus_name_type);222 H5Tclose (virus_name_type);
147 H5Tclose (age_type);223 H5Tclose (age_type);
148 H5Tclose (gender_type);224 H5Tclose (gender_type);

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.