summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
new file mode 100644
index 0000000..8bf47aa
--- a/dev/null
+++ b/src/load/load_influenza_aa_dat.c
@@ -0,0 +1,254 @@
1/*
2 * Load the influnza_aa.dat tab delimited text file into an HDF5
3 * binary table.
4 *
5 * todo: Handle NULL values occuring in numeric fields.
6 */
7
8#include "load_influenza_aa_dat.h"
9#include "error/check_error.h"
10#include "error/check_h5_error.h"
11#include <hdf5_hl.h>
12#include <string.h>
13#include <stdlib.h>
14
15#define NFIELDS (hsize_t) 11
16#define TABLE_NAME "influenza_aa.dat"
17
18void
19load_influenza_aa_dat (hid_t file_id)
20{
21 /*
22 * Model the data using native types.
23 */
24 typedef struct
25 {
26 char genbank_accession_number[9];
27 char host[15];
28 int genome_segment_number;
29 char subtype[7];
30 char country[25];
31 int year;
32 int sequence_length;
33 char virus_name[196];
34 char age[17];
35 char gender[6];
36 char full_length_indicator[4];
37 } supplementary_data;
38
39 /*
40 * Use an HDF5 Table for storage.
41 * http://www.hdfgroup.org/HDF5/Tutor/h5table.html
42 */
43
44 /*
45 * "Calculate the size and the offsets of our struct members in
46 * memory."
47 */
48 size_t dst_size = sizeof (supplementary_data);
49 size_t dst_offset[NFIELDS] =
50 { HOFFSET (supplementary_data, genbank_accession_number),
51 HOFFSET (supplementary_data, host),
52 HOFFSET (supplementary_data, genome_segment_number),
53 HOFFSET (supplementary_data, subtype),
54 HOFFSET (supplementary_data, country),
55 HOFFSET (supplementary_data, year),
56 HOFFSET (supplementary_data, sequence_length),
57 HOFFSET (supplementary_data, virus_name),
58 HOFFSET (supplementary_data, age),
59 HOFFSET (supplementary_data, gender),
60 HOFFSET (supplementary_data, full_length_indicator)
61 };
62
63 supplementary_data dst_buf[1];
64
65 size_t dst_sizes[NFIELDS] = { sizeof (dst_buf[0].genbank_accession_number),
66 sizeof (dst_buf[0].host),
67 sizeof (dst_buf[0].genome_segment_number),
68 sizeof (dst_buf[0].subtype),
69 sizeof (dst_buf[0].country),
70 sizeof (dst_buf[0].year),
71 sizeof (dst_buf[0].sequence_length),
72 sizeof (dst_buf[0].virus_name),
73 sizeof (dst_buf[0].age),
74 sizeof (dst_buf[0].gender),
75 sizeof (dst_buf[0].full_length_indicator)
76 };
77
78 /*
79 * Map the native types to HDF5 types for each field.
80 */
81 hid_t field_type[NFIELDS];
82
83 hid_t genbank_accession_number_type = H5Tcopy (H5T_C_S1);
84 H5Tset_size (genbank_accession_number_type, 9);
85 field_type[0] = genbank_accession_number_type;
86
87 hid_t host_type = H5Tcopy (H5T_C_S1);
88 H5Tset_size (host_type, 15);
89 field_type[1] = host_type;
90
91 field_type[2] = H5T_NATIVE_INT;
92
93 hid_t subtype_type = H5Tcopy (H5T_C_S1);
94 H5Tset_size (subtype_type, 7);
95 field_type[3] = subtype_type;
96
97 hid_t country_type = H5Tcopy (H5T_C_S1);
98 H5Tset_size (country_type, 25);
99 field_type[4] = country_type;
100
101 field_type[5] = H5T_NATIVE_INT;
102
103 field_type[6] = H5T_NATIVE_INT;
104
105 hid_t virus_name_type = H5Tcopy (H5T_C_S1);
106 H5Tset_size (virus_name_type, 196);
107 field_type[7] = virus_name_type;
108
109 hid_t age_type = H5Tcopy (H5T_C_S1);
110 H5Tset_size (age_type, 17);
111 field_type[8] = age_type;
112
113 hid_t gender_type = H5Tcopy (H5T_C_S1);
114 H5Tset_size (gender_type, 6);
115 field_type[9] = gender_type;
116
117 hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1);
118 H5Tset_size (full_length_indicator_type, 4);
119 field_type[10] = full_length_indicator_type;
120
121 /*
122 * Labels used for the fields in the table.
123 */
124 const char *field_names[NFIELDS] = { "GenBank accession number",
125 "Host",
126 "Genome segment number",
127 "Subtype",
128 "Country",
129 "Year",
130 "Sequence length",
131 "Virus name",
132 "Age",
133 "Gender",
134 "Full-length Indicator"
135 };
136
137 /*
138 * Table storage options.
139 */
140 hsize_t chunk_size = 10;
141 int *fill_data = NULL;
142 int compress = 0;
143
144 /*
145 * Insert the records.
146 */
147 supplementary_data p_data;
148 FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat",
149 "r");
150 if (dat == NULL)
151 check_error (__FILE__, __LINE__);
152 char *line = NULL;
153 size_t len = 0;
154 int current_line = 0;
155
156 while (getline (&line, &len, dat) != -1)
157 {
158
159 current_line++;
160 char *running = strdup (line);
161 char *token = NULL;
162
163 /*
164 * Parse the line, handling the case of empty fields represented
165 * by sequential delimiters.
166 */
167 strncpy (p_data.genbank_accession_number, strsep (&running, "\t"),
168 sizeof (p_data.genbank_accession_number));
169
170 strncpy (p_data.host, strsep (&running, "\t"), sizeof (p_data.host));
171
172 token = strsep (&running, "\t");
173 if (strcmp (token, "\0") == 0)
174 p_data.genome_segment_number = 0;
175 else
176 p_data.genome_segment_number = atoi (token);
177
178 strncpy (p_data.subtype, strsep (&running, "\t"),
179 sizeof (p_data.subtype));
180
181 strncpy (p_data.country, strsep (&running, "\t"),
182 sizeof (p_data.country));
183
184 /*
185 * Convert the year field from text to numeric. Unknown and empty
186 * values are assigned a numeric value of zero.
187 */
188 token = strsep (&running, "\t");
189 if (strcmp (token, "\0") == 0)
190 p_data.year = 0;
191 else if (strcmp (token, "unknown") == 0)
192 p_data.year = 0;
193 else if (strcmp (token, "NON") == 0)
194 p_data.year = 0;
195 else
196 p_data.year = atoi (token);
197
198 token = strsep (&running, "\t");
199 if (strcmp (token, "\0") == 0)
200 p_data.sequence_length = 0;
201 else
202 p_data.sequence_length = atoi (token);
203
204 strncpy (p_data.virus_name, strsep (&running, "\t"),
205 sizeof (p_data.virus_name));
206
207 strncpy (p_data.age, strsep (&running, "\t"), sizeof (p_data.age));
208
209 strncpy (p_data.gender, strsep (&running, "\t"),
210 sizeof (p_data.gender));
211
212 strncpy (p_data.full_length_indicator, strsep (&running, "\t"),
213 sizeof (p_data.full_length_indicator));
214
215 if (current_line == 1)
216 {
217 herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
218 TABLE_NAME, NFIELDS, 1, dst_size,
219 field_names, dst_offset, field_type,
220 chunk_size, fill_data, compress,
221 &p_data);
222 if (status < 0)
223 check_h5_error (status, __FILE__, __LINE__);
224 }
225 else
226 {
227 herr_t status =
228 H5TBappend_records (file_id, TABLE_NAME, 1, dst_size,
229 dst_offset, dst_sizes, &p_data);
230 if (status < 0)
231 check_h5_error (status, __FILE__, __LINE__);
232 }
233
234 if (running)
235 free (running);
236
237 }
238
239 if (line)
240 free (line);
241
242 fclose (dat);
243
244 H5Tclose (genbank_accession_number_type);
245 H5Tclose (host_type);
246 H5Tclose (subtype_type);
247 H5Tclose (country_type);
248 H5Tclose (virus_name_type);
249 H5Tclose (age_type);
250 H5Tclose (gender_type);
251 H5Tclose (full_length_indicator_type);
252
253 return;
254}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.