summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--src/aggregator.c3
-rw-r--r--src/assign/assign_protein_type.c110
-rw-r--r--src/load/load_influenza_aa_dat.c13
-rw-r--r--src/load/load_influenza_aa_dat.h2
-rw-r--r--src/load/load_influenza_faa.c30
-rw-r--r--src/load/load_influenza_faa.h2
-rw-r--r--src/model/gi_type_data_init.c4
-rw-r--r--src/model/gi_type_data_init.h4
-rw-r--r--src/model/sequence_data_init.c18
-rw-r--r--src/model/sequence_data_init.h6
10 files changed, 99 insertions, 93 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 9a0717b..1df4c8d 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -70,7 +70,7 @@ assign_protein_type (hid_t file_id)
70 if (status < 0)70 if (status < 0)
71 check_h5_error (status, __FILE__, __LINE__);71 check_h5_error (status, __FILE__, __LINE__);
7272
73 sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);73 sequence_data *faa_buf = malloc (sizeof (sequence_data) * faa_nrecords);
7474
75 size_t faa_size;75 size_t faa_size;
76 size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];76 size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
@@ -86,7 +86,7 @@ assign_protein_type (hid_t file_id)
86 /*86 /*
87 * Allocate memory for the new table.87 * Allocate memory for the new table.
88 */88 */
89 gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);89 gi_type_data *new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);
90 if (new_buf == NULL)90 if (new_buf == NULL)
91 check_error (__FILE__, __LINE__);91 check_error (__FILE__, __LINE__);
9292
@@ -101,7 +101,7 @@ assign_protein_type (hid_t file_id)
101 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];101 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
102 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);102 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
103103
104 gi_type_data* old_buf = NULL;104 gi_type_data *old_buf = NULL;
105105
106 /*106 /*
107 * If the table is already present read the values into memory and107 * If the table is already present read the values into memory and
@@ -117,10 +117,11 @@ assign_protein_type (hid_t file_id)
117 if (status < 0)117 if (status < 0)
118 check_h5_error (status, __FILE__, __LINE__);118 check_h5_error (status, __FILE__, __LINE__);
119119
120 printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords);120 printf (" Using gi_type_data cache of %i records.\n",
121 121 (int) gi_nrecords);
122 old_buf = malloc (sizeof(gi_type_data) * gi_nrecords);122
123 123 old_buf = malloc (sizeof (gi_type_data) * gi_nrecords);
124
124 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,125 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
125 gi_sizes, old_buf);126 gi_sizes, old_buf);
126 if (status < 0)127 if (status < 0)
@@ -129,18 +130,18 @@ assign_protein_type (hid_t file_id)
129 status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);130 status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);
130 if (status < 0)131 if (status < 0)
131 check_h5_error (status, __FILE__, __LINE__);132 check_h5_error (status, __FILE__, __LINE__);
132 133
133 }134 }
134135
135 /*136 /*
136 * If the table is not already present create it.137 * If the table is not already present create it.
137 */138 */
138 else139 else
139 { 140 {
140141
141 printf ("Creating gi_type_data.\n");142 printf ("Creating gi_type_data.\n");
142143
143 const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =144 const char *gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
144 GI_TYPE_DATA_FIELD_NAMES;145 GI_TYPE_DATA_FIELD_NAMES;
145146
146 hsize_t chunk_size = 10;147 hsize_t chunk_size = 10;
@@ -152,8 +153,7 @@ assign_protein_type (hid_t file_id)
152 GI_TYPE_DATA_FIELD_NUM, 0,153 GI_TYPE_DATA_FIELD_NUM, 0,
153 gi_size, gi_type_data_field_names,154 gi_size, gi_type_data_field_names,
154 gi_offset, gi_field_type,155 gi_offset, gi_field_type,
155 chunk_size, fill_data, compress,156 chunk_size, fill_data, compress, NULL);
156 NULL);
157 if (status < 0)157 if (status < 0)
158 check_h5_error (status, __FILE__, __LINE__);158 check_h5_error (status, __FILE__, __LINE__);
159159
@@ -169,7 +169,7 @@ assign_protein_type (hid_t file_id)
169 "Allocation of cache failed.");169 "Allocation of cache failed.");
170 ENTRY e, *ep;170 ENTRY e, *ep;
171171
172 for (int i = 0; i < (int)gi_nrecords; i++)172 for (int i = 0; i < (int) gi_nrecords; i++)
173 {173 {
174 char gi_chr[25];174 char gi_chr[25];
175 snprintf (gi_chr, 25, "%i", old_buf[i].gi);175 snprintf (gi_chr, 25, "%i", old_buf[i].gi);
@@ -183,14 +183,14 @@ assign_protein_type (hid_t file_id)
183 /*183 /*
184 * Assign protein types to records for which the field is empty.184 * Assign protein types to records for which the field is empty.
185 */185 */
186 printf ("Records to process: %i\n", (int)faa_nrecords);186 printf ("Records to process: %i\n", (int) faa_nrecords);
187 int written = 0;187 int written = 0;
188 for (int i = 0; i < (int)faa_nrecords; i++)188 for (int i = 0; i < (int) faa_nrecords; i++)
189 {189 {
190 new_buf[i].gi = faa_buf[i].gi;190 new_buf[i].gi = faa_buf[i].gi;
191 strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));191 strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));
192 strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));192 strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));
193 193
194 char gi_chr[25];194 char gi_chr[25];
195 snprintf (gi_chr, 25, "%i", faa_buf[i].gi);195 snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
196 e.key = gi_chr;196 e.key = gi_chr;
@@ -199,24 +199,23 @@ assign_protein_type (hid_t file_id)
199 /*199 /*
200 * A record was not found in the cache for this gi.200 * A record was not found in the cache for this gi.
201 */201 */
202 if (hsearch_r (e, FIND, &ep, &htab) == 0) 202 if (hsearch_r (e, FIND, &ep, &htab) == 0)
203 {203 {
204 204
205 /*205 /*
206 * Read the sequence from the database by GI.206 * Read the sequence from the database by GI.
207 */207 */
208 Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL);208 Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL);
209 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);209 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
210 if (bsp == NULL) 210 if (bsp == NULL)
211 {211 {
212 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,212 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
213 "Unable to find BLAST record for gi|%i. Ensure "213 "Unable to find BLAST record for gi|%i. Ensure "
214 "the BLAST database is up-to-date with the HDF5 "214 "the BLAST database is up-to-date with the HDF5 "
215 "record set. See the BLAST formatdb.log file "215 "record set. See the BLAST formatdb.log file "
216 "for details.\n",216 "for details.\n", faa_buf[i].gi);
217 faa_buf[i].gi);
218 }217 }
219 218
220 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,219 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
221 "blastp",220 "blastp",
222 REFDB,221 REFDB,
@@ -224,19 +223,19 @@ assign_protein_type (hid_t file_id)
224 NULL,223 NULL,
225 &error_returns,224 &error_returns,
226 NULL);225 NULL);
227 226
228 /*227 /*
229 * BLAST reported an error. Write it out and continue processing.228 * BLAST reported an error. Write it out and continue processing.
230 */229 */
231 if (error_returns != NULL)230 if (error_returns != NULL)
232 {231 {
233 CharPtr msg = BlastErrorToString (error_returns);232 CharPtr msg = BlastErrorToString (error_returns);
234 printf ("Warning: An error has been reported by the NCBI Toolkit "233 printf
235 "API for sequence gi|%i: %s",234 ("Warning: An error has been reported by the NCBI Toolkit "
236 faa_buf[i].gi, msg);235 "API for sequence gi|%i: %s", faa_buf[i].gi, msg);
237 free (msg); 236 free (msg);
238 }237 }
239 238
240 /*239 /*
241 * A hit was found. Record the first hit as the protein type.240 * A hit was found. Record the first hit as the protein type.
242 * Skip the first 4 characters and eat the "lcl|".241 * Skip the first 4 characters and eat the "lcl|".
@@ -245,18 +244,18 @@ assign_protein_type (hid_t file_id)
245 {244 {
246 Char target_id_buf[BUFFER_LEN + 1];245 Char target_id_buf[BUFFER_LEN + 1];
247 SeqIdPtr target_id = SeqAlignId (seqalign, 1);246 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
248 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, 247 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT,
249 BUFFER_LEN);248 BUFFER_LEN);
250249
251 // Species Type250 // Species Type
252 new_buf[i].type[0] = target_id_buf[4];251 new_buf[i].type[0] = target_id_buf[4];
253 new_buf[i].type[1] = '\0';252 new_buf[i].type[1] = '\0';
254 253
255 // Protein Type (Skip the underscore in the string).254 // Protein Type (Skip the underscore in the string).
256 strncpy (new_buf[i].protein, &target_id_buf[6], 255 strncpy (new_buf[i].protein, &target_id_buf[6],
257 sizeof (new_buf[i].protein)); 256 sizeof (new_buf[i].protein));
258 }257 }
259 258
260 /*259 /*
261 * BLAST did not find any hits.260 * BLAST did not find any hits.
262 */261 */
@@ -265,73 +264,74 @@ assign_protein_type (hid_t file_id)
265 printf ("Warning: Unable to identify protein type for sequence "264 printf ("Warning: Unable to identify protein type for sequence "
266 "gi|%i\n", faa_buf[i].gi);265 "gi|%i\n", faa_buf[i].gi);
267 }266 }
268 267
269 /*268 /*
270 * Clean up memory for the next ieration.269 * Clean up memory for the next ieration.
271 */270 */
272 seqalign = SeqAlignSetFree (seqalign);271 seqalign = SeqAlignSetFree (seqalign);
273 bsp = BioseqFree (bsp);272 bsp = BioseqFree (bsp);
274 273
275 } // End existing entry not found.274 } // End existing entry not found.
276275
277 /*276 /*
278 * Hash table entry found. Keep the old value.277 * Hash table entry found. Keep the old value.
279 */278 */
280 else279 else
281 {280 {
282 gi_type_data* old_value = (gi_type_data*)ep->data;281 gi_type_data *old_value = (gi_type_data *) ep->data;
283 new_buf[i].gi = old_value->gi;282 new_buf[i].gi = old_value->gi;
284 strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type));283 strncpy (new_buf[i].type, old_value->type,
285 strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein));284 sizeof (new_buf[i].type));
285 strncpy (new_buf[i].protein, old_value->protein,
286 sizeof (new_buf[i].protein));
286 }287 }
287 288
288 /*289 /*
289 * Write the data out to the file.290 * Write the data out to the file.
290 */291 */
291 if ( (i % 1000 == 0) && (i > 0) )292 if ((i % 1000 == 0) && (i > 0))
292 {293 {
293 status = H5TBappend_records (file_id, "gi_type_data", 1000,294 status = H5TBappend_records (file_id, "gi_type_data", 1000,
294 gi_size, gi_offset, gi_sizes, 295 gi_size, gi_offset, gi_sizes,
295 &new_buf[i-1000]);296 &new_buf[i - 1000]);
296 if (status < 0)297 if (status < 0)
297 check_h5_error (status, __FILE__, __LINE__);298 check_h5_error (status, __FILE__, __LINE__);
298 299
299 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);300 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
300 if (status < 0)301 if (status < 0)
301 check_h5_error (status, __FILE__, __LINE__);302 check_h5_error (status, __FILE__, __LINE__);
302303
303 written = i;304 written = i;
304305
305 printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);306 printf ("Processed %i of %i records.\n", i, (int) faa_nrecords);
306 }307 }
307 308
308 }309 }
309310
310 /*311 /*
311 * Write out records from the last bin if it was less than 1000312 * Write out records from the last bin if it was less than 1000
312 * records in size.313 * records in size.
313 */314 */
314 if ((int)faa_nrecords < 1000) 315 if ((int) faa_nrecords < 1000)
315 {316 {
316 status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,317 status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,
317 gi_size, gi_offset, gi_sizes,318 gi_size, gi_offset, gi_sizes, new_buf);
318 new_buf);
319 }319 }
320320
321 else321 else
322 {322 {
323 status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,323 status =
324 gi_size, gi_offset, gi_sizes, 324 H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,
325 &new_buf[written]);325 gi_size, gi_offset, gi_sizes, &new_buf[written]);
326 }326 }
327327
328 if (status < 0)328 if (status < 0)
329 check_h5_error (status, __FILE__, __LINE__);329 check_h5_error (status, __FILE__, __LINE__);
330 330
331 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);331 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
332 if (status < 0)332 if (status < 0)
333 check_h5_error (status, __FILE__, __LINE__);333 check_h5_error (status, __FILE__, __LINE__);
334 334
335 free (faa_buf);335 free (faa_buf);
336 free (old_buf);336 free (old_buf);
337 free (new_buf);337 free (new_buf);
@@ -339,6 +339,6 @@ assign_protein_type (hid_t file_id)
339339
340 options = BLASTOptionDelete (options);340 options = BLASTOptionDelete (options);
341 readdb_destruct (seqdb);341 readdb_destruct (seqdb);
342 342
343 return;343 return;
344}344}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.