summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--src/aggregator.c34
-rw-r--r--src/assign/assign_protein_type.c141
-rw-r--r--src/load/load_influenza_aa_dat.c53
-rw-r--r--src/load/load_influenza_aa_dat.h2
-rw-r--r--src/load/load_influenza_faa.c53
-rw-r--r--src/load/load_influenza_faa.h2
-rw-r--r--src/updator.c2
7 files changed, 201 insertions, 86 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 73685bb..3947800 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,5 +1,6 @@
1#define _GNU_SOURCE1#define _GNU_SOURCE
2#include "assign_protein_type.h"2#include "assign_protein_type.h"
3#include "error/check_error.h"
3#include "error/check_h5_error.h"4#include "error/check_h5_error.h"
4#include "error/check_ncbi_error.h"5#include "error/check_ncbi_error.h"
5#include "model/gi_type_data.h"6#include "model/gi_type_data.h"
@@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id)
84 check_h5_error (status, __FILE__, __LINE__);85 check_h5_error (status, __FILE__, __LINE__);
8586
86 /*87 /*
88 * Allocate memory for the new table.
89 */
90 gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);
91 if (new_buf == NULL)
92 check_error (__FILE__, __LINE__);
93
94 /*
87 * Read the data from HDF5 gi_type_data.95 * Read the data from HDF5 gi_type_data.
88 */96 */
89 hsize_t gi_nfields = 0;97 hsize_t gi_nfields = 0;
@@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id)
94 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];102 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
95 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);103 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
96104
97 gi_type_data* gi_buf = NULL;105 gi_type_data* old_buf = NULL;
98106
107 /*
108 * If the table is already present read the values into memory and
109 * then clear the table.
110 */
99 if (H5LTfind_dataset (file_id, "gi_type_data") == 1)111 if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
100 {112 {
101113
@@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id)
105 &gi_nrecords);117 &gi_nrecords);
106 if (status < 0)118 if (status < 0)
107 check_h5_error (status, __FILE__, __LINE__);119 check_h5_error (status, __FILE__, __LINE__);
120
121 printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords);
108 122
109 gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);123 old_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
110 124
111 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,125 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
112 gi_sizes, gi_buf);126 gi_sizes, old_buf);
127 if (status < 0)
128 check_h5_error (status, __FILE__, __LINE__);
129
130 status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);
113 if (status < 0)131 if (status < 0)
114 check_h5_error (status, __FILE__, __LINE__);132 check_h5_error (status, __FILE__, __LINE__);
115 133
116 }134 }
135
136 /*
137 * If the table is not already present create it.
138 */
117 else139 else
118 { 140 {
119141
120 printf ("Creating gi_type_data.\n");142 printf ("Creating gi_type_data.\n");
121143
122 gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
123
124 const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =144 const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
125 GI_TYPE_DATA_FIELD_NAMES;145 GI_TYPE_DATA_FIELD_NAMES;
126146
@@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id)
130150
131 status = H5TBmake_table ("gi_type_data", file_id,151 status = H5TBmake_table ("gi_type_data", file_id,
132 "gi_type_data",152 "gi_type_data",
133 GI_TYPE_DATA_FIELD_NUM, faa_nrecords,153 GI_TYPE_DATA_FIELD_NUM, 0,
134 gi_size, gi_type_data_field_names,154 gi_size, gi_type_data_field_names,
135 gi_offset, gi_field_type,155 gi_offset, gi_field_type,
136 chunk_size, fill_data, compress,156 chunk_size, fill_data, compress,
@@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id)
140160
141 }161 }
142162
163 /*
164 * Copy the contents of the old table into a hash.
165 */
143 struct hsearch_data htab;166 struct hsearch_data htab;
144 bzero (&htab, sizeof (htab));167 bzero (&htab, sizeof (htab));
145 hcreate_r (gi_nrecords * 2, &htab);168 if (hcreate_r (gi_nrecords * 2, &htab) == 0)
169 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
170 "Allocation of cache failed.");
146 ENTRY e, *ep;171 ENTRY e, *ep;
147 172
148 for (int i = 0; i < gi_nrecords; i++)173 for (int i = 0; i < (int)gi_nrecords; i++)
149 {174 {
150 char gi_chr[25];175 char gi_chr[25];
151 snprintf (gi_chr, 25, "%i", gi_buf[i].gi);176 snprintf (gi_chr, 25, "%i", old_buf[i].gi);
152 e.key = gi_chr;177 e.key = strdup (gi_chr);
153 e.data = &gi_buf[i];178 e.data = &old_buf[i];
154 if (hsearch_r (e, ENTER, &ep, &htab) == 0)179 if (hsearch_r (e, ENTER, &ep, &htab) == 0)
155 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,180 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
156 "Allocation failed.");181 "Allocation failed.");
@@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id)
160 * Assign protein types to records for which the field is empty.185 * Assign protein types to records for which the field is empty.
161 */186 */
162 printf ("Records to process: %i\n", (int)faa_nrecords);187 printf ("Records to process: %i\n", (int)faa_nrecords);
163 bool updates_pending = false;188 int written = 0;
164 for (int i = 0; i < faa_nrecords; i++)189 for (int i = 0; i < (int)faa_nrecords; i++)
165 {190 {
191 new_buf[i].gi = faa_buf[i].gi;
192 strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));
193 strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));
166 194
167 char gi_chr[25];195 char gi_chr[25];
168 snprintf (gi_chr, 25, "%i", faa_buf[i].gi);196 snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
169 e.key = gi_chr;197 e.key = gi_chr;
198 e.data = NULL;
199
200 /*
201 * A record was not found in the cache for this gi.
202 */
170 if (hsearch_r (e, FIND, &ep, &htab) == 0) 203 if (hsearch_r (e, FIND, &ep, &htab) == 0)
171 {204 {
172
173 gi_buf[i].gi = faa_buf[i].gi;
174 gi_buf[i].type[0] = '\0';
175 gi_buf[i].protein[0] = '\0';
176 205
177 /*206 /*
178 * Read the sequence from the database by GI.207 * Read the sequence from the database by GI.
@@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id)
202 */231 */
203 if (error_returns != NULL)232 if (error_returns != NULL)
204 {233 {
205 char *msg = BlastErrorToString (error_returns);234 CharPtr msg = BlastErrorToString (error_returns);
206 printf ("Warning: An error has been reported by the NCBI Toolkit "235 printf ("Warning: An error has been reported by the NCBI Toolkit "
207 "API for sequence gi|%i: %s",236 "API for sequence gi|%i: %s",
208 faa_buf[i].gi, msg);237 faa_buf[i].gi, msg);
@@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id)
221 BUFFER_LEN);250 BUFFER_LEN);
222251
223 // Species Type252 // Species Type
224 gi_buf[i].type[0] = target_id_buf[4];253 new_buf[i].type[0] = target_id_buf[4];
225 gi_buf[i].type[1] = '\0';254 new_buf[i].type[1] = '\0';
226 255
227 // Protein Type256 // Protein Type
228 strncpy (gi_buf[i].protein, &target_id_buf[6], 257 strncpy (new_buf[i].protein, &target_id_buf[6],
229 sizeof (gi_buf[i].protein));258 sizeof (new_buf[i].protein));
230
231 updates_pending = true;
232 }259 }
233 260
234 /*261 /*
@@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id)
246 seqalign = SeqAlignSetFree (seqalign);273 seqalign = SeqAlignSetFree (seqalign);
247 bsp = BioseqFree (bsp);274 bsp = BioseqFree (bsp);
248 275
276 } // End existing entry not found.
277
278 /*
279 * Hash table entry found. Keep the old value.
280 */
281 else
282 {
283 gi_type_data* old_value = (gi_type_data*)ep->data;
284 new_buf[i].gi = old_value->gi;
285 strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type));
286 strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein));
249 }287 }
250 288
251 /*289 /*
252 * Write the data out to the file.290 * Write the data out to the file.
253 */291 */
254 if ( (i % 1000 == 0) && (i > 0) && updates_pending)292 if ( (i % 1000 == 0) && (i > 0) )
255 {293 {
256 status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,294 status = H5TBappend_records (file_id, "gi_type_data", 1000,
257 gi_size, gi_offset, gi_sizes, 295 gi_size, gi_offset, gi_sizes,
258 &gi_buf[i-1000]);296 &new_buf[i-1000]);
259 if (status < 0)297 if (status < 0)
260 check_h5_error (status, __FILE__, __LINE__);298 check_h5_error (status, __FILE__, __LINE__);
261 299
@@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id)
263 if (status < 0)301 if (status < 0)
264 check_h5_error (status, __FILE__, __LINE__);302 check_h5_error (status, __FILE__, __LINE__);
265303
266 updates_pending = false;304 written = i;
267305
268 printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);306 printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
269 }307 }
@@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id)
274 * Write out records from the last bin if it was less than 1000312 * Write out records from the last bin if it was less than 1000
275 * records in size.313 * records in size.
276 */314 */
277 if (updates_pending)315 if ((int)faa_nrecords < 1000)
278 {316 {
279 /*317 status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,
280 if ((int)faa_nrecords < 1000) 318 gi_size, gi_offset, gi_sizes,
281 {319 new_buf);
282 status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,320 }
283 dst_size, dst_offset, dst_sizes,321
284 dst_buf);322 else
285 }323 {
286 else324 status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,
287 {325 gi_size, gi_offset, gi_sizes,
288 status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,326 &new_buf[written]);
289 dst_size, dst_offset, dst_sizes,
290 &dst_buf[nrecords-1000]);
291 }
292 if (status < 0)
293 check_h5_error (status, __FILE__, __LINE__);
294
295 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
296 if (status < 0)
297 check_h5_error (status, __FILE__, __LINE__);
298
299 updates_pending = false;
300 */
301 }327 }
328
329 if (status < 0)
330 check_h5_error (status, __FILE__, __LINE__);
331
332 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
333 if (status < 0)
334 check_h5_error (status, __FILE__, __LINE__);
302 335
303 free (faa_buf);336 free (faa_buf);
304 free (gi_buf);337 free (old_buf);
338 free (new_buf);
305 hdestroy_r (&htab);339 hdestroy_r (&htab);
306340
307 options = BLASTOptionDelete (options);341 options = BLASTOptionDelete (options);
342 readdb_destruct (seqdb);
308 343
309 return;344 return;
310}345}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.