summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--src/assign/assign_protein_type.c269
1 files changed, 163 insertions, 106 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 0a11c23..73685bb 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,27 +1,19 @@
1#define _GNU_SOURCE
1#include "assign_protein_type.h"2#include "assign_protein_type.h"
2#include "error/check_ncbi_error.h"
3#include "error/check_h5_error.h"3#include "error/check_h5_error.h"
4#include "error/check_ncbi_error.h"
4#include "model/gi_type_data.h"5#include "model/gi_type_data.h"
5#include "model/gi_type_data_init.h"6#include "model/gi_type_data_init.h"
6#include "model/sequence_data.h"7#include "model/sequence_data.h"
7#include "model/sequence_data_init.h"8#include "model/sequence_data_init.h"
9#include <blast.h>
10#include <error.h>
11#include <hdf5_hl.h>
8#include <ncbi.h>12#include <ncbi.h>
9#include <readdb.h>13#include <readdb.h>
10#include <blast.h>
11#include <salpacc.h>14#include <salpacc.h>
15#include <search.h>
12#include <stdbool.h>16#include <stdbool.h>
13#include <hdf5_hl.h>
14#include <error.h>
15
16/*
17 * todo: Write results to an independent table by GI so that they can
18 * be reused when the input tables influenza.faa and influenza_aa.dat
19 * are replaced with new data. Since GI values are unique and
20 * persistent the assignment of a protein type should stay valid
21 * unless the list of reference sequences is updated.
22 *
23 * todo: Save the species type as a new field { A, B, C }.
24 */
2517
26/*18/*
27 * BLAST database containing all of the influenza protein sequences.19 * BLAST database containing all of the influenza protein sequences.
@@ -94,25 +86,75 @@ assign_protein_type (hid_t file_id)
94 /*86 /*
95 * Read the data from HDF5 gi_type_data.87 * Read the data from HDF5 gi_type_data.
96 */88 */
97 hsize_t gi_nfields;89 hsize_t gi_nfields = 0;
98 hsize_t gi_nrecords;90 hsize_t gi_nrecords = 0;
99 status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,91 size_t gi_size = 0;
100 &gi_nrecords);
101 if (status < 0)
102 check_h5_error (status, __FILE__, __LINE__);
103
104 gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
105
106 size_t gi_size;
107 size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];92 size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
108 size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];93 size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
109 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];94 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
110 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);95 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
11196
112 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,97 gi_type_data* gi_buf = NULL;
113 gi_sizes, gi_buf);98
114 if (status < 0)99 if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
115 check_h5_error (status, __FILE__, __LINE__);100 {
101
102 printf ("Updating gi_type_data.\n");
103
104 status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
105 &gi_nrecords);
106 if (status < 0)
107 check_h5_error (status, __FILE__, __LINE__);
108
109 gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
110
111 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
112 gi_sizes, gi_buf);
113 if (status < 0)
114 check_h5_error (status, __FILE__, __LINE__);
115
116 }
117 else
118 {
119
120 printf ("Creating gi_type_data.\n");
121
122 gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
123
124 const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
125 GI_TYPE_DATA_FIELD_NAMES;
126
127 hsize_t chunk_size = 10;
128 int *fill_data = NULL;
129 int compress = 0;
130
131 status = H5TBmake_table ("gi_type_data", file_id,
132 "gi_type_data",
133 GI_TYPE_DATA_FIELD_NUM, faa_nrecords,
134 gi_size, gi_type_data_field_names,
135 gi_offset, gi_field_type,
136 chunk_size, fill_data, compress,
137 NULL);
138 if (status < 0)
139 check_h5_error (status, __FILE__, __LINE__);
140
141 }
142
143 struct hsearch_data htab;
144 bzero (&htab, sizeof (htab));
145 hcreate_r (gi_nrecords * 2, &htab);
146 ENTRY e, *ep;
147
148 for (int i = 0; i < gi_nrecords; i++)
149 {
150 char gi_chr[25];
151 snprintf (gi_chr, 25, "%i", gi_buf[i].gi);
152 e.key = gi_chr;
153 e.data = &gi_buf[i];
154 if (hsearch_r (e, ENTER, &ep, &htab) == 0)
155 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
156 "Allocation failed.");
157 }
116158
117 /*159 /*
118 * Assign protein types to records for which the field is empty.160 * Assign protein types to records for which the field is empty.
@@ -121,97 +163,109 @@ assign_protein_type (hid_t file_id)
121 bool updates_pending = false;163 bool updates_pending = false;
122 for (int i = 0; i < faa_nrecords; i++)164 for (int i = 0; i < faa_nrecords; i++)
123 {165 {
166
167 char gi_chr[25];
168 snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
169 e.key = gi_chr;
170 if (hsearch_r (e, FIND, &ep, &htab) == 0)
171 {
124172
125 if (dst_buf[i].protein_type[0] == '\0') {173 gi_buf[i].gi = faa_buf[i].gi;
126 /*174 gi_buf[i].type[0] = '\0';
127 * Read the sequence from the database by GI.175 gi_buf[i].protein[0] = '\0';
128 */176
129 Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);177 /*
130 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);178 * Read the sequence from the database by GI.
131 if (bsp == NULL) 179 */
132 {180 Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL);
133 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,181 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
134 "Unable to find BLAST record for gi|%i. Ensure the BLAST "182 if (bsp == NULL)
135 "database is up-to-date with the HDF5 record set. See the "183 {
136 "BLAST formatdb.log file for details.\n",184 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
137 dst_buf[i].gi);185 "Unable to find BLAST record for gi|%i. Ensure "
138 }186 "the BLAST database is up-to-date with the HDF5 "
139 187 "record set. See the BLAST formatdb.log file "
140 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,188 "for details.\n",
141 "blastp",189 faa_buf[i].gi);
142 REFDB,190 }
143 options,191
144 NULL,192 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
145 &error_returns,193 "blastp",
146 NULL);194 REFDB,
147195 options,
148 /*196 NULL,
149 * BLAST reported an error. Write it out and continue processing.197 &error_returns,
150 */198 NULL);
151 if (error_returns != NULL)199
152 {200 /*
153 printf ("Warning: An error has been reported by the NCBI Toolkit API "201 * BLAST reported an error. Write it out and continue processing.
154 "for sequence gi|%i: %s",202 */
155 dst_buf[i].gi, BlastErrorToString (error_returns));203 if (error_returns != NULL)
156204 {
157 }205 char *msg = BlastErrorToString (error_returns);
158206 printf ("Warning: An error has been reported by the NCBI Toolkit "
159 /*207 "API for sequence gi|%i: %s",
160 * A hit was found. Record the first hit as the protein type.208 faa_buf[i].gi, msg);
161 * Skip the first 6 characters and eat the "lcl|x_".209 free (msg);
162 */210 }
163 else if (seqalign != NULL)211
164 {212 /*
165 Char target_id_buf[BUFFER_LEN + 1];213 * A hit was found. Record the first hit as the protein type.
166 SeqIdPtr target_id = SeqAlignId (seqalign, 1);214 * Skip the first 6 characters and eat the "lcl|x_".
167 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);215 */
168216 else if (seqalign != NULL)
169 // Species Type217 {
170 dst_buf[i].type[0] = &target_id_buf[4];218 Char target_id_buf[BUFFER_LEN + 1];
171 dst_buf[i].type[1] = '\0';219 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
172220 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT,
173 // Protein Type221 BUFFER_LEN);
174 strncpy (dst_buf[i].protein, &target_id_buf[6],
175 sizeof (dst_buf[i].protein));
176
177 updates_pending = true;
178 }
179
180 /*
181 * BLAST did not find any hits.
182 */
183 else
184 {
185 printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
186 dst_buf[i].gi);
187 }
188
189 /*
190 * Clean up memory for the next ieration.
191 */
192 seqalign = SeqAlignSetFree (seqalign);
193 bsp = BioseqFree (bsp);
194
195 }
196222
223 // Species Type
224 gi_buf[i].type[0] = target_id_buf[4];
225 gi_buf[i].type[1] = '\0';
226
227 // Protein Type
228 strncpy (gi_buf[i].protein, &target_id_buf[6],
229 sizeof (gi_buf[i].protein));
230
231 updates_pending = true;
232 }
233
234 /*
235 * BLAST did not find any hits.
236 */
237 else
238 {
239 printf ("Warning: Unable to identify protein type for sequence "
240 "gi|%i\n", faa_buf[i].gi);
241 }
242
243 /*
244 * Clean up memory for the next ieration.
245 */
246 seqalign = SeqAlignSetFree (seqalign);
247 bsp = BioseqFree (bsp);
248
249 }
250
197 /*251 /*
198 * Write the data out to the file.252 * Write the data out to the file.
199 */253 */
200 if ( (i % 1000 == 0) && (i > 0) && updates_pending)254 if ( (i % 1000 == 0) && (i > 0) && updates_pending)
201 {255 {
202 status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,256 status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,
203 dst_size, dst_offset, dst_sizes, 257 gi_size, gi_offset, gi_sizes,
204 &dst_buf[i-1000]);258 &gi_buf[i-1000]);
205 if (status < 0)259 if (status < 0)
206 check_h5_error (status, __FILE__, __LINE__);260 check_h5_error (status, __FILE__, __LINE__);
207 261
208 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);262 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
209 if (status < 0)263 if (status < 0)
210 check_h5_error (status, __FILE__, __LINE__);264 check_h5_error (status, __FILE__, __LINE__);
211 265
212 updates_pending = false;266 updates_pending = false;
213267
214 printf ("Processed %i of %i records.\n", i, (int)nrecords);268 printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
215 }269 }
216 270
217 }271 }
@@ -222,7 +276,8 @@ assign_protein_type (hid_t file_id)
222 */276 */
223 if (updates_pending)277 if (updates_pending)
224 {278 {
225 if ((int)nrecords < 1000) 279 /*
280 if ((int)faa_nrecords < 1000)
226 {281 {
227 status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,282 status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
228 dst_size, dst_offset, dst_sizes,283 dst_size, dst_offset, dst_sizes,
@@ -242,11 +297,13 @@ assign_protein_type (hid_t file_id)
242 check_h5_error (status, __FILE__, __LINE__);297 check_h5_error (status, __FILE__, __LINE__);
243 298
244 updates_pending = false;299 updates_pending = false;
300 */
245 }301 }
246 302
247 free (faa_buf);303 free (faa_buf);
248 free (gi_buf);304 free (gi_buf);
249 305 hdestroy_r (&htab);
306
250 options = BLASTOptionDelete (options);307 options = BLASTOptionDelete (options);
251 308
252 return;309 return;

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.