-rw-r--r-- | src/assign/assign_protein_type.c | 269 |
1 files changed, 163 insertions, 106 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 0a11c23..73685bb 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c | |||
@@ -1,27 +1,19 @@ | |||
1 | #define _GNU_SOURCE | ||
1 | #include "assign_protein_type.h" | 2 | #include "assign_protein_type.h" |
2 | #include "error/check_ncbi_error.h" | ||
3 | #include "error/check_h5_error.h" | 3 | #include "error/check_h5_error.h" |
4 | #include "error/check_ncbi_error.h" | ||
4 | #include "model/gi_type_data.h" | 5 | #include "model/gi_type_data.h" |
5 | #include "model/gi_type_data_init.h" | 6 | #include "model/gi_type_data_init.h" |
6 | #include "model/sequence_data.h" | 7 | #include "model/sequence_data.h" |
7 | #include "model/sequence_data_init.h" | 8 | #include "model/sequence_data_init.h" |
9 | #include <blast.h> | ||
10 | #include <error.h> | ||
11 | #include <hdf5_hl.h> | ||
8 | #include <ncbi.h> | 12 | #include <ncbi.h> |
9 | #include <readdb.h> | 13 | #include <readdb.h> |
10 | #include <blast.h> | ||
11 | #include <salpacc.h> | 14 | #include <salpacc.h> |
15 | #include <search.h> | ||
12 | #include <stdbool.h> | 16 | #include <stdbool.h> |
13 | #include <hdf5_hl.h> | ||
14 | #include <error.h> | ||
15 | |||
16 | /* | ||
17 | * todo: Write results to an independent table by GI so that they can | ||
18 | * be reused when the input tables influenza.faa and influenza_aa.dat | ||
19 | * are replaced with new data. Since GI values are unique and | ||
20 | * persistent the assignment of a protein type should stay valid | ||
21 | * unless the list of reference sequences is updated. | ||
22 | * | ||
23 | * todo: Save the species type as a new field { A, B, C }. | ||
24 | */ | ||
25 | 17 | ||
26 | /* | 18 | /* |
27 | * BLAST database containing all of the influenza protein sequences. | 19 | * BLAST database containing all of the influenza protein sequences. |
@@ -94,25 +86,75 @@ assign_protein_type (hid_t file_id) | |||
94 | /* | 86 | /* |
95 | * Read the data from HDF5 gi_type_data. | 87 | * Read the data from HDF5 gi_type_data. |
96 | */ | 88 | */ |
97 | hsize_t gi_nfields; | 89 | hsize_t gi_nfields = 0; |
98 | hsize_t gi_nrecords; | 90 | hsize_t gi_nrecords = 0; |
99 | status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, | 91 | size_t gi_size = 0; |
100 | &gi_nrecords); | ||
101 | if (status < 0) | ||
102 | check_h5_error (status, __FILE__, __LINE__); | ||
103 | |||
104 | gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); | ||
105 | |||
106 | size_t gi_size; | ||
107 | size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; | 92 | size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; |
108 | size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; | 93 | size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; |
109 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; | 94 | hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; |
110 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); | 95 | gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); |
111 | 96 | ||
112 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, | 97 | gi_type_data* gi_buf = NULL; |
113 | gi_sizes, gi_buf); | 98 | |
114 | if (status < 0) | 99 | if (H5LTfind_dataset (file_id, "gi_type_data") == 1) |
115 | check_h5_error (status, __FILE__, __LINE__); | 100 | { |
101 | |||
102 | printf ("Updating gi_type_data.\n"); | ||
103 | |||
104 | status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, | ||
105 | &gi_nrecords); | ||
106 | if (status < 0) | ||
107 | check_h5_error (status, __FILE__, __LINE__); | ||
108 | |||
109 | gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); | ||
110 | |||
111 | status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, | ||
112 | gi_sizes, gi_buf); | ||
113 | if (status < 0) | ||
114 | check_h5_error (status, __FILE__, __LINE__); | ||
115 | |||
116 | } | ||
117 | else | ||
118 | { | ||
119 | |||
120 | printf ("Creating gi_type_data.\n"); | ||
121 | |||
122 | gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); | ||
123 | |||
124 | const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = | ||
125 | GI_TYPE_DATA_FIELD_NAMES; | ||
126 | |||
127 | hsize_t chunk_size = 10; | ||
128 | int *fill_data = NULL; | ||
129 | int compress = 0; | ||
130 | |||
131 | status = H5TBmake_table ("gi_type_data", file_id, | ||
132 | "gi_type_data", | ||
133 | GI_TYPE_DATA_FIELD_NUM, faa_nrecords, | ||
134 | gi_size, gi_type_data_field_names, | ||
135 | gi_offset, gi_field_type, | ||
136 | chunk_size, fill_data, compress, | ||
137 | NULL); | ||
138 | if (status < 0) | ||
139 | check_h5_error (status, __FILE__, __LINE__); | ||
140 | |||
141 | } | ||
142 | |||
143 | struct hsearch_data htab; | ||
144 | bzero (&htab, sizeof (htab)); | ||
145 | hcreate_r (gi_nrecords * 2, &htab); | ||
146 | ENTRY e, *ep; | ||
147 | |||
148 | for (int i = 0; i < gi_nrecords; i++) | ||
149 | { | ||
150 | char gi_chr[25]; | ||
151 | snprintf (gi_chr, 25, "%i", gi_buf[i].gi); | ||
152 | e.key = gi_chr; | ||
153 | e.data = &gi_buf[i]; | ||
154 | if (hsearch_r (e, ENTER, &ep, &htab) == 0) | ||
155 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | ||
156 | "Allocation failed."); | ||
157 | } | ||
116 | 158 | ||
117 | /* | 159 | /* |
118 | * Assign protein types to records for which the field is empty. | 160 | * Assign protein types to records for which the field is empty. |
@@ -121,97 +163,109 @@ assign_protein_type (hid_t file_id) | |||
121 | bool updates_pending = false; | 163 | bool updates_pending = false; |
122 | for (int i = 0; i < faa_nrecords; i++) | 164 | for (int i = 0; i < faa_nrecords; i++) |
123 | { | 165 | { |
166 | |||
167 | char gi_chr[25]; | ||
168 | snprintf (gi_chr, 25, "%i", faa_buf[i].gi); | ||
169 | e.key = gi_chr; | ||
170 | if (hsearch_r (e, FIND, &ep, &htab) == 0) | ||
171 | { | ||
124 | 172 | ||
125 | if (dst_buf[i].protein_type[0] == '\0') { | 173 | gi_buf[i].gi = faa_buf[i].gi; |
126 | /* | 174 | gi_buf[i].type[0] = '\0'; |
127 | * Read the sequence from the database by GI. | 175 | gi_buf[i].protein[0] = '\0'; |
128 | */ | 176 | |
129 | Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); | 177 | /* |
130 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | 178 | * Read the sequence from the database by GI. |
131 | if (bsp == NULL) | 179 | */ |
132 | { | 180 | Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL); |
133 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, | 181 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); |
134 | "Unable to find BLAST record for gi|%i. Ensure the BLAST " | 182 | if (bsp == NULL) |
135 | "database is up-to-date with the HDF5 record set. See the " | 183 | { |
136 | "BLAST formatdb.log file for details.\n", | 184 | error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, |
137 | dst_buf[i].gi); | 185 | "Unable to find BLAST record for gi|%i. Ensure " |
138 | } | 186 | "the BLAST database is up-to-date with the HDF5 " |
139 | 187 | "record set. See the BLAST formatdb.log file " | |
140 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | 188 | "for details.\n", |
141 | "blastp", | 189 | faa_buf[i].gi); |
142 | REFDB, | 190 | } |
143 | options, | 191 | |
144 | NULL, | 192 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, |
145 | &error_returns, | 193 | "blastp", |
146 | NULL); | 194 | REFDB, |
147 | 195 | options, | |
148 | /* | 196 | NULL, |
149 | * BLAST reported an error. Write it out and continue processing. | 197 | &error_returns, |
150 | */ | 198 | NULL); |
151 | if (error_returns != NULL) | 199 | |
152 | { | 200 | /* |
153 | printf ("Warning: An error has been reported by the NCBI Toolkit API " | 201 | * BLAST reported an error. Write it out and continue processing. |
154 | "for sequence gi|%i: %s", | 202 | */ |
155 | dst_buf[i].gi, BlastErrorToString (error_returns)); | 203 | if (error_returns != NULL) |
156 | 204 | { | |
157 | } | 205 | char *msg = BlastErrorToString (error_returns); |
158 | 206 | printf ("Warning: An error has been reported by the NCBI Toolkit " | |
159 | /* | 207 | "API for sequence gi|%i: %s", |
160 | * A hit was found. Record the first hit as the protein type. | 208 | faa_buf[i].gi, msg); |
161 | * Skip the first 6 characters and eat the "lcl|x_". | 209 | free (msg); |
162 | */ | 210 | } |
163 | else if (seqalign != NULL) | 211 | |
164 | { | 212 | /* |
165 | Char target_id_buf[BUFFER_LEN + 1]; | 213 | * A hit was found. Record the first hit as the protein type. |
166 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | 214 | * Skip the first 6 characters and eat the "lcl|x_". |
167 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | 215 | */ |
168 | 216 | else if (seqalign != NULL) | |
169 | // Species Type | 217 | { |
170 | dst_buf[i].type[0] = &target_id_buf[4]; | 218 | Char target_id_buf[BUFFER_LEN + 1]; |
171 | dst_buf[i].type[1] = '\0'; | 219 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); |
172 | 220 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, | |
173 | // Protein Type | 221 | BUFFER_LEN); |
174 | strncpy (dst_buf[i].protein, &target_id_buf[6], | ||
175 | sizeof (dst_buf[i].protein)); | ||
176 | |||
177 | updates_pending = true; | ||
178 | } | ||
179 | |||
180 | /* | ||
181 | * BLAST did not find any hits. | ||
182 | */ | ||
183 | else | ||
184 | { | ||
185 | printf ("Warning: Unable to identify protein type for sequence gi|%i\n", | ||
186 | dst_buf[i].gi); | ||
187 | } | ||
188 | |||
189 | /* | ||
190 | * Clean up memory for the next ieration. | ||
191 | */ | ||
192 | seqalign = SeqAlignSetFree (seqalign); | ||
193 | bsp = BioseqFree (bsp); | ||
194 | |||
195 | } | ||
196 | 222 | ||
223 | // Species Type | ||
224 | gi_buf[i].type[0] = target_id_buf[4]; | ||
225 | gi_buf[i].type[1] = '\0'; | ||
226 | |||
227 | // Protein Type | ||
228 | strncpy (gi_buf[i].protein, &target_id_buf[6], | ||
229 | sizeof (gi_buf[i].protein)); | ||
230 | |||
231 | updates_pending = true; | ||
232 | } | ||
233 | |||
234 | /* | ||
235 | * BLAST did not find any hits. | ||
236 | */ | ||
237 | else | ||
238 | { | ||
239 | printf ("Warning: Unable to identify protein type for sequence " | ||
240 | "gi|%i\n", faa_buf[i].gi); | ||
241 | } | ||
242 | |||
243 | /* | ||
244 | * Clean up memory for the next ieration. | ||
245 | */ | ||
246 | seqalign = SeqAlignSetFree (seqalign); | ||
247 | bsp = BioseqFree (bsp); | ||
248 | |||
249 | } | ||
250 | |||
197 | /* | 251 | /* |
198 | * Write the data out to the file. | 252 | * Write the data out to the file. |
199 | */ | 253 | */ |
200 | if ( (i % 1000 == 0) && (i > 0) && updates_pending) | 254 | if ( (i % 1000 == 0) && (i > 0) && updates_pending) |
201 | { | 255 | { |
202 | status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, | 256 | status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, |
203 | dst_size, dst_offset, dst_sizes, | 257 | gi_size, gi_offset, gi_sizes, |
204 | &dst_buf[i-1000]); | 258 | &gi_buf[i-1000]); |
205 | if (status < 0) | 259 | if (status < 0) |
206 | check_h5_error (status, __FILE__, __LINE__); | 260 | check_h5_error (status, __FILE__, __LINE__); |
207 | 261 | ||
208 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | 262 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); |
209 | if (status < 0) | 263 | if (status < 0) |
210 | check_h5_error (status, __FILE__, __LINE__); | 264 | check_h5_error (status, __FILE__, __LINE__); |
211 | 265 | ||
212 | updates_pending = false; | 266 | updates_pending = false; |
213 | 267 | ||
214 | printf ("Processed %i of %i records.\n", i, (int)nrecords); | 268 | printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); |
215 | } | 269 | } |
216 | 270 | ||
217 | } | 271 | } |
@@ -222,7 +276,8 @@ assign_protein_type (hid_t file_id) | |||
222 | */ | 276 | */ |
223 | if (updates_pending) | 277 | if (updates_pending) |
224 | { | 278 | { |
225 | if ((int)nrecords < 1000) | 279 | /* |
280 | if ((int)faa_nrecords < 1000) | ||
226 | { | 281 | { |
227 | status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, | 282 | status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, |
228 | dst_size, dst_offset, dst_sizes, | 283 | dst_size, dst_offset, dst_sizes, |
@@ -242,11 +297,13 @@ assign_protein_type (hid_t file_id) | |||
242 | check_h5_error (status, __FILE__, __LINE__); | 297 | check_h5_error (status, __FILE__, __LINE__); |
243 | 298 | ||
244 | updates_pending = false; | 299 | updates_pending = false; |
300 | */ | ||
245 | } | 301 | } |
246 | 302 | ||
247 | free (faa_buf); | 303 | free (faa_buf); |
248 | free (gi_buf); | 304 | free (gi_buf); |
249 | 305 | hdestroy_r (&htab); | |
306 | |||
250 | options = BLASTOptionDelete (options); | 307 | options = BLASTOptionDelete (options); |
251 | 308 | ||
252 | return; | 309 | return; |