summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
new file mode 100644
index 0000000..0a11c23
--- a/dev/null
+++ b/src/assign/assign_protein_type.c
@@ -0,0 +1,253 @@
1#include "assign_protein_type.h"
2#include "error/check_ncbi_error.h"
3#include "error/check_h5_error.h"
4#include "model/gi_type_data.h"
5#include "model/gi_type_data_init.h"
6#include "model/sequence_data.h"
7#include "model/sequence_data_init.h"
8#include <ncbi.h>
9#include <readdb.h>
10#include <blast.h>
11#include <salpacc.h>
12#include <stdbool.h>
13#include <hdf5_hl.h>
14#include <error.h>
15
16/*
17 * todo: Write results to an independent table by GI so that they can
18 * be reused when the input tables influenza.faa and influenza_aa.dat
19 * are replaced with new data. Since GI values are unique and
20 * persistent the assignment of a protein type should stay valid
21 * unless the list of reference sequences is updated.
22 *
23 * todo: Save the species type as a new field { A, B, C }.
24 */
25
26/*
27 * BLAST database containing all of the influenza protein sequences.
28 */
29#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
30
31/*
32 * BLAST reference database of prototypical protein types.
33 */
34#define REFDB "/home/don/exp004/influenzadb/proteinnames"
35
36#define BUFFER_LEN 50
37
38void
39assign_protein_type (hid_t file_id)
40{
41 /*
42 * Iterate through the records for which no protein type has been
43 * assigned. Create a BioSeq Pointer to the data and then use this
44 * as input to the BLAST run against the reference database of
45 * prototypical sequence records.
46 */
47
48 /*
49 * Make BLAST messages reportable.
50 */
51 ErrSetMessageLevel (SEV_WARNING);
52
53 /*
54 * Open the BLAST sequence database.
55 */
56 ReadDBFILEPtr seqdb = readdb_new (SEQDB, true);
57
58 /*
59 * Get default BLAST options.
60 */
61 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
62
63 /*
64 * Set to single hit mode.
65 */
66 options->window_size = 0;
67 options->multiple_hits_only = false;
68
69 ValNodePtr error_returns = NULL;
70
71 /*
72 * Read the data from HDF5 influenza.faa.
73 */
74 hsize_t faa_nfields;
75 hsize_t faa_nrecords;
76 herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields,
77 &faa_nrecords);
78 if (status < 0)
79 check_h5_error (status, __FILE__, __LINE__);
80
81 sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);
82
83 size_t faa_size;
84 size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
85 size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM];
86 hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM];
87 sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type);
88
89 status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset,
90 faa_sizes, faa_buf);
91 if (status < 0)
92 check_h5_error (status, __FILE__, __LINE__);
93
94 /*
95 * Read the data from HDF5 gi_type_data.
96 */
97 hsize_t gi_nfields;
98 hsize_t gi_nrecords;
99 status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
100 &gi_nrecords);
101 if (status < 0)
102 check_h5_error (status, __FILE__, __LINE__);
103
104 gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
105
106 size_t gi_size;
107 size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
108 size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
109 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
110 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
111
112 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
113 gi_sizes, gi_buf);
114 if (status < 0)
115 check_h5_error (status, __FILE__, __LINE__);
116
117 /*
118 * Assign protein types to records for which the field is empty.
119 */
120 printf ("Records to process: %i\n", (int)faa_nrecords);
121 bool updates_pending = false;
122 for (int i = 0; i < faa_nrecords; i++)
123 {
124
125 if (dst_buf[i].protein_type[0] == '\0') {
126 /*
127 * Read the sequence from the database by GI.
128 */
129 Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
130 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
131 if (bsp == NULL)
132 {
133 error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
134 "Unable to find BLAST record for gi|%i. Ensure the BLAST "
135 "database is up-to-date with the HDF5 record set. See the "
136 "BLAST formatdb.log file for details.\n",
137 dst_buf[i].gi);
138 }
139
140 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
141 "blastp",
142 REFDB,
143 options,
144 NULL,
145 &error_returns,
146 NULL);
147
148 /*
149 * BLAST reported an error. Write it out and continue processing.
150 */
151 if (error_returns != NULL)
152 {
153 printf ("Warning: An error has been reported by the NCBI Toolkit API "
154 "for sequence gi|%i: %s",
155 dst_buf[i].gi, BlastErrorToString (error_returns));
156
157 }
158
159 /*
160 * A hit was found. Record the first hit as the protein type.
161 * Skip the first 6 characters and eat the "lcl|x_".
162 */
163 else if (seqalign != NULL)
164 {
165 Char target_id_buf[BUFFER_LEN + 1];
166 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
167 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
168
169 // Species Type
170 dst_buf[i].type[0] = &target_id_buf[4];
171 dst_buf[i].type[1] = '\0';
172
173 // Protein Type
174 strncpy (dst_buf[i].protein, &target_id_buf[6],
175 sizeof (dst_buf[i].protein));
176
177 updates_pending = true;
178 }
179
180 /*
181 * BLAST did not find any hits.
182 */
183 else
184 {
185 printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
186 dst_buf[i].gi);
187 }
188
189 /*
190 * Clean up memory for the next ieration.
191 */
192 seqalign = SeqAlignSetFree (seqalign);
193 bsp = BioseqFree (bsp);
194
195 }
196
197 /*
198 * Write the data out to the file.
199 */
200 if ( (i % 1000 == 0) && (i > 0) && updates_pending)
201 {
202 status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
203 dst_size, dst_offset, dst_sizes,
204 &dst_buf[i-1000]);
205 if (status < 0)
206 check_h5_error (status, __FILE__, __LINE__);
207
208 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
209 if (status < 0)
210 check_h5_error (status, __FILE__, __LINE__);
211
212 updates_pending = false;
213
214 printf ("Processed %i of %i records.\n", i, (int)nrecords);
215 }
216
217 }
218
219 /*
220 * Write out records from the last bin if it was less than 1000
221 * records in size.
222 */
223 if (updates_pending)
224 {
225 if ((int)nrecords < 1000)
226 {
227 status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
228 dst_size, dst_offset, dst_sizes,
229 dst_buf);
230 }
231 else
232 {
233 status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,
234 dst_size, dst_offset, dst_sizes,
235 &dst_buf[nrecords-1000]);
236 }
237 if (status < 0)
238 check_h5_error (status, __FILE__, __LINE__);
239
240 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
241 if (status < 0)
242 check_h5_error (status, __FILE__, __LINE__);
243
244 updates_pending = false;
245 }
246
247 free (faa_buf);
248 free (gi_buf);
249
250 options = BLASTOptionDelete (options);
251
252 return;
253}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.