summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--.gitignore5
-rw-r--r--src/Makefile.am10
-rw-r--r--src/assign_protein_type.c135
-rw-r--r--src/load_influenza_faa.c67
-rw-r--r--src/sequence_data.h16
-rw-r--r--src/sequence_data_init.c37
-rw-r--r--src/sequence_data_init.h14
7 files changed, 188 insertions, 96 deletions
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
1#include "assign_protein_type.h"1#include "assign_protein_type.h"
2#include "check_ncbi_error.h"2#include "check_ncbi_error.h"
3#include "check_h5_error.h"3#include "check_h5_error.h"
4#include "sequence_data.h"
5#include "sequence_data_init.h"
4#include <ncbi.h>6#include <ncbi.h>
5#include <readdb.h>7#include <readdb.h>
6#include <blast.h>8#include <blast.h>
7#include <salpacc.h>9#include <salpacc.h>
8#include <stdbool.h>10#include <stdbool.h>
9#include <hdf5_hl.h>11#include <hdf5_hl.h>
12#include <error.h>
1013
11/*14/*
12 * BLAST database containing all of the influenza protein sequences.15 * BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
44 * Get default BLAST options.47 * Get default BLAST options.
45 */48 */
46 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);49 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
50
51 /*
52 * Set to single hit mode.
53 */
54 options->window_size = 0;
55 options->multiple_hits_only = false;
56
47 ValNodePtr error_returns = NULL;57 ValNodePtr error_returns = NULL;
4858
49 /*59 /*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
56 if (status < 0)66 if (status < 0)
57 check_h5_error (status, __FILE__, __LINE__);67 check_h5_error (status, __FILE__, __LINE__);
5868
59 /*69 sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
60 * todo: Allocate memory of nrecords for dst_buf.70
61 *71 size_t dst_size;
62 * todo: Refactor code to share structres in read and write HDF572 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
63 * calls.73 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
64 */74 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
75 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
6576
66 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,77 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
67 dst_sizes, dst_buf);78 dst_sizes, dst_buf);
68 if (status < 0)79 if (status < 0)
69 check_h5_error (status, __FILE__, __LINE__);80 check_h5_error (status, __FILE__, __LINE__);
7081
71 for (int i = 0; i < nrecords; i++)
72 {
73
74 }
75
76 /*82 /*
77 * Read the sequence from the database by GI.83 * Assign protein types to records for which the field is empty.
78 */84 */
79 Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);85 printf ("Records to process: %i\n", (int)nrecords);
80 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);86 for (int i = 0; i < nrecords; i++)
81
82 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
83 "blastp",
84 REFDB,
85 options,
86 NULL,
87 &error_returns,
88 NULL);
89
90 if (error_returns != NULL)
91 check_ncbi_error (error_returns, __FILE__, __LINE__);
92
93 if (seqalign != NULL)
94 {87 {
95 Char target_id_buf[BUFFER_LEN + 1];
96 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
97 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
98 printf ("%s\n", target_id_buf);
99 }
100
101 // Clean up memory for the next ieration.
102 seqalign = SeqAlignSetFree (seqalign);
103 bsp = BioseqFree (bsp);
10488
89 if (dst_buf[i].protein_type[0] == '\0') {
90 /*
91 * Read the sequence from the database by GI.
92 */
93 Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
94 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
95
96 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
97 "blastp",
98 REFDB,
99 options,
100 NULL,
101 &error_returns,
102 NULL);
103
104 /*
105 * BLAST reported an error. Write it out and continue processing.
106 */
107 if (error_returns != NULL)
108 {
109 printf ("Warning: An error has been reported by the NCBI Toolkit API "
110 "for sequence gi|%i: %s",
111 dst_buf[i].gi, BlastErrorToString (error_returns));
112
113 }
114
115 /*
116 * A hit was found. Record the first hit as the protein type.
117 * Skip the first 6 characters and eat the "lcl|x_".
118 */
119 else if (seqalign != NULL)
120 {
121 Char target_id_buf[BUFFER_LEN + 1];
122 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
123 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
124 strncpy (dst_buf[i].protein_type, &target_id_buf[6],
125 sizeof (dst_buf[i].protein_type));
126 }
127
128 /*
129 * BLAST did not find any hits.
130 */
131 else
132 {
133 printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
134 dst_buf[i].gi);
135 }
136
137 /*
138 * Clean up memory for the next ieration.
139 */
140 seqalign = SeqAlignSetFree (seqalign);
141 bsp = BioseqFree (bsp);
142
143 /*
144 * Write the data out to the file.
145 */
146 if ( (i % 1000 == 0) && (i > 0) )
147 {
148 status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
149 dst_size, dst_offset, dst_sizes,
150 &dst_buf[i-1000]);
151 if (status < 0)
152 check_h5_error (status, __FILE__, __LINE__);
153
154 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
155 if (status < 0)
156 check_h5_error (status, __FILE__, __LINE__);
157
158 printf ("Processed %i of %i records.\n", i, (int)nrecords);
159 }
160 }
161
162 }
163
164 free (dst_buf);
165
105 options = BLASTOptionDelete (options);166 options = BLASTOptionDelete (options);
106167
107 return;168 return;
108}169}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.