summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--src/assign/assign_protein_type.c269
1 files changed, 163 insertions, 106 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 0a11c23..73685bb 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,27 +1,19 @@
+#define _GNU_SOURCE
#include "assign_protein_type.h"
-#include "error/check_ncbi_error.h"
#include "error/check_h5_error.h"
+#include "error/check_ncbi_error.h"
#include "model/gi_type_data.h"
#include "model/gi_type_data_init.h"
#include "model/sequence_data.h"
#include "model/sequence_data_init.h"
+#include <blast.h>
+#include <error.h>
+#include <hdf5_hl.h>
#include <ncbi.h>
#include <readdb.h>
-#include <blast.h>
#include <salpacc.h>
+#include <search.h>
#include <stdbool.h>
-#include <hdf5_hl.h>
-#include <error.h>
-
-/*
- * todo: Write results to an independent table by GI so that they can
- * be reused when the input tables influenza.faa and influenza_aa.dat
- * are replaced with new data. Since GI values are unique and
- * persistent the assignment of a protein type should stay valid
- * unless the list of reference sequences is updated.
- *
- * todo: Save the species type as a new field { A, B, C }.
- */
/*
* BLAST database containing all of the influenza protein sequences.
@@ -94,25 +86,75 @@ assign_protein_type (hid_t file_id)
/*
* Read the data from HDF5 gi_type_data.
*/
- hsize_t gi_nfields;
- hsize_t gi_nrecords;
- status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
- &gi_nrecords);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
-
- gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
-
- size_t gi_size;
+ hsize_t gi_nfields = 0;
+ hsize_t gi_nrecords = 0;
+ size_t gi_size = 0;
size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
- status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
- gi_sizes, gi_buf);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
+ gi_type_data* gi_buf = NULL;
+
+ if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
+ {
+
+ printf ("Updating gi_type_data.\n");
+
+ status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
+ &gi_nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
+
+ status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
+ gi_sizes, gi_buf);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ }
+ else
+ {
+
+ printf ("Creating gi_type_data.\n");
+
+ gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
+
+ const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
+ GI_TYPE_DATA_FIELD_NAMES;
+
+ hsize_t chunk_size = 10;
+ int *fill_data = NULL;
+ int compress = 0;
+
+ status = H5TBmake_table ("gi_type_data", file_id,
+ "gi_type_data",
+ GI_TYPE_DATA_FIELD_NUM, faa_nrecords,
+ gi_size, gi_type_data_field_names,
+ gi_offset, gi_field_type,
+ chunk_size, fill_data, compress,
+ NULL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ }
+
+ struct hsearch_data htab;
+ bzero (&htab, sizeof (htab));
+ hcreate_r (gi_nrecords * 2, &htab);
+ ENTRY e, *ep;
+
+ for (int i = 0; i < gi_nrecords; i++)
+ {
+ char gi_chr[25];
+ snprintf (gi_chr, 25, "%i", gi_buf[i].gi);
+ e.key = gi_chr;
+ e.data = &gi_buf[i];
+ if (hsearch_r (e, ENTER, &ep, &htab) == 0)
+ error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+ "Allocation failed.");
+ }
/*
* Assign protein types to records for which the field is empty.
@@ -121,97 +163,109 @@ assign_protein_type (hid_t file_id)
bool updates_pending = false;
for (int i = 0; i < faa_nrecords; i++)
{
+
+ char gi_chr[25];
+ snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
+ e.key = gi_chr;
+ if (hsearch_r (e, FIND, &ep, &htab) == 0)
+ {
- if (dst_buf[i].protein_type[0] == '\0') {
- /*
- * Read the sequence from the database by GI.
- */
- Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
- BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
- if (bsp == NULL)
- {
- error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
- "Unable to find BLAST record for gi|%i. Ensure the BLAST "
- "database is up-to-date with the HDF5 record set. See the "
- "BLAST formatdb.log file for details.\n",
- dst_buf[i].gi);
- }
-
- SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
- "blastp",
- REFDB,
- options,
- NULL,
- &error_returns,
- NULL);
-
- /*
- * BLAST reported an error. Write it out and continue processing.
- */
- if (error_returns != NULL)
- {
- printf ("Warning: An error has been reported by the NCBI Toolkit API "
- "for sequence gi|%i: %s",
- dst_buf[i].gi, BlastErrorToString (error_returns));
-
- }
-
- /*
- * A hit was found. Record the first hit as the protein type.
- * Skip the first 6 characters and eat the "lcl|x_".
- */
- else if (seqalign != NULL)
- {
- Char target_id_buf[BUFFER_LEN + 1];
- SeqIdPtr target_id = SeqAlignId (seqalign, 1);
- SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
-
- // Species Type
- dst_buf[i].type[0] = &target_id_buf[4];
- dst_buf[i].type[1] = '\0';
-
- // Protein Type
- strncpy (dst_buf[i].protein, &target_id_buf[6],
- sizeof (dst_buf[i].protein));
-
- updates_pending = true;
- }
-
- /*
- * BLAST did not find any hits.
- */
- else
- {
- printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
- dst_buf[i].gi);
- }
-
- /*
- * Clean up memory for the next ieration.
- */
- seqalign = SeqAlignSetFree (seqalign);
- bsp = BioseqFree (bsp);
-
- }
+ gi_buf[i].gi = faa_buf[i].gi;
+ gi_buf[i].type[0] = '\0';
+ gi_buf[i].protein[0] = '\0';
+
+ /*
+ * Read the sequence from the database by GI.
+ */
+ Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL);
+ BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+ if (bsp == NULL)
+ {
+ error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+ "Unable to find BLAST record for gi|%i. Ensure "
+ "the BLAST database is up-to-date with the HDF5 "
+ "record set. See the BLAST formatdb.log file "
+ "for details.\n",
+ faa_buf[i].gi);
+ }
+
+ SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+ "blastp",
+ REFDB,
+ options,
+ NULL,
+ &error_returns,
+ NULL);
+
+ /*
+ * BLAST reported an error. Write it out and continue processing.
+ */
+ if (error_returns != NULL)
+ {
+ char *msg = BlastErrorToString (error_returns);
+ printf ("Warning: An error has been reported by the NCBI Toolkit "
+ "API for sequence gi|%i: %s",
+ faa_buf[i].gi, msg);
+ free (msg);
+ }
+
+ /*
+ * A hit was found. Record the first hit as the protein type.
+ * Skip the first 6 characters and eat the "lcl|x_".
+ */
+ else if (seqalign != NULL)
+ {
+ Char target_id_buf[BUFFER_LEN + 1];
+ SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+ SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT,
+ BUFFER_LEN);
+ // Species Type
+ gi_buf[i].type[0] = target_id_buf[4];
+ gi_buf[i].type[1] = '\0';
+
+ // Protein Type
+ strncpy (gi_buf[i].protein, &target_id_buf[6],
+ sizeof (gi_buf[i].protein));
+
+ updates_pending = true;
+ }
+
+ /*
+ * BLAST did not find any hits.
+ */
+ else
+ {
+ printf ("Warning: Unable to identify protein type for sequence "
+ "gi|%i\n", faa_buf[i].gi);
+ }
+
+ /*
+ * Clean up memory for the next ieration.
+ */
+ seqalign = SeqAlignSetFree (seqalign);
+ bsp = BioseqFree (bsp);
+
+ }
+
/*
* Write the data out to the file.
*/
if ( (i % 1000 == 0) && (i > 0) && updates_pending)
{
- status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
- dst_size, dst_offset, dst_sizes,
- &dst_buf[i-1000]);
+ status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,
+ gi_size, gi_offset, gi_sizes,
+ &gi_buf[i-1000]);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
-
+
updates_pending = false;
- printf ("Processed %i of %i records.\n", i, (int)nrecords);
+ printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
}
}
@@ -222,7 +276,8 @@ assign_protein_type (hid_t file_id)
*/
if (updates_pending)
{
- if ((int)nrecords < 1000)
+ /*
+ if ((int)faa_nrecords < 1000)
{
status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
dst_size, dst_offset, dst_sizes,
@@ -242,11 +297,13 @@ assign_protein_type (hid_t file_id)
check_h5_error (status, __FILE__, __LINE__);
updates_pending = false;
+ */
}
free (faa_buf);
free (gi_buf);
-
+ hdestroy_r (&htab);
+
options = BLASTOptionDelete (options);
return;

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.