summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
new file mode 100644
index 0000000..0a11c23
--- a/dev/null
+++ b/src/assign/assign_protein_type.c
@@ -0,0 +1,253 @@
+#include "assign_protein_type.h"
+#include "error/check_ncbi_error.h"
+#include "error/check_h5_error.h"
+#include "model/gi_type_data.h"
+#include "model/gi_type_data_init.h"
+#include "model/sequence_data.h"
+#include "model/sequence_data_init.h"
+#include <ncbi.h>
+#include <readdb.h>
+#include <blast.h>
+#include <salpacc.h>
+#include <stdbool.h>
+#include <hdf5_hl.h>
+#include <error.h>
+
+/*
+ * todo: Write results to an independent table by GI so that they can
+ * be reused when the input tables influenza.faa and influenza_aa.dat
+ * are replaced with new data. Since GI values are unique and
+ * persistent the assignment of a protein type should stay valid
+ * unless the list of reference sequences is updated.
+ *
+ * todo: Save the species type as a new field { A, B, C }.
+ */
+
+/*
+ * BLAST database containing all of the influenza protein sequences.
+ */
+#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
+
+/*
+ * BLAST reference database of prototypical protein types.
+ */
+#define REFDB "/home/don/exp004/influenzadb/proteinnames"
+
+#define BUFFER_LEN 50
+
+void
+assign_protein_type (hid_t file_id)
+{
+ /*
+ * Iterate through the records for which no protein type has been
+ * assigned. Create a BioSeq Pointer to the data and then use this
+ * as input to the BLAST run against the reference database of
+ * prototypical sequence records.
+ */
+
+ /*
+ * Make BLAST messages reportable.
+ */
+ ErrSetMessageLevel (SEV_WARNING);
+
+ /*
+ * Open the BLAST sequence database.
+ */
+ ReadDBFILEPtr seqdb = readdb_new (SEQDB, true);
+
+ /*
+ * Get default BLAST options.
+ */
+ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+
+ /*
+ * Set to single hit mode.
+ */
+ options->window_size = 0;
+ options->multiple_hits_only = false;
+
+ ValNodePtr error_returns = NULL;
+
+ /*
+ * Read the data from HDF5 influenza.faa.
+ */
+ hsize_t faa_nfields;
+ hsize_t faa_nrecords;
+ herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields,
+ &faa_nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);
+
+ size_t faa_size;
+ size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM];
+ hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM];
+ sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type);
+
+ status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset,
+ faa_sizes, faa_buf);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ /*
+ * Read the data from HDF5 gi_type_data.
+ */
+ hsize_t gi_nfields;
+ hsize_t gi_nrecords;
+ status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
+ &gi_nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
+
+ size_t gi_size;
+ size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
+ size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
+ hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
+ gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
+
+ status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
+ gi_sizes, gi_buf);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ /*
+ * Assign protein types to records for which the field is empty.
+ */
+ printf ("Records to process: %i\n", (int)faa_nrecords);
+ bool updates_pending = false;
+ for (int i = 0; i < faa_nrecords; i++)
+ {
+
+ if (dst_buf[i].protein_type[0] == '\0') {
+ /*
+ * Read the sequence from the database by GI.
+ */
+ Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
+ BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+ if (bsp == NULL)
+ {
+ error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+ "Unable to find BLAST record for gi|%i. Ensure the BLAST "
+ "database is up-to-date with the HDF5 record set. See the "
+ "BLAST formatdb.log file for details.\n",
+ dst_buf[i].gi);
+ }
+
+ SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+ "blastp",
+ REFDB,
+ options,
+ NULL,
+ &error_returns,
+ NULL);
+
+ /*
+ * BLAST reported an error. Write it out and continue processing.
+ */
+ if (error_returns != NULL)
+ {
+ printf ("Warning: An error has been reported by the NCBI Toolkit API "
+ "for sequence gi|%i: %s",
+ dst_buf[i].gi, BlastErrorToString (error_returns));
+
+ }
+
+ /*
+ * A hit was found. Record the first hit as the protein type.
+ * Skip the first 6 characters and eat the "lcl|x_".
+ */
+ else if (seqalign != NULL)
+ {
+ Char target_id_buf[BUFFER_LEN + 1];
+ SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+ SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+
+ // Species Type
+ dst_buf[i].type[0] = &target_id_buf[4];
+ dst_buf[i].type[1] = '\0';
+
+ // Protein Type
+ strncpy (dst_buf[i].protein, &target_id_buf[6],
+ sizeof (dst_buf[i].protein));
+
+ updates_pending = true;
+ }
+
+ /*
+ * BLAST did not find any hits.
+ */
+ else
+ {
+ printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
+ dst_buf[i].gi);
+ }
+
+ /*
+ * Clean up memory for the next ieration.
+ */
+ seqalign = SeqAlignSetFree (seqalign);
+ bsp = BioseqFree (bsp);
+
+ }
+
+ /*
+ * Write the data out to the file.
+ */
+ if ( (i % 1000 == 0) && (i > 0) && updates_pending)
+ {
+ status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
+ dst_size, dst_offset, dst_sizes,
+ &dst_buf[i-1000]);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ updates_pending = false;
+
+ printf ("Processed %i of %i records.\n", i, (int)nrecords);
+ }
+
+ }
+
+ /*
+ * Write out records from the last bin if it was less than 1000
+ * records in size.
+ */
+ if (updates_pending)
+ {
+ if ((int)nrecords < 1000)
+ {
+ status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
+ dst_size, dst_offset, dst_sizes,
+ dst_buf);
+ }
+ else
+ {
+ status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,
+ dst_size, dst_offset, dst_sizes,
+ &dst_buf[nrecords-1000]);
+ }
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ updates_pending = false;
+ }
+
+ free (faa_buf);
+ free (gi_buf);
+
+ options = BLASTOptionDelete (options);
+
+ return;
+}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.