-rw-r--r-- | src/assign/assign_protein_type.c | 269 |
1 files changed, 163 insertions, 106 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 0a11c23..73685bb 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c @@ -1,27 +1,19 @@ +#define _GNU_SOURCE #include "assign_protein_type.h" -#include "error/check_ncbi_error.h" #include "error/check_h5_error.h" +#include "error/check_ncbi_error.h" #include "model/gi_type_data.h" #include "model/gi_type_data_init.h" #include "model/sequence_data.h" #include "model/sequence_data_init.h" +#include <blast.h> +#include <error.h> +#include <hdf5_hl.h> #include <ncbi.h> #include <readdb.h> -#include <blast.h> #include <salpacc.h> +#include <search.h> #include <stdbool.h> -#include <hdf5_hl.h> -#include <error.h> - -/* - * todo: Write results to an independent table by GI so that they can - * be reused when the input tables influenza.faa and influenza_aa.dat - * are replaced with new data. Since GI values are unique and - * persistent the assignment of a protein type should stay valid - * unless the list of reference sequences is updated. - * - * todo: Save the species type as a new field { A, B, C }. - */ /* * BLAST database containing all of the influenza protein sequences. @@ -94,25 +86,75 @@ assign_protein_type (hid_t file_id) /* * Read the data from HDF5 gi_type_data. */ - hsize_t gi_nfields; - hsize_t gi_nrecords; - status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, - &gi_nrecords); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); - - gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); - - size_t gi_size; + hsize_t gi_nfields = 0; + hsize_t gi_nrecords = 0; + size_t gi_size = 0; size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); - status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, - gi_sizes, gi_buf); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); + gi_type_data* gi_buf = NULL; + + if (H5LTfind_dataset (file_id, "gi_type_data") == 1) + { + + printf ("Updating gi_type_data.\n"); + + status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, + &gi_nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); + + status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, + gi_sizes, gi_buf); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + } + else + { + + printf ("Creating gi_type_data.\n"); + + gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); + + const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = + GI_TYPE_DATA_FIELD_NAMES; + + hsize_t chunk_size = 10; + int *fill_data = NULL; + int compress = 0; + + status = H5TBmake_table ("gi_type_data", file_id, + "gi_type_data", + GI_TYPE_DATA_FIELD_NUM, faa_nrecords, + gi_size, gi_type_data_field_names, + gi_offset, gi_field_type, + chunk_size, fill_data, compress, + NULL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + } + + struct hsearch_data htab; + bzero (&htab, sizeof (htab)); + hcreate_r (gi_nrecords * 2, &htab); + ENTRY e, *ep; + + for (int i = 0; i < gi_nrecords; i++) + { + char gi_chr[25]; + snprintf (gi_chr, 25, "%i", gi_buf[i].gi); + e.key = gi_chr; + e.data = &gi_buf[i]; + if (hsearch_r (e, ENTER, &ep, &htab) == 0) + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Allocation failed."); + } /* * Assign protein types to records for which the field is empty. @@ -121,97 +163,109 @@ assign_protein_type (hid_t file_id) bool updates_pending = false; for (int i = 0; i < faa_nrecords; i++) { + + char gi_chr[25]; + snprintf (gi_chr, 25, "%i", faa_buf[i].gi); + e.key = gi_chr; + if (hsearch_r (e, FIND, &ep, &htab) == 0) + { - if (dst_buf[i].protein_type[0] == '\0') { - /* - * Read the sequence from the database by GI. - */ - Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); - BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); - if (bsp == NULL) - { - error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, - "Unable to find BLAST record for gi|%i. Ensure the BLAST " - "database is up-to-date with the HDF5 record set. See the " - "BLAST formatdb.log file for details.\n", - dst_buf[i].gi); - } - - SeqAlignPtr seqalign = BioseqBlastEngine (bsp, - "blastp", - REFDB, - options, - NULL, - &error_returns, - NULL); - - /* - * BLAST reported an error. Write it out and continue processing. - */ - if (error_returns != NULL) - { - printf ("Warning: An error has been reported by the NCBI Toolkit API " - "for sequence gi|%i: %s", - dst_buf[i].gi, BlastErrorToString (error_returns)); - - } - - /* - * A hit was found. Record the first hit as the protein type. - * Skip the first 6 characters and eat the "lcl|x_". - */ - else if (seqalign != NULL) - { - Char target_id_buf[BUFFER_LEN + 1]; - SeqIdPtr target_id = SeqAlignId (seqalign, 1); - SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); - - // Species Type - dst_buf[i].type[0] = &target_id_buf[4]; - dst_buf[i].type[1] = '\0'; - - // Protein Type - strncpy (dst_buf[i].protein, &target_id_buf[6], - sizeof (dst_buf[i].protein)); - - updates_pending = true; - } - - /* - * BLAST did not find any hits. - */ - else - { - printf ("Warning: Unable to identify protein type for sequence gi|%i\n", - dst_buf[i].gi); - } - - /* - * Clean up memory for the next ieration. - */ - seqalign = SeqAlignSetFree (seqalign); - bsp = BioseqFree (bsp); - - } + gi_buf[i].gi = faa_buf[i].gi; + gi_buf[i].type[0] = '\0'; + gi_buf[i].protein[0] = '\0'; + + /* + * Read the sequence from the database by GI. + */ + Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL); + BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); + if (bsp == NULL) + { + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Unable to find BLAST record for gi|%i. Ensure " + "the BLAST database is up-to-date with the HDF5 " + "record set. See the BLAST formatdb.log file " + "for details.\n", + faa_buf[i].gi); + } + + SeqAlignPtr seqalign = BioseqBlastEngine (bsp, + "blastp", + REFDB, + options, + NULL, + &error_returns, + NULL); + + /* + * BLAST reported an error. Write it out and continue processing. + */ + if (error_returns != NULL) + { + char *msg = BlastErrorToString (error_returns); + printf ("Warning: An error has been reported by the NCBI Toolkit " + "API for sequence gi|%i: %s", + faa_buf[i].gi, msg); + free (msg); + } + + /* + * A hit was found. Record the first hit as the protein type. + * Skip the first 6 characters and eat the "lcl|x_". + */ + else if (seqalign != NULL) + { + Char target_id_buf[BUFFER_LEN + 1]; + SeqIdPtr target_id = SeqAlignId (seqalign, 1); + SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, + BUFFER_LEN); + // Species Type + gi_buf[i].type[0] = target_id_buf[4]; + gi_buf[i].type[1] = '\0'; + + // Protein Type + strncpy (gi_buf[i].protein, &target_id_buf[6], + sizeof (gi_buf[i].protein)); + + updates_pending = true; + } + + /* + * BLAST did not find any hits. + */ + else + { + printf ("Warning: Unable to identify protein type for sequence " + "gi|%i\n", faa_buf[i].gi); + } + + /* + * Clean up memory for the next ieration. + */ + seqalign = SeqAlignSetFree (seqalign); + bsp = BioseqFree (bsp); + + } + /* * Write the data out to the file. */ if ( (i % 1000 == 0) && (i > 0) && updates_pending) { - status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, - dst_size, dst_offset, dst_sizes, - &dst_buf[i-1000]); + status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, + gi_size, gi_offset, gi_sizes, + &gi_buf[i-1000]); if (status < 0) check_h5_error (status, __FILE__, __LINE__); status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); - + updates_pending = false; - printf ("Processed %i of %i records.\n", i, (int)nrecords); + printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); } } @@ -222,7 +276,8 @@ assign_protein_type (hid_t file_id) */ if (updates_pending) { - if ((int)nrecords < 1000) + /* + if ((int)faa_nrecords < 1000) { status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, dst_size, dst_offset, dst_sizes, @@ -242,11 +297,13 @@ assign_protein_type (hid_t file_id) check_h5_error (status, __FILE__, __LINE__); updates_pending = false; + */ } free (faa_buf); free (gi_buf); - + hdestroy_r (&htab); + options = BLASTOptionDelete (options); return; |