#include "assign_protein_type.h" #include "check_ncbi_error.h" #include "check_h5_error.h" #include "sequence_data.h" #include "sequence_data_init.h" #include #include #include #include #include #include #include /* * BLAST database containing all of the influenza protein sequences. */ #define SEQDB "/home/don/exp004/influenzadb/influenzadb" /* * BLAST reference database of prototypical protein types. */ #define REFDB "/home/don/exp004/influenzadb/proteinnames" #define BUFFER_LEN 50 void assign_protein_type (hid_t file_id) { /* * Iterate through the records for which no protein type has been * assigned. Create a BioSeq Pointer to the data and then use this * as input to the BLAST run against the reference database of * prototypical sequence records. */ /* * Make BLAST messages reportable. */ ErrSetMessageLevel (SEV_WARNING); /* * Open the BLAST sequence database. */ ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); /* * Get default BLAST options. */ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); /* * Set to single hit mode. */ options->window_size = 0; options->multiple_hits_only = false; ValNodePtr error_returns = NULL; /* * Read the data from HDF5 file. */ hsize_t nfields; hsize_t nrecords; herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields, &nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); size_t dst_size; size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, dst_sizes, dst_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); /* * Assign protein types to records for which the field is empty. */ printf ("Records to process: %i\n", (int)nrecords); bool updates_pending = false; for (int i = 0; i < nrecords; i++) { if (dst_buf[i].protein_type[0] == '\0') { /* * Read the sequence from the database by GI. */ Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); if (bsp == NULL) { error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, "Unable to find BLAST record for gi|%i. Ensure the BLAST " "database is up-to-date with the HDF5 record set. See the " "BLAST formatdb.log file for details.\n", dst_buf[i].gi); } SeqAlignPtr seqalign = BioseqBlastEngine (bsp, "blastp", REFDB, options, NULL, &error_returns, NULL); /* * BLAST reported an error. Write it out and continue processing. */ if (error_returns != NULL) { printf ("Warning: An error has been reported by the NCBI Toolkit API " "for sequence gi|%i: %s", dst_buf[i].gi, BlastErrorToString (error_returns)); } /* * A hit was found. Record the first hit as the protein type. * Skip the first 6 characters and eat the "lcl|x_". */ else if (seqalign != NULL) { Char target_id_buf[BUFFER_LEN + 1]; SeqIdPtr target_id = SeqAlignId (seqalign, 1); SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); strncpy (dst_buf[i].protein_type, &target_id_buf[6], sizeof (dst_buf[i].protein_type)); updates_pending = true; } /* * BLAST did not find any hits. */ else { printf ("Warning: Unable to identify protein type for sequence gi|%i\n", dst_buf[i].gi); } /* * Clean up memory for the next ieration. */ seqalign = SeqAlignSetFree (seqalign); bsp = BioseqFree (bsp); } /* * Write the data out to the file. */ if ( (i % 1000 == 0) && (i > 0) && updates_pending) { status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, dst_size, dst_offset, dst_sizes, &dst_buf[i-1000]); if (status < 0) check_h5_error (status, __FILE__, __LINE__); status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); updates_pending = false; printf ("Processed %i of %i records.\n", i, (int)nrecords); } } /* * Write out records from the last bin if it was less than 1000 * records in size. */ if (updates_pending) { if ((int)nrecords < 1000) { status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, dst_size, dst_offset, dst_sizes, dst_buf); } else { status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, dst_size, dst_offset, dst_sizes, &dst_buf[nrecords-1000]); } if (status < 0) check_h5_error (status, __FILE__, __LINE__); status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); updates_pending = false; } free (dst_buf); options = BLASTOptionDelete (options); return; }