#define _GNU_SOURCE #include "assign_protein_type.h" #include "error/check_h5_error.h" #include "error/check_ncbi_error.h" #include "model/gi_type_data.h" #include "model/gi_type_data_init.h" #include "model/sequence_data.h" #include "model/sequence_data_init.h" #include #include #include #include #include #include #include #include /* * BLAST database containing all of the influenza protein sequences. */ #define SEQDB "/home/don/exp004/influenzadb/influenzadb" /* * BLAST reference database of prototypical protein types. */ #define REFDB "/home/don/exp004/influenzadb/proteinnames" #define BUFFER_LEN 50 void assign_protein_type (hid_t file_id) { /* * Iterate through the records for which no protein type has been * assigned. Create a BioSeq Pointer to the data and then use this * as input to the BLAST run against the reference database of * prototypical sequence records. */ /* * Make BLAST messages reportable. */ ErrSetMessageLevel (SEV_WARNING); /* * Open the BLAST sequence database. */ ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); /* * Get default BLAST options. */ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); /* * Set to single hit mode. */ options->window_size = 0; options->multiple_hits_only = false; ValNodePtr error_returns = NULL; /* * Read the data from HDF5 influenza.faa. */ hsize_t faa_nfields; hsize_t faa_nrecords; herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields, &faa_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords); size_t faa_size; size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM]; hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM]; sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type); status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset, faa_sizes, faa_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); /* * Read the data from HDF5 gi_type_data. */ hsize_t gi_nfields = 0; hsize_t gi_nrecords = 0; size_t gi_size = 0; size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); gi_type_data* gi_buf = NULL; if (H5LTfind_dataset (file_id, "gi_type_data") == 1) { printf ("Updating gi_type_data.\n"); status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, &gi_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, gi_sizes, gi_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } else { printf ("Creating gi_type_data.\n"); gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = GI_TYPE_DATA_FIELD_NAMES; hsize_t chunk_size = 10; int *fill_data = NULL; int compress = 0; status = H5TBmake_table ("gi_type_data", file_id, "gi_type_data", GI_TYPE_DATA_FIELD_NUM, faa_nrecords, gi_size, gi_type_data_field_names, gi_offset, gi_field_type, chunk_size, fill_data, compress, NULL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } struct hsearch_data htab; bzero (&htab, sizeof (htab)); hcreate_r (gi_nrecords * 2, &htab); ENTRY e, *ep; for (int i = 0; i < gi_nrecords; i++) { char gi_chr[25]; snprintf (gi_chr, 25, "%i", gi_buf[i].gi); e.key = gi_chr; e.data = &gi_buf[i]; if (hsearch_r (e, ENTER, &ep, &htab) == 0) error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, "Allocation failed."); } /* * Assign protein types to records for which the field is empty. */ printf ("Records to process: %i\n", (int)faa_nrecords); bool updates_pending = false; for (int i = 0; i < faa_nrecords; i++) { char gi_chr[25]; snprintf (gi_chr, 25, "%i", faa_buf[i].gi); e.key = gi_chr; if (hsearch_r (e, FIND, &ep, &htab) == 0) { gi_buf[i].gi = faa_buf[i].gi; gi_buf[i].type[0] = '\0'; gi_buf[i].protein[0] = '\0'; /* * Read the sequence from the database by GI. */ Int4 sequence_number = readdb_gi2seq (seqdb, faa_buf[i].gi, NULL); BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); if (bsp == NULL) { error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, "Unable to find BLAST record for gi|%i. Ensure " "the BLAST database is up-to-date with the HDF5 " "record set. See the BLAST formatdb.log file " "for details.\n", faa_buf[i].gi); } SeqAlignPtr seqalign = BioseqBlastEngine (bsp, "blastp", REFDB, options, NULL, &error_returns, NULL); /* * BLAST reported an error. Write it out and continue processing. */ if (error_returns != NULL) { char *msg = BlastErrorToString (error_returns); printf ("Warning: An error has been reported by the NCBI Toolkit " "API for sequence gi|%i: %s", faa_buf[i].gi, msg); free (msg); } /* * A hit was found. Record the first hit as the protein type. * Skip the first 6 characters and eat the "lcl|x_". */ else if (seqalign != NULL) { Char target_id_buf[BUFFER_LEN + 1]; SeqIdPtr target_id = SeqAlignId (seqalign, 1); SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); // Species Type gi_buf[i].type[0] = target_id_buf[4]; gi_buf[i].type[1] = '\0'; // Protein Type strncpy (gi_buf[i].protein, &target_id_buf[6], sizeof (gi_buf[i].protein)); updates_pending = true; } /* * BLAST did not find any hits. */ else { printf ("Warning: Unable to identify protein type for sequence " "gi|%i\n", faa_buf[i].gi); } /* * Clean up memory for the next ieration. */ seqalign = SeqAlignSetFree (seqalign); bsp = BioseqFree (bsp); } /* * Write the data out to the file. */ if ( (i % 1000 == 0) && (i > 0) && updates_pending) { status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, gi_size, gi_offset, gi_sizes, &gi_buf[i-1000]); if (status < 0) check_h5_error (status, __FILE__, __LINE__); status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); updates_pending = false; printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); } } /* * Write out records from the last bin if it was less than 1000 * records in size. */ if (updates_pending) { /* if ((int)faa_nrecords < 1000) { status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, dst_size, dst_offset, dst_sizes, dst_buf); } else { status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, dst_size, dst_offset, dst_sizes, &dst_buf[nrecords-1000]); } if (status < 0) check_h5_error (status, __FILE__, __LINE__); status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); if (status < 0) check_h5_error (status, __FILE__, __LINE__); updates_pending = false; */ } free (faa_buf); free (gi_buf); hdestroy_r (&htab); options = BLASTOptionDelete (options); return; }