From 0871f6cf645c20673e45946c3ba3ddaa2ffb47aa Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Mon, 18 Jan 2010 18:46:10 +0000 Subject: Added error handling for the case where a GI record is not found in the BLAST database. Added writing of the last bin of record updates to the file. --- diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 166e787..ec3a959 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c @@ -83,6 +83,7 @@ assign_protein_type (hid_t file_id) * Assign protein types to records for which the field is empty. */ printf ("Records to process: %i\n", (int)nrecords); + bool updates_pending = false; for (int i = 0; i < nrecords; i++) { @@ -92,6 +93,14 @@ assign_protein_type (hid_t file_id) */ Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); + if (bsp == NULL) + { + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Unable to find BLAST record for gi|%i. Ensure the BLAST " + "database is up-to-date with the HDF5 record set. See the " + "BLAST formatdb.log file for details.\n", + dst_buf[i].gi); + } SeqAlignPtr seqalign = BioseqBlastEngine (bsp, "blastp", @@ -123,6 +132,7 @@ assign_protein_type (hid_t file_id) SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); strncpy (dst_buf[i].protein_type, &target_id_buf[6], sizeof (dst_buf[i].protein_type)); + updates_pending = true; } /* @@ -140,25 +150,56 @@ assign_protein_type (hid_t file_id) seqalign = SeqAlignSetFree (seqalign); bsp = BioseqFree (bsp); - /* - * Write the data out to the file. - */ - if ( (i % 1000 == 0) && (i > 0) ) - { - status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, - dst_size, dst_offset, dst_sizes, - &dst_buf[i-1000]); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); + } - status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); + /* + * Write the data out to the file. + */ + if ( (i % 1000 == 0) && (i > 0) && updates_pending) + { + status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[i-1000]); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + updates_pending = false; + + printf ("Processed %i of %i records.\n", i, (int)nrecords); + } + + } - printf ("Processed %i of %i records.\n", i, (int)nrecords); - } - } + /* + * Write out records from the last bin if it was less than 1000 + * records in size. + */ + if (updates_pending) + { + if ((int)nrecords < 1000) + { + status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, + dst_size, dst_offset, dst_sizes, + dst_buf); + } + else + { + status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[nrecords-1000]); + } + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + updates_pending = false; } free (dst_buf); -- cgit v0.8.3.1-22-g547a