From b2e4ace6b0265f4575733bfaa38519167074c477 Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Sun, 17 Jan 2010 23:44:57 +0000 Subject: Added error checking routines. Added implementation of assign_protein_type tested against a single hard-coded sequence input. The next step is to iterate over all records in the HDF5 collection that don't have a type assigned and to assign a type value. --- diff --git a/configure.ac b/configure.ac index 3104df8..f45958c 100644 --- a/configure.ac +++ b/configure.ac @@ -24,4 +24,24 @@ AC_CHECK_HEADERS([mpi.h],[], AC_CHECK_HEADERS([hdf5.h],[], [AC_MSG_ERROR(The HDF5 headers are needed to build the system.)]) +######################## +# MODULE: NCBI Toolkit # +######################## + +# Check for the NCBI ToolBox libraries. +AC_SEARCH_LIBS([log10],[m],[], +[AC_MSG_ERROR(The C Math Library is needed to build the system.)]) + +AC_SEARCH_LIBS([NlmThreadsAvailable],[ncbi],[], +[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) + +AC_SEARCH_LIBS([SeqAlignNew],[ncbiobj],[], +[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) + +AC_SEARCH_LIBS([Blast_RedoOneMatch],[blastcompadj],[], +[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) + +AC_SEARCH_LIBS([BioseqBlastEngine],[ncbitool],[], +[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)]) + AC_OUTPUT diff --git a/src/Makefile.am b/src/Makefile.am index 8098f33..fcbcdd5 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -3,6 +3,9 @@ bin_PROGRAMS = aggregator aggregator_SOURCES = \ aggregator.c \ assign_protein_type.c \ + check_error.c \ + check_h5_error.c \ + check_ncbi_error.c \ load_influenza_aa_dat.c \ load_influenza_faa.c @@ -10,6 +13,9 @@ aggregator_LDADD = -lhdf5 noinst_HEADERS = \ assign_protein_type.h \ + check_error.h \ + check_h5_error.h \ + check_ncbi_error.h \ load_influenza_aa_dat.h \ load_influenza_faa.h diff --git a/src/aggregator.c b/src/aggregator.c index 228e5c6..20da6df 100644 --- a/src/aggregator.c +++ b/src/aggregator.c @@ -3,6 +3,8 @@ * container. */ +#include "assign_protein_type.h" +#include "check_h5_error.h" #include "load_influenza_aa_dat.h" #include "load_influenza_faa.h" @@ -14,22 +16,26 @@ main() /* * Create the HDF5 file. */ - hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + // hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); /* * Load the supplementary protein data file. */ - load_influenza_aa_dat (file_id); + // load_influenza_aa_dat (file_id); /* * Load the FASTA protein sequence data file. */ - load_influenza_faa (file_id); + // load_influenza_faa (file_id); /* * Close the HD5 file. */ - H5Fclose (file_id); + // herr_t status = H5Fclose (file_id); + // if (status < 0) + // check_h5_error (status, __FILE__, __LINE__); + + assign_protein_type (0); return 0; } diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 2ba59a7..643ea3f 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c @@ -1,8 +1,80 @@ #include "assign_protein_type.h" +#include "check_ncbi_error.h" +#include +#include +#include +#include +#include + +/* + * BLAST database containing all of the influenza protein sequences. + */ +#define SEQDB "/home/don/exp004/influenzadb/influenzadb" + +/* + * BLAST reference database of prototypical protein types. + */ +#define REFDB "/home/don/exp004/influenzadb/proteinnames" + +#define BUFFER_LEN 50 void assign_protein_type (hid_t file_id) { + /* + * Iterate through the records for which no protein type has been + * assigned. Create a BioSeq Pointer to the data and then use this + * as input to the BLAST run against the reference database of + * prototypical sequence records. + */ + + /* + * Make BLAST messages reportable. + */ + ErrSetMessageLevel (SEV_WARNING); + + /* + * Open the BLAST sequence database. + */ + ReadDBFILEPtr seqdb = readdb_new (SEQDB, true); + + /* + * Get default BLAST options. + */ + BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); + ValNodePtr error_returns = NULL; + + /* + * Read the sequence from the database by GI. + */ + Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); + BioseqPtr bsp = readdb_get_bioseq(seqdb, sequence_number); + + SeqAlignPtr seqalign = BioseqBlastEngine (bsp, + "blastp", + REFDB, + options, + NULL, + &error_returns, + NULL); + + if (error_returns != NULL) + check_ncbi_error (error_returns, __FILE__, __LINE__); + + if (seqalign != NULL) + { + Char target_id_buf[BUFFER_LEN + 1]; + SeqIdPtr target_id = SeqAlignId (seqalign, 1); + SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); + printf ("%s\n", + target_id_buf); + } + + // Clean up memory for the next ieration. + seqalign = SeqAlignSetFree (seqalign); + bsp = BioseqFree (bsp); + + options = BLASTOptionDelete (options); return; } diff --git a/src/assign_protein_type.h b/src/assign_protein_type.h index 312b774..1dfb8e6 100644 --- a/src/assign_protein_type.h +++ b/src/assign_protein_type.h @@ -4,7 +4,11 @@ #include /* - * Determine the protein type for each protein sequence record. + * Determine the protein type for each protein sequence record. The + * technique used by NCBI is used here. A BLAST database of + * prototypical protein sequences serves as the reference. Each input + * sequence is BLASTed against this database. The first hit is used + * to assign a protein type to sequence. */ void assign_protein_type (hid_t file_id); diff --git a/src/check_error.c b/src/check_error.c new file mode 100644 index 0000000..70c62c4 --- a/dev/null +++ b/src/check_error.c @@ -0,0 +1,14 @@ +#include "check_error.h" +#include +#include +#include + +void +check_error (const char *filename, const unsigned int linenum) +{ + if (errno) + error_at_line (EXIT_FAILURE, errno, filename, linenum, + "An error has been detected within the application."); + + return; +} diff --git a/src/check_error.h b/src/check_error.h new file mode 100644 index 0000000..33acc63 --- a/dev/null +++ b/src/check_error.h @@ -0,0 +1,11 @@ +#ifndef CHECK_ERROR_H +#define CHECK_ERROR_H + +/* + * Check the error state. Reports and error message and exits if an + * error has occured. + */ +void +check_error (const char *filename, unsigned int linenum); + +#endif // CHECK_ERROR_H diff --git a/src/check_h5_error.c b/src/check_h5_error.c new file mode 100644 index 0000000..30fc87c --- a/dev/null +++ b/src/check_h5_error.c @@ -0,0 +1,12 @@ +#include "check_h5_error.h" +#include +#include + +void +check_h5_error (herr_t status, const char* filename, unsigned int linenum) +{ + error_at_line (EXIT_FAILURE, 0, filename, linenum, + "An error has been reported by the HDF5 API."); + + return; +} diff --git a/src/check_h5_error.h b/src/check_h5_error.h new file mode 100644 index 0000000..74730cd --- a/dev/null +++ b/src/check_h5_error.h @@ -0,0 +1,12 @@ +#ifndef CHECK_H5_ERROR_H +#define CHECK_H5_ERROR_H + +#include + +/* + * Handle errors from the HDF5 API. + */ +void +check_h5_error (herr_t status, const char* filename, unsigned int linenum); + +#endif // CHECK_H5_ERROR_H diff --git a/src/check_ncbi_error.c b/src/check_ncbi_error.c new file mode 100644 index 0000000..3caa7a9 --- a/dev/null +++ b/src/check_ncbi_error.c @@ -0,0 +1,13 @@ +#include "check_ncbi_error.h" + +void +check_ncbi_error (ValNodePtr error_returns, + const char* filename, + unsigned int linenum) +{ + error_at_line (EXIT_FAILURE, 0, filename, linenum, + "An error has been reported by the NCBI Toolkit API: %s", + BlastErrorToString (error_returns)); + + return; +} diff --git a/src/check_ncbi_error.h b/src/check_ncbi_error.h new file mode 100644 index 0000000..c27c56d --- a/dev/null +++ b/src/check_ncbi_error.h @@ -0,0 +1,13 @@ +#ifndef CHECK_NCBI_ERROR_H +#define CHECK_NCBI_ERROR_H + +#include + +/* + * Handle errors from the NCBI Toolkit API. + */ +void +check_ncbi_error (ValNodePtr error_returns, + const char* filename, unsigned int linenum); + +#endif // CHECK_NCBI_ERROR_H diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index 493c7db..91ef415 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c @@ -6,6 +6,8 @@ */ #include "load_influenza_aa_dat.h" +#include "check_error.h" +#include "check_h5_error.h" #include "hdf5_hl.h" #include #include @@ -140,7 +142,10 @@ load_influenza_aa_dat (hid_t file_id) * Insert the records. */ supplementary_data p_data; - FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); + FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", + "r"); + if (dat == NULL) + check_error (__FILE__, __LINE__); char *line = NULL; size_t len = 0; int current_line = 0; @@ -205,13 +210,23 @@ load_influenza_aa_dat (hid_t file_id) strncpy(p_data.full_length_indicator, strsep (&running, "\t"), sizeof(p_data.full_length_indicator)); - if (current_line == 1) - H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,1, - dst_size,field_names, dst_offset, field_type, - chunk_size, fill_data, compress, &p_data); - else - H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, - dst_sizes, &p_data); + if (current_line == 1) + { + herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, + TABLE_NAME, NFIELDS, 1, dst_size, + field_names, dst_offset, field_type, + chunk_size, fill_data, compress, + &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } + else + { + herr_t status = H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, + dst_offset, dst_sizes, &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } if (running) free (running); -- cgit v0.8.3.1-22-g547a