summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-17 23:44:57 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-17 23:44:57 (GMT)
commitb2e4ace6b0265f4575733bfaa38519167074c477 (patch) (side-by-side diff)
tree5d7dc32c19e6a33ac45dc1fc9b702919a4c7721b
parent22bc29abeb6c0c3354bbd857cac53af14153bb2f (diff)
downloadexp007-b2e4ace6b0265f4575733bfaa38519167074c477.zip
exp007-b2e4ace6b0265f4575733bfaa38519167074c477.tar.gz
exp007-b2e4ace6b0265f4575733bfaa38519167074c477.tar.bz2
Added error checking routines. Added implementation of
assign_protein_type tested against a single hard-coded sequence input. The next step is to iterate over all records in the HDF5 collection that don't have a type assigned and to assign a type value.
-rw-r--r--configure.ac20
-rw-r--r--src/Makefile.am6
-rw-r--r--src/aggregator.c14
-rw-r--r--src/assign_protein_type.c72
-rw-r--r--src/assign_protein_type.h6
-rw-r--r--src/check_error.c14
-rw-r--r--src/check_error.h11
-rw-r--r--src/check_h5_error.c12
-rw-r--r--src/check_h5_error.h12
-rw-r--r--src/check_ncbi_error.c13
-rw-r--r--src/check_ncbi_error.h13
-rw-r--r--src/load_influenza_aa_dat.c31
12 files changed, 211 insertions, 13 deletions
diff --git a/configure.ac b/configure.ac
index 3104df8..f45958c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -24,4 +24,24 @@ AC_CHECK_HEADERS([mpi.h],[],
AC_CHECK_HEADERS([hdf5.h],[],
[AC_MSG_ERROR(The HDF5 headers are needed to build the system.)])
+########################
+# MODULE: NCBI Toolkit #
+########################
+
+# Check for the NCBI ToolBox libraries.
+AC_SEARCH_LIBS([log10],[m],[],
+[AC_MSG_ERROR(The C Math Library is needed to build the system.)])
+
+AC_SEARCH_LIBS([NlmThreadsAvailable],[ncbi],[],
+[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)])
+
+AC_SEARCH_LIBS([SeqAlignNew],[ncbiobj],[],
+[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)])
+
+AC_SEARCH_LIBS([Blast_RedoOneMatch],[blastcompadj],[],
+[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)])
+
+AC_SEARCH_LIBS([BioseqBlastEngine],[ncbitool],[],
+[AC_MSG_ERROR(The NCBI ToolBox is needed to build the system. Information on this API can be found on-line at http://www.ncbi.nlm.nih.gov/IEB/ToolBox/index.cgi. Debian users can add the package libncbi6-dev to fulfill this dependency.)])
+
AC_OUTPUT
diff --git a/src/Makefile.am b/src/Makefile.am
index 8098f33..fcbcdd5 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -3,6 +3,9 @@ bin_PROGRAMS = aggregator
aggregator_SOURCES = \
aggregator.c \
assign_protein_type.c \
+ check_error.c \
+ check_h5_error.c \
+ check_ncbi_error.c \
load_influenza_aa_dat.c \
load_influenza_faa.c
@@ -10,6 +13,9 @@ aggregator_LDADD = -lhdf5
noinst_HEADERS = \
assign_protein_type.h \
+ check_error.h \
+ check_h5_error.h \
+ check_ncbi_error.h \
load_influenza_aa_dat.h \
load_influenza_faa.h
diff --git a/src/aggregator.c b/src/aggregator.c
index 228e5c6..20da6df 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -3,6 +3,8 @@
* container.
*/
+#include "assign_protein_type.h"
+#include "check_h5_error.h"
#include "load_influenza_aa_dat.h"
#include "load_influenza_faa.h"
@@ -14,22 +16,26 @@ main()
/*
* Create the HDF5 file.
*/
- hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
+ // hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
/*
* Load the supplementary protein data file.
*/
- load_influenza_aa_dat (file_id);
+ // load_influenza_aa_dat (file_id);
/*
* Load the FASTA protein sequence data file.
*/
- load_influenza_faa (file_id);
+ // load_influenza_faa (file_id);
/*
* Close the HD5 file.
*/
- H5Fclose (file_id);
+ // herr_t status = H5Fclose (file_id);
+ // if (status < 0)
+ // check_h5_error (status, __FILE__, __LINE__);
+
+ assign_protein_type (0);
return 0;
}
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 2ba59a7..643ea3f 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,8 +1,80 @@
#include "assign_protein_type.h"
+#include "check_ncbi_error.h"
+#include <ncbi.h>
+#include <readdb.h>
+#include <blast.h>
+#include <salpacc.h>
+#include <stdbool.h>
+
+/*
+ * BLAST database containing all of the influenza protein sequences.
+ */
+#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
+
+/*
+ * BLAST reference database of prototypical protein types.
+ */
+#define REFDB "/home/don/exp004/influenzadb/proteinnames"
+
+#define BUFFER_LEN 50
void
assign_protein_type (hid_t file_id)
{
+ /*
+ * Iterate through the records for which no protein type has been
+ * assigned. Create a BioSeq Pointer to the data and then use this
+ * as input to the BLAST run against the reference database of
+ * prototypical sequence records.
+ */
+
+ /*
+ * Make BLAST messages reportable.
+ */
+ ErrSetMessageLevel (SEV_WARNING);
+
+ /*
+ * Open the BLAST sequence database.
+ */
+ ReadDBFILEPtr seqdb = readdb_new (SEQDB, true);
+
+ /*
+ * Get default BLAST options.
+ */
+ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+ ValNodePtr error_returns = NULL;
+
+ /*
+ * Read the sequence from the database by GI.
+ */
+ Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
+ BioseqPtr bsp = readdb_get_bioseq(seqdb, sequence_number);
+
+ SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+ "blastp",
+ REFDB,
+ options,
+ NULL,
+ &error_returns,
+ NULL);
+
+ if (error_returns != NULL)
+ check_ncbi_error (error_returns, __FILE__, __LINE__);
+
+ if (seqalign != NULL)
+ {
+ Char target_id_buf[BUFFER_LEN + 1];
+ SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+ SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+ printf ("%s\n",
+ target_id_buf);
+ }
+
+ // Clean up memory for the next ieration.
+ seqalign = SeqAlignSetFree (seqalign);
+ bsp = BioseqFree (bsp);
+
+ options = BLASTOptionDelete (options);
return;
}
diff --git a/src/assign_protein_type.h b/src/assign_protein_type.h
index 312b774..1dfb8e6 100644
--- a/src/assign_protein_type.h
+++ b/src/assign_protein_type.h
@@ -4,7 +4,11 @@
#include <hdf5.h>
/*
- * Determine the protein type for each protein sequence record.
+ * Determine the protein type for each protein sequence record. The
+ * technique used by NCBI is used here. A BLAST database of
+ * prototypical protein sequences serves as the reference. Each input
+ * sequence is BLASTed against this database. The first hit is used
+ * to assign a protein type to sequence.
*/
void
assign_protein_type (hid_t file_id);
diff --git a/src/check_error.c b/src/check_error.c
new file mode 100644
index 0000000..70c62c4
--- a/dev/null
+++ b/src/check_error.c
@@ -0,0 +1,14 @@
+#include "check_error.h"
+#include <error.h>
+#include <errno.h>
+#include <stdlib.h>
+
+void
+check_error (const char *filename, const unsigned int linenum)
+{
+ if (errno)
+ error_at_line (EXIT_FAILURE, errno, filename, linenum,
+ "An error has been detected within the application.");
+
+ return;
+}
diff --git a/src/check_error.h b/src/check_error.h
new file mode 100644
index 0000000..33acc63
--- a/dev/null
+++ b/src/check_error.h
@@ -0,0 +1,11 @@
+#ifndef CHECK_ERROR_H
+#define CHECK_ERROR_H
+
+/*
+ * Check the error state. Reports and error message and exits if an
+ * error has occured.
+ */
+void
+check_error (const char *filename, unsigned int linenum);
+
+#endif // CHECK_ERROR_H
diff --git a/src/check_h5_error.c b/src/check_h5_error.c
new file mode 100644
index 0000000..30fc87c
--- a/dev/null
+++ b/src/check_h5_error.c
@@ -0,0 +1,12 @@
+#include "check_h5_error.h"
+#include <error.h>
+#include <stdlib.h>
+
+void
+check_h5_error (herr_t status, const char* filename, unsigned int linenum)
+{
+ error_at_line (EXIT_FAILURE, 0, filename, linenum,
+ "An error has been reported by the HDF5 API.");
+
+ return;
+}
diff --git a/src/check_h5_error.h b/src/check_h5_error.h
new file mode 100644
index 0000000..74730cd
--- a/dev/null
+++ b/src/check_h5_error.h
@@ -0,0 +1,12 @@
+#ifndef CHECK_H5_ERROR_H
+#define CHECK_H5_ERROR_H
+
+#include <hdf5.h>
+
+/*
+ * Handle errors from the HDF5 API.
+ */
+void
+check_h5_error (herr_t status, const char* filename, unsigned int linenum);
+
+#endif // CHECK_H5_ERROR_H
diff --git a/src/check_ncbi_error.c b/src/check_ncbi_error.c
new file mode 100644
index 0000000..3caa7a9
--- a/dev/null
+++ b/src/check_ncbi_error.c
@@ -0,0 +1,13 @@
+#include "check_ncbi_error.h"
+
+void
+check_ncbi_error (ValNodePtr error_returns,
+ const char* filename,
+ unsigned int linenum)
+{
+ error_at_line (EXIT_FAILURE, 0, filename, linenum,
+ "An error has been reported by the NCBI Toolkit API: %s",
+ BlastErrorToString (error_returns));
+
+ return;
+}
diff --git a/src/check_ncbi_error.h b/src/check_ncbi_error.h
new file mode 100644
index 0000000..c27c56d
--- a/dev/null
+++ b/src/check_ncbi_error.h
@@ -0,0 +1,13 @@
+#ifndef CHECK_NCBI_ERROR_H
+#define CHECK_NCBI_ERROR_H
+
+#include <ncbi.h>
+
+/*
+ * Handle errors from the NCBI Toolkit API.
+ */
+void
+check_ncbi_error (ValNodePtr error_returns,
+ const char* filename, unsigned int linenum);
+
+#endif // CHECK_NCBI_ERROR_H
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c
index 493c7db..91ef415 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load_influenza_aa_dat.c
@@ -6,6 +6,8 @@
*/
#include "load_influenza_aa_dat.h"
+#include "check_error.h"
+#include "check_h5_error.h"
#include "hdf5_hl.h"
#include <string.h>
#include <stdlib.h>
@@ -140,7 +142,10 @@ load_influenza_aa_dat (hid_t file_id)
* Insert the records.
*/
supplementary_data p_data;
- FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r");
+ FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat",
+ "r");
+ if (dat == NULL)
+ check_error (__FILE__, __LINE__);
char *line = NULL;
size_t len = 0;
int current_line = 0;
@@ -205,13 +210,23 @@ load_influenza_aa_dat (hid_t file_id)
strncpy(p_data.full_length_indicator, strsep (&running, "\t"),
sizeof(p_data.full_length_indicator));
- if (current_line == 1)
- H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,1,
- dst_size,field_names, dst_offset, field_type,
- chunk_size, fill_data, compress, &p_data);
- else
- H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset,
- dst_sizes, &p_data);
+ if (current_line == 1)
+ {
+ herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
+ TABLE_NAME, NFIELDS, 1, dst_size,
+ field_names, dst_offset, field_type,
+ chunk_size, fill_data, compress,
+ &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
+ else
+ {
+ herr_t status = H5TBappend_records (file_id, TABLE_NAME, 1, dst_size,
+ dst_offset, dst_sizes, &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
if (running)
free (running);

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.