-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/assign_protein_type.c | 135 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 67 | ||||
-rw-r--r-- | src/sequence_data.h | 16 | ||||
-rw-r--r-- | src/sequence_data_init.c | 37 | ||||
-rw-r--r-- | src/sequence_data_init.h | 14 |
7 files changed, 188 insertions, 96 deletions
@@ -14,7 +14,8 @@ configure depcomp install-sh missing -stamp-h1 +src/.deps src/aggregator src/influenza.h5 -src/.deps +src/updator +stamp-h1 diff --git a/src/Makefile.am b/src/Makefile.am index a7e6852..6dd8e16 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -5,14 +5,16 @@ aggregator_SOURCES = \ check_error.c \ check_h5_error.c \ load_influenza_aa_dat.c \ - load_influenza_faa.c + load_influenza_faa.c \ + sequence_data_init.c updator_SOURCES = \ updator.c \ check_error.c \ check_h5_error.c \ check_ncbi_error.c \ - assign_protein_type.c + assign_protein_type.c \ + sequence_data_init.c noinst_HEADERS = \ assign_protein_type.h \ @@ -20,6 +22,8 @@ noinst_HEADERS = \ check_h5_error.h \ check_ncbi_error.h \ load_influenza_aa_dat.h \ - load_influenza_faa.h + load_influenza_faa.h \ + sequence_data.h \ + sequence_data_init.h AM_CFLAGS = -Wall -std=gnu99 -ggdb diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c @@ -1,12 +1,15 @@ #include "assign_protein_type.h" #include "check_ncbi_error.h" #include "check_h5_error.h" +#include "sequence_data.h" +#include "sequence_data_init.h" #include <ncbi.h> #include <readdb.h> #include <blast.h> #include <salpacc.h> #include <stdbool.h> #include <hdf5_hl.h> +#include <error.h> /* * BLAST database containing all of the influenza protein sequences. @@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id) * Get default BLAST options. */ BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); + + /* + * Set to single hit mode. + */ + options->window_size = 0; + options->multiple_hits_only = false; + ValNodePtr error_returns = NULL; /* @@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id) if (status < 0) check_h5_error (status, __FILE__, __LINE__); - /* - * todo: Allocate memory of nrecords for dst_buf. - * - * todo: Refactor code to share structres in read and write HDF5 - * calls. - */ + sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); + + size_t dst_size; + size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; + size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; + hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; + sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, dst_sizes, dst_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); - for (int i = 0; i < nrecords; i++) - { - - } - /* - * Read the sequence from the database by GI. + * Assign protein types to records for which the field is empty. */ - Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); - BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); - - SeqAlignPtr seqalign = BioseqBlastEngine (bsp, - "blastp", - REFDB, - options, - NULL, - &error_returns, - NULL); - - if (error_returns != NULL) - check_ncbi_error (error_returns, __FILE__, __LINE__); - - if (seqalign != NULL) + printf ("Records to process: %i\n", (int)nrecords); + for (int i = 0; i < nrecords; i++) { - Char target_id_buf[BUFFER_LEN + 1]; - SeqIdPtr target_id = SeqAlignId (seqalign, 1); - SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); - printf ("%s\n", target_id_buf); - } - - // Clean up memory for the next ieration. - seqalign = SeqAlignSetFree (seqalign); - bsp = BioseqFree (bsp); + if (dst_buf[i].protein_type[0] == '\0') { + /* + * Read the sequence from the database by GI. + */ + Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); + BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); + + SeqAlignPtr seqalign = BioseqBlastEngine (bsp, + "blastp", + REFDB, + options, + NULL, + &error_returns, + NULL); + + /* + * BLAST reported an error. Write it out and continue processing. + */ + if (error_returns != NULL) + { + printf ("Warning: An error has been reported by the NCBI Toolkit API " + "for sequence gi|%i: %s", + dst_buf[i].gi, BlastErrorToString (error_returns)); + + } + + /* + * A hit was found. Record the first hit as the protein type. + * Skip the first 6 characters and eat the "lcl|x_". + */ + else if (seqalign != NULL) + { + Char target_id_buf[BUFFER_LEN + 1]; + SeqIdPtr target_id = SeqAlignId (seqalign, 1); + SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); + strncpy (dst_buf[i].protein_type, &target_id_buf[6], + sizeof (dst_buf[i].protein_type)); + } + + /* + * BLAST did not find any hits. + */ + else + { + printf ("Warning: Unable to identify protein type for sequence gi|%i\n", + dst_buf[i].gi); + } + + /* + * Clean up memory for the next ieration. + */ + seqalign = SeqAlignSetFree (seqalign); + bsp = BioseqFree (bsp); + + /* + * Write the data out to the file. + */ + if ( (i % 1000 == 0) && (i > 0) ) + { + status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, + dst_size, dst_offset, dst_sizes, + &dst_buf[i-1000]); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + printf ("Processed %i of %i records.\n", i, (int)nrecords); + } + } + + } + + free (dst_buf); + options = BLASTOptionDelete (options); - + return; } diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c index 749b7ad..fd35254 100644 --- a/src/load_influenza_faa.c +++ b/src/load_influenza_faa.c @@ -1,62 +1,22 @@ -#include "load_influenza_faa.h" #include "check_error.h" #include "check_h5_error.h" -#include "hdf5_hl.h" +#include "load_influenza_faa.h" +#include "sequence_data.h" +#include "sequence_data_init.h" +#include <hdf5_hl.h> #include <string.h> #include <stdlib.h> -#define SEQUENCE_DATA_FIELD_NUM 4 - void load_influenza_faa (hid_t file_id) { - typedef struct - { - int gi; - char gb[9]; - char description[196]; - char protein_type[7]; - } sequence_data; - - size_t dst_size = sizeof (sequence_data); - size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] = - { HOFFSET (sequence_data, gi), - HOFFSET (sequence_data, gb), - HOFFSET (sequence_data, description), - HOFFSET (sequence_data, protein_type) - }; - - sequence_data dst_buf[1]; - - size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = { - sizeof (dst_buf[0].gi), - sizeof (dst_buf[0].gb), - sizeof (dst_buf[0].description), - sizeof (dst_buf[0].protein_type) - }; - + size_t dst_size; + size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; + size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; - field_type[0] = H5T_NATIVE_INT; - - hid_t gb_type = H5Tcopy (H5T_C_S1); - H5Tset_size (gb_type, 9); - field_type[1] = gb_type; - - hid_t description_type = H5Tcopy (H5T_C_S1); - H5Tset_size (description_type, 196); - field_type[2] = description_type; - - hid_t protein_type_type = H5Tcopy (H5T_C_S1); - H5Tset_size (protein_type_type, 7); - field_type[3] = protein_type_type; - - const char *field_names[SEQUENCE_DATA_FIELD_NUM] = - { "GI", - "GB", - "Description", - "Protein Type" }; - + sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); + hsize_t chunk_size = 10; int *fill_data = NULL; int compress = 0; @@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id) strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); + const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] = + SEQUENCE_DATA_FIELD_NAMES; + if (current_line == 1) { herr_t status = H5TBmake_table ("influenza.faa", file_id, "influenza.faa", SEQUENCE_DATA_FIELD_NUM, 1, - dst_size, field_names, + dst_size, sequence_data_field_names, dst_offset, field_type, chunk_size, fill_data, compress, &p_data); @@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id) fclose (dat); - H5Tclose (gb_type); - H5Tclose (description_type); - H5Tclose (protein_type_type); - return; } diff --git a/src/sequence_data.h b/src/sequence_data.h new file mode 100644 index 0000000..fb4ff78 --- a/dev/null +++ b/src/sequence_data.h @@ -0,0 +1,16 @@ +#ifndef SEQUENCE_DATA_H +#define SEQUENCE_DATA_H + +#define SEQUENCE_DATA_FIELD_NUM 4 + +#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" } + +typedef struct +{ + int gi; + char gb[9]; + char description[196]; + char protein_type[7]; +} sequence_data; + +#endif // SEQUENCE_DATA_H diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c new file mode 100644 index 0000000..09ba189 --- a/dev/null +++ b/src/sequence_data_init.c @@ -0,0 +1,37 @@ +#include "sequence_data_init.h" +#include "sequence_data.h" + +void +sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, + hid_t *field_type) +{ + *dst_size = sizeof (sequence_data); + + dst_offset[0] = HOFFSET (sequence_data, gi); + dst_offset[1] = HOFFSET (sequence_data, gb); + dst_offset[2] = HOFFSET (sequence_data, description); + dst_offset[3] = HOFFSET (sequence_data, protein_type); + + sequence_data dst_buf[1]; + + dst_sizes[0] = sizeof (dst_buf[0].gi); + dst_sizes[1] = sizeof (dst_buf[0].gb); + dst_sizes[2] = sizeof (dst_buf[0].description); + dst_sizes[3] = sizeof (dst_buf[0].protein_type); + + field_type[0] = H5T_NATIVE_INT; + + hid_t gb_type = H5Tcopy (H5T_C_S1); + H5Tset_size (gb_type, 9); + field_type[1] = gb_type; + + hid_t description_type = H5Tcopy (H5T_C_S1); + H5Tset_size (description_type, 196); + field_type[2] = description_type; + + hid_t protein_type_type = H5Tcopy (H5T_C_S1); + H5Tset_size (protein_type_type, 7); + field_type[3] = protein_type_type; + + return; +} diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h new file mode 100644 index 0000000..c87e7e6 --- a/dev/null +++ b/src/sequence_data_init.h @@ -0,0 +1,14 @@ +#ifndef SEQUENCE_DATA_INIT_H +#define SEQUENCE_DATA_INIT_H + +#include <hdf5.h> + +/* + * Initialize the structures describing sequence_data. These + * descriptive structures are used by the HDF5 API. + */ +void +sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, + hid_t *field_type); + +#endif // SEQUENCE_DATA_INIT_H |