summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--.gitignore5
-rw-r--r--src/Makefile.am10
-rw-r--r--src/assign_protein_type.c135
-rw-r--r--src/load_influenza_faa.c67
-rw-r--r--src/sequence_data.h16
-rw-r--r--src/sequence_data_init.c37
-rw-r--r--src/sequence_data_init.h14
7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
depcomp
install-sh
missing
-stamp-h1
+src/.deps
src/aggregator
src/influenza.h5
-src/.deps
+src/updator
+stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
check_error.c \
check_h5_error.c \
load_influenza_aa_dat.c \
- load_influenza_faa.c
+ load_influenza_faa.c \
+ sequence_data_init.c
updator_SOURCES = \
updator.c \
check_error.c \
check_h5_error.c \
check_ncbi_error.c \
- assign_protein_type.c
+ assign_protein_type.c \
+ sequence_data_init.c
noinst_HEADERS = \
assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
check_h5_error.h \
check_ncbi_error.h \
load_influenza_aa_dat.h \
- load_influenza_faa.h
+ load_influenza_faa.h \
+ sequence_data.h \
+ sequence_data_init.h
AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
#include "assign_protein_type.h"
#include "check_ncbi_error.h"
#include "check_h5_error.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
#include <ncbi.h>
#include <readdb.h>
#include <blast.h>
#include <salpacc.h>
#include <stdbool.h>
#include <hdf5_hl.h>
+#include <error.h>
/*
* BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
* Get default BLAST options.
*/
BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+
+ /*
+ * Set to single hit mode.
+ */
+ options->window_size = 0;
+ options->multiple_hits_only = false;
+
ValNodePtr error_returns = NULL;
/*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- /*
- * todo: Allocate memory of nrecords for dst_buf.
- *
- * todo: Refactor code to share structres in read and write HDF5
- * calls.
- */
+ sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
+
+ size_t dst_size;
+ size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
+ hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
+ sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
dst_sizes, dst_buf);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- for (int i = 0; i < nrecords; i++)
- {
-
- }
-
/*
- * Read the sequence from the database by GI.
+ * Assign protein types to records for which the field is empty.
*/
- Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
- BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
-
- SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
- "blastp",
- REFDB,
- options,
- NULL,
- &error_returns,
- NULL);
-
- if (error_returns != NULL)
- check_ncbi_error (error_returns, __FILE__, __LINE__);
-
- if (seqalign != NULL)
+ printf ("Records to process: %i\n", (int)nrecords);
+ for (int i = 0; i < nrecords; i++)
{
- Char target_id_buf[BUFFER_LEN + 1];
- SeqIdPtr target_id = SeqAlignId (seqalign, 1);
- SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
- printf ("%s\n", target_id_buf);
- }
-
- // Clean up memory for the next ieration.
- seqalign = SeqAlignSetFree (seqalign);
- bsp = BioseqFree (bsp);
+ if (dst_buf[i].protein_type[0] == '\0') {
+ /*
+ * Read the sequence from the database by GI.
+ */
+ Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
+ BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+
+ SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+ "blastp",
+ REFDB,
+ options,
+ NULL,
+ &error_returns,
+ NULL);
+
+ /*
+ * BLAST reported an error. Write it out and continue processing.
+ */
+ if (error_returns != NULL)
+ {
+ printf ("Warning: An error has been reported by the NCBI Toolkit API "
+ "for sequence gi|%i: %s",
+ dst_buf[i].gi, BlastErrorToString (error_returns));
+
+ }
+
+ /*
+ * A hit was found. Record the first hit as the protein type.
+ * Skip the first 6 characters and eat the "lcl|x_".
+ */
+ else if (seqalign != NULL)
+ {
+ Char target_id_buf[BUFFER_LEN + 1];
+ SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+ SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+ strncpy (dst_buf[i].protein_type, &target_id_buf[6],
+ sizeof (dst_buf[i].protein_type));
+ }
+
+ /*
+ * BLAST did not find any hits.
+ */
+ else
+ {
+ printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
+ dst_buf[i].gi);
+ }
+
+ /*
+ * Clean up memory for the next ieration.
+ */
+ seqalign = SeqAlignSetFree (seqalign);
+ bsp = BioseqFree (bsp);
+
+ /*
+ * Write the data out to the file.
+ */
+ if ( (i % 1000 == 0) && (i > 0) )
+ {
+ status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
+ dst_size, dst_offset, dst_sizes,
+ &dst_buf[i-1000]);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ printf ("Processed %i of %i records.\n", i, (int)nrecords);
+ }
+ }
+
+ }
+
+ free (dst_buf);
+
options = BLASTOptionDelete (options);
-
+
return;
}
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
-#include "load_influenza_faa.h"
#include "check_error.h"
#include "check_h5_error.h"
-#include "hdf5_hl.h"
+#include "load_influenza_faa.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
+#include <hdf5_hl.h>
#include <string.h>
#include <stdlib.h>
-#define SEQUENCE_DATA_FIELD_NUM 4
-
void
load_influenza_faa (hid_t file_id)
{
- typedef struct
- {
- int gi;
- char gb[9];
- char description[196];
- char protein_type[7];
- } sequence_data;
-
- size_t dst_size = sizeof (sequence_data);
- size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
- { HOFFSET (sequence_data, gi),
- HOFFSET (sequence_data, gb),
- HOFFSET (sequence_data, description),
- HOFFSET (sequence_data, protein_type)
- };
-
- sequence_data dst_buf[1];
-
- size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
- sizeof (dst_buf[0].gi),
- sizeof (dst_buf[0].gb),
- sizeof (dst_buf[0].description),
- sizeof (dst_buf[0].protein_type)
- };
-
+ size_t dst_size;
+ size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
- field_type[0] = H5T_NATIVE_INT;
-
- hid_t gb_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (gb_type, 9);
- field_type[1] = gb_type;
-
- hid_t description_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (description_type, 196);
- field_type[2] = description_type;
-
- hid_t protein_type_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (protein_type_type, 7);
- field_type[3] = protein_type_type;
-
- const char *field_names[SEQUENCE_DATA_FIELD_NUM] =
- { "GI",
- "GB",
- "Description",
- "Protein Type" };
-
+ sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
+
hsize_t chunk_size = 10;
int *fill_data = NULL;
int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
+ const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
+ SEQUENCE_DATA_FIELD_NAMES;
+
if (current_line == 1)
{
herr_t status = H5TBmake_table ("influenza.faa", file_id,
"influenza.faa",
SEQUENCE_DATA_FIELD_NUM, 1,
- dst_size, field_names,
+ dst_size, sequence_data_field_names,
dst_offset, field_type,
chunk_size, fill_data, compress,
&p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
fclose (dat);
- H5Tclose (gb_type);
- H5Tclose (description_type);
- H5Tclose (protein_type_type);
-
return;
}
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
+#ifndef SEQUENCE_DATA_H
+#define SEQUENCE_DATA_H
+
+#define SEQUENCE_DATA_FIELD_NUM 4
+
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+
+typedef struct
+{
+ int gi;
+ char gb[9];
+ char description[196];
+ char protein_type[7];
+} sequence_data;
+
+#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
+#include "sequence_data_init.h"
+#include "sequence_data.h"
+
+void
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type)
+{
+ *dst_size = sizeof (sequence_data);
+
+ dst_offset[0] = HOFFSET (sequence_data, gi);
+ dst_offset[1] = HOFFSET (sequence_data, gb);
+ dst_offset[2] = HOFFSET (sequence_data, description);
+ dst_offset[3] = HOFFSET (sequence_data, protein_type);
+
+ sequence_data dst_buf[1];
+
+ dst_sizes[0] = sizeof (dst_buf[0].gi);
+ dst_sizes[1] = sizeof (dst_buf[0].gb);
+ dst_sizes[2] = sizeof (dst_buf[0].description);
+ dst_sizes[3] = sizeof (dst_buf[0].protein_type);
+
+ field_type[0] = H5T_NATIVE_INT;
+
+ hid_t gb_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (gb_type, 9);
+ field_type[1] = gb_type;
+
+ hid_t description_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (description_type, 196);
+ field_type[2] = description_type;
+
+ hid_t protein_type_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (protein_type_type, 7);
+ field_type[3] = protein_type_type;
+
+ return;
+}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
+#ifndef SEQUENCE_DATA_INIT_H
+#define SEQUENCE_DATA_INIT_H
+
+#include <hdf5.h>
+
+/*
+ * Initialize the structures describing sequence_data. These
+ * descriptive structures are used by the HDF5 API.
+ */
+void
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type);
+
+#endif // SEQUENCE_DATA_INIT_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.