summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-18 17:26:18 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-18 17:26:18 (GMT)
commita7cc532248146968a9296be7db42830e35525afa (patch) (unidiff)
tree93c44ad4f1ac27a1daed4ad5c7fdcb76971c0221
parent0d0e0886d17612fb7ebdb9110679d5b7bd5087be (diff)
downloadexp007-a7cc532248146968a9296be7db42830e35525afa.zip
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.gz
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.bz2
Implemented updator to calculate and assign protein type values to
records which do not have them. This is a re-entrant process that only updates missing records and skips quickly through existing records. In addition premature termination loses at most 999 records of work since every thousand records are written back to the file and flushed to disk. Still to do is to write and flush the last set of records in the final bin under 1000. Also refactored the sequence_data structure so that it can be shared between HDF5 reading and writing operations.
-rw-r--r--.gitignore5
-rw-r--r--src/Makefile.am10
-rw-r--r--src/assign_protein_type.c135
-rw-r--r--src/load_influenza_faa.c67
-rw-r--r--src/sequence_data.h16
-rw-r--r--src/sequence_data_init.c37
-rw-r--r--src/sequence_data_init.h14
7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
14depcomp14depcomp
15install-sh15install-sh
16missing16missing
17stamp-h117src/.deps
18src/aggregator18src/aggregator
19src/influenza.h519src/influenza.h5
20src/.deps20src/updator
21stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
5 check_error.c \5 check_error.c \
6 check_h5_error.c \6 check_h5_error.c \
7 load_influenza_aa_dat.c \7 load_influenza_aa_dat.c \
8 load_influenza_faa.c8 load_influenza_faa.c \
9 sequence_data_init.c
910
10updator_SOURCES = \11updator_SOURCES = \
11 updator.c \12 updator.c \
12 check_error.c \13 check_error.c \
13 check_h5_error.c \14 check_h5_error.c \
14 check_ncbi_error.c \15 check_ncbi_error.c \
15 assign_protein_type.c16 assign_protein_type.c \
17 sequence_data_init.c
1618
17noinst_HEADERS = \19noinst_HEADERS = \
18 assign_protein_type.h \20 assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
20 check_h5_error.h \22 check_h5_error.h \
21 check_ncbi_error.h \23 check_ncbi_error.h \
22 load_influenza_aa_dat.h \24 load_influenza_aa_dat.h \
23 load_influenza_faa.h25 load_influenza_faa.h \
26 sequence_data.h \
27 sequence_data_init.h
2428
25AM_CFLAGS = -Wall -std=gnu99 -ggdb29AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
1#include "assign_protein_type.h"1#include "assign_protein_type.h"
2#include "check_ncbi_error.h"2#include "check_ncbi_error.h"
3#include "check_h5_error.h"3#include "check_h5_error.h"
4#include "sequence_data.h"
5#include "sequence_data_init.h"
4#include <ncbi.h>6#include <ncbi.h>
5#include <readdb.h>7#include <readdb.h>
6#include <blast.h>8#include <blast.h>
7#include <salpacc.h>9#include <salpacc.h>
8#include <stdbool.h>10#include <stdbool.h>
9#include <hdf5_hl.h>11#include <hdf5_hl.h>
12#include <error.h>
1013
11/*14/*
12 * BLAST database containing all of the influenza protein sequences.15 * BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
44 * Get default BLAST options.47 * Get default BLAST options.
45 */48 */
46 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);49 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
50
51 /*
52 * Set to single hit mode.
53 */
54 options->window_size = 0;
55 options->multiple_hits_only = false;
56
47 ValNodePtr error_returns = NULL;57 ValNodePtr error_returns = NULL;
4858
49 /*59 /*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
56 if (status < 0)66 if (status < 0)
57 check_h5_error (status, __FILE__, __LINE__);67 check_h5_error (status, __FILE__, __LINE__);
5868
59 /*69 sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
60 * todo: Allocate memory of nrecords for dst_buf.70
61 *71 size_t dst_size;
62 * todo: Refactor code to share structres in read and write HDF572 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
63 * calls.73 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
64 */74 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
75 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
6576
66 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,77 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
67 dst_sizes, dst_buf);78 dst_sizes, dst_buf);
68 if (status < 0)79 if (status < 0)
69 check_h5_error (status, __FILE__, __LINE__);80 check_h5_error (status, __FILE__, __LINE__);
7081
71 for (int i = 0; i < nrecords; i++)
72 {
73
74 }
75
76 /*82 /*
77 * Read the sequence from the database by GI.83 * Assign protein types to records for which the field is empty.
78 */84 */
79 Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);85 printf ("Records to process: %i\n", (int)nrecords);
80 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);86 for (int i = 0; i < nrecords; i++)
81
82 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
83 "blastp",
84 REFDB,
85 options,
86 NULL,
87 &error_returns,
88 NULL);
89
90 if (error_returns != NULL)
91 check_ncbi_error (error_returns, __FILE__, __LINE__);
92
93 if (seqalign != NULL)
94 {87 {
95 Char target_id_buf[BUFFER_LEN + 1];
96 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
97 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
98 printf ("%s\n", target_id_buf);
99 }
100
101 // Clean up memory for the next ieration.
102 seqalign = SeqAlignSetFree (seqalign);
103 bsp = BioseqFree (bsp);
10488
89 if (dst_buf[i].protein_type[0] == '\0') {
90 /*
91 * Read the sequence from the database by GI.
92 */
93 Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
94 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
95
96 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
97 "blastp",
98 REFDB,
99 options,
100 NULL,
101 &error_returns,
102 NULL);
103
104 /*
105 * BLAST reported an error. Write it out and continue processing.
106 */
107 if (error_returns != NULL)
108 {
109 printf ("Warning: An error has been reported by the NCBI Toolkit API "
110 "for sequence gi|%i: %s",
111 dst_buf[i].gi, BlastErrorToString (error_returns));
112
113 }
114
115 /*
116 * A hit was found. Record the first hit as the protein type.
117 * Skip the first 6 characters and eat the "lcl|x_".
118 */
119 else if (seqalign != NULL)
120 {
121 Char target_id_buf[BUFFER_LEN + 1];
122 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
123 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
124 strncpy (dst_buf[i].protein_type, &target_id_buf[6],
125 sizeof (dst_buf[i].protein_type));
126 }
127
128 /*
129 * BLAST did not find any hits.
130 */
131 else
132 {
133 printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
134 dst_buf[i].gi);
135 }
136
137 /*
138 * Clean up memory for the next ieration.
139 */
140 seqalign = SeqAlignSetFree (seqalign);
141 bsp = BioseqFree (bsp);
142
143 /*
144 * Write the data out to the file.
145 */
146 if ( (i % 1000 == 0) && (i > 0) )
147 {
148 status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
149 dst_size, dst_offset, dst_sizes,
150 &dst_buf[i-1000]);
151 if (status < 0)
152 check_h5_error (status, __FILE__, __LINE__);
153
154 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
155 if (status < 0)
156 check_h5_error (status, __FILE__, __LINE__);
157
158 printf ("Processed %i of %i records.\n", i, (int)nrecords);
159 }
160 }
161
162 }
163
164 free (dst_buf);
165
105 options = BLASTOptionDelete (options);166 options = BLASTOptionDelete (options);
106167
107 return;168 return;
108}169}
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
1#include "load_influenza_faa.h"
2#include "check_error.h"1#include "check_error.h"
3#include "check_h5_error.h"2#include "check_h5_error.h"
4#include "hdf5_hl.h"3#include "load_influenza_faa.h"
4#include "sequence_data.h"
5#include "sequence_data_init.h"
6#include <hdf5_hl.h>
5#include <string.h>7#include <string.h>
6#include <stdlib.h>8#include <stdlib.h>
79
8#define SEQUENCE_DATA_FIELD_NUM 4
9
10void10void
11load_influenza_faa (hid_t file_id)11load_influenza_faa (hid_t file_id)
12{12{
13 typedef struct13 size_t dst_size;
14 {14 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
15 int gi;15 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
16 char gb[9];
17 char description[196];
18 char protein_type[7];
19 } sequence_data;
20
21 size_t dst_size = sizeof (sequence_data);
22 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
23 { HOFFSET (sequence_data, gi),
24 HOFFSET (sequence_data, gb),
25 HOFFSET (sequence_data, description),
26 HOFFSET (sequence_data, protein_type)
27 };
28
29 sequence_data dst_buf[1];
30
31 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
32 sizeof (dst_buf[0].gi),
33 sizeof (dst_buf[0].gb),
34 sizeof (dst_buf[0].description),
35 sizeof (dst_buf[0].protein_type)
36 };
37
38 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];16 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
3917
40 field_type[0] = H5T_NATIVE_INT;18 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
4119
42 hid_t gb_type = H5Tcopy (H5T_C_S1);
43 H5Tset_size (gb_type, 9);
44 field_type[1] = gb_type;
45
46 hid_t description_type = H5Tcopy (H5T_C_S1);
47 H5Tset_size (description_type, 196);
48 field_type[2] = description_type;
49
50 hid_t protein_type_type = H5Tcopy (H5T_C_S1);
51 H5Tset_size (protein_type_type, 7);
52 field_type[3] = protein_type_type;
53
54 const char *field_names[SEQUENCE_DATA_FIELD_NUM] =
55 { "GI",
56 "GB",
57 "Description",
58 "Protein Type" };
59
60 hsize_t chunk_size = 10;20 hsize_t chunk_size = 10;
61 int *fill_data = NULL;21 int *fill_data = NULL;
62 int compress = 0;22 int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
9959
100 strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));60 strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
10161
62 const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
63 SEQUENCE_DATA_FIELD_NAMES;
64
102 if (current_line == 1)65 if (current_line == 1)
103 {66 {
104 herr_t status = H5TBmake_table ("influenza.faa", file_id,67 herr_t status = H5TBmake_table ("influenza.faa", file_id,
105 "influenza.faa",68 "influenza.faa",
106 SEQUENCE_DATA_FIELD_NUM, 1,69 SEQUENCE_DATA_FIELD_NUM, 1,
107 dst_size, field_names,70 dst_size, sequence_data_field_names,
108 dst_offset, field_type,71 dst_offset, field_type,
109 chunk_size, fill_data, compress,72 chunk_size, fill_data, compress,
110 &p_data);73 &p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
13295
133 fclose (dat);96 fclose (dat);
13497
135 H5Tclose (gb_type);
136 H5Tclose (description_type);
137 H5Tclose (protein_type_type);
138
139 return;98 return;
140}99}
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
1#ifndef SEQUENCE_DATA_H
2#define SEQUENCE_DATA_H
3
4#define SEQUENCE_DATA_FIELD_NUM 4
5
6#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
7
8typedef struct
9{
10 int gi;
11 char gb[9];
12 char description[196];
13 char protein_type[7];
14} sequence_data;
15
16#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
1#include "sequence_data_init.h"
2#include "sequence_data.h"
3
4void
5sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
6 hid_t *field_type)
7{
8 *dst_size = sizeof (sequence_data);
9
10 dst_offset[0] = HOFFSET (sequence_data, gi);
11 dst_offset[1] = HOFFSET (sequence_data, gb);
12 dst_offset[2] = HOFFSET (sequence_data, description);
13 dst_offset[3] = HOFFSET (sequence_data, protein_type);
14
15 sequence_data dst_buf[1];
16
17 dst_sizes[0] = sizeof (dst_buf[0].gi);
18 dst_sizes[1] = sizeof (dst_buf[0].gb);
19 dst_sizes[2] = sizeof (dst_buf[0].description);
20 dst_sizes[3] = sizeof (dst_buf[0].protein_type);
21
22 field_type[0] = H5T_NATIVE_INT;
23
24 hid_t gb_type = H5Tcopy (H5T_C_S1);
25 H5Tset_size (gb_type, 9);
26 field_type[1] = gb_type;
27
28 hid_t description_type = H5Tcopy (H5T_C_S1);
29 H5Tset_size (description_type, 196);
30 field_type[2] = description_type;
31
32 hid_t protein_type_type = H5Tcopy (H5T_C_S1);
33 H5Tset_size (protein_type_type, 7);
34 field_type[3] = protein_type_type;
35
36 return;
37}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
1#ifndef SEQUENCE_DATA_INIT_H
2#define SEQUENCE_DATA_INIT_H
3
4#include <hdf5.h>
5
6/*
7 * Initialize the structures describing sequence_data. These
8 * descriptive structures are used by the HDF5 API.
9 */
10void
11sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
12 hid_t *field_type);
13
14#endif // SEQUENCE_DATA_INIT_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.