author | Don Pellegrino <don@drexel.edu> | 2010-01-18 17:26:18 (GMT) |
---|---|---|
committer | Don Pellegrino <don@drexel.edu> | 2010-01-18 17:26:18 (GMT) |
commit | a7cc532248146968a9296be7db42830e35525afa (patch) (unidiff) | |
tree | 93c44ad4f1ac27a1daed4ad5c7fdcb76971c0221 | |
parent | 0d0e0886d17612fb7ebdb9110679d5b7bd5087be (diff) | |
download | exp007-a7cc532248146968a9296be7db42830e35525afa.zip exp007-a7cc532248146968a9296be7db42830e35525afa.tar.gz exp007-a7cc532248146968a9296be7db42830e35525afa.tar.bz2 |
Implemented updator to calculate and assign protein type values to
records which do not have them. This is a re-entrant process that
only updates missing records and skips quickly through existing
records. In addition premature termination loses at most 999 records
of work since every thousand records are written back to the file and
flushed to disk. Still to do is to write and flush the last set of
records in the final bin under 1000. Also refactored the
sequence_data structure so that it can be shared between HDF5 reading
and writing operations.
-rw-r--r-- | .gitignore | 5 | ||||
-rw-r--r-- | src/Makefile.am | 10 | ||||
-rw-r--r-- | src/assign_protein_type.c | 135 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 67 | ||||
-rw-r--r-- | src/sequence_data.h | 16 | ||||
-rw-r--r-- | src/sequence_data_init.c | 37 | ||||
-rw-r--r-- | src/sequence_data_init.h | 14 |
7 files changed, 188 insertions, 96 deletions
@@ -14,7 +14,8 @@ configure | |||
14 | depcomp | 14 | depcomp |
15 | install-sh | 15 | install-sh |
16 | missing | 16 | missing |
17 | stamp-h1 | 17 | src/.deps |
18 | src/aggregator | 18 | src/aggregator |
19 | src/influenza.h5 | 19 | src/influenza.h5 |
20 | src/.deps | 20 | src/updator |
21 | stamp-h1 | ||
diff --git a/src/Makefile.am b/src/Makefile.am index a7e6852..6dd8e16 100644 --- a/src/Makefile.am +++ b/src/Makefile.am | |||
@@ -5,14 +5,16 @@ aggregator_SOURCES = \ | |||
5 | check_error.c \ | 5 | check_error.c \ |
6 | check_h5_error.c \ | 6 | check_h5_error.c \ |
7 | load_influenza_aa_dat.c \ | 7 | load_influenza_aa_dat.c \ |
8 | load_influenza_faa.c | 8 | load_influenza_faa.c \ |
9 | sequence_data_init.c | ||
9 | 10 | ||
10 | updator_SOURCES = \ | 11 | updator_SOURCES = \ |
11 | updator.c \ | 12 | updator.c \ |
12 | check_error.c \ | 13 | check_error.c \ |
13 | check_h5_error.c \ | 14 | check_h5_error.c \ |
14 | check_ncbi_error.c \ | 15 | check_ncbi_error.c \ |
15 | assign_protein_type.c | 16 | assign_protein_type.c \ |
17 | sequence_data_init.c | ||
16 | 18 | ||
17 | noinst_HEADERS = \ | 19 | noinst_HEADERS = \ |
18 | assign_protein_type.h \ | 20 | assign_protein_type.h \ |
@@ -20,6 +22,8 @@ noinst_HEADERS = \ | |||
20 | check_h5_error.h \ | 22 | check_h5_error.h \ |
21 | check_ncbi_error.h \ | 23 | check_ncbi_error.h \ |
22 | load_influenza_aa_dat.h \ | 24 | load_influenza_aa_dat.h \ |
23 | load_influenza_faa.h | 25 | load_influenza_faa.h \ |
26 | sequence_data.h \ | ||
27 | sequence_data_init.h | ||
24 | 28 | ||
25 | AM_CFLAGS = -Wall -std=gnu99 -ggdb | 29 | AM_CFLAGS = -Wall -std=gnu99 -ggdb |
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c | |||
@@ -1,12 +1,15 @@ | |||
1 | #include "assign_protein_type.h" | 1 | #include "assign_protein_type.h" |
2 | #include "check_ncbi_error.h" | 2 | #include "check_ncbi_error.h" |
3 | #include "check_h5_error.h" | 3 | #include "check_h5_error.h" |
4 | #include "sequence_data.h" | ||
5 | #include "sequence_data_init.h" | ||
4 | #include <ncbi.h> | 6 | #include <ncbi.h> |
5 | #include <readdb.h> | 7 | #include <readdb.h> |
6 | #include <blast.h> | 8 | #include <blast.h> |
7 | #include <salpacc.h> | 9 | #include <salpacc.h> |
8 | #include <stdbool.h> | 10 | #include <stdbool.h> |
9 | #include <hdf5_hl.h> | 11 | #include <hdf5_hl.h> |
12 | #include <error.h> | ||
10 | 13 | ||
11 | /* | 14 | /* |
12 | * BLAST database containing all of the influenza protein sequences. | 15 | * BLAST database containing all of the influenza protein sequences. |
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id) | |||
44 | * Get default BLAST options. | 47 | * Get default BLAST options. |
45 | */ | 48 | */ |
46 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); | 49 | BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true); |
50 | |||
51 | /* | ||
52 | * Set to single hit mode. | ||
53 | */ | ||
54 | options->window_size = 0; | ||
55 | options->multiple_hits_only = false; | ||
56 | |||
47 | ValNodePtr error_returns = NULL; | 57 | ValNodePtr error_returns = NULL; |
48 | 58 | ||
49 | /* | 59 | /* |
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id) | |||
56 | if (status < 0) | 66 | if (status < 0) |
57 | check_h5_error (status, __FILE__, __LINE__); | 67 | check_h5_error (status, __FILE__, __LINE__); |
58 | 68 | ||
59 | /* | 69 | sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); |
60 | * todo: Allocate memory of nrecords for dst_buf. | 70 | |
61 | * | 71 | size_t dst_size; |
62 | * todo: Refactor code to share structres in read and write HDF5 | 72 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; |
63 | * calls. | 73 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; |
64 | */ | 74 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
75 | sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); | ||
65 | 76 | ||
66 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, | 77 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, |
67 | dst_sizes, dst_buf); | 78 | dst_sizes, dst_buf); |
68 | if (status < 0) | 79 | if (status < 0) |
69 | check_h5_error (status, __FILE__, __LINE__); | 80 | check_h5_error (status, __FILE__, __LINE__); |
70 | 81 | ||
71 | for (int i = 0; i < nrecords; i++) | ||
72 | { | ||
73 | |||
74 | } | ||
75 | |||
76 | /* | 82 | /* |
77 | * Read the sequence from the database by GI. | 83 | * Assign protein types to records for which the field is empty. |
78 | */ | 84 | */ |
79 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); | 85 | printf ("Records to process: %i\n", (int)nrecords); |
80 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | 86 | for (int i = 0; i < nrecords; i++) |
81 | |||
82 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
83 | "blastp", | ||
84 | REFDB, | ||
85 | options, | ||
86 | NULL, | ||
87 | &error_returns, | ||
88 | NULL); | ||
89 | |||
90 | if (error_returns != NULL) | ||
91 | check_ncbi_error (error_returns, __FILE__, __LINE__); | ||
92 | |||
93 | if (seqalign != NULL) | ||
94 | { | 87 | { |
95 | Char target_id_buf[BUFFER_LEN + 1]; | ||
96 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
97 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
98 | printf ("%s\n", target_id_buf); | ||
99 | } | ||
100 | |||
101 | // Clean up memory for the next ieration. | ||
102 | seqalign = SeqAlignSetFree (seqalign); | ||
103 | bsp = BioseqFree (bsp); | ||
104 | 88 | ||
89 | if (dst_buf[i].protein_type[0] == '\0') { | ||
90 | /* | ||
91 | * Read the sequence from the database by GI. | ||
92 | */ | ||
93 | Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL); | ||
94 | BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number); | ||
95 | |||
96 | SeqAlignPtr seqalign = BioseqBlastEngine (bsp, | ||
97 | "blastp", | ||
98 | REFDB, | ||
99 | options, | ||
100 | NULL, | ||
101 | &error_returns, | ||
102 | NULL); | ||
103 | |||
104 | /* | ||
105 | * BLAST reported an error. Write it out and continue processing. | ||
106 | */ | ||
107 | if (error_returns != NULL) | ||
108 | { | ||
109 | printf ("Warning: An error has been reported by the NCBI Toolkit API " | ||
110 | "for sequence gi|%i: %s", | ||
111 | dst_buf[i].gi, BlastErrorToString (error_returns)); | ||
112 | |||
113 | } | ||
114 | |||
115 | /* | ||
116 | * A hit was found. Record the first hit as the protein type. | ||
117 | * Skip the first 6 characters and eat the "lcl|x_". | ||
118 | */ | ||
119 | else if (seqalign != NULL) | ||
120 | { | ||
121 | Char target_id_buf[BUFFER_LEN + 1]; | ||
122 | SeqIdPtr target_id = SeqAlignId (seqalign, 1); | ||
123 | SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); | ||
124 | strncpy (dst_buf[i].protein_type, &target_id_buf[6], | ||
125 | sizeof (dst_buf[i].protein_type)); | ||
126 | } | ||
127 | |||
128 | /* | ||
129 | * BLAST did not find any hits. | ||
130 | */ | ||
131 | else | ||
132 | { | ||
133 | printf ("Warning: Unable to identify protein type for sequence gi|%i\n", | ||
134 | dst_buf[i].gi); | ||
135 | } | ||
136 | |||
137 | /* | ||
138 | * Clean up memory for the next ieration. | ||
139 | */ | ||
140 | seqalign = SeqAlignSetFree (seqalign); | ||
141 | bsp = BioseqFree (bsp); | ||
142 | |||
143 | /* | ||
144 | * Write the data out to the file. | ||
145 | */ | ||
146 | if ( (i % 1000 == 0) && (i > 0) ) | ||
147 | { | ||
148 | status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000, | ||
149 | dst_size, dst_offset, dst_sizes, | ||
150 | &dst_buf[i-1000]); | ||
151 | if (status < 0) | ||
152 | check_h5_error (status, __FILE__, __LINE__); | ||
153 | |||
154 | status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); | ||
155 | if (status < 0) | ||
156 | check_h5_error (status, __FILE__, __LINE__); | ||
157 | |||
158 | printf ("Processed %i of %i records.\n", i, (int)nrecords); | ||
159 | } | ||
160 | } | ||
161 | |||
162 | } | ||
163 | |||
164 | free (dst_buf); | ||
165 | |||
105 | options = BLASTOptionDelete (options); | 166 | options = BLASTOptionDelete (options); |
106 | 167 | ||
107 | return; | 168 | return; |
108 | } | 169 | } |
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c index 749b7ad..fd35254 100644 --- a/src/load_influenza_faa.c +++ b/src/load_influenza_faa.c | |||
@@ -1,62 +1,22 @@ | |||
1 | #include "load_influenza_faa.h" | ||
2 | #include "check_error.h" | 1 | #include "check_error.h" |
3 | #include "check_h5_error.h" | 2 | #include "check_h5_error.h" |
4 | #include "hdf5_hl.h" | 3 | #include "load_influenza_faa.h" |
4 | #include "sequence_data.h" | ||
5 | #include "sequence_data_init.h" | ||
6 | #include <hdf5_hl.h> | ||
5 | #include <string.h> | 7 | #include <string.h> |
6 | #include <stdlib.h> | 8 | #include <stdlib.h> |
7 | 9 | ||
8 | #define SEQUENCE_DATA_FIELD_NUM 4 | ||
9 | |||
10 | void | 10 | void |
11 | load_influenza_faa (hid_t file_id) | 11 | load_influenza_faa (hid_t file_id) |
12 | { | 12 | { |
13 | typedef struct | 13 | size_t dst_size; |
14 | { | 14 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; |
15 | int gi; | 15 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; |
16 | char gb[9]; | ||
17 | char description[196]; | ||
18 | char protein_type[7]; | ||
19 | } sequence_data; | ||
20 | |||
21 | size_t dst_size = sizeof (sequence_data); | ||
22 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] = | ||
23 | { HOFFSET (sequence_data, gi), | ||
24 | HOFFSET (sequence_data, gb), | ||
25 | HOFFSET (sequence_data, description), | ||
26 | HOFFSET (sequence_data, protein_type) | ||
27 | }; | ||
28 | |||
29 | sequence_data dst_buf[1]; | ||
30 | |||
31 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = { | ||
32 | sizeof (dst_buf[0].gi), | ||
33 | sizeof (dst_buf[0].gb), | ||
34 | sizeof (dst_buf[0].description), | ||
35 | sizeof (dst_buf[0].protein_type) | ||
36 | }; | ||
37 | |||
38 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; | 16 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
39 | 17 | ||
40 | field_type[0] = H5T_NATIVE_INT; | 18 | sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); |
41 | 19 | ||
42 | hid_t gb_type = H5Tcopy (H5T_C_S1); | ||
43 | H5Tset_size (gb_type, 9); | ||
44 | field_type[1] = gb_type; | ||
45 | |||
46 | hid_t description_type = H5Tcopy (H5T_C_S1); | ||
47 | H5Tset_size (description_type, 196); | ||
48 | field_type[2] = description_type; | ||
49 | |||
50 | hid_t protein_type_type = H5Tcopy (H5T_C_S1); | ||
51 | H5Tset_size (protein_type_type, 7); | ||
52 | field_type[3] = protein_type_type; | ||
53 | |||
54 | const char *field_names[SEQUENCE_DATA_FIELD_NUM] = | ||
55 | { "GI", | ||
56 | "GB", | ||
57 | "Description", | ||
58 | "Protein Type" }; | ||
59 | |||
60 | hsize_t chunk_size = 10; | 20 | hsize_t chunk_size = 10; |
61 | int *fill_data = NULL; | 21 | int *fill_data = NULL; |
62 | int compress = 0; | 22 | int compress = 0; |
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id) | |||
99 | 59 | ||
100 | strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); | 60 | strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); |
101 | 61 | ||
62 | const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] = | ||
63 | SEQUENCE_DATA_FIELD_NAMES; | ||
64 | |||
102 | if (current_line == 1) | 65 | if (current_line == 1) |
103 | { | 66 | { |
104 | herr_t status = H5TBmake_table ("influenza.faa", file_id, | 67 | herr_t status = H5TBmake_table ("influenza.faa", file_id, |
105 | "influenza.faa", | 68 | "influenza.faa", |
106 | SEQUENCE_DATA_FIELD_NUM, 1, | 69 | SEQUENCE_DATA_FIELD_NUM, 1, |
107 | dst_size, field_names, | 70 | dst_size, sequence_data_field_names, |
108 | dst_offset, field_type, | 71 | dst_offset, field_type, |
109 | chunk_size, fill_data, compress, | 72 | chunk_size, fill_data, compress, |
110 | &p_data); | 73 | &p_data); |
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id) | |||
132 | 95 | ||
133 | fclose (dat); | 96 | fclose (dat); |
134 | 97 | ||
135 | H5Tclose (gb_type); | ||
136 | H5Tclose (description_type); | ||
137 | H5Tclose (protein_type_type); | ||
138 | |||
139 | return; | 98 | return; |
140 | } | 99 | } |
diff --git a/src/sequence_data.h b/src/sequence_data.h new file mode 100644 index 0000000..fb4ff78 --- a/dev/null +++ b/src/sequence_data.h | |||
@@ -0,0 +1,16 @@ | |||
1 | #ifndef SEQUENCE_DATA_H | ||
2 | #define SEQUENCE_DATA_H | ||
3 | |||
4 | #define SEQUENCE_DATA_FIELD_NUM 4 | ||
5 | |||
6 | #define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" } | ||
7 | |||
8 | typedef struct | ||
9 | { | ||
10 | int gi; | ||
11 | char gb[9]; | ||
12 | char description[196]; | ||
13 | char protein_type[7]; | ||
14 | } sequence_data; | ||
15 | |||
16 | #endif // SEQUENCE_DATA_H | ||
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c new file mode 100644 index 0000000..09ba189 --- a/dev/null +++ b/src/sequence_data_init.c | |||
@@ -0,0 +1,37 @@ | |||
1 | #include "sequence_data_init.h" | ||
2 | #include "sequence_data.h" | ||
3 | |||
4 | void | ||
5 | sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, | ||
6 | hid_t *field_type) | ||
7 | { | ||
8 | *dst_size = sizeof (sequence_data); | ||
9 | |||
10 | dst_offset[0] = HOFFSET (sequence_data, gi); | ||
11 | dst_offset[1] = HOFFSET (sequence_data, gb); | ||
12 | dst_offset[2] = HOFFSET (sequence_data, description); | ||
13 | dst_offset[3] = HOFFSET (sequence_data, protein_type); | ||
14 | |||
15 | sequence_data dst_buf[1]; | ||
16 | |||
17 | dst_sizes[0] = sizeof (dst_buf[0].gi); | ||
18 | dst_sizes[1] = sizeof (dst_buf[0].gb); | ||
19 | dst_sizes[2] = sizeof (dst_buf[0].description); | ||
20 | dst_sizes[3] = sizeof (dst_buf[0].protein_type); | ||
21 | |||
22 | field_type[0] = H5T_NATIVE_INT; | ||
23 | |||
24 | hid_t gb_type = H5Tcopy (H5T_C_S1); | ||
25 | H5Tset_size (gb_type, 9); | ||
26 | field_type[1] = gb_type; | ||
27 | |||
28 | hid_t description_type = H5Tcopy (H5T_C_S1); | ||
29 | H5Tset_size (description_type, 196); | ||
30 | field_type[2] = description_type; | ||
31 | |||
32 | hid_t protein_type_type = H5Tcopy (H5T_C_S1); | ||
33 | H5Tset_size (protein_type_type, 7); | ||
34 | field_type[3] = protein_type_type; | ||
35 | |||
36 | return; | ||
37 | } | ||
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h new file mode 100644 index 0000000..c87e7e6 --- a/dev/null +++ b/src/sequence_data_init.h | |||
@@ -0,0 +1,14 @@ | |||
1 | #ifndef SEQUENCE_DATA_INIT_H | ||
2 | #define SEQUENCE_DATA_INIT_H | ||
3 | |||
4 | #include <hdf5.h> | ||
5 | |||
6 | /* | ||
7 | * Initialize the structures describing sequence_data. These | ||
8 | * descriptive structures are used by the HDF5 API. | ||
9 | */ | ||
10 | void | ||
11 | sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, | ||
12 | hid_t *field_type); | ||
13 | |||
14 | #endif // SEQUENCE_DATA_INIT_H | ||