summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--.gitignore5
-rw-r--r--src/Makefile.am10
-rw-r--r--src/assign_protein_type.c135
-rw-r--r--src/load_influenza_faa.c67
-rw-r--r--src/sequence_data.h16
-rw-r--r--src/sequence_data_init.c37
-rw-r--r--src/sequence_data_init.h14
7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
14depcomp14depcomp
15install-sh15install-sh
16missing16missing
17stamp-h117src/.deps
18src/aggregator18src/aggregator
19src/influenza.h519src/influenza.h5
20src/.deps20src/updator
21stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
5 check_error.c \5 check_error.c \
6 check_h5_error.c \6 check_h5_error.c \
7 load_influenza_aa_dat.c \7 load_influenza_aa_dat.c \
8 load_influenza_faa.c8 load_influenza_faa.c \
9 sequence_data_init.c
910
10updator_SOURCES = \11updator_SOURCES = \
11 updator.c \12 updator.c \
12 check_error.c \13 check_error.c \
13 check_h5_error.c \14 check_h5_error.c \
14 check_ncbi_error.c \15 check_ncbi_error.c \
15 assign_protein_type.c16 assign_protein_type.c \
17 sequence_data_init.c
1618
17noinst_HEADERS = \19noinst_HEADERS = \
18 assign_protein_type.h \20 assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
20 check_h5_error.h \22 check_h5_error.h \
21 check_ncbi_error.h \23 check_ncbi_error.h \
22 load_influenza_aa_dat.h \24 load_influenza_aa_dat.h \
23 load_influenza_faa.h25 load_influenza_faa.h \
26 sequence_data.h \
27 sequence_data_init.h
2428
25AM_CFLAGS = -Wall -std=gnu99 -ggdb29AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
1#include "assign_protein_type.h"1#include "assign_protein_type.h"
2#include "check_ncbi_error.h"2#include "check_ncbi_error.h"
3#include "check_h5_error.h"3#include "check_h5_error.h"
4#include "sequence_data.h"
5#include "sequence_data_init.h"
4#include <ncbi.h>6#include <ncbi.h>
5#include <readdb.h>7#include <readdb.h>
6#include <blast.h>8#include <blast.h>
7#include <salpacc.h>9#include <salpacc.h>
8#include <stdbool.h>10#include <stdbool.h>
9#include <hdf5_hl.h>11#include <hdf5_hl.h>
12#include <error.h>
1013
11/*14/*
12 * BLAST database containing all of the influenza protein sequences.15 * BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
44 * Get default BLAST options.47 * Get default BLAST options.
45 */48 */
46 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);49 BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
50
51 /*
52 * Set to single hit mode.
53 */
54 options->window_size = 0;
55 options->multiple_hits_only = false;
56
47 ValNodePtr error_returns = NULL;57 ValNodePtr error_returns = NULL;
4858
49 /*59 /*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
56 if (status < 0)66 if (status < 0)
57 check_h5_error (status, __FILE__, __LINE__);67 check_h5_error (status, __FILE__, __LINE__);
5868
59 /*69 sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
60 * todo: Allocate memory of nrecords for dst_buf.70
61 *71 size_t dst_size;
62 * todo: Refactor code to share structres in read and write HDF572 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
63 * calls.73 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
64 */74 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
75 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
6576
66 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,77 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
67 dst_sizes, dst_buf);78 dst_sizes, dst_buf);
68 if (status < 0)79 if (status < 0)
69 check_h5_error (status, __FILE__, __LINE__);80 check_h5_error (status, __FILE__, __LINE__);
7081
71 for (int i = 0; i < nrecords; i++)
72 {
73
74 }
75
76 /*82 /*
77 * Read the sequence from the database by GI.83 * Assign protein types to records for which the field is empty.
78 */84 */
79 Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);85 printf ("Records to process: %i\n", (int)nrecords);
80 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);86 for (int i = 0; i < nrecords; i++)
81
82 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
83 "blastp",
84 REFDB,
85 options,
86 NULL,
87 &error_returns,
88 NULL);
89
90 if (error_returns != NULL)
91 check_ncbi_error (error_returns, __FILE__, __LINE__);
92
93 if (seqalign != NULL)
94 {87 {
95 Char target_id_buf[BUFFER_LEN + 1];
96 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
97 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
98 printf ("%s\n", target_id_buf);
99 }
100
101 // Clean up memory for the next ieration.
102 seqalign = SeqAlignSetFree (seqalign);
103 bsp = BioseqFree (bsp);
10488
89 if (dst_buf[i].protein_type[0] == '\0') {
90 /*
91 * Read the sequence from the database by GI.
92 */
93 Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
94 BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
95
96 SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
97 "blastp",
98 REFDB,
99 options,
100 NULL,
101 &error_returns,
102 NULL);
103
104 /*
105 * BLAST reported an error. Write it out and continue processing.
106 */
107 if (error_returns != NULL)
108 {
109 printf ("Warning: An error has been reported by the NCBI Toolkit API "
110 "for sequence gi|%i: %s",
111 dst_buf[i].gi, BlastErrorToString (error_returns));
112
113 }
114
115 /*
116 * A hit was found. Record the first hit as the protein type.
117 * Skip the first 6 characters and eat the "lcl|x_".
118 */
119 else if (seqalign != NULL)
120 {
121 Char target_id_buf[BUFFER_LEN + 1];
122 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
123 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
124 strncpy (dst_buf[i].protein_type, &target_id_buf[6],
125 sizeof (dst_buf[i].protein_type));
126 }
127
128 /*
129 * BLAST did not find any hits.
130 */
131 else
132 {
133 printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
134 dst_buf[i].gi);
135 }
136
137 /*
138 * Clean up memory for the next ieration.
139 */
140 seqalign = SeqAlignSetFree (seqalign);
141 bsp = BioseqFree (bsp);
142
143 /*
144 * Write the data out to the file.
145 */
146 if ( (i % 1000 == 0) && (i > 0) )
147 {
148 status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
149 dst_size, dst_offset, dst_sizes,
150 &dst_buf[i-1000]);
151 if (status < 0)
152 check_h5_error (status, __FILE__, __LINE__);
153
154 status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
155 if (status < 0)
156 check_h5_error (status, __FILE__, __LINE__);
157
158 printf ("Processed %i of %i records.\n", i, (int)nrecords);
159 }
160 }
161
162 }
163
164 free (dst_buf);
165
105 options = BLASTOptionDelete (options);166 options = BLASTOptionDelete (options);
106167
107 return;168 return;
108}169}
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
1#include "load_influenza_faa.h"
2#include "check_error.h"1#include "check_error.h"
3#include "check_h5_error.h"2#include "check_h5_error.h"
4#include "hdf5_hl.h"3#include "load_influenza_faa.h"
4#include "sequence_data.h"
5#include "sequence_data_init.h"
6#include <hdf5_hl.h>
5#include <string.h>7#include <string.h>
6#include <stdlib.h>8#include <stdlib.h>
79
8#define SEQUENCE_DATA_FIELD_NUM 4
9
10void10void
11load_influenza_faa (hid_t file_id)11load_influenza_faa (hid_t file_id)
12{12{
13 typedef struct13 size_t dst_size;
14 {14 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
15 int gi;15 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
16 char gb[9];
17 char description[196];
18 char protein_type[7];
19 } sequence_data;
20
21 size_t dst_size = sizeof (sequence_data);
22 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
23 { HOFFSET (sequence_data, gi),
24 HOFFSET (sequence_data, gb),
25 HOFFSET (sequence_data, description),
26 HOFFSET (sequence_data, protein_type)
27 };
28
29 sequence_data dst_buf[1];
30
31 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
32 sizeof (dst_buf[0].gi),
33 sizeof (dst_buf[0].gb),
34 sizeof (dst_buf[0].description),
35 sizeof (dst_buf[0].protein_type)
36 };
37
38 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];16 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
3917
40 field_type[0] = H5T_NATIVE_INT;18 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
4119
42 hid_t gb_type = H5Tcopy (H5T_C_S1);
43 H5Tset_size (gb_type, 9);
44 field_type[1] = gb_type;
45
46 hid_t description_type = H5Tcopy (H5T_C_S1);
47 H5Tset_size (description_type, 196);
48 field_type[2] = description_type;
49
50 hid_t protein_type_type = H5Tcopy (H5T_C_S1);
51 H5Tset_size (protein_type_type, 7);
52 field_type[3] = protein_type_type;
53
54 const char *field_names[SEQUENCE_DATA_FIELD_NUM] =
55 { "GI",
56 "GB",
57 "Description",
58 "Protein Type" };
59
60 hsize_t chunk_size = 10;20 hsize_t chunk_size = 10;
61 int *fill_data = NULL;21 int *fill_data = NULL;
62 int compress = 0;22 int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
9959
100 strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));60 strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
10161
62 const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
63 SEQUENCE_DATA_FIELD_NAMES;
64
102 if (current_line == 1)65 if (current_line == 1)
103 {66 {
104 herr_t status = H5TBmake_table ("influenza.faa", file_id,67 herr_t status = H5TBmake_table ("influenza.faa", file_id,
105 "influenza.faa",68 "influenza.faa",
106 SEQUENCE_DATA_FIELD_NUM, 1,69 SEQUENCE_DATA_FIELD_NUM, 1,
107 dst_size, field_names,70 dst_size, sequence_data_field_names,
108 dst_offset, field_type,71 dst_offset, field_type,
109 chunk_size, fill_data, compress,72 chunk_size, fill_data, compress,
110 &p_data);73 &p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
13295
133 fclose (dat);96 fclose (dat);
13497
135 H5Tclose (gb_type);
136 H5Tclose (description_type);
137 H5Tclose (protein_type_type);
138
139 return;98 return;
140}99}
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
1#ifndef SEQUENCE_DATA_H
2#define SEQUENCE_DATA_H
3
4#define SEQUENCE_DATA_FIELD_NUM 4
5
6#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
7
8typedef struct
9{
10 int gi;
11 char gb[9];
12 char description[196];
13 char protein_type[7];
14} sequence_data;
15
16#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
1#include "sequence_data_init.h"
2#include "sequence_data.h"
3
4void
5sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
6 hid_t *field_type)
7{
8 *dst_size = sizeof (sequence_data);
9
10 dst_offset[0] = HOFFSET (sequence_data, gi);
11 dst_offset[1] = HOFFSET (sequence_data, gb);
12 dst_offset[2] = HOFFSET (sequence_data, description);
13 dst_offset[3] = HOFFSET (sequence_data, protein_type);
14
15 sequence_data dst_buf[1];
16
17 dst_sizes[0] = sizeof (dst_buf[0].gi);
18 dst_sizes[1] = sizeof (dst_buf[0].gb);
19 dst_sizes[2] = sizeof (dst_buf[0].description);
20 dst_sizes[3] = sizeof (dst_buf[0].protein_type);
21
22 field_type[0] = H5T_NATIVE_INT;
23
24 hid_t gb_type = H5Tcopy (H5T_C_S1);
25 H5Tset_size (gb_type, 9);
26 field_type[1] = gb_type;
27
28 hid_t description_type = H5Tcopy (H5T_C_S1);
29 H5Tset_size (description_type, 196);
30 field_type[2] = description_type;
31
32 hid_t protein_type_type = H5Tcopy (H5T_C_S1);
33 H5Tset_size (protein_type_type, 7);
34 field_type[3] = protein_type_type;
35
36 return;
37}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
1#ifndef SEQUENCE_DATA_INIT_H
2#define SEQUENCE_DATA_INIT_H
3
4#include <hdf5.h>
5
6/*
7 * Initialize the structures describing sequence_data. These
8 * descriptive structures are used by the HDF5 API.
9 */
10void
11sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
12 hid_t *field_type);
13
14#endif // SEQUENCE_DATA_INIT_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.