-rw-r--r-- | src/assign_protein_type.c | 29 | ||||
-rw-r--r-- | src/load_influenza_aa_dat.c | 2 | ||||
-rw-r--r-- | src/load_influenza_faa.c | 42 |
3 files changed, 59 insertions, 14 deletions
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 54db84e..1b58f54 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c | |||
@@ -1,10 +1,12 @@ | |||
1 | #include "assign_protein_type.h" | 1 | #include "assign_protein_type.h" |
2 | #include "check_ncbi_error.h" | 2 | #include "check_ncbi_error.h" |
3 | #include "check_h5_error.h" | ||
3 | #include <ncbi.h> | 4 | #include <ncbi.h> |
4 | #include <readdb.h> | 5 | #include <readdb.h> |
5 | #include <blast.h> | 6 | #include <blast.h> |
6 | #include <salpacc.h> | 7 | #include <salpacc.h> |
7 | #include <stdbool.h> | 8 | #include <stdbool.h> |
9 | #include <hdf5_hl.h> | ||
8 | 10 | ||
9 | /* | 11 | /* |
10 | * BLAST database containing all of the influenza protein sequences. | 12 | * BLAST database containing all of the influenza protein sequences. |
@@ -45,6 +47,33 @@ assign_protein_type (hid_t file_id) | |||
45 | ValNodePtr error_returns = NULL; | 47 | ValNodePtr error_returns = NULL; |
46 | 48 | ||
47 | /* | 49 | /* |
50 | * Read the data from HDF5 file. | ||
51 | */ | ||
52 | hsize_t nfields; | ||
53 | hsize_t nrecords; | ||
54 | herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields, | ||
55 | &nrecords); | ||
56 | if (status < 0) | ||
57 | check_h5_error (status, __FILE__, __LINE__); | ||
58 | |||
59 | /* | ||
60 | * todo: Allocate memory of nrecords for dst_buf. | ||
61 | * | ||
62 | * todo: Refactor code to share structres in read and write HDF5 | ||
63 | * calls. | ||
64 | */ | ||
65 | |||
66 | status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, | ||
67 | dst_sizes, dst_buf); | ||
68 | if (status < 0) | ||
69 | check_h5_error (status, __FILE__, __LINE__); | ||
70 | |||
71 | for (int i = 0; i < nrecords; i++) | ||
72 | { | ||
73 | |||
74 | } | ||
75 | |||
76 | /* | ||
48 | * Read the sequence from the database by GI. | 77 | * Read the sequence from the database by GI. |
49 | */ | 78 | */ |
50 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); | 79 | Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL); |
diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c index f0d9ee5..aed33e8 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load_influenza_aa_dat.c | |||
@@ -8,7 +8,7 @@ | |||
8 | #include "load_influenza_aa_dat.h" | 8 | #include "load_influenza_aa_dat.h" |
9 | #include "check_error.h" | 9 | #include "check_error.h" |
10 | #include "check_h5_error.h" | 10 | #include "check_h5_error.h" |
11 | #include "hdf5_hl.h" | 11 | #include <hdf5_hl.h> |
12 | #include <string.h> | 12 | #include <string.h> |
13 | #include <stdlib.h> | 13 | #include <stdlib.h> |
14 | 14 | ||
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c index 61bb99d..749b7ad 100644 --- a/src/load_influenza_faa.c +++ b/src/load_influenza_faa.c | |||
@@ -5,6 +5,8 @@ | |||
5 | #include <string.h> | 5 | #include <string.h> |
6 | #include <stdlib.h> | 6 | #include <stdlib.h> |
7 | 7 | ||
8 | #define SEQUENCE_DATA_FIELD_NUM 4 | ||
9 | |||
8 | void | 10 | void |
9 | load_influenza_faa (hid_t file_id) | 11 | load_influenza_faa (hid_t file_id) |
10 | { | 12 | { |
@@ -13,24 +15,27 @@ load_influenza_faa (hid_t file_id) | |||
13 | int gi; | 15 | int gi; |
14 | char gb[9]; | 16 | char gb[9]; |
15 | char description[196]; | 17 | char description[196]; |
18 | char protein_type[7]; | ||
16 | } sequence_data; | 19 | } sequence_data; |
17 | 20 | ||
18 | size_t dst_size = sizeof (sequence_data); | 21 | size_t dst_size = sizeof (sequence_data); |
19 | size_t dst_offset[3] = | 22 | size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] = |
20 | { HOFFSET (sequence_data, gi), | 23 | { HOFFSET (sequence_data, gi), |
21 | HOFFSET (sequence_data, gb), | 24 | HOFFSET (sequence_data, gb), |
22 | HOFFSET (sequence_data, description) | 25 | HOFFSET (sequence_data, description), |
26 | HOFFSET (sequence_data, protein_type) | ||
23 | }; | 27 | }; |
24 | 28 | ||
25 | sequence_data dst_buf[1]; | 29 | sequence_data dst_buf[1]; |
26 | 30 | ||
27 | size_t dst_sizes[3] = { | 31 | size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = { |
28 | sizeof (dst_buf[0].gi), | 32 | sizeof (dst_buf[0].gi), |
29 | sizeof (dst_buf[0].gb), | 33 | sizeof (dst_buf[0].gb), |
30 | sizeof (dst_buf[0].description) | 34 | sizeof (dst_buf[0].description), |
35 | sizeof (dst_buf[0].protein_type) | ||
31 | }; | 36 | }; |
32 | 37 | ||
33 | hid_t field_type[3]; | 38 | hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; |
34 | 39 | ||
35 | field_type[0] = H5T_NATIVE_INT; | 40 | field_type[0] = H5T_NATIVE_INT; |
36 | 41 | ||
@@ -42,9 +47,15 @@ load_influenza_faa (hid_t file_id) | |||
42 | H5Tset_size (description_type, 196); | 47 | H5Tset_size (description_type, 196); |
43 | field_type[2] = description_type; | 48 | field_type[2] = description_type; |
44 | 49 | ||
45 | const char *field_names[3] = { "GI", | 50 | hid_t protein_type_type = H5Tcopy (H5T_C_S1); |
46 | "GB", | 51 | H5Tset_size (protein_type_type, 7); |
47 | "Description" }; | 52 | field_type[3] = protein_type_type; |
53 | |||
54 | const char *field_names[SEQUENCE_DATA_FIELD_NUM] = | ||
55 | { "GI", | ||
56 | "GB", | ||
57 | "Description", | ||
58 | "Protein Type" }; | ||
48 | 59 | ||
49 | hsize_t chunk_size = 10; | 60 | hsize_t chunk_size = 10; |
50 | int *fill_data = NULL; | 61 | int *fill_data = NULL; |
@@ -86,13 +97,17 @@ load_influenza_faa (hid_t file_id) | |||
86 | strncpy (p_data.description, strsep (&running, "|"), | 97 | strncpy (p_data.description, strsep (&running, "|"), |
87 | sizeof (p_data.description)); | 98 | sizeof (p_data.description)); |
88 | 99 | ||
100 | strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); | ||
101 | |||
89 | if (current_line == 1) | 102 | if (current_line == 1) |
90 | { | 103 | { |
91 | herr_t status = H5TBmake_table ("influenza.faa", file_id, | 104 | herr_t status = H5TBmake_table ("influenza.faa", file_id, |
92 | "influenza.faa", 3, 1, dst_size, | 105 | "influenza.faa", |
93 | field_names, dst_offset, | 106 | SEQUENCE_DATA_FIELD_NUM, 1, |
94 | field_type, chunk_size, | 107 | dst_size, field_names, |
95 | fill_data, compress, &p_data); | 108 | dst_offset, field_type, |
109 | chunk_size, fill_data, compress, | ||
110 | &p_data); | ||
96 | if (status < 0) | 111 | if (status < 0) |
97 | check_h5_error (status, __FILE__, __LINE__); | 112 | check_h5_error (status, __FILE__, __LINE__); |
98 | } | 113 | } |
@@ -119,6 +134,7 @@ load_influenza_faa (hid_t file_id) | |||
119 | 134 | ||
120 | H5Tclose (gb_type); | 135 | H5Tclose (gb_type); |
121 | H5Tclose (description_type); | 136 | H5Tclose (description_type); |
137 | H5Tclose (protein_type_type); | ||
122 | 138 | ||
123 | return; | 139 | return; |
124 | } | 140 | } |