summaryrefslogtreecommitdiffstats
Unidiff
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/analysis/year.R b/analysis/year.R
index b7da3c8..6d68925 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -5,21 +5,23 @@ require(hdf5);
5hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);5hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
66
7A <- influenza.aa.dat;7A <- influenza.aa.dat;
8
9B <- influenza.faa;8B <- influenza.faa;
109
10# Join the two tables by GB value.
11C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
12
11# All records for 1918. Based on code from13# All records for 1918. Based on code from
12# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations14# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
13C <- A[A$Year == 1918, ]15D <- C[C$Year == 1918, ]
1416
15# Countries represented in the 1918 dataset.17summary (D);
16C$Country;
1718
18# All record with a year value.19# Countries represented in the 1918 dataset.
19D <- A[A$Year != 0, ];20D$Country;
2021
21hist(D$Year);22D[D$"Protein Type" == "HA", ]
2223
23B[B$GI == 305182, ]24# All record with a year value.
25E <- A[A$Year != 0, ];
2426
25B[B$"Protein Type" == "HA", ]27hist(E$Year);
diff --git a/src/Makefile.am b/src/Makefile.am
index 6dd8e16..06a6f5c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,28 +2,31 @@ bin_PROGRAMS = aggregator updator
22
3aggregator_SOURCES = \3aggregator_SOURCES = \
4 aggregator.c \4 aggregator.c \
5 check_error.c \5 error/check_error.c \
6 check_h5_error.c \6 error/check_h5_error.c \
7 load_influenza_aa_dat.c \7 load/load_influenza_aa_dat.c \
8 load_influenza_faa.c \8 load/load_influenza_faa.c \
9 sequence_data_init.c9 model/sequence_data_init.c
1010
11updator_SOURCES = \11updator_SOURCES = \
12 updator.c \12 updator.c \
13 check_error.c \13 assign/assign_protein_type.c \
14 check_h5_error.c \14 error/check_error.c \
15 check_ncbi_error.c \15 error/check_h5_error.c \
16 assign_protein_type.c \16 error/check_ncbi_error.c \
17 sequence_data_init.c17 model/gi_type_data_init.c \
18 model/sequence_data_init.c
1819
19noinst_HEADERS = \20noinst_HEADERS = \
20 assign_protein_type.h \21 assign/assign_protein_type.h \
21 check_error.h \22 error/check_error.h \
22 check_h5_error.h \23 error/check_h5_error.h \
23 check_ncbi_error.h \24 error/check_ncbi_error.h \
24 load_influenza_aa_dat.h \25 load/load_influenza_aa_dat.h \
25 load_influenza_faa.h \26 load/load_influenza_faa.h \
26 sequence_data.h \27 model/gi_type_data.h \
27 sequence_data_init.h28 model/gi_type_data_init.h \
29 model/sequence_data.h \
30 model/sequence_data_init.h
2831
29AM_CFLAGS = -Wall -std=gnu99 -ggdb32AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/aggregator.c b/src/aggregator.c
index 07aba39..c00d912 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -3,9 +3,9 @@
3 * container.3 * container.
4 */4 */
55
6#include "check_h5_error.h"6#include "error/check_h5_error.h"
7#include "load_influenza_aa_dat.h"7#include "load/load_influenza_aa_dat.h"
8#include "load_influenza_faa.h"8#include "load/load_influenza_faa.h"
99
10#define FILE "influenza.h5"10#define FILE "influenza.h5"
1111
diff --git a/src/assign_protein_type.c b/src/assign/assign_protein_type.c
index ec3a959..0a11c23 100644
--- a/src/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,8 +1,10 @@
1#include "assign_protein_type.h"1#include "assign_protein_type.h"
2#include "check_ncbi_error.h"2#include "error/check_ncbi_error.h"
3#include "check_h5_error.h"3#include "error/check_h5_error.h"
4#include "sequence_data.h"4#include "model/gi_type_data.h"
5#include "sequence_data_init.h"5#include "model/gi_type_data_init.h"
6#include "model/sequence_data.h"
7#include "model/sequence_data_init.h"
6#include <ncbi.h>8#include <ncbi.h>
7#include <readdb.h>9#include <readdb.h>
8#include <blast.h>10#include <blast.h>
@@ -12,6 +14,16 @@
12#include <error.h>14#include <error.h>
1315
14/*16/*
17 * todo: Write results to an independent table by GI so that they can
18 * be reused when the input tables influenza.faa and influenza_aa.dat
19 * are replaced with new data. Since GI values are unique and
20 * persistent the assignment of a protein type should stay valid
21 * unless the list of reference sequences is updated.
22 *
23 * todo: Save the species type as a new field { A, B, C }.
24 */
25
26/*
15 * BLAST database containing all of the influenza protein sequences.27 * BLAST database containing all of the influenza protein sequences.
16 */28 */
17#define SEQDB "/home/don/exp004/influenzadb/influenzadb"29#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
@@ -57,34 +69,57 @@ assign_protein_type (hid_t file_id)
57 ValNodePtr error_returns = NULL;69 ValNodePtr error_returns = NULL;
5870
59 /*71 /*
60 * Read the data from HDF5 file.72 * Read the data from HDF5 influenza.faa.
61 */73 */
62 hsize_t nfields;74 hsize_t faa_nfields;
63 hsize_t nrecords;75 hsize_t faa_nrecords;
64 herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,76 herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields,
65 &nrecords);77 &faa_nrecords);
78 if (status < 0)
79 check_h5_error (status, __FILE__, __LINE__);
80
81 sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);
82
83 size_t faa_size;
84 size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
85 size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM];
86 hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM];
87 sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type);
88
89 status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset,
90 faa_sizes, faa_buf);
66 if (status < 0)91 if (status < 0)
67 check_h5_error (status, __FILE__, __LINE__);92 check_h5_error (status, __FILE__, __LINE__);
6893
69 sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);94 /*
95 * Read the data from HDF5 gi_type_data.
96 */
97 hsize_t gi_nfields;
98 hsize_t gi_nrecords;
99 status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
100 &gi_nrecords);
101 if (status < 0)
102 check_h5_error (status, __FILE__, __LINE__);
103
104 gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
70105
71 size_t dst_size;106 size_t gi_size;
72 size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];107 size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
73 size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];108 size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
74 hid_t field_type[SEQUENCE_DATA_FIELD_NUM];109 hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
75 sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); 110 gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
76111
77 status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,112 status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
78 dst_sizes, dst_buf);113 gi_sizes, gi_buf);
79 if (status < 0)114 if (status < 0)
80 check_h5_error (status, __FILE__, __LINE__);115 check_h5_error (status, __FILE__, __LINE__);
81116
82 /*117 /*
83 * Assign protein types to records for which the field is empty.118 * Assign protein types to records for which the field is empty.
84 */119 */
85 printf ("Records to process: %i\n", (int)nrecords);120 printf ("Records to process: %i\n", (int)faa_nrecords);
86 bool updates_pending = false;121 bool updates_pending = false;
87 for (int i = 0; i < nrecords; i++)122 for (int i = 0; i < faa_nrecords; i++)
88 {123 {
89124
90 if (dst_buf[i].protein_type[0] == '\0') {125 if (dst_buf[i].protein_type[0] == '\0') {
@@ -130,8 +165,15 @@ assign_protein_type (hid_t file_id)
130 Char target_id_buf[BUFFER_LEN + 1];165 Char target_id_buf[BUFFER_LEN + 1];
131 SeqIdPtr target_id = SeqAlignId (seqalign, 1);166 SeqIdPtr target_id = SeqAlignId (seqalign, 1);
132 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);167 SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
133 strncpy (dst_buf[i].protein_type, &target_id_buf[6], 168
134 sizeof (dst_buf[i].protein_type));169 // Species Type
170 dst_buf[i].type[0] = &target_id_buf[4];
171 dst_buf[i].type[1] = '\0';
172
173 // Protein Type
174 strncpy (dst_buf[i].protein, &target_id_buf[6],
175 sizeof (dst_buf[i].protein));
176
135 updates_pending = true;177 updates_pending = true;
136 }178 }
137179
@@ -202,7 +244,8 @@ assign_protein_type (hid_t file_id)
202 updates_pending = false;244 updates_pending = false;
203 }245 }
204 246
205 free (dst_buf);247 free (faa_buf);
248 free (gi_buf);
206 249
207 options = BLASTOptionDelete (options);250 options = BLASTOptionDelete (options);
208 251
diff --git a/src/assign_protein_type.h b/src/assign/assign_protein_type.h
index 221154f..221154f 100644
--- a/src/assign_protein_type.h
+++ b/src/assign/assign_protein_type.h
diff --git a/src/check_error.c b/src/error/check_error.c
index 4630b50..4630b50 100644
--- a/src/check_error.c
+++ b/src/error/check_error.c
diff --git a/src/check_error.h b/src/error/check_error.h
index 2250c59..2250c59 100644
--- a/src/check_error.h
+++ b/src/error/check_error.h
diff --git a/src/check_h5_error.c b/src/error/check_h5_error.c
index d90b21f..d90b21f 100644
--- a/src/check_h5_error.c
+++ b/src/error/check_h5_error.c
diff --git a/src/check_h5_error.h b/src/error/check_h5_error.h
index e460e97..e460e97 100644
--- a/src/check_h5_error.h
+++ b/src/error/check_h5_error.h
diff --git a/src/check_ncbi_error.c b/src/error/check_ncbi_error.c
index 6071d1a..6071d1a 100644
--- a/src/check_ncbi_error.c
+++ b/src/error/check_ncbi_error.c
diff --git a/src/check_ncbi_error.h b/src/error/check_ncbi_error.h
index 45ac0ca..45ac0ca 100644
--- a/src/check_ncbi_error.h
+++ b/src/error/check_ncbi_error.h
diff --git a/src/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
index aed33e8..8bf47aa 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load/load_influenza_aa_dat.c
@@ -6,8 +6,8 @@
6 */6 */
77
8#include "load_influenza_aa_dat.h"8#include "load_influenza_aa_dat.h"
9#include "check_error.h"9#include "error/check_error.h"
10#include "check_h5_error.h"10#include "error/check_h5_error.h"
11#include <hdf5_hl.h>11#include <hdf5_hl.h>
12#include <string.h>12#include <string.h>
13#include <stdlib.h>13#include <stdlib.h>
diff --git a/src/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h
index f6c60be..f6c60be 100644
--- a/src/load_influenza_aa_dat.h
+++ b/src/load/load_influenza_aa_dat.h
diff --git a/src/load_influenza_faa.c b/src/load/load_influenza_faa.c
index fd35254..a217989 100644
--- a/src/load_influenza_faa.c
+++ b/src/load/load_influenza_faa.c
@@ -1,8 +1,8 @@
1#include "check_error.h"1#include "error/check_error.h"
2#include "check_h5_error.h"2#include "error/check_h5_error.h"
3#include "load_influenza_faa.h"3#include "load_influenza_faa.h"
4#include "sequence_data.h"4#include "model/sequence_data.h"
5#include "sequence_data_init.h"5#include "model/sequence_data_init.h"
6#include <hdf5_hl.h>6#include <hdf5_hl.h>
7#include <string.h>7#include <string.h>
8#include <stdlib.h>8#include <stdlib.h>
@@ -57,8 +57,6 @@ load_influenza_faa (hid_t file_id)
57 strncpy (p_data.description, strsep (&running, "|"),57 strncpy (p_data.description, strsep (&running, "|"),
58 sizeof (p_data.description));58 sizeof (p_data.description));
5959
60 strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
61
62 const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =60 const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
63 SEQUENCE_DATA_FIELD_NAMES;61 SEQUENCE_DATA_FIELD_NAMES;
6462
diff --git a/src/load_influenza_faa.h b/src/load/load_influenza_faa.h
index 569c411..569c411 100644
--- a/src/load_influenza_faa.h
+++ b/src/load/load_influenza_faa.h
diff --git a/src/model/gi_type_data.h b/src/model/gi_type_data.h
new file mode 100644
index 0000000..bf149c3
--- a/dev/null
+++ b/src/model/gi_type_data.h
@@ -0,0 +1,21 @@
1#ifndef GI_TYPE_DATA_H
2#define GI_TYPE_DATA_H
3
4#define GI_TYPE_DATA_FIELD_NUM 3
5
6/*
7 * Field names selected to match the names of fields on the "Influenza
8 * Virus Resource - Influenza Virus Sequence Database" search form
9 * online at
10 * http://www.ncbi.nlm.nih.gov/genomes/FLU/Database/select.cgi?go=database.
11 */
12#define GI_TYPE_DATA_FIELD_NAMES { "GI", "Type", "Protein" }
13
14typedef struct
15{
16 int gi;
17 char type[2];
18 char protein[7];
19} gi_type_data;
20
21#endif // GI_TYPE_DATA_H
diff --git a/src/model/gi_type_data_init.c b/src/model/gi_type_data_init.c
new file mode 100644
index 0000000..4a161c7
--- a/dev/null
+++ b/src/model/gi_type_data_init.c
@@ -0,0 +1,36 @@
1#include "gi_type_data_init.h"
2#include "gi_type_data.h"
3
4/*
5 * There should be some way to automate the generation of these values
6 * or at least this source file based on just the definition of the
7 * struct. Perhaps an HDF5 precompiler could do such a thing.
8 */
9void
10gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
11 hid_t *field_type)
12{
13 *dst_size = sizeof (gi_type_data);
14
15 dst_offset[0] = HOFFSET (gi_type_data, gi);
16 dst_offset[1] = HOFFSET (gi_type_data, type);
17 dst_offset[2] = HOFFSET (gi_type_data, protein);
18
19 gi_type_data dst_buf[1];
20
21 dst_sizes[0] = sizeof (dst_buf[0].gi);
22 dst_sizes[1] = sizeof (dst_buf[0].type);
23 dst_sizes[2] = sizeof (dst_buf[0].protein);
24
25 field_type[0] = H5T_NATIVE_INT;
26
27 hid_t type_h5t = H5Tcopy (H5T_C_S1);
28 H5Tset_size (type_h5t, 2);
29 field_type[1] = type_h5t;
30
31 hid_t protein_h5t = H5Tcopy (H5T_C_S1);
32 H5Tset_size (protein_h5t, 7);
33 field_type[2] = protein_h5t;
34
35 return;
36}
diff --git a/src/model/gi_type_data_init.h b/src/model/gi_type_data_init.h
new file mode 100644
index 0000000..5c45cba
--- a/dev/null
+++ b/src/model/gi_type_data_init.h
@@ -0,0 +1,14 @@
1#ifndef GI_TYPE_DATA_INIT_H
2#define GI_TYPE_DATA_INIT_H
3
4#include <hdf5.h>
5
6/*
7 * Initialize the structures describing the struct. These descriptive
8 * structures are used by the HDF5 API.
9 */
10void
11gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
12 hid_t *field_type);
13
14#endif // GI_TYPE_DATA_INIT_H
diff --git a/src/sequence_data.h b/src/model/sequence_data.h
index fb4ff78..9db91ac 100644
--- a/src/sequence_data.h
+++ b/src/model/sequence_data.h
@@ -1,16 +1,15 @@
1#ifndef SEQUENCE_DATA_H1#ifndef SEQUENCE_DATA_H
2#define SEQUENCE_DATA_H2#define SEQUENCE_DATA_H
33
4#define SEQUENCE_DATA_FIELD_NUM 44#define SEQUENCE_DATA_FIELD_NUM 3
55
6#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }6#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description" }
77
8typedef struct8typedef struct
9{9{
10 int gi;10 int gi;
11 char gb[9];11 char gb[9];
12 char description[196];12 char description[196];
13 char protein_type[7];
14} sequence_data;13} sequence_data;
1514
16#endif // SEQUENCE_DATA_H15#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/model/sequence_data_init.c
index 09ba189..f6b3b1f 100644
--- a/src/sequence_data_init.c
+++ b/src/model/sequence_data_init.c
@@ -10,14 +10,12 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
10 dst_offset[0] = HOFFSET (sequence_data, gi);10 dst_offset[0] = HOFFSET (sequence_data, gi);
11 dst_offset[1] = HOFFSET (sequence_data, gb);11 dst_offset[1] = HOFFSET (sequence_data, gb);
12 dst_offset[2] = HOFFSET (sequence_data, description);12 dst_offset[2] = HOFFSET (sequence_data, description);
13 dst_offset[3] = HOFFSET (sequence_data, protein_type);
14 13
15 sequence_data dst_buf[1];14 sequence_data dst_buf[1];
16 15
17 dst_sizes[0] = sizeof (dst_buf[0].gi);16 dst_sizes[0] = sizeof (dst_buf[0].gi);
18 dst_sizes[1] = sizeof (dst_buf[0].gb);17 dst_sizes[1] = sizeof (dst_buf[0].gb);
19 dst_sizes[2] = sizeof (dst_buf[0].description);18 dst_sizes[2] = sizeof (dst_buf[0].description);
20 dst_sizes[3] = sizeof (dst_buf[0].protein_type);
21 19
22 field_type[0] = H5T_NATIVE_INT;20 field_type[0] = H5T_NATIVE_INT;
23 21
@@ -28,10 +26,6 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
28 hid_t description_type = H5Tcopy (H5T_C_S1);26 hid_t description_type = H5Tcopy (H5T_C_S1);
29 H5Tset_size (description_type, 196);27 H5Tset_size (description_type, 196);
30 field_type[2] = description_type;28 field_type[2] = description_type;
31
32 hid_t protein_type_type = H5Tcopy (H5T_C_S1);
33 H5Tset_size (protein_type_type, 7);
34 field_type[3] = protein_type_type;
3529
36 return;30 return;
37}31}
diff --git a/src/sequence_data_init.h b/src/model/sequence_data_init.h
index c87e7e6..c87e7e6 100644
--- a/src/sequence_data_init.h
+++ b/src/model/sequence_data_init.h
diff --git a/src/updator.c b/src/updator.c
index 8d556da..591d2f6 100644
--- a/src/updator.c
+++ b/src/updator.c
@@ -2,8 +2,8 @@
2 * Update derived fields.2 * Update derived fields.
3 */3 */
44
5#include "assign_protein_type.h"5#include "assign/assign_protein_type.h"
6#include "check_h5_error.h"6#include "error/check_h5_error.h"
77
8#define FILE "influenza.h5"8#define FILE "influenza.h5"
99

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.