summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-18 23:31:27 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-18 23:31:27 (GMT)
commitd32c4e84c2969303b90474b19714efdd00ddb8f9 (patch) (side-by-side diff)
treea3c2c79fb3411199b0b9595a2b17ec26fd551392
parent2c848c05f549d4ef40e51e3b55b786706500eac4 (diff)
downloadexp007-d32c4e84c2969303b90474b19714efdd00ddb8f9.zip
exp007-d32c4e84c2969303b90474b19714efdd00ddb8f9.tar.gz
exp007-d32c4e84c2969303b90474b19714efdd00ddb8f9.tar.bz2
Organzied the functions into assign, error, load and model subdirectories. Code does not compile as the transition to storing species and protein type data in a separate HDF5 table is incomplete.
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/analysis/year.R b/analysis/year.R
index b7da3c8..6d68925 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -5,21 +5,23 @@ require(hdf5);
hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
A <- influenza.aa.dat;
-
B <- influenza.faa;
+# Join the two tables by GB value.
+C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
+
# All records for 1918. Based on code from
# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
-C <- A[A$Year == 1918, ]
+D <- C[C$Year == 1918, ]
-# Countries represented in the 1918 dataset.
-C$Country;
+summary (D);
-# All record with a year value.
-D <- A[A$Year != 0, ];
+# Countries represented in the 1918 dataset.
+D$Country;
-hist(D$Year);
+D[D$"Protein Type" == "HA", ]
-B[B$GI == 305182, ]
+# All record with a year value.
+E <- A[A$Year != 0, ];
-B[B$"Protein Type" == "HA", ]
+hist(E$Year);
diff --git a/src/Makefile.am b/src/Makefile.am
index 6dd8e16..06a6f5c 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,28 +2,31 @@ bin_PROGRAMS = aggregator updator
aggregator_SOURCES = \
aggregator.c \
- check_error.c \
- check_h5_error.c \
- load_influenza_aa_dat.c \
- load_influenza_faa.c \
- sequence_data_init.c
+ error/check_error.c \
+ error/check_h5_error.c \
+ load/load_influenza_aa_dat.c \
+ load/load_influenza_faa.c \
+ model/sequence_data_init.c
updator_SOURCES = \
updator.c \
- check_error.c \
- check_h5_error.c \
- check_ncbi_error.c \
- assign_protein_type.c \
- sequence_data_init.c
+ assign/assign_protein_type.c \
+ error/check_error.c \
+ error/check_h5_error.c \
+ error/check_ncbi_error.c \
+ model/gi_type_data_init.c \
+ model/sequence_data_init.c
noinst_HEADERS = \
- assign_protein_type.h \
- check_error.h \
- check_h5_error.h \
- check_ncbi_error.h \
- load_influenza_aa_dat.h \
- load_influenza_faa.h \
- sequence_data.h \
- sequence_data_init.h
+ assign/assign_protein_type.h \
+ error/check_error.h \
+ error/check_h5_error.h \
+ error/check_ncbi_error.h \
+ load/load_influenza_aa_dat.h \
+ load/load_influenza_faa.h \
+ model/gi_type_data.h \
+ model/gi_type_data_init.h \
+ model/sequence_data.h \
+ model/sequence_data_init.h
AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/aggregator.c b/src/aggregator.c
index 07aba39..c00d912 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -3,9 +3,9 @@
* container.
*/
-#include "check_h5_error.h"
-#include "load_influenza_aa_dat.h"
-#include "load_influenza_faa.h"
+#include "error/check_h5_error.h"
+#include "load/load_influenza_aa_dat.h"
+#include "load/load_influenza_faa.h"
#define FILE "influenza.h5"
diff --git a/src/assign_protein_type.c b/src/assign/assign_protein_type.c
index ec3a959..0a11c23 100644
--- a/src/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,8 +1,10 @@
#include "assign_protein_type.h"
-#include "check_ncbi_error.h"
-#include "check_h5_error.h"
-#include "sequence_data.h"
-#include "sequence_data_init.h"
+#include "error/check_ncbi_error.h"
+#include "error/check_h5_error.h"
+#include "model/gi_type_data.h"
+#include "model/gi_type_data_init.h"
+#include "model/sequence_data.h"
+#include "model/sequence_data_init.h"
#include <ncbi.h>
#include <readdb.h>
#include <blast.h>
@@ -12,6 +14,16 @@
#include <error.h>
/*
+ * todo: Write results to an independent table by GI so that they can
+ * be reused when the input tables influenza.faa and influenza_aa.dat
+ * are replaced with new data. Since GI values are unique and
+ * persistent the assignment of a protein type should stay valid
+ * unless the list of reference sequences is updated.
+ *
+ * todo: Save the species type as a new field { A, B, C }.
+ */
+
+/*
* BLAST database containing all of the influenza protein sequences.
*/
#define SEQDB "/home/don/exp004/influenzadb/influenzadb"
@@ -57,34 +69,57 @@ assign_protein_type (hid_t file_id)
ValNodePtr error_returns = NULL;
/*
- * Read the data from HDF5 file.
+ * Read the data from HDF5 influenza.faa.
*/
- hsize_t nfields;
- hsize_t nrecords;
- herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
- &nrecords);
+ hsize_t faa_nfields;
+ hsize_t faa_nrecords;
+ herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields,
+ &faa_nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords);
+
+ size_t faa_size;
+ size_t faa_offset[SEQUENCE_DATA_FIELD_NUM];
+ size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM];
+ hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM];
+ sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type);
+
+ status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset,
+ faa_sizes, faa_buf);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
+ /*
+ * Read the data from HDF5 gi_type_data.
+ */
+ hsize_t gi_nfields;
+ hsize_t gi_nrecords;
+ status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields,
+ &gi_nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
- size_t dst_size;
- size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
- size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
- hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
- sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
+ size_t gi_size;
+ size_t gi_offset[GI_TYPE_DATA_FIELD_NUM];
+ size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM];
+ hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
+ gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
- status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
- dst_sizes, dst_buf);
+ status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
+ gi_sizes, gi_buf);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
/*
* Assign protein types to records for which the field is empty.
*/
- printf ("Records to process: %i\n", (int)nrecords);
+ printf ("Records to process: %i\n", (int)faa_nrecords);
bool updates_pending = false;
- for (int i = 0; i < nrecords; i++)
+ for (int i = 0; i < faa_nrecords; i++)
{
if (dst_buf[i].protein_type[0] == '\0') {
@@ -130,8 +165,15 @@ assign_protein_type (hid_t file_id)
Char target_id_buf[BUFFER_LEN + 1];
SeqIdPtr target_id = SeqAlignId (seqalign, 1);
SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
- strncpy (dst_buf[i].protein_type, &target_id_buf[6],
- sizeof (dst_buf[i].protein_type));
+
+ // Species Type
+ dst_buf[i].type[0] = &target_id_buf[4];
+ dst_buf[i].type[1] = '\0';
+
+ // Protein Type
+ strncpy (dst_buf[i].protein, &target_id_buf[6],
+ sizeof (dst_buf[i].protein));
+
updates_pending = true;
}
@@ -202,7 +244,8 @@ assign_protein_type (hid_t file_id)
updates_pending = false;
}
- free (dst_buf);
+ free (faa_buf);
+ free (gi_buf);
options = BLASTOptionDelete (options);
diff --git a/src/assign_protein_type.h b/src/assign/assign_protein_type.h
index 221154f..221154f 100644
--- a/src/assign_protein_type.h
+++ b/src/assign/assign_protein_type.h
diff --git a/src/check_error.c b/src/error/check_error.c
index 4630b50..4630b50 100644
--- a/src/check_error.c
+++ b/src/error/check_error.c
diff --git a/src/check_error.h b/src/error/check_error.h
index 2250c59..2250c59 100644
--- a/src/check_error.h
+++ b/src/error/check_error.h
diff --git a/src/check_h5_error.c b/src/error/check_h5_error.c
index d90b21f..d90b21f 100644
--- a/src/check_h5_error.c
+++ b/src/error/check_h5_error.c
diff --git a/src/check_h5_error.h b/src/error/check_h5_error.h
index e460e97..e460e97 100644
--- a/src/check_h5_error.h
+++ b/src/error/check_h5_error.h
diff --git a/src/check_ncbi_error.c b/src/error/check_ncbi_error.c
index 6071d1a..6071d1a 100644
--- a/src/check_ncbi_error.c
+++ b/src/error/check_ncbi_error.c
diff --git a/src/check_ncbi_error.h b/src/error/check_ncbi_error.h
index 45ac0ca..45ac0ca 100644
--- a/src/check_ncbi_error.h
+++ b/src/error/check_ncbi_error.h
diff --git a/src/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
index aed33e8..8bf47aa 100644
--- a/src/load_influenza_aa_dat.c
+++ b/src/load/load_influenza_aa_dat.c
@@ -6,8 +6,8 @@
*/
#include "load_influenza_aa_dat.h"
-#include "check_error.h"
-#include "check_h5_error.h"
+#include "error/check_error.h"
+#include "error/check_h5_error.h"
#include <hdf5_hl.h>
#include <string.h>
#include <stdlib.h>
diff --git a/src/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h
index f6c60be..f6c60be 100644
--- a/src/load_influenza_aa_dat.h
+++ b/src/load/load_influenza_aa_dat.h
diff --git a/src/load_influenza_faa.c b/src/load/load_influenza_faa.c
index fd35254..a217989 100644
--- a/src/load_influenza_faa.c
+++ b/src/load/load_influenza_faa.c
@@ -1,8 +1,8 @@
-#include "check_error.h"
-#include "check_h5_error.h"
+#include "error/check_error.h"
+#include "error/check_h5_error.h"
#include "load_influenza_faa.h"
-#include "sequence_data.h"
-#include "sequence_data_init.h"
+#include "model/sequence_data.h"
+#include "model/sequence_data_init.h"
#include <hdf5_hl.h>
#include <string.h>
#include <stdlib.h>
@@ -57,8 +57,6 @@ load_influenza_faa (hid_t file_id)
strncpy (p_data.description, strsep (&running, "|"),
sizeof (p_data.description));
- strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
-
const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
SEQUENCE_DATA_FIELD_NAMES;
diff --git a/src/load_influenza_faa.h b/src/load/load_influenza_faa.h
index 569c411..569c411 100644
--- a/src/load_influenza_faa.h
+++ b/src/load/load_influenza_faa.h
diff --git a/src/model/gi_type_data.h b/src/model/gi_type_data.h
new file mode 100644
index 0000000..bf149c3
--- a/dev/null
+++ b/src/model/gi_type_data.h
@@ -0,0 +1,21 @@
+#ifndef GI_TYPE_DATA_H
+#define GI_TYPE_DATA_H
+
+#define GI_TYPE_DATA_FIELD_NUM 3
+
+/*
+ * Field names selected to match the names of fields on the "Influenza
+ * Virus Resource - Influenza Virus Sequence Database" search form
+ * online at
+ * http://www.ncbi.nlm.nih.gov/genomes/FLU/Database/select.cgi?go=database.
+ */
+#define GI_TYPE_DATA_FIELD_NAMES { "GI", "Type", "Protein" }
+
+typedef struct
+{
+ int gi;
+ char type[2];
+ char protein[7];
+} gi_type_data;
+
+#endif // GI_TYPE_DATA_H
diff --git a/src/model/gi_type_data_init.c b/src/model/gi_type_data_init.c
new file mode 100644
index 0000000..4a161c7
--- a/dev/null
+++ b/src/model/gi_type_data_init.c
@@ -0,0 +1,36 @@
+#include "gi_type_data_init.h"
+#include "gi_type_data.h"
+
+/*
+ * There should be some way to automate the generation of these values
+ * or at least this source file based on just the definition of the
+ * struct. Perhaps an HDF5 precompiler could do such a thing.
+ */
+void
+gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type)
+{
+ *dst_size = sizeof (gi_type_data);
+
+ dst_offset[0] = HOFFSET (gi_type_data, gi);
+ dst_offset[1] = HOFFSET (gi_type_data, type);
+ dst_offset[2] = HOFFSET (gi_type_data, protein);
+
+ gi_type_data dst_buf[1];
+
+ dst_sizes[0] = sizeof (dst_buf[0].gi);
+ dst_sizes[1] = sizeof (dst_buf[0].type);
+ dst_sizes[2] = sizeof (dst_buf[0].protein);
+
+ field_type[0] = H5T_NATIVE_INT;
+
+ hid_t type_h5t = H5Tcopy (H5T_C_S1);
+ H5Tset_size (type_h5t, 2);
+ field_type[1] = type_h5t;
+
+ hid_t protein_h5t = H5Tcopy (H5T_C_S1);
+ H5Tset_size (protein_h5t, 7);
+ field_type[2] = protein_h5t;
+
+ return;
+}
diff --git a/src/model/gi_type_data_init.h b/src/model/gi_type_data_init.h
new file mode 100644
index 0000000..5c45cba
--- a/dev/null
+++ b/src/model/gi_type_data_init.h
@@ -0,0 +1,14 @@
+#ifndef GI_TYPE_DATA_INIT_H
+#define GI_TYPE_DATA_INIT_H
+
+#include <hdf5.h>
+
+/*
+ * Initialize the structures describing the struct. These descriptive
+ * structures are used by the HDF5 API.
+ */
+void
+gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+ hid_t *field_type);
+
+#endif // GI_TYPE_DATA_INIT_H
diff --git a/src/sequence_data.h b/src/model/sequence_data.h
index fb4ff78..9db91ac 100644
--- a/src/sequence_data.h
+++ b/src/model/sequence_data.h
@@ -1,16 +1,15 @@
#ifndef SEQUENCE_DATA_H
#define SEQUENCE_DATA_H
-#define SEQUENCE_DATA_FIELD_NUM 4
+#define SEQUENCE_DATA_FIELD_NUM 3
-#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description" }
typedef struct
{
int gi;
char gb[9];
char description[196];
- char protein_type[7];
} sequence_data;
#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/model/sequence_data_init.c
index 09ba189..f6b3b1f 100644
--- a/src/sequence_data_init.c
+++ b/src/model/sequence_data_init.c
@@ -10,14 +10,12 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
dst_offset[0] = HOFFSET (sequence_data, gi);
dst_offset[1] = HOFFSET (sequence_data, gb);
dst_offset[2] = HOFFSET (sequence_data, description);
- dst_offset[3] = HOFFSET (sequence_data, protein_type);
sequence_data dst_buf[1];
dst_sizes[0] = sizeof (dst_buf[0].gi);
dst_sizes[1] = sizeof (dst_buf[0].gb);
dst_sizes[2] = sizeof (dst_buf[0].description);
- dst_sizes[3] = sizeof (dst_buf[0].protein_type);
field_type[0] = H5T_NATIVE_INT;
@@ -28,10 +26,6 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
hid_t description_type = H5Tcopy (H5T_C_S1);
H5Tset_size (description_type, 196);
field_type[2] = description_type;
-
- hid_t protein_type_type = H5Tcopy (H5T_C_S1);
- H5Tset_size (protein_type_type, 7);
- field_type[3] = protein_type_type;
return;
}
diff --git a/src/sequence_data_init.h b/src/model/sequence_data_init.h
index c87e7e6..c87e7e6 100644
--- a/src/sequence_data_init.h
+++ b/src/model/sequence_data_init.h
diff --git a/src/updator.c b/src/updator.c
index 8d556da..591d2f6 100644
--- a/src/updator.c
+++ b/src/updator.c
@@ -2,8 +2,8 @@
* Update derived fields.
*/
-#include "assign_protein_type.h"
-#include "check_h5_error.h"
+#include "assign/assign_protein_type.h"
+#include "error/check_h5_error.h"
#define FILE "influenza.h5"

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.