From d32c4e84c2969303b90474b19714efdd00ddb8f9 Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Mon, 18 Jan 2010 23:31:27 +0000 Subject: Organzied the functions into assign, error, load and model subdirectories. Code does not compile as the transition to storing species and protein type data in a separate HDF5 table is incomplete. --- diff --git a/analysis/year.R b/analysis/year.R index b7da3c8..6d68925 100644 --- a/analysis/year.R +++ b/analysis/year.R @@ -5,21 +5,23 @@ require(hdf5); hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); A <- influenza.aa.dat; - B <- influenza.faa; +# Join the two tables by GB value. +C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB"); + # All records for 1918. Based on code from # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations -C <- A[A$Year == 1918, ] +D <- C[C$Year == 1918, ] -# Countries represented in the 1918 dataset. -C$Country; +summary (D); -# All record with a year value. -D <- A[A$Year != 0, ]; +# Countries represented in the 1918 dataset. +D$Country; -hist(D$Year); +D[D$"Protein Type" == "HA", ] -B[B$GI == 305182, ] +# All record with a year value. +E <- A[A$Year != 0, ]; -B[B$"Protein Type" == "HA", ] +hist(E$Year); diff --git a/src/Makefile.am b/src/Makefile.am index 6dd8e16..06a6f5c 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -2,28 +2,31 @@ bin_PROGRAMS = aggregator updator aggregator_SOURCES = \ aggregator.c \ - check_error.c \ - check_h5_error.c \ - load_influenza_aa_dat.c \ - load_influenza_faa.c \ - sequence_data_init.c + error/check_error.c \ + error/check_h5_error.c \ + load/load_influenza_aa_dat.c \ + load/load_influenza_faa.c \ + model/sequence_data_init.c updator_SOURCES = \ updator.c \ - check_error.c \ - check_h5_error.c \ - check_ncbi_error.c \ - assign_protein_type.c \ - sequence_data_init.c + assign/assign_protein_type.c \ + error/check_error.c \ + error/check_h5_error.c \ + error/check_ncbi_error.c \ + model/gi_type_data_init.c \ + model/sequence_data_init.c noinst_HEADERS = \ - assign_protein_type.h \ - check_error.h \ - check_h5_error.h \ - check_ncbi_error.h \ - load_influenza_aa_dat.h \ - load_influenza_faa.h \ - sequence_data.h \ - sequence_data_init.h + assign/assign_protein_type.h \ + error/check_error.h \ + error/check_h5_error.h \ + error/check_ncbi_error.h \ + load/load_influenza_aa_dat.h \ + load/load_influenza_faa.h \ + model/gi_type_data.h \ + model/gi_type_data_init.h \ + model/sequence_data.h \ + model/sequence_data_init.h AM_CFLAGS = -Wall -std=gnu99 -ggdb diff --git a/src/aggregator.c b/src/aggregator.c index 07aba39..c00d912 100644 --- a/src/aggregator.c +++ b/src/aggregator.c @@ -3,9 +3,9 @@ * container. */ -#include "check_h5_error.h" -#include "load_influenza_aa_dat.h" -#include "load_influenza_faa.h" +#include "error/check_h5_error.h" +#include "load/load_influenza_aa_dat.h" +#include "load/load_influenza_faa.h" #define FILE "influenza.h5" diff --git a/src/assign_protein_type.c b/src/assign/assign_protein_type.c index ec3a959..0a11c23 100644 --- a/src/assign_protein_type.c +++ b/src/assign/assign_protein_type.c @@ -1,8 +1,10 @@ #include "assign_protein_type.h" -#include "check_ncbi_error.h" -#include "check_h5_error.h" -#include "sequence_data.h" -#include "sequence_data_init.h" +#include "error/check_ncbi_error.h" +#include "error/check_h5_error.h" +#include "model/gi_type_data.h" +#include "model/gi_type_data_init.h" +#include "model/sequence_data.h" +#include "model/sequence_data_init.h" #include #include #include @@ -12,6 +14,16 @@ #include /* + * todo: Write results to an independent table by GI so that they can + * be reused when the input tables influenza.faa and influenza_aa.dat + * are replaced with new data. Since GI values are unique and + * persistent the assignment of a protein type should stay valid + * unless the list of reference sequences is updated. + * + * todo: Save the species type as a new field { A, B, C }. + */ + +/* * BLAST database containing all of the influenza protein sequences. */ #define SEQDB "/home/don/exp004/influenzadb/influenzadb" @@ -57,34 +69,57 @@ assign_protein_type (hid_t file_id) ValNodePtr error_returns = NULL; /* - * Read the data from HDF5 file. + * Read the data from HDF5 influenza.faa. */ - hsize_t nfields; - hsize_t nrecords; - herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields, - &nrecords); + hsize_t faa_nfields; + hsize_t faa_nrecords; + herr_t status = H5TBget_table_info (file_id, "influenza.faa", &faa_nfields, + &faa_nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + sequence_data* faa_buf = malloc (sizeof(sequence_data) * faa_nrecords); + + size_t faa_size; + size_t faa_offset[SEQUENCE_DATA_FIELD_NUM]; + size_t faa_sizes[SEQUENCE_DATA_FIELD_NUM]; + hid_t faa_field_type[SEQUENCE_DATA_FIELD_NUM]; + sequence_data_init (&faa_size, faa_offset, faa_sizes, faa_field_type); + + status = H5TBread_table (file_id, "influenza.faa", faa_size, faa_offset, + faa_sizes, faa_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); - sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords); + /* + * Read the data from HDF5 gi_type_data. + */ + hsize_t gi_nfields; + hsize_t gi_nrecords; + status = H5TBget_table_info (file_id, "gi_type_data", &gi_nfields, + &gi_nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + gi_type_data* gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); - size_t dst_size; - size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; - size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM]; - hid_t field_type[SEQUENCE_DATA_FIELD_NUM]; - sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type); + size_t gi_size; + size_t gi_offset[GI_TYPE_DATA_FIELD_NUM]; + size_t gi_sizes[GI_TYPE_DATA_FIELD_NUM]; + hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; + gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); - status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset, - dst_sizes, dst_buf); + status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, + gi_sizes, gi_buf); if (status < 0) check_h5_error (status, __FILE__, __LINE__); /* * Assign protein types to records for which the field is empty. */ - printf ("Records to process: %i\n", (int)nrecords); + printf ("Records to process: %i\n", (int)faa_nrecords); bool updates_pending = false; - for (int i = 0; i < nrecords; i++) + for (int i = 0; i < faa_nrecords; i++) { if (dst_buf[i].protein_type[0] == '\0') { @@ -130,8 +165,15 @@ assign_protein_type (hid_t file_id) Char target_id_buf[BUFFER_LEN + 1]; SeqIdPtr target_id = SeqAlignId (seqalign, 1); SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN); - strncpy (dst_buf[i].protein_type, &target_id_buf[6], - sizeof (dst_buf[i].protein_type)); + + // Species Type + dst_buf[i].type[0] = &target_id_buf[4]; + dst_buf[i].type[1] = '\0'; + + // Protein Type + strncpy (dst_buf[i].protein, &target_id_buf[6], + sizeof (dst_buf[i].protein)); + updates_pending = true; } @@ -202,7 +244,8 @@ assign_protein_type (hid_t file_id) updates_pending = false; } - free (dst_buf); + free (faa_buf); + free (gi_buf); options = BLASTOptionDelete (options); diff --git a/src/assign_protein_type.h b/src/assign/assign_protein_type.h index 221154f..221154f 100644 --- a/src/assign_protein_type.h +++ b/src/assign/assign_protein_type.h diff --git a/src/check_error.c b/src/error/check_error.c index 4630b50..4630b50 100644 --- a/src/check_error.c +++ b/src/error/check_error.c diff --git a/src/check_error.h b/src/error/check_error.h index 2250c59..2250c59 100644 --- a/src/check_error.h +++ b/src/error/check_error.h diff --git a/src/check_h5_error.c b/src/error/check_h5_error.c index d90b21f..d90b21f 100644 --- a/src/check_h5_error.c +++ b/src/error/check_h5_error.c diff --git a/src/check_h5_error.h b/src/error/check_h5_error.h index e460e97..e460e97 100644 --- a/src/check_h5_error.h +++ b/src/error/check_h5_error.h diff --git a/src/check_ncbi_error.c b/src/error/check_ncbi_error.c index 6071d1a..6071d1a 100644 --- a/src/check_ncbi_error.c +++ b/src/error/check_ncbi_error.c diff --git a/src/check_ncbi_error.h b/src/error/check_ncbi_error.h index 45ac0ca..45ac0ca 100644 --- a/src/check_ncbi_error.h +++ b/src/error/check_ncbi_error.h diff --git a/src/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c index aed33e8..8bf47aa 100644 --- a/src/load_influenza_aa_dat.c +++ b/src/load/load_influenza_aa_dat.c @@ -6,8 +6,8 @@ */ #include "load_influenza_aa_dat.h" -#include "check_error.h" -#include "check_h5_error.h" +#include "error/check_error.h" +#include "error/check_h5_error.h" #include #include #include diff --git a/src/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h index f6c60be..f6c60be 100644 --- a/src/load_influenza_aa_dat.h +++ b/src/load/load_influenza_aa_dat.h diff --git a/src/load_influenza_faa.c b/src/load/load_influenza_faa.c index fd35254..a217989 100644 --- a/src/load_influenza_faa.c +++ b/src/load/load_influenza_faa.c @@ -1,8 +1,8 @@ -#include "check_error.h" -#include "check_h5_error.h" +#include "error/check_error.h" +#include "error/check_h5_error.h" #include "load_influenza_faa.h" -#include "sequence_data.h" -#include "sequence_data_init.h" +#include "model/sequence_data.h" +#include "model/sequence_data_init.h" #include #include #include @@ -57,8 +57,6 @@ load_influenza_faa (hid_t file_id) strncpy (p_data.description, strsep (&running, "|"), sizeof (p_data.description)); - strncpy (p_data.protein_type, "", sizeof (p_data.protein_type)); - const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] = SEQUENCE_DATA_FIELD_NAMES; diff --git a/src/load_influenza_faa.h b/src/load/load_influenza_faa.h index 569c411..569c411 100644 --- a/src/load_influenza_faa.h +++ b/src/load/load_influenza_faa.h diff --git a/src/model/gi_type_data.h b/src/model/gi_type_data.h new file mode 100644 index 0000000..bf149c3 --- a/dev/null +++ b/src/model/gi_type_data.h @@ -0,0 +1,21 @@ +#ifndef GI_TYPE_DATA_H +#define GI_TYPE_DATA_H + +#define GI_TYPE_DATA_FIELD_NUM 3 + +/* + * Field names selected to match the names of fields on the "Influenza + * Virus Resource - Influenza Virus Sequence Database" search form + * online at + * http://www.ncbi.nlm.nih.gov/genomes/FLU/Database/select.cgi?go=database. + */ +#define GI_TYPE_DATA_FIELD_NAMES { "GI", "Type", "Protein" } + +typedef struct +{ + int gi; + char type[2]; + char protein[7]; +} gi_type_data; + +#endif // GI_TYPE_DATA_H diff --git a/src/model/gi_type_data_init.c b/src/model/gi_type_data_init.c new file mode 100644 index 0000000..4a161c7 --- a/dev/null +++ b/src/model/gi_type_data_init.c @@ -0,0 +1,36 @@ +#include "gi_type_data_init.h" +#include "gi_type_data.h" + +/* + * There should be some way to automate the generation of these values + * or at least this source file based on just the definition of the + * struct. Perhaps an HDF5 precompiler could do such a thing. + */ +void +gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, + hid_t *field_type) +{ + *dst_size = sizeof (gi_type_data); + + dst_offset[0] = HOFFSET (gi_type_data, gi); + dst_offset[1] = HOFFSET (gi_type_data, type); + dst_offset[2] = HOFFSET (gi_type_data, protein); + + gi_type_data dst_buf[1]; + + dst_sizes[0] = sizeof (dst_buf[0].gi); + dst_sizes[1] = sizeof (dst_buf[0].type); + dst_sizes[2] = sizeof (dst_buf[0].protein); + + field_type[0] = H5T_NATIVE_INT; + + hid_t type_h5t = H5Tcopy (H5T_C_S1); + H5Tset_size (type_h5t, 2); + field_type[1] = type_h5t; + + hid_t protein_h5t = H5Tcopy (H5T_C_S1); + H5Tset_size (protein_h5t, 7); + field_type[2] = protein_h5t; + + return; +} diff --git a/src/model/gi_type_data_init.h b/src/model/gi_type_data_init.h new file mode 100644 index 0000000..5c45cba --- a/dev/null +++ b/src/model/gi_type_data_init.h @@ -0,0 +1,14 @@ +#ifndef GI_TYPE_DATA_INIT_H +#define GI_TYPE_DATA_INIT_H + +#include + +/* + * Initialize the structures describing the struct. These descriptive + * structures are used by the HDF5 API. + */ +void +gi_type_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, + hid_t *field_type); + +#endif // GI_TYPE_DATA_INIT_H diff --git a/src/sequence_data.h b/src/model/sequence_data.h index fb4ff78..9db91ac 100644 --- a/src/sequence_data.h +++ b/src/model/sequence_data.h @@ -1,16 +1,15 @@ #ifndef SEQUENCE_DATA_H #define SEQUENCE_DATA_H -#define SEQUENCE_DATA_FIELD_NUM 4 +#define SEQUENCE_DATA_FIELD_NUM 3 -#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" } +#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description" } typedef struct { int gi; char gb[9]; char description[196]; - char protein_type[7]; } sequence_data; #endif // SEQUENCE_DATA_H diff --git a/src/sequence_data_init.c b/src/model/sequence_data_init.c index 09ba189..f6b3b1f 100644 --- a/src/sequence_data_init.c +++ b/src/model/sequence_data_init.c @@ -10,14 +10,12 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, dst_offset[0] = HOFFSET (sequence_data, gi); dst_offset[1] = HOFFSET (sequence_data, gb); dst_offset[2] = HOFFSET (sequence_data, description); - dst_offset[3] = HOFFSET (sequence_data, protein_type); sequence_data dst_buf[1]; dst_sizes[0] = sizeof (dst_buf[0].gi); dst_sizes[1] = sizeof (dst_buf[0].gb); dst_sizes[2] = sizeof (dst_buf[0].description); - dst_sizes[3] = sizeof (dst_buf[0].protein_type); field_type[0] = H5T_NATIVE_INT; @@ -28,10 +26,6 @@ sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes, hid_t description_type = H5Tcopy (H5T_C_S1); H5Tset_size (description_type, 196); field_type[2] = description_type; - - hid_t protein_type_type = H5Tcopy (H5T_C_S1); - H5Tset_size (protein_type_type, 7); - field_type[3] = protein_type_type; return; } diff --git a/src/sequence_data_init.h b/src/model/sequence_data_init.h index c87e7e6..c87e7e6 100644 --- a/src/sequence_data_init.h +++ b/src/model/sequence_data_init.h diff --git a/src/updator.c b/src/updator.c index 8d556da..591d2f6 100644 --- a/src/updator.c +++ b/src/updator.c @@ -2,8 +2,8 @@ * Update derived fields. */ -#include "assign_protein_type.h" -#include "check_h5_error.h" +#include "assign/assign_protein_type.h" +#include "error/check_h5_error.h" #define FILE "influenza.h5" -- cgit v0.8.3.1-22-g547a