summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--analysis/year.R20
-rw-r--r--src/Makefile.am39
-rw-r--r--src/aggregator.c6
-rw-r--r--src/assign/assign_protein_type.c (renamed from src/assign_protein_type.c)87
-rw-r--r--src/assign/assign_protein_type.h (renamed from src/assign_protein_type.h)0
-rw-r--r--src/error/check_error.c (renamed from src/check_error.c)0
-rw-r--r--src/error/check_error.h (renamed from src/check_error.h)0
-rw-r--r--src/error/check_h5_error.c (renamed from src/check_h5_error.c)0
-rw-r--r--src/error/check_h5_error.h (renamed from src/check_h5_error.h)0
-rw-r--r--src/error/check_ncbi_error.c (renamed from src/check_ncbi_error.c)0
-rw-r--r--src/error/check_ncbi_error.h (renamed from src/check_ncbi_error.h)0
-rw-r--r--src/load/load_influenza_aa_dat.c (renamed from src/load_influenza_aa_dat.c)4
-rw-r--r--src/load/load_influenza_aa_dat.h (renamed from src/load_influenza_aa_dat.h)0
-rw-r--r--src/load/load_influenza_faa.c (renamed from src/load_influenza_faa.c)10
-rw-r--r--src/load/load_influenza_faa.h (renamed from src/load_influenza_faa.h)0
-rw-r--r--src/model/gi_type_data.h21
-rw-r--r--src/model/gi_type_data_init.c36
-rw-r--r--src/model/gi_type_data_init.h14
-rw-r--r--src/model/sequence_data.h (renamed from src/sequence_data.h)5
-rw-r--r--src/model/sequence_data_init.c (renamed from src/sequence_data_init.c)6
-rw-r--r--src/model/sequence_data_init.h (renamed from src/sequence_data_init.h)0
-rw-r--r--src/updator.c4
22 files changed, 181 insertions, 71 deletions
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
new file mode 100644
index 0000000..8bf47aa
--- a/dev/null
+++ b/src/load/load_influenza_aa_dat.c
@@ -0,0 +1,254 @@
+/*
+ * Load the influnza_aa.dat tab delimited text file into an HDF5
+ * binary table.
+ *
+ * todo: Handle NULL values occuring in numeric fields.
+ */
+
+#include "load_influenza_aa_dat.h"
+#include "error/check_error.h"
+#include "error/check_h5_error.h"
+#include <hdf5_hl.h>
+#include <string.h>
+#include <stdlib.h>
+
+#define NFIELDS (hsize_t) 11
+#define TABLE_NAME "influenza_aa.dat"
+
+void
+load_influenza_aa_dat (hid_t file_id)
+{
+ /*
+ * Model the data using native types.
+ */
+ typedef struct
+ {
+ char genbank_accession_number[9];
+ char host[15];
+ int genome_segment_number;
+ char subtype[7];
+ char country[25];
+ int year;
+ int sequence_length;
+ char virus_name[196];
+ char age[17];
+ char gender[6];
+ char full_length_indicator[4];
+ } supplementary_data;
+
+ /*
+ * Use an HDF5 Table for storage.
+ * http://www.hdfgroup.org/HDF5/Tutor/h5table.html
+ */
+
+ /*
+ * "Calculate the size and the offsets of our struct members in
+ * memory."
+ */
+ size_t dst_size = sizeof (supplementary_data);
+ size_t dst_offset[NFIELDS] =
+ { HOFFSET (supplementary_data, genbank_accession_number),
+ HOFFSET (supplementary_data, host),
+ HOFFSET (supplementary_data, genome_segment_number),
+ HOFFSET (supplementary_data, subtype),
+ HOFFSET (supplementary_data, country),
+ HOFFSET (supplementary_data, year),
+ HOFFSET (supplementary_data, sequence_length),
+ HOFFSET (supplementary_data, virus_name),
+ HOFFSET (supplementary_data, age),
+ HOFFSET (supplementary_data, gender),
+ HOFFSET (supplementary_data, full_length_indicator)
+ };
+
+ supplementary_data dst_buf[1];
+
+ size_t dst_sizes[NFIELDS] = { sizeof (dst_buf[0].genbank_accession_number),
+ sizeof (dst_buf[0].host),
+ sizeof (dst_buf[0].genome_segment_number),
+ sizeof (dst_buf[0].subtype),
+ sizeof (dst_buf[0].country),
+ sizeof (dst_buf[0].year),
+ sizeof (dst_buf[0].sequence_length),
+ sizeof (dst_buf[0].virus_name),
+ sizeof (dst_buf[0].age),
+ sizeof (dst_buf[0].gender),
+ sizeof (dst_buf[0].full_length_indicator)
+ };
+
+ /*
+ * Map the native types to HDF5 types for each field.
+ */
+ hid_t field_type[NFIELDS];
+
+ hid_t genbank_accession_number_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (genbank_accession_number_type, 9);
+ field_type[0] = genbank_accession_number_type;
+
+ hid_t host_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (host_type, 15);
+ field_type[1] = host_type;
+
+ field_type[2] = H5T_NATIVE_INT;
+
+ hid_t subtype_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (subtype_type, 7);
+ field_type[3] = subtype_type;
+
+ hid_t country_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (country_type, 25);
+ field_type[4] = country_type;
+
+ field_type[5] = H5T_NATIVE_INT;
+
+ field_type[6] = H5T_NATIVE_INT;
+
+ hid_t virus_name_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (virus_name_type, 196);
+ field_type[7] = virus_name_type;
+
+ hid_t age_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (age_type, 17);
+ field_type[8] = age_type;
+
+ hid_t gender_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (gender_type, 6);
+ field_type[9] = gender_type;
+
+ hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1);
+ H5Tset_size (full_length_indicator_type, 4);
+ field_type[10] = full_length_indicator_type;
+
+ /*
+ * Labels used for the fields in the table.
+ */
+ const char *field_names[NFIELDS] = { "GenBank accession number",
+ "Host",
+ "Genome segment number",
+ "Subtype",
+ "Country",
+ "Year",
+ "Sequence length",
+ "Virus name",
+ "Age",
+ "Gender",
+ "Full-length Indicator"
+ };
+
+ /*
+ * Table storage options.
+ */
+ hsize_t chunk_size = 10;
+ int *fill_data = NULL;
+ int compress = 0;
+
+ /*
+ * Insert the records.
+ */
+ supplementary_data p_data;
+ FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat",
+ "r");
+ if (dat == NULL)
+ check_error (__FILE__, __LINE__);
+ char *line = NULL;
+ size_t len = 0;
+ int current_line = 0;
+
+ while (getline (&line, &len, dat) != -1)
+ {
+
+ current_line++;
+ char *running = strdup (line);
+ char *token = NULL;
+
+ /*
+ * Parse the line, handling the case of empty fields represented
+ * by sequential delimiters.
+ */
+ strncpy (p_data.genbank_accession_number, strsep (&running, "\t"),
+ sizeof (p_data.genbank_accession_number));
+
+ strncpy (p_data.host, strsep (&running, "\t"), sizeof (p_data.host));
+
+ token = strsep (&running, "\t");
+ if (strcmp (token, "\0") == 0)
+ p_data.genome_segment_number = 0;
+ else
+ p_data.genome_segment_number = atoi (token);
+
+ strncpy (p_data.subtype, strsep (&running, "\t"),
+ sizeof (p_data.subtype));
+
+ strncpy (p_data.country, strsep (&running, "\t"),
+ sizeof (p_data.country));
+
+ /*
+ * Convert the year field from text to numeric. Unknown and empty
+ * values are assigned a numeric value of zero.
+ */
+ token = strsep (&running, "\t");
+ if (strcmp (token, "\0") == 0)
+ p_data.year = 0;
+ else if (strcmp (token, "unknown") == 0)
+ p_data.year = 0;
+ else if (strcmp (token, "NON") == 0)
+ p_data.year = 0;
+ else
+ p_data.year = atoi (token);
+
+ token = strsep (&running, "\t");
+ if (strcmp (token, "\0") == 0)
+ p_data.sequence_length = 0;
+ else
+ p_data.sequence_length = atoi (token);
+
+ strncpy (p_data.virus_name, strsep (&running, "\t"),
+ sizeof (p_data.virus_name));
+
+ strncpy (p_data.age, strsep (&running, "\t"), sizeof (p_data.age));
+
+ strncpy (p_data.gender, strsep (&running, "\t"),
+ sizeof (p_data.gender));
+
+ strncpy (p_data.full_length_indicator, strsep (&running, "\t"),
+ sizeof (p_data.full_length_indicator));
+
+ if (current_line == 1)
+ {
+ herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
+ TABLE_NAME, NFIELDS, 1, dst_size,
+ field_names, dst_offset, field_type,
+ chunk_size, fill_data, compress,
+ &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
+ else
+ {
+ herr_t status =
+ H5TBappend_records (file_id, TABLE_NAME, 1, dst_size,
+ dst_offset, dst_sizes, &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
+
+ if (running)
+ free (running);
+
+ }
+
+ if (line)
+ free (line);
+
+ fclose (dat);
+
+ H5Tclose (genbank_accession_number_type);
+ H5Tclose (host_type);
+ H5Tclose (subtype_type);
+ H5Tclose (country_type);
+ H5Tclose (virus_name_type);
+ H5Tclose (age_type);
+ H5Tclose (gender_type);
+ H5Tclose (full_length_indicator_type);
+
+ return;
+}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.