-rw-r--r-- | Makefile.am | 1 | ||||
-rw-r--r-- | src/Makefile.am | 12 | ||||
-rw-r--r-- | src/aggregator.c | 30 | ||||
-rw-r--r-- | src/load_influenza_aa_dat.c | 152 | ||||
-rw-r--r-- | src/load_influenza_aa_dat.h | 13 |
5 files changed, 208 insertions, 0 deletions
diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..af437a6 --- a/dev/null +++ b/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = src diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..7cb4282 --- a/dev/null +++ b/src/Makefile.am @@ -0,0 +1,12 @@ +bin_PROGRAMS = aggregator + +aggregator_SOURCES = \ + aggregator.c \ + load_influenza_aa_dat.c + +aggregator_LDADD = -lhdf5 + +noinst_HEADERS = \ + load_influenza_aa_dat.h + +AM_CFLAGS = -Wall -std=gnu99 -ggdb diff --git a/src/aggregator.c b/src/aggregator.c new file mode 100644 index 0000000..ae5aa60 --- a/dev/null +++ b/src/aggregator.c @@ -0,0 +1,30 @@ +/* + * Aggregate the collected influenza data into a single HDF5 + * container. + */ + +#include <hdf5.h> +#include "load_influenza_aa_dat.h" + +#define FILE "influenza.h5" + +int +main() +{ + /* + * Create the HDF5 file. + */ + hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); + + /* + * Load the supplementary protein data file. + */ + load_influenza_aa_dat (file_id); + + /* + * Close the HD5 file. + */ + herr_t status = H5Fclose (file_id); + + return 0; +} diff --git a/src/load_influenza_aa_dat.c b/src/load_influenza_aa_dat.c new file mode 100644 index 0000000..72aacb5 --- a/dev/null +++ b/src/load_influenza_aa_dat.c @@ -0,0 +1,152 @@ +#include "load_influenza_aa_dat.h" +#include "hdf5_hl.h" + +#define NFIELDS (hsize_t) 11 +//#define NRECORDS (hsize_t) 138052 +#define NRECORDS (hsize_t) 1 +#define TABLE_NAME "influenza_aa.dat" + +void +load_influenza_aa_dat (hid_t file_id) +{ + /* + * Model the data using native types. + */ + typedef struct supplementary_data + { + char genbank_accession_number[9]; + char host[15]; + int genome_segment_number; + char subtype[7]; + char country[25]; + int year; + int sequence_length; + char virus_name[196]; + char age[17]; + char gender[6]; + char full_length_indicator[4]; + } supplementary_data; + + /* + * Use an HDF5 Table for storage. + * http://www.hdfgroup.org/HDF5/Tutor/h5table.html + */ + + /* + * "Calculate the size and the offsets of our struct members in + * memory." + */ + size_t dst_size = sizeof (supplementary_data); + size_t dst_offset[NFIELDS] = { HOFFSET ( supplementary_data, genbank_accession_number ), + HOFFSET ( supplementary_data, host ), + HOFFSET ( supplementary_data, genome_segment_number ), + HOFFSET ( supplementary_data, subtype ), + HOFFSET ( supplementary_data, country ), + HOFFSET ( supplementary_data, year ), + HOFFSET ( supplementary_data, sequence_length ), + HOFFSET ( supplementary_data, virus_name ), + HOFFSET ( supplementary_data, age ), + HOFFSET ( supplementary_data, gender ), + HOFFSET ( supplementary_data, full_length_indicator )}; + + /* + + Only needed for reading? + + supplementary_data dst_buf[NRECORDS]; + + size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ), + sizeof ( dst_buf[0].host ), + sizeof ( dst_buf[0].genome_segment_number ), + sizeof ( dst_buf[0].subtype ), + sizeof ( dst_buf[0].country ), + sizeof ( dst_buf[0].year ), + sizeof ( dst_buf[0].sequence_length ), + sizeof ( dst_buf[0].virus_name ), + sizeof ( dst_buf[0].age ), + sizeof ( dst_buf[0].gender ), + sizeof ( dst_buf[0].full_length_indicator)}; + */ + + /* + * "Define field information." + */ + const char *field_names[NFIELDS] = + { "GenBank accession number", + "Host", + "Genome segment number", + "Subtype", + "Country", + "Year", + "Sequence length", + "Virus name", + "Age", + "Gender", + "Full-length Indicator" }; + hsize_t chunk_size = 10; + int *fill_data = NULL; + int compress = 0; + + /* + * "Initialize field type." + */ + hid_t field_type[NFIELDS]; + + hid_t genbank_accession_number_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size ( genbank_accession_number_type, 9 ); + field_type[0] = genbank_accession_number_type; + + hid_t host_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size ( host_type, 15 ); + field_type[1] = host_type; + + field_type[2] = H5T_NATIVE_INT; + + hid_t subtype_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size (subtype_type, 7 ); + field_type[3] = subtype_type; + + hid_t country_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size (country_type, 25 ); + field_type[4] = country_type; + + field_type[5] = H5T_NATIVE_INT; + + field_type[6] = H5T_NATIVE_INT; + + hid_t virus_name_type = H5Tcopy ( H5T_C_S1 ); + H5Tset_size (virus_name_type, 196); + field_type[7] = virus_name_type; + + hid_t age_type = H5Tcopy (H5T_C_S1); + H5Tset_size (age_type, 17); + field_type[8] = age_type; + + hid_t gender_type = H5Tcopy (H5T_C_S1); + H5Tset_size (gender_type, 6); + field_type[9] = gender_type; + + hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1); + H5Tset_size (full_length_indicator_type, 4); + field_type[10] = full_length_indicator_type; + + supplementary_data p_data[NRECORDS] = { + {"BAC53999", "Human", 7, "", "Zambia", 1999, 109, "Influenza B virus (B/Lusaka/270/99)", + "", "", "yes"} + }; + + herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME,NFIELDS,NRECORDS, + dst_size,field_names, dst_offset, field_type, + chunk_size, fill_data, compress, p_data); + + H5Tclose (genbank_accession_number_type); + H5Tclose (host_type); + H5Tclose (subtype_type); + H5Tclose (country_type); + H5Tclose (virus_name_type); + H5Tclose (age_type); + H5Tclose (gender_type); + H5Tclose (full_length_indicator_type); + + return; +} diff --git a/src/load_influenza_aa_dat.h b/src/load_influenza_aa_dat.h new file mode 100644 index 0000000..c431e67 --- a/dev/null +++ b/src/load_influenza_aa_dat.h @@ -0,0 +1,13 @@ +#ifndef LOAD_INFLUENZA_AA_DAT_H +#define LOAD_INFLUENZA_AA_DAT_H + +#include <hdf5.h> + +/* + * Load the supplementary protein data from the NCBI influenza_aa.dat + * file. + */ +void +load_influenza_aa_dat (hid_t file_id); + +#endif // LOAD_INFLUENZA_AA_DAT_H |