/* * Load the influnza_aa.dat tab delimited text file into an HDF5 * binary table. * * todo: Handle NULL values occuring in numeric fields. */ #include "load_influenza_aa_dat.h" #include "check_error.h" #include "check_h5_error.h" #include "hdf5_hl.h" #include #include #define NFIELDS (hsize_t) 11 #define TABLE_NAME "influenza_aa.dat" void load_influenza_aa_dat (hid_t file_id) { /* * Model the data using native types. */ typedef struct { char genbank_accession_number[9]; char host[15]; int genome_segment_number; char subtype[7]; char country[25]; int year; int sequence_length; char virus_name[196]; char age[17]; char gender[6]; char full_length_indicator[4]; } supplementary_data; /* * Use an HDF5 Table for storage. * http://www.hdfgroup.org/HDF5/Tutor/h5table.html */ /* * "Calculate the size and the offsets of our struct members in * memory." */ size_t dst_size = sizeof (supplementary_data); size_t dst_offset[NFIELDS] = { HOFFSET ( supplementary_data, genbank_accession_number ), HOFFSET ( supplementary_data, host ), HOFFSET ( supplementary_data, genome_segment_number ), HOFFSET ( supplementary_data, subtype ), HOFFSET ( supplementary_data, country ), HOFFSET ( supplementary_data, year ), HOFFSET ( supplementary_data, sequence_length ), HOFFSET ( supplementary_data, virus_name ), HOFFSET ( supplementary_data, age ), HOFFSET ( supplementary_data, gender ), HOFFSET ( supplementary_data, full_length_indicator )}; supplementary_data dst_buf[1]; size_t dst_sizes[NFIELDS] = { sizeof ( dst_buf[0].genbank_accession_number ), sizeof ( dst_buf[0].host ), sizeof ( dst_buf[0].genome_segment_number ), sizeof ( dst_buf[0].subtype ), sizeof ( dst_buf[0].country ), sizeof ( dst_buf[0].year ), sizeof ( dst_buf[0].sequence_length ), sizeof ( dst_buf[0].virus_name ), sizeof ( dst_buf[0].age ), sizeof ( dst_buf[0].gender ), sizeof ( dst_buf[0].full_length_indicator)}; /* * Map the native types to HDF5 types for each field. */ hid_t field_type[NFIELDS]; hid_t genbank_accession_number_type = H5Tcopy ( H5T_C_S1 ); H5Tset_size ( genbank_accession_number_type, 9 ); field_type[0] = genbank_accession_number_type; hid_t host_type = H5Tcopy ( H5T_C_S1 ); H5Tset_size ( host_type, 15 ); field_type[1] = host_type; field_type[2] = H5T_NATIVE_INT; hid_t subtype_type = H5Tcopy ( H5T_C_S1 ); H5Tset_size (subtype_type, 7 ); field_type[3] = subtype_type; hid_t country_type = H5Tcopy ( H5T_C_S1 ); H5Tset_size (country_type, 25 ); field_type[4] = country_type; field_type[5] = H5T_NATIVE_INT; field_type[6] = H5T_NATIVE_INT; hid_t virus_name_type = H5Tcopy ( H5T_C_S1 ); H5Tset_size (virus_name_type, 196); field_type[7] = virus_name_type; hid_t age_type = H5Tcopy (H5T_C_S1); H5Tset_size (age_type, 17); field_type[8] = age_type; hid_t gender_type = H5Tcopy (H5T_C_S1); H5Tset_size (gender_type, 6); field_type[9] = gender_type; hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1); H5Tset_size (full_length_indicator_type, 4); field_type[10] = full_length_indicator_type; /* * Labels used for the fields in the table. */ const char *field_names[NFIELDS] = { "GenBank accession number", "Host", "Genome segment number", "Subtype", "Country", "Year", "Sequence length", "Virus name", "Age", "Gender", "Full-length Indicator" }; /* * Table storage options. */ hsize_t chunk_size = 10; int *fill_data = NULL; int compress = 0; /* * Insert the records. */ supplementary_data p_data; FILE* dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); if (dat == NULL) check_error (__FILE__, __LINE__); char *line = NULL; size_t len = 0; int current_line = 0; while (getline (&line, &len, dat) != -1) { current_line++; char *running = strdup (line); char *token; /* * Parse the line, handling the case of empty fields represented * by sequential delimiters. */ strncpy(p_data.genbank_accession_number, strsep (&running, "\t"), sizeof(p_data.genbank_accession_number)); strncpy(p_data.host, strsep (&running, "\t"), sizeof(p_data.host)); token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.genome_segment_number = 0; else p_data.genome_segment_number = atoi(token); strncpy(p_data.subtype, strsep (&running, "\t"), sizeof(p_data.subtype)); strncpy(p_data.country, strsep (&running, "\t"), sizeof(p_data.country)); /* * Convert the year field from text to numeric. Unknown and empty * values are assigned a numeric value of zero. */ token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.year = 0; else if (strcmp (token, "unknown") == 0) p_data.year = 0; else if (strcmp (token, "NON") == 0) p_data.year = 0; else p_data.year = atoi(token); token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.sequence_length = 0; else p_data.sequence_length = atoi(token); strncpy(p_data.virus_name, strsep (&running, "\t"), sizeof(p_data.virus_name)); strncpy(p_data.age, strsep (&running, "\t"), sizeof(p_data.age)); strncpy(p_data.gender, strsep (&running, "\t"), sizeof(p_data.gender)); strncpy(p_data.full_length_indicator, strsep (&running, "\t"), sizeof(p_data.full_length_indicator)); if (current_line == 1) { herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME, NFIELDS, 1, dst_size, field_names, dst_offset, field_type, chunk_size, fill_data, compress, &p_data); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } else { herr_t status = H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, dst_sizes, &p_data); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } if (running) free (running); } if (line) free (line); fclose (dat); H5Tclose (genbank_accession_number_type); H5Tclose (host_type); H5Tclose (subtype_type); H5Tclose (country_type); H5Tclose (virus_name_type); H5Tclose (age_type); H5Tclose (gender_type); H5Tclose (full_length_indicator_type); return; }