/* * Load the influnza_aa.dat tab delimited text file into an HDF5 * binary table. * * todo: Handle NULL values occuring in numeric fields. */ #include "load_influenza_aa_dat.h" #include "error/check_error.h" #include "error/check_h5_error.h" #include #include #include #define NFIELDS (hsize_t) 11 #define TABLE_NAME "influenza_aa.dat" void load_influenza_aa_dat (hid_t file_id) { /* * Model the data using native types. */ typedef struct { char genbank_accession_number[9]; char host[15]; int genome_segment_number; char subtype[7]; char country[25]; int year; int sequence_length; char virus_name[196]; char age[17]; char gender[6]; char full_length_indicator[4]; } supplementary_data; /* * Use an HDF5 Table for storage. * http://www.hdfgroup.org/HDF5/Tutor/h5table.html */ /* * "Calculate the size and the offsets of our struct members in * memory." */ size_t dst_size = sizeof (supplementary_data); size_t dst_offset[NFIELDS] = { HOFFSET (supplementary_data, genbank_accession_number), HOFFSET (supplementary_data, host), HOFFSET (supplementary_data, genome_segment_number), HOFFSET (supplementary_data, subtype), HOFFSET (supplementary_data, country), HOFFSET (supplementary_data, year), HOFFSET (supplementary_data, sequence_length), HOFFSET (supplementary_data, virus_name), HOFFSET (supplementary_data, age), HOFFSET (supplementary_data, gender), HOFFSET (supplementary_data, full_length_indicator) }; supplementary_data dst_buf[1]; size_t dst_sizes[NFIELDS] = { sizeof (dst_buf[0].genbank_accession_number), sizeof (dst_buf[0].host), sizeof (dst_buf[0].genome_segment_number), sizeof (dst_buf[0].subtype), sizeof (dst_buf[0].country), sizeof (dst_buf[0].year), sizeof (dst_buf[0].sequence_length), sizeof (dst_buf[0].virus_name), sizeof (dst_buf[0].age), sizeof (dst_buf[0].gender), sizeof (dst_buf[0].full_length_indicator) }; /* * Map the native types to HDF5 types for each field. */ hid_t field_type[NFIELDS]; hid_t genbank_accession_number_type = H5Tcopy (H5T_C_S1); H5Tset_size (genbank_accession_number_type, 9); field_type[0] = genbank_accession_number_type; hid_t host_type = H5Tcopy (H5T_C_S1); H5Tset_size (host_type, 15); field_type[1] = host_type; field_type[2] = H5T_NATIVE_INT; hid_t subtype_type = H5Tcopy (H5T_C_S1); H5Tset_size (subtype_type, 7); field_type[3] = subtype_type; hid_t country_type = H5Tcopy (H5T_C_S1); H5Tset_size (country_type, 25); field_type[4] = country_type; field_type[5] = H5T_NATIVE_INT; field_type[6] = H5T_NATIVE_INT; hid_t virus_name_type = H5Tcopy (H5T_C_S1); H5Tset_size (virus_name_type, 196); field_type[7] = virus_name_type; hid_t age_type = H5Tcopy (H5T_C_S1); H5Tset_size (age_type, 17); field_type[8] = age_type; hid_t gender_type = H5Tcopy (H5T_C_S1); H5Tset_size (gender_type, 6); field_type[9] = gender_type; hid_t full_length_indicator_type = H5Tcopy (H5T_C_S1); H5Tset_size (full_length_indicator_type, 4); field_type[10] = full_length_indicator_type; /* * Labels used for the fields in the table. */ const char *field_names[NFIELDS] = { "GenBank accession number", "Host", "Genome segment number", "Subtype", "Country", "Year", "Sequence length", "Virus name", "Age", "Gender", "Full-length Indicator" }; /* * Table storage options. */ hsize_t chunk_size = 10; int *fill_data = NULL; int compress = 0; /* * Insert the records. */ supplementary_data p_data; FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", "r"); if (dat == NULL) check_error (__FILE__, __LINE__); char *line = NULL; size_t len = 0; int current_line = 0; while (getline (&line, &len, dat) != -1) { current_line++; char *running = strdup (line); char *token = NULL; /* * Parse the line, handling the case of empty fields represented * by sequential delimiters. */ strncpy (p_data.genbank_accession_number, strsep (&running, "\t"), sizeof (p_data.genbank_accession_number)); strncpy (p_data.host, strsep (&running, "\t"), sizeof (p_data.host)); token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.genome_segment_number = 0; else p_data.genome_segment_number = atoi (token); strncpy (p_data.subtype, strsep (&running, "\t"), sizeof (p_data.subtype)); strncpy (p_data.country, strsep (&running, "\t"), sizeof (p_data.country)); /* * Convert the year field from text to numeric. Unknown and empty * values are assigned a numeric value of zero. */ token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.year = 0; else if (strcmp (token, "unknown") == 0) p_data.year = 0; else if (strcmp (token, "NON") == 0) p_data.year = 0; else p_data.year = atoi (token); token = strsep (&running, "\t"); if (strcmp (token, "\0") == 0) p_data.sequence_length = 0; else p_data.sequence_length = atoi (token); strncpy (p_data.virus_name, strsep (&running, "\t"), sizeof (p_data.virus_name)); strncpy (p_data.age, strsep (&running, "\t"), sizeof (p_data.age)); strncpy (p_data.gender, strsep (&running, "\t"), sizeof (p_data.gender)); strncpy (p_data.full_length_indicator, strsep (&running, "\t"), sizeof (p_data.full_length_indicator)); if (current_line == 1) { herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, TABLE_NAME, NFIELDS, 1, dst_size, field_names, dst_offset, field_type, chunk_size, fill_data, compress, &p_data); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } else { herr_t status = H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, dst_offset, dst_sizes, &p_data); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } if (running) free (running); } if (line) free (line); fclose (dat); H5Tclose (genbank_accession_number_type); H5Tclose (host_type); H5Tclose (subtype_type); H5Tclose (country_type); H5Tclose (virus_name_type); H5Tclose (age_type); H5Tclose (gender_type); H5Tclose (full_length_indicator_type); return; }