-rw-r--r-- | src/aggregator.c | 34 | ||||
-rw-r--r-- | src/assign/assign_protein_type.c | 141 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_aa_dat.h | 2 | ||||
-rw-r--r-- | src/load/load_influenza_faa.c | 53 | ||||
-rw-r--r-- | src/load/load_influenza_faa.h | 2 | ||||
-rw-r--r-- | src/updator.c | 2 |
7 files changed, 201 insertions, 86 deletions
diff --git a/src/aggregator.c b/src/aggregator.c index c00d912..c9a03b5 100644 --- a/src/aggregator.c +++ b/src/aggregator.c @@ -6,28 +6,48 @@ #include "error/check_h5_error.h" #include "load/load_influenza_aa_dat.h" #include "load/load_influenza_faa.h" +#include <stdio.h> -#define FILE "influenza.h5" +#define H5FILE "influenza.h5" +#define INFLUENZA_AA_DAT "/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat" +#define INFLUENZA_FAA "/home/don/exp004/genomes/INFLUENZA/influenza.faa" int main () { /* - * Create the HDF5 file. + * Create a new HDF5 file if it does not already exist. If an + * existing file is found then open it. */ - hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT); - if (file_id < 0) - check_h5_error (file_id, __FILE__, __LINE__); + hid_t file_id = 0; + FILE *f = fopen (H5FILE, "r+"); + if (f == NULL) + { + file_id = H5Fcreate (H5FILE, H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT); + if (file_id < 0) + check_h5_error (file_id, __FILE__, __LINE__); + } + else + { + fclose (f); + file_id = H5Fopen (H5FILE, H5F_ACC_RDWR, H5P_DEFAULT); + if (file_id < 0) + check_h5_error (file_id, __FILE__, __LINE__); + } /* * Load the supplementary protein data file. */ - load_influenza_aa_dat (file_id); + printf ("Loading \"influenza_aa.dat\" with contents of %s.\n", + INFLUENZA_AA_DAT); + load_influenza_aa_dat (file_id, INFLUENZA_AA_DAT); /* * Load the FASTA protein sequence data file. */ - load_influenza_faa (file_id); + printf ("Loading \"influenza.faa\" with contents of %s.\n", + INFLUENZA_FAA); + load_influenza_faa (file_id, INFLUENZA_FAA); /* * Close the HDF5 file. diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c index 73685bb..3947800 100644 --- a/src/assign/assign_protein_type.c +++ b/src/assign/assign_protein_type.c @@ -1,5 +1,6 @@ #define _GNU_SOURCE #include "assign_protein_type.h" +#include "error/check_error.h" #include "error/check_h5_error.h" #include "error/check_ncbi_error.h" #include "model/gi_type_data.h" @@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id) check_h5_error (status, __FILE__, __LINE__); /* + * Allocate memory for the new table. + */ + gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords); + if (new_buf == NULL) + check_error (__FILE__, __LINE__); + + /* * Read the data from HDF5 gi_type_data. */ hsize_t gi_nfields = 0; @@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id) hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM]; gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type); - gi_type_data* gi_buf = NULL; + gi_type_data* old_buf = NULL; + /* + * If the table is already present read the values into memory and + * then clear the table. + */ if (H5LTfind_dataset (file_id, "gi_type_data") == 1) { @@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id) &gi_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); + + printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords); - gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords); + old_buf = malloc (sizeof(gi_type_data) * gi_nrecords); status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset, - gi_sizes, gi_buf); + gi_sizes, old_buf); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords); if (status < 0) check_h5_error (status, __FILE__, __LINE__); } + + /* + * If the table is not already present create it. + */ else { printf ("Creating gi_type_data.\n"); - gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords); - const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] = GI_TYPE_DATA_FIELD_NAMES; @@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id) status = H5TBmake_table ("gi_type_data", file_id, "gi_type_data", - GI_TYPE_DATA_FIELD_NUM, faa_nrecords, + GI_TYPE_DATA_FIELD_NUM, 0, gi_size, gi_type_data_field_names, gi_offset, gi_field_type, chunk_size, fill_data, compress, @@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id) } + /* + * Copy the contents of the old table into a hash. + */ struct hsearch_data htab; bzero (&htab, sizeof (htab)); - hcreate_r (gi_nrecords * 2, &htab); + if (hcreate_r (gi_nrecords * 2, &htab) == 0) + error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, + "Allocation of cache failed."); ENTRY e, *ep; - - for (int i = 0; i < gi_nrecords; i++) + + for (int i = 0; i < (int)gi_nrecords; i++) { char gi_chr[25]; - snprintf (gi_chr, 25, "%i", gi_buf[i].gi); - e.key = gi_chr; - e.data = &gi_buf[i]; + snprintf (gi_chr, 25, "%i", old_buf[i].gi); + e.key = strdup (gi_chr); + e.data = &old_buf[i]; if (hsearch_r (e, ENTER, &ep, &htab) == 0) error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__, "Allocation failed."); @@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id) * Assign protein types to records for which the field is empty. */ printf ("Records to process: %i\n", (int)faa_nrecords); - bool updates_pending = false; - for (int i = 0; i < faa_nrecords; i++) + int written = 0; + for (int i = 0; i < (int)faa_nrecords; i++) { + new_buf[i].gi = faa_buf[i].gi; + strncpy (new_buf[i].type, "", sizeof (new_buf[i].type)); + strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein)); char gi_chr[25]; snprintf (gi_chr, 25, "%i", faa_buf[i].gi); e.key = gi_chr; + e.data = NULL; + + /* + * A record was not found in the cache for this gi. + */ if (hsearch_r (e, FIND, &ep, &htab) == 0) { - - gi_buf[i].gi = faa_buf[i].gi; - gi_buf[i].type[0] = '\0'; - gi_buf[i].protein[0] = '\0'; /* * Read the sequence from the database by GI. @@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id) */ if (error_returns != NULL) { - char *msg = BlastErrorToString (error_returns); + CharPtr msg = BlastErrorToString (error_returns); printf ("Warning: An error has been reported by the NCBI Toolkit " "API for sequence gi|%i: %s", faa_buf[i].gi, msg); @@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id) BUFFER_LEN); // Species Type - gi_buf[i].type[0] = target_id_buf[4]; - gi_buf[i].type[1] = '\0'; + new_buf[i].type[0] = target_id_buf[4]; + new_buf[i].type[1] = '\0'; // Protein Type - strncpy (gi_buf[i].protein, &target_id_buf[6], - sizeof (gi_buf[i].protein)); - - updates_pending = true; + strncpy (new_buf[i].protein, &target_id_buf[6], + sizeof (new_buf[i].protein)); } /* @@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id) seqalign = SeqAlignSetFree (seqalign); bsp = BioseqFree (bsp); + } // End existing entry not found. + + /* + * Hash table entry found. Keep the old value. + */ + else + { + gi_type_data* old_value = (gi_type_data*)ep->data; + new_buf[i].gi = old_value->gi; + strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type)); + strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein)); } /* * Write the data out to the file. */ - if ( (i % 1000 == 0) && (i > 0) && updates_pending) + if ( (i % 1000 == 0) && (i > 0) ) { - status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000, + status = H5TBappend_records (file_id, "gi_type_data", 1000, gi_size, gi_offset, gi_sizes, - &gi_buf[i-1000]); + &new_buf[i-1000]); if (status < 0) check_h5_error (status, __FILE__, __LINE__); @@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id) if (status < 0) check_h5_error (status, __FILE__, __LINE__); - updates_pending = false; + written = i; printf ("Processed %i of %i records.\n", i, (int)faa_nrecords); } @@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id) * Write out records from the last bin if it was less than 1000 * records in size. */ - if (updates_pending) + if ((int)faa_nrecords < 1000) { - /* - if ((int)faa_nrecords < 1000) - { - status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords, - dst_size, dst_offset, dst_sizes, - dst_buf); - } - else - { - status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000, - dst_size, dst_offset, dst_sizes, - &dst_buf[nrecords-1000]); - } - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); - - status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); - - updates_pending = false; - */ + status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords, + gi_size, gi_offset, gi_sizes, + new_buf); + } + + else + { + status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written, + gi_size, gi_offset, gi_sizes, + &new_buf[written]); } + + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); free (faa_buf); - free (gi_buf); + free (old_buf); + free (new_buf); hdestroy_r (&htab); options = BLASTOptionDelete (options); + readdb_destruct (seqdb); return; } diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c index 8bf47aa..3826349 100644 --- a/src/load/load_influenza_aa_dat.c +++ b/src/load/load_influenza_aa_dat.c @@ -13,10 +13,9 @@ #include <stdlib.h> #define NFIELDS (hsize_t) 11 -#define TABLE_NAME "influenza_aa.dat" void -load_influenza_aa_dat (hid_t file_id) +load_influenza_aa_dat (hid_t file_id, const char* file_name) { /* * Model the data using native types. @@ -145,8 +144,7 @@ load_influenza_aa_dat (hid_t file_id) * Insert the records. */ supplementary_data p_data; - FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat", - "r"); + FILE *dat = fopen (file_name, "r"); if (dat == NULL) check_error (__FILE__, __LINE__); char *line = NULL; @@ -214,18 +212,49 @@ load_influenza_aa_dat (hid_t file_id) if (current_line == 1) { - herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, - TABLE_NAME, NFIELDS, 1, dst_size, - field_names, dst_offset, field_type, - chunk_size, fill_data, compress, - &p_data); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); + + /* + * Dataset already exists. Purge it. + */ + if (H5LTfind_dataset (file_id, "influenza_aa.dat") == 1) + { + hsize_t nfields = 0; + hsize_t nrecords = 0; + herr_t status = H5TBget_table_info (file_id, "influenza_aa.dat", + &nfields, &nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5TBdelete_record (file_id, "influenza_aa.dat", 0, nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = + H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size, + dst_offset, dst_sizes, &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } + + /* + * Dataset does not exist. Create it. + */ + else + { + herr_t status = H5TBmake_table ("influenza_aa.dat", file_id, + "influenza_aa.dat", NFIELDS, 1, dst_size, + field_names, dst_offset, field_type, + chunk_size, fill_data, compress, + &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } } + else { herr_t status = - H5TBappend_records (file_id, TABLE_NAME, 1, dst_size, + H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size, dst_offset, dst_sizes, &p_data); if (status < 0) check_h5_error (status, __FILE__, __LINE__); diff --git a/src/load/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h index f6c60be..97e36f8 100644 --- a/src/load/load_influenza_aa_dat.h +++ b/src/load/load_influenza_aa_dat.h @@ -7,6 +7,6 @@ * Load the supplementary protein data from the NCBI influenza_aa.dat * file. */ -void load_influenza_aa_dat (hid_t file_id); +void load_influenza_aa_dat (hid_t file_id, const char* file_name); #endif // LOAD_INFLUENZA_AA_DAT_H diff --git a/src/load/load_influenza_faa.c b/src/load/load_influenza_faa.c index a217989..04bf05b 100644 --- a/src/load/load_influenza_faa.c +++ b/src/load/load_influenza_faa.c @@ -8,7 +8,7 @@ #include <stdlib.h> void -load_influenza_faa (hid_t file_id) +load_influenza_faa (hid_t file_id, const char* file_name) { size_t dst_size; size_t dst_offset[SEQUENCE_DATA_FIELD_NUM]; @@ -22,8 +22,7 @@ load_influenza_faa (hid_t file_id) int compress = 0; sequence_data p_data; - FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza.faa", - "r"); + FILE *dat = fopen (file_name, "r"); if (dat == NULL) check_error (__FILE__, __LINE__); char *line = NULL; @@ -62,16 +61,46 @@ load_influenza_faa (hid_t file_id) if (current_line == 1) { - herr_t status = H5TBmake_table ("influenza.faa", file_id, - "influenza.faa", - SEQUENCE_DATA_FIELD_NUM, 1, - dst_size, sequence_data_field_names, - dst_offset, field_type, - chunk_size, fill_data, compress, - &p_data); - if (status < 0) - check_h5_error (status, __FILE__, __LINE__); + /* + * Dataset already exists. Purge it. + */ + if (H5LTfind_dataset (file_id, "influenza.faa") == 1) + { + hsize_t nfields = 0; + hsize_t nrecords = 0; + herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields, + &nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = H5TBdelete_record (file_id, "influenza.faa", 0, nrecords); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + + status = + H5TBappend_records (file_id, "influenza.faa", 1, dst_size, + dst_offset, dst_sizes, &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } + + /* + * Dataset does not exist. Create it. + */ + else + { + herr_t status = H5TBmake_table ("influenza.faa", file_id, + "influenza.faa", + SEQUENCE_DATA_FIELD_NUM, 1, + dst_size, sequence_data_field_names, + dst_offset, field_type, + chunk_size, fill_data, compress, + &p_data); + if (status < 0) + check_h5_error (status, __FILE__, __LINE__); + } } + else { herr_t status = diff --git a/src/load/load_influenza_faa.h b/src/load/load_influenza_faa.h index 569c411..1ad5797 100644 --- a/src/load/load_influenza_faa.h +++ b/src/load/load_influenza_faa.h @@ -6,6 +6,6 @@ /* * Load the protein sequence data from the NCBI influenza.faa file. */ -void load_influenza_faa (hid_t file_id); +void load_influenza_faa (hid_t file_id, const char* file_name); #endif // LOAD_INFLUENZA_FAA_H diff --git a/src/updator.c b/src/updator.c index 591d2f6..9a5ad18 100644 --- a/src/updator.c +++ b/src/updator.c @@ -4,6 +4,8 @@ #include "assign/assign_protein_type.h" #include "error/check_h5_error.h" +#include <stdio.h> +#include <signal.h> #define FILE "influenza.h5" |