summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--src/aggregator.c34
-rw-r--r--src/assign/assign_protein_type.c141
-rw-r--r--src/load/load_influenza_aa_dat.c53
-rw-r--r--src/load/load_influenza_aa_dat.h2
-rw-r--r--src/load/load_influenza_faa.c53
-rw-r--r--src/load/load_influenza_faa.h2
-rw-r--r--src/updator.c2
7 files changed, 201 insertions, 86 deletions
diff --git a/src/aggregator.c b/src/aggregator.c
index c00d912..c9a03b5 100644
--- a/src/aggregator.c
+++ b/src/aggregator.c
@@ -6,28 +6,48 @@
#include "error/check_h5_error.h"
#include "load/load_influenza_aa_dat.h"
#include "load/load_influenza_faa.h"
+#include <stdio.h>
-#define FILE "influenza.h5"
+#define H5FILE "influenza.h5"
+#define INFLUENZA_AA_DAT "/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat"
+#define INFLUENZA_FAA "/home/don/exp004/genomes/INFLUENZA/influenza.faa"
int
main ()
{
/*
- * Create the HDF5 file.
+ * Create a new HDF5 file if it does not already exist. If an
+ * existing file is found then open it.
*/
- hid_t file_id = H5Fcreate (FILE, H5F_ACC_TRUNC, H5P_DEFAULT, H5P_DEFAULT);
- if (file_id < 0)
- check_h5_error (file_id, __FILE__, __LINE__);
+ hid_t file_id = 0;
+ FILE *f = fopen (H5FILE, "r+");
+ if (f == NULL)
+ {
+ file_id = H5Fcreate (H5FILE, H5F_ACC_EXCL, H5P_DEFAULT, H5P_DEFAULT);
+ if (file_id < 0)
+ check_h5_error (file_id, __FILE__, __LINE__);
+ }
+ else
+ {
+ fclose (f);
+ file_id = H5Fopen (H5FILE, H5F_ACC_RDWR, H5P_DEFAULT);
+ if (file_id < 0)
+ check_h5_error (file_id, __FILE__, __LINE__);
+ }
/*
* Load the supplementary protein data file.
*/
- load_influenza_aa_dat (file_id);
+ printf ("Loading \"influenza_aa.dat\" with contents of %s.\n",
+ INFLUENZA_AA_DAT);
+ load_influenza_aa_dat (file_id, INFLUENZA_AA_DAT);
/*
* Load the FASTA protein sequence data file.
*/
- load_influenza_faa (file_id);
+ printf ("Loading \"influenza.faa\" with contents of %s.\n",
+ INFLUENZA_FAA);
+ load_influenza_faa (file_id, INFLUENZA_FAA);
/*
* Close the HDF5 file.
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 73685bb..3947800 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,5 +1,6 @@
#define _GNU_SOURCE
#include "assign_protein_type.h"
+#include "error/check_error.h"
#include "error/check_h5_error.h"
#include "error/check_ncbi_error.h"
#include "model/gi_type_data.h"
@@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id)
check_h5_error (status, __FILE__, __LINE__);
/*
+ * Allocate memory for the new table.
+ */
+ gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);
+ if (new_buf == NULL)
+ check_error (__FILE__, __LINE__);
+
+ /*
* Read the data from HDF5 gi_type_data.
*/
hsize_t gi_nfields = 0;
@@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id)
hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
- gi_type_data* gi_buf = NULL;
+ gi_type_data* old_buf = NULL;
+ /*
+ * If the table is already present read the values into memory and
+ * then clear the table.
+ */
if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
{
@@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id)
&gi_nrecords);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
+
+ printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords);
- gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
+ old_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
- gi_sizes, gi_buf);
+ gi_sizes, old_buf);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
}
+
+ /*
+ * If the table is not already present create it.
+ */
else
{
printf ("Creating gi_type_data.\n");
- gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
-
const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
GI_TYPE_DATA_FIELD_NAMES;
@@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id)
status = H5TBmake_table ("gi_type_data", file_id,
"gi_type_data",
- GI_TYPE_DATA_FIELD_NUM, faa_nrecords,
+ GI_TYPE_DATA_FIELD_NUM, 0,
gi_size, gi_type_data_field_names,
gi_offset, gi_field_type,
chunk_size, fill_data, compress,
@@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id)
}
+ /*
+ * Copy the contents of the old table into a hash.
+ */
struct hsearch_data htab;
bzero (&htab, sizeof (htab));
- hcreate_r (gi_nrecords * 2, &htab);
+ if (hcreate_r (gi_nrecords * 2, &htab) == 0)
+ error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+ "Allocation of cache failed.");
ENTRY e, *ep;
-
- for (int i = 0; i < gi_nrecords; i++)
+
+ for (int i = 0; i < (int)gi_nrecords; i++)
{
char gi_chr[25];
- snprintf (gi_chr, 25, "%i", gi_buf[i].gi);
- e.key = gi_chr;
- e.data = &gi_buf[i];
+ snprintf (gi_chr, 25, "%i", old_buf[i].gi);
+ e.key = strdup (gi_chr);
+ e.data = &old_buf[i];
if (hsearch_r (e, ENTER, &ep, &htab) == 0)
error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
"Allocation failed.");
@@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id)
* Assign protein types to records for which the field is empty.
*/
printf ("Records to process: %i\n", (int)faa_nrecords);
- bool updates_pending = false;
- for (int i = 0; i < faa_nrecords; i++)
+ int written = 0;
+ for (int i = 0; i < (int)faa_nrecords; i++)
{
+ new_buf[i].gi = faa_buf[i].gi;
+ strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));
+ strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));
char gi_chr[25];
snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
e.key = gi_chr;
+ e.data = NULL;
+
+ /*
+ * A record was not found in the cache for this gi.
+ */
if (hsearch_r (e, FIND, &ep, &htab) == 0)
{
-
- gi_buf[i].gi = faa_buf[i].gi;
- gi_buf[i].type[0] = '\0';
- gi_buf[i].protein[0] = '\0';
/*
* Read the sequence from the database by GI.
@@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id)
*/
if (error_returns != NULL)
{
- char *msg = BlastErrorToString (error_returns);
+ CharPtr msg = BlastErrorToString (error_returns);
printf ("Warning: An error has been reported by the NCBI Toolkit "
"API for sequence gi|%i: %s",
faa_buf[i].gi, msg);
@@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id)
BUFFER_LEN);
// Species Type
- gi_buf[i].type[0] = target_id_buf[4];
- gi_buf[i].type[1] = '\0';
+ new_buf[i].type[0] = target_id_buf[4];
+ new_buf[i].type[1] = '\0';
// Protein Type
- strncpy (gi_buf[i].protein, &target_id_buf[6],
- sizeof (gi_buf[i].protein));
-
- updates_pending = true;
+ strncpy (new_buf[i].protein, &target_id_buf[6],
+ sizeof (new_buf[i].protein));
}
/*
@@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id)
seqalign = SeqAlignSetFree (seqalign);
bsp = BioseqFree (bsp);
+ } // End existing entry not found.
+
+ /*
+ * Hash table entry found. Keep the old value.
+ */
+ else
+ {
+ gi_type_data* old_value = (gi_type_data*)ep->data;
+ new_buf[i].gi = old_value->gi;
+ strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type));
+ strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein));
}
/*
* Write the data out to the file.
*/
- if ( (i % 1000 == 0) && (i > 0) && updates_pending)
+ if ( (i % 1000 == 0) && (i > 0) )
{
- status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,
+ status = H5TBappend_records (file_id, "gi_type_data", 1000,
gi_size, gi_offset, gi_sizes,
- &gi_buf[i-1000]);
+ &new_buf[i-1000]);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
@@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id)
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- updates_pending = false;
+ written = i;
printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
}
@@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id)
* Write out records from the last bin if it was less than 1000
* records in size.
*/
- if (updates_pending)
+ if ((int)faa_nrecords < 1000)
{
- /*
- if ((int)faa_nrecords < 1000)
- {
- status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
- dst_size, dst_offset, dst_sizes,
- dst_buf);
- }
- else
- {
- status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,
- dst_size, dst_offset, dst_sizes,
- &dst_buf[nrecords-1000]);
- }
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
-
- status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
-
- updates_pending = false;
- */
+ status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,
+ gi_size, gi_offset, gi_sizes,
+ new_buf);
+ }
+
+ else
+ {
+ status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,
+ gi_size, gi_offset, gi_sizes,
+ &new_buf[written]);
}
+
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
free (faa_buf);
- free (gi_buf);
+ free (old_buf);
+ free (new_buf);
hdestroy_r (&htab);
options = BLASTOptionDelete (options);
+ readdb_destruct (seqdb);
return;
}
diff --git a/src/load/load_influenza_aa_dat.c b/src/load/load_influenza_aa_dat.c
index 8bf47aa..3826349 100644
--- a/src/load/load_influenza_aa_dat.c
+++ b/src/load/load_influenza_aa_dat.c
@@ -13,10 +13,9 @@
#include <stdlib.h>
#define NFIELDS (hsize_t) 11
-#define TABLE_NAME "influenza_aa.dat"
void
-load_influenza_aa_dat (hid_t file_id)
+load_influenza_aa_dat (hid_t file_id, const char* file_name)
{
/*
* Model the data using native types.
@@ -145,8 +144,7 @@ load_influenza_aa_dat (hid_t file_id)
* Insert the records.
*/
supplementary_data p_data;
- FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza_aa.dat",
- "r");
+ FILE *dat = fopen (file_name, "r");
if (dat == NULL)
check_error (__FILE__, __LINE__);
char *line = NULL;
@@ -214,18 +212,49 @@ load_influenza_aa_dat (hid_t file_id)
if (current_line == 1)
{
- herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
- TABLE_NAME, NFIELDS, 1, dst_size,
- field_names, dst_offset, field_type,
- chunk_size, fill_data, compress,
- &p_data);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
+
+ /*
+ * Dataset already exists. Purge it.
+ */
+ if (H5LTfind_dataset (file_id, "influenza_aa.dat") == 1)
+ {
+ hsize_t nfields = 0;
+ hsize_t nrecords = 0;
+ herr_t status = H5TBget_table_info (file_id, "influenza_aa.dat",
+ &nfields, &nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5TBdelete_record (file_id, "influenza_aa.dat", 0, nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status =
+ H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size,
+ dst_offset, dst_sizes, &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
+
+ /*
+ * Dataset does not exist. Create it.
+ */
+ else
+ {
+ herr_t status = H5TBmake_table ("influenza_aa.dat", file_id,
+ "influenza_aa.dat", NFIELDS, 1, dst_size,
+ field_names, dst_offset, field_type,
+ chunk_size, fill_data, compress,
+ &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
}
+
else
{
herr_t status =
- H5TBappend_records (file_id, TABLE_NAME, 1, dst_size,
+ H5TBappend_records (file_id, "influenza_aa.dat", 1, dst_size,
dst_offset, dst_sizes, &p_data);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
diff --git a/src/load/load_influenza_aa_dat.h b/src/load/load_influenza_aa_dat.h
index f6c60be..97e36f8 100644
--- a/src/load/load_influenza_aa_dat.h
+++ b/src/load/load_influenza_aa_dat.h
@@ -7,6 +7,6 @@
* Load the supplementary protein data from the NCBI influenza_aa.dat
* file.
*/
-void load_influenza_aa_dat (hid_t file_id);
+void load_influenza_aa_dat (hid_t file_id, const char* file_name);
#endif // LOAD_INFLUENZA_AA_DAT_H
diff --git a/src/load/load_influenza_faa.c b/src/load/load_influenza_faa.c
index a217989..04bf05b 100644
--- a/src/load/load_influenza_faa.c
+++ b/src/load/load_influenza_faa.c
@@ -8,7 +8,7 @@
#include <stdlib.h>
void
-load_influenza_faa (hid_t file_id)
+load_influenza_faa (hid_t file_id, const char* file_name)
{
size_t dst_size;
size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
@@ -22,8 +22,7 @@ load_influenza_faa (hid_t file_id)
int compress = 0;
sequence_data p_data;
- FILE *dat = fopen ("/home/don/exp004/genomes/INFLUENZA/influenza.faa",
- "r");
+ FILE *dat = fopen (file_name, "r");
if (dat == NULL)
check_error (__FILE__, __LINE__);
char *line = NULL;
@@ -62,16 +61,46 @@ load_influenza_faa (hid_t file_id)
if (current_line == 1)
{
- herr_t status = H5TBmake_table ("influenza.faa", file_id,
- "influenza.faa",
- SEQUENCE_DATA_FIELD_NUM, 1,
- dst_size, sequence_data_field_names,
- dst_offset, field_type,
- chunk_size, fill_data, compress,
- &p_data);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
+ /*
+ * Dataset already exists. Purge it.
+ */
+ if (H5LTfind_dataset (file_id, "influenza.faa") == 1)
+ {
+ hsize_t nfields = 0;
+ hsize_t nrecords = 0;
+ herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
+ &nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5TBdelete_record (file_id, "influenza.faa", 0, nrecords);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status =
+ H5TBappend_records (file_id, "influenza.faa", 1, dst_size,
+ dst_offset, dst_sizes, &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
+
+ /*
+ * Dataset does not exist. Create it.
+ */
+ else
+ {
+ herr_t status = H5TBmake_table ("influenza.faa", file_id,
+ "influenza.faa",
+ SEQUENCE_DATA_FIELD_NUM, 1,
+ dst_size, sequence_data_field_names,
+ dst_offset, field_type,
+ chunk_size, fill_data, compress,
+ &p_data);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+ }
}
+
else
{
herr_t status =
diff --git a/src/load/load_influenza_faa.h b/src/load/load_influenza_faa.h
index 569c411..1ad5797 100644
--- a/src/load/load_influenza_faa.h
+++ b/src/load/load_influenza_faa.h
@@ -6,6 +6,6 @@
/*
* Load the protein sequence data from the NCBI influenza.faa file.
*/
-void load_influenza_faa (hid_t file_id);
+void load_influenza_faa (hid_t file_id, const char* file_name);
#endif // LOAD_INFLUENZA_FAA_H
diff --git a/src/updator.c b/src/updator.c
index 591d2f6..9a5ad18 100644
--- a/src/updator.c
+++ b/src/updator.c
@@ -4,6 +4,8 @@
#include "assign/assign_protein_type.h"
#include "error/check_h5_error.h"
+#include <stdio.h>
+#include <signal.h>
#define FILE "influenza.h5"

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.