summaryrefslogtreecommitdiffstats
Side-by-side diff
-rw-r--r--src/aggregator.c34
-rw-r--r--src/assign/assign_protein_type.c141
-rw-r--r--src/load/load_influenza_aa_dat.c53
-rw-r--r--src/load/load_influenza_aa_dat.h2
-rw-r--r--src/load/load_influenza_faa.c53
-rw-r--r--src/load/load_influenza_faa.h2
-rw-r--r--src/updator.c2
7 files changed, 201 insertions, 86 deletions
diff --git a/src/assign/assign_protein_type.c b/src/assign/assign_protein_type.c
index 73685bb..3947800 100644
--- a/src/assign/assign_protein_type.c
+++ b/src/assign/assign_protein_type.c
@@ -1,5 +1,6 @@
#define _GNU_SOURCE
#include "assign_protein_type.h"
+#include "error/check_error.h"
#include "error/check_h5_error.h"
#include "error/check_ncbi_error.h"
#include "model/gi_type_data.h"
@@ -84,6 +85,13 @@ assign_protein_type (hid_t file_id)
check_h5_error (status, __FILE__, __LINE__);
/*
+ * Allocate memory for the new table.
+ */
+ gi_type_data* new_buf = malloc (sizeof (gi_type_data) * faa_nrecords);
+ if (new_buf == NULL)
+ check_error (__FILE__, __LINE__);
+
+ /*
* Read the data from HDF5 gi_type_data.
*/
hsize_t gi_nfields = 0;
@@ -94,8 +102,12 @@ assign_protein_type (hid_t file_id)
hid_t gi_field_type[GI_TYPE_DATA_FIELD_NUM];
gi_type_data_init (&gi_size, gi_offset, gi_sizes, gi_field_type);
- gi_type_data* gi_buf = NULL;
+ gi_type_data* old_buf = NULL;
+ /*
+ * If the table is already present read the values into memory and
+ * then clear the table.
+ */
if (H5LTfind_dataset (file_id, "gi_type_data") == 1)
{
@@ -105,22 +117,30 @@ assign_protein_type (hid_t file_id)
&gi_nrecords);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
+
+ printf (" Using gi_type_data cache of %i records.\n", (int)gi_nrecords);
- gi_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
+ old_buf = malloc (sizeof(gi_type_data) * gi_nrecords);
status = H5TBread_table (file_id, "gi_type_data", gi_size, gi_offset,
- gi_sizes, gi_buf);
+ gi_sizes, old_buf);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5TBdelete_record (file_id, "gi_type_data", 0, gi_nrecords);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
}
+
+ /*
+ * If the table is not already present create it.
+ */
else
{
printf ("Creating gi_type_data.\n");
- gi_buf = malloc (sizeof(gi_type_data) * faa_nrecords);
-
const char* gi_type_data_field_names[GI_TYPE_DATA_FIELD_NUM] =
GI_TYPE_DATA_FIELD_NAMES;
@@ -130,7 +150,7 @@ assign_protein_type (hid_t file_id)
status = H5TBmake_table ("gi_type_data", file_id,
"gi_type_data",
- GI_TYPE_DATA_FIELD_NUM, faa_nrecords,
+ GI_TYPE_DATA_FIELD_NUM, 0,
gi_size, gi_type_data_field_names,
gi_offset, gi_field_type,
chunk_size, fill_data, compress,
@@ -140,17 +160,22 @@ assign_protein_type (hid_t file_id)
}
+ /*
+ * Copy the contents of the old table into a hash.
+ */
struct hsearch_data htab;
bzero (&htab, sizeof (htab));
- hcreate_r (gi_nrecords * 2, &htab);
+ if (hcreate_r (gi_nrecords * 2, &htab) == 0)
+ error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
+ "Allocation of cache failed.");
ENTRY e, *ep;
-
- for (int i = 0; i < gi_nrecords; i++)
+
+ for (int i = 0; i < (int)gi_nrecords; i++)
{
char gi_chr[25];
- snprintf (gi_chr, 25, "%i", gi_buf[i].gi);
- e.key = gi_chr;
- e.data = &gi_buf[i];
+ snprintf (gi_chr, 25, "%i", old_buf[i].gi);
+ e.key = strdup (gi_chr);
+ e.data = &old_buf[i];
if (hsearch_r (e, ENTER, &ep, &htab) == 0)
error_at_line (EXIT_FAILURE, 0, __FILE__, __LINE__,
"Allocation failed.");
@@ -160,19 +185,23 @@ assign_protein_type (hid_t file_id)
* Assign protein types to records for which the field is empty.
*/
printf ("Records to process: %i\n", (int)faa_nrecords);
- bool updates_pending = false;
- for (int i = 0; i < faa_nrecords; i++)
+ int written = 0;
+ for (int i = 0; i < (int)faa_nrecords; i++)
{
+ new_buf[i].gi = faa_buf[i].gi;
+ strncpy (new_buf[i].type, "", sizeof (new_buf[i].type));
+ strncpy (new_buf[i].protein, "", sizeof (new_buf[i].protein));
char gi_chr[25];
snprintf (gi_chr, 25, "%i", faa_buf[i].gi);
e.key = gi_chr;
+ e.data = NULL;
+
+ /*
+ * A record was not found in the cache for this gi.
+ */
if (hsearch_r (e, FIND, &ep, &htab) == 0)
{
-
- gi_buf[i].gi = faa_buf[i].gi;
- gi_buf[i].type[0] = '\0';
- gi_buf[i].protein[0] = '\0';
/*
* Read the sequence from the database by GI.
@@ -202,7 +231,7 @@ assign_protein_type (hid_t file_id)
*/
if (error_returns != NULL)
{
- char *msg = BlastErrorToString (error_returns);
+ CharPtr msg = BlastErrorToString (error_returns);
printf ("Warning: An error has been reported by the NCBI Toolkit "
"API for sequence gi|%i: %s",
faa_buf[i].gi, msg);
@@ -221,14 +250,12 @@ assign_protein_type (hid_t file_id)
BUFFER_LEN);
// Species Type
- gi_buf[i].type[0] = target_id_buf[4];
- gi_buf[i].type[1] = '\0';
+ new_buf[i].type[0] = target_id_buf[4];
+ new_buf[i].type[1] = '\0';
// Protein Type
- strncpy (gi_buf[i].protein, &target_id_buf[6],
- sizeof (gi_buf[i].protein));
-
- updates_pending = true;
+ strncpy (new_buf[i].protein, &target_id_buf[6],
+ sizeof (new_buf[i].protein));
}
/*
@@ -246,16 +273,27 @@ assign_protein_type (hid_t file_id)
seqalign = SeqAlignSetFree (seqalign);
bsp = BioseqFree (bsp);
+ } // End existing entry not found.
+
+ /*
+ * Hash table entry found. Keep the old value.
+ */
+ else
+ {
+ gi_type_data* old_value = (gi_type_data*)ep->data;
+ new_buf[i].gi = old_value->gi;
+ strncpy (new_buf[i].type, old_value->type, sizeof (new_buf[i].type));
+ strncpy (new_buf[i].protein, old_value->protein, sizeof (new_buf[i].protein));
}
/*
* Write the data out to the file.
*/
- if ( (i % 1000 == 0) && (i > 0) && updates_pending)
+ if ( (i % 1000 == 0) && (i > 0) )
{
- status = H5TBwrite_records (file_id, "gi_type_data", i - 1000, 1000,
+ status = H5TBappend_records (file_id, "gi_type_data", 1000,
gi_size, gi_offset, gi_sizes,
- &gi_buf[i-1000]);
+ &new_buf[i-1000]);
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
@@ -263,7 +301,7 @@ assign_protein_type (hid_t file_id)
if (status < 0)
check_h5_error (status, __FILE__, __LINE__);
- updates_pending = false;
+ written = i;
printf ("Processed %i of %i records.\n", i, (int)faa_nrecords);
}
@@ -274,37 +312,34 @@ assign_protein_type (hid_t file_id)
* Write out records from the last bin if it was less than 1000
* records in size.
*/
- if (updates_pending)
+ if ((int)faa_nrecords < 1000)
{
- /*
- if ((int)faa_nrecords < 1000)
- {
- status = H5TBwrite_records (file_id, "influenza.faa", 0, nrecords,
- dst_size, dst_offset, dst_sizes,
- dst_buf);
- }
- else
- {
- status = H5TBwrite_records (file_id, "influenza.faa", nrecords - 1000, 1000,
- dst_size, dst_offset, dst_sizes,
- &dst_buf[nrecords-1000]);
- }
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
-
- status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
- if (status < 0)
- check_h5_error (status, __FILE__, __LINE__);
-
- updates_pending = false;
- */
+ status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords,
+ gi_size, gi_offset, gi_sizes,
+ new_buf);
+ }
+
+ else
+ {
+ status = H5TBappend_records (file_id, "gi_type_data", faa_nrecords - written,
+ gi_size, gi_offset, gi_sizes,
+ &new_buf[written]);
}
+
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
+
+ status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+ if (status < 0)
+ check_h5_error (status, __FILE__, __LINE__);
free (faa_buf);
- free (gi_buf);
+ free (old_buf);
+ free (new_buf);
hdestroy_r (&htab);
options = BLASTOptionDelete (options);
+ readdb_destruct (seqdb);
return;
}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.