From 6f7b615d1e0a2aac33712792d6146b42f2623e8f Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Sun, 24 Jan 2010 04:18:36 +0000 Subject: Modified to load the blast scores from a text file into the HDF5 file. --- diff --git a/src/load/load_blast_scores.c b/src/load/load_blast_scores.c index 42e6bd9..e6a6fd3 100644 --- a/src/load/load_blast_scores.c +++ b/src/load/load_blast_scores.c @@ -1,3 +1,4 @@ +#define _GNU_SOURCE #include "error/check_error.h" #include "error/check_h5_error.h" #include "model/blast_scores_data.h" @@ -6,7 +7,19 @@ #include #include #include - +#include + +/* + * A simple sanity check can be performed by comparing the HDF5 + * content with the content of the input text file. Data block output + * from the first command should be equal to the first 5 lines of the + * input file while data block output from the second command should + * be equal to the last 5 lines of the input file. + * + * h5dump --dataset=blast --start "0" --count "5" influenza.h5 + * + * h5dump --dataset=blast --start "5749892" --count "5" influenza.h5 + */ void load_blast_scores (hid_t file_id, const char *file_name) { @@ -21,14 +34,15 @@ load_blast_scores (hid_t file_id, const char *file_name) int *fill_data = NULL; int compress = 0; - blast_scores_data p_data[1000]; + blast_scores_data p_data[10000]; FILE *dat = fopen (file_name, "r"); if (dat == NULL) check_error (__FILE__, __LINE__); char *line = NULL; size_t len = 0; - int current_line = 0; int i = -1; + int written = 0; + int current_line = 0; while (getline (&line, &len, dat) != -1) { @@ -65,8 +79,10 @@ load_blast_scores (hid_t file_id, const char *file_name) token = strsep (&running, ","); p_data[i].evalue = strtod (token, NULL); - if (current_line == 1) + if (written == 0) { + + printf ("Creating blast table.\n"); const char *blast_scores_data_field_names[BLAST_SCORES_DATA_FIELD_NUM] = BLAST_SCORES_DATA_FIELD_NAMES; @@ -84,13 +100,17 @@ load_blast_scores (hid_t file_id, const char *file_name) if (status < 0) check_h5_error (__FILE__, __LINE__); + written = 1; + i = -1; + current_line = 0; + } - if ((i % 1000 == 0) && (i > 0)) + if ((current_line % 10000 == 0) && (current_line > 0)) { - + herr_t status = - H5TBappend_records (file_id, "blast", 1000, + H5TBappend_records (file_id, "blast", 10000, dst_size, dst_offset, dst_sizes, &p_data[0]); if(status < 0) @@ -100,9 +120,11 @@ load_blast_scores (hid_t file_id, const char *file_name) if (status < 0) check_h5_error (__FILE__, __LINE__); - printf ("Processed %i of records.\n", current_line); + printf ("Processed %i records.\n", current_line); + written += 10000; i = -1; + } if (running) @@ -110,6 +132,21 @@ load_blast_scores (hid_t file_id, const char *file_name) } // End for each line of the input file. + if (i >= 0) + { + herr_t status = + H5TBappend_records (file_id, "blast", i+1, + dst_size, dst_offset, dst_sizes, &p_data[0]); + if (status < 0) + check_h5_error (__FILE__, __LINE__); + + printf ("Wrote %i records.\n", written + i + 1); + } + + herr_t status = H5Fflush (file_id, H5F_SCOPE_GLOBAL); + if (status < 0) + check_h5_error (__FILE__, __LINE__); + if (line) free (line); -- cgit v0.8.3.1-22-g547a