Implemented updator to calculate and assign protein type values to

records which do not have them. This is a re-entrant process that only updates missing records and skips quickly through existing records. In addition premature termination loses at most 999 records of work since every thousand records are written back to the file and flushed to disk. Still to do is to write and flush the last set of records in the final bin under 1000. Also refactored the sequence_data structure so that it can be shared between HDF5 reading and writing operations.
author: Don Pellegrino <don@drexel.edu> 2010-01-18 17:26:18 (GMT)
committer: Don Pellegrino <don@drexel.edu> 2010-01-18 17:26:18 (GMT)
commit: a7cc532248146968a9296be7db42830e35525afa (patch) (unidiff)
tree: 93c44ad4f1ac27a1daed4ad5c7fdcb76971c0221
parent: 0d0e0886d17612fb7ebdb9110679d5b7bd5087be (diff)
download: exp007-a7cc532248146968a9296be7db42830e35525afa.zip
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.gz
exp007-a7cc532248146968a9296be7db42830e35525afa.tar.bz2
7 files changed, 188 insertions, 96 deletions
diff --git a/.gitignore b/.gitignore
index b07ba15..4e963fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,7 +14,8 @@ configure
 depcomp
 install-sh
 missing
-stamp-h1
+src/.deps
 src/aggregator
 src/influenza.h5
-src/.deps
+src/updator
+stamp-h1
diff --git a/src/Makefile.am b/src/Makefile.am
index a7e6852..6dd8e16 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
        check_error.c \
        check_h5_error.c \
        load_influenza_aa_dat.c \
-        load_influenza_faa.c
+        load_influenza_faa.c \
+        sequence_data_init.c
 updator_SOURCES = \
        updator.c \
        check_error.c \
        check_h5_error.c \
        check_ncbi_error.c \
-        assign_protein_type.c
+        assign_protein_type.c \
+        sequence_data_init.c
 noinst_HEADERS = \
        assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
        check_h5_error.h \
        check_ncbi_error.h \
        load_influenza_aa_dat.h \
-        load_influenza_faa.h
+        load_influenza_faa.h \
+        sequence_data.h \
+        sequence_data_init.h
 AM_CFLAGS = -Wall -std=gnu99 -ggdb
diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c
index 1b58f54..166e787 100644
--- a/src/assign_protein_type.c
+++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
 #include "assign_protein_type.h"
 #include "check_ncbi_error.h"
 #include "check_h5_error.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
 #include <ncbi.h>
 #include <readdb.h>
 #include <blast.h>
 #include <salpacc.h>
 #include <stdbool.h>
 #include <hdf5_hl.h>
+#include <error.h>
 /*
 * BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
   * Get default BLAST options.
   */
  BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
+  /*
+   * Set to single hit mode.
+   */
+  options->window_size = 0;
+  options->multiple_hits_only = false;
  ValNodePtr error_returns = NULL;
  /*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
  if (status < 0)
    check_h5_error (status, __FILE__, __LINE__);
-  /*
+  sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
-   * todo: Allocate memory of nrecords for dst_buf.
-   *
+  size_t dst_size;
-   * todo: Refactor code to share structres in read and write HDF5
+  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
-   * calls.
+  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
-   */
+  hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
+  sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);  
  status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
                           dst_sizes, dst_buf);
  if (status < 0)
    check_h5_error (status, __FILE__, __LINE__);
-  for (int i = 0; i < nrecords; i++)
-    {
-      
-    }
  /*
-   * Read the sequence from the database by GI.
+   * Assign protein types to records for which the field is empty.
   */
-  Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
+  printf ("Records to process: %i\n", (int)nrecords);
-  BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+  for (int i = 0; i < nrecords; i++)
-  SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
-                                            "blastp",
-                                            REFDB,
-                                            options,
-                                            NULL,
-                                            &error_returns,
-                                            NULL);
-  if (error_returns != NULL)
-    check_ncbi_error (error_returns, __FILE__, __LINE__);
-  if (seqalign != NULL)
    {
-      Char target_id_buf[BUFFER_LEN + 1];
-      SeqIdPtr target_id = SeqAlignId (seqalign, 1);
-      SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
-      printf ("%s\n", target_id_buf);
-    }
-  // Clean up memory for the next ieration.
-  seqalign = SeqAlignSetFree (seqalign);
-  bsp = BioseqFree (bsp);
+      if (dst_buf[i].protein_type[0] == '\0') {
+        /*
+         * Read the sequence from the database by GI.
+         */
+        Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
+        BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
+        
+        SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
+                                                  "blastp",
+                                                  REFDB,
+                                                  options,
+                                                  NULL,
+                                                  &error_returns,
+                                                  NULL);
+        /*
+         * BLAST reported an error.  Write it out and continue processing.
+         */
+        if (error_returns != NULL)
+          {
+            printf ("Warning: An error has been reported by the NCBI Toolkit API "
+                    "for sequence gi|%i: %s",
+                    dst_buf[i].gi, BlastErrorToString (error_returns));
+          }
+        /*
+         * A hit was found.  Record the first hit as the protein type.
+         * Skip the first 6 characters and eat the "lcl|x_".
+         */
+        else if (seqalign != NULL)
+          {
+            Char target_id_buf[BUFFER_LEN + 1];
+            SeqIdPtr target_id = SeqAlignId (seqalign, 1);
+            SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
+            strncpy (dst_buf[i].protein_type, &target_id_buf[6], 
+                     sizeof (dst_buf[i].protein_type));
+          }
+        /*
+         * BLAST did not find any hits.
+         */
+        else
+          {
+            printf ("Warning: Unable to identify protein type for sequence gi|%i\n",
+                    dst_buf[i].gi);
+          }
+        
+        /*
+         * Clean up memory for the next ieration.
+         */
+        seqalign = SeqAlignSetFree (seqalign);
+        bsp = BioseqFree (bsp);
+        /*
+         * Write the data out to the file.
+         */
+        if ( (i % 1000 == 0) && (i > 0) )
+          {
+            status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
+                                        dst_size, dst_offset, dst_sizes, 
+                                        &dst_buf[i-1000]);
+            if (status < 0)
+              check_h5_error (status, __FILE__, __LINE__);
+            status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
+            if (status < 0)
+              check_h5_error (status, __FILE__, __LINE__);
+            printf ("Processed %i of %i records.\n", i, (int)nrecords);
+          }
+      }
+      
+    }
+  
+  free (dst_buf);
+  
  options = BLASTOptionDelete (options);
+  
  return;
 }
diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c
index 749b7ad..fd35254 100644
--- a/src/load_influenza_faa.c
+++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
-#include "load_influenza_faa.h"
 #include "check_error.h"
 #include "check_h5_error.h"
-#include "hdf5_hl.h"
+#include "load_influenza_faa.h"
+#include "sequence_data.h"
+#include "sequence_data_init.h"
+#include <hdf5_hl.h>
 #include <string.h>
 #include <stdlib.h>
-#define SEQUENCE_DATA_FIELD_NUM 4
 void
 load_influenza_faa (hid_t file_id)
 {
-  typedef struct
+  size_t dst_size;
-  {
+  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
-    int gi;
+  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
-    char gb[9];
-    char description[196];
-    char protein_type[7];
-  } sequence_data;
-  
-  size_t dst_size = sizeof (sequence_data);
-  size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
-    { HOFFSET (sequence_data, gi),
-      HOFFSET (sequence_data, gb),
-      HOFFSET (sequence_data, description),
-      HOFFSET (sequence_data, protein_type)
-    };
-  sequence_data dst_buf[1];
-  size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
-    sizeof (dst_buf[0].gi),
-    sizeof (dst_buf[0].gb),
-    sizeof (dst_buf[0].description),
-    sizeof (dst_buf[0].protein_type)
-  };
  hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
-  field_type[0] = H5T_NATIVE_INT;
+  sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
+  
-  hid_t gb_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (gb_type, 9);
-  field_type[1] = gb_type;
-  hid_t description_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (description_type, 196);
-  field_type[2] = description_type;
-  hid_t protein_type_type = H5Tcopy (H5T_C_S1);
-  H5Tset_size (protein_type_type, 7);
-  field_type[3] = protein_type_type;
-  const char *field_names[SEQUENCE_DATA_FIELD_NUM] = 
-    { "GI",
-      "GB",
-      "Description",
-      "Protein Type" };
  hsize_t chunk_size = 10;
  int *fill_data = NULL;
  int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
          strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
+          const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
+            SEQUENCE_DATA_FIELD_NAMES;
          if (current_line == 1)
            {
              herr_t status = H5TBmake_table ("influenza.faa", file_id,
                                              "influenza.faa",
                                              SEQUENCE_DATA_FIELD_NUM, 1,
-                                              dst_size, field_names,
+                                              dst_size, sequence_data_field_names,
                                              dst_offset, field_type,
                                              chunk_size, fill_data, compress,
                                              &p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
  fclose (dat);
-  H5Tclose (gb_type);
-  H5Tclose (description_type);
-  H5Tclose (protein_type_type);
  return;
 }
diff --git a/src/sequence_data.h b/src/sequence_data.h
new file mode 100644
index 0000000..fb4ff78
--- a/dev/null
+++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
+#ifndef SEQUENCE_DATA_H
+#define SEQUENCE_DATA_H
+#define SEQUENCE_DATA_FIELD_NUM 4
+#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
+typedef struct
+{
+  int gi;
+  char gb[9];
+  char description[196];
+  char protein_type[7];
+} sequence_data;
+#endif // SEQUENCE_DATA_H
diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c
new file mode 100644
index 0000000..09ba189
--- a/dev/null
+++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
+#include "sequence_data_init.h"
+#include "sequence_data.h"
+void 
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+                    hid_t *field_type)
+{
+  *dst_size = sizeof (sequence_data);
+  
+  dst_offset[0] = HOFFSET (sequence_data, gi);
+  dst_offset[1] = HOFFSET (sequence_data, gb);
+  dst_offset[2] = HOFFSET (sequence_data, description);
+  dst_offset[3] = HOFFSET (sequence_data, protein_type);
+  
+  sequence_data dst_buf[1];
+  
+  dst_sizes[0] = sizeof (dst_buf[0].gi);
+  dst_sizes[1] = sizeof (dst_buf[0].gb);
+  dst_sizes[2] = sizeof (dst_buf[0].description);
+  dst_sizes[3] = sizeof (dst_buf[0].protein_type);
+  
+  field_type[0] = H5T_NATIVE_INT;
+  
+  hid_t gb_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (gb_type, 9);
+  field_type[1] = gb_type;
+  
+  hid_t description_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (description_type, 196);
+  field_type[2] = description_type;
+  
+  hid_t protein_type_type = H5Tcopy (H5T_C_S1);
+  H5Tset_size (protein_type_type, 7);
+  field_type[3] = protein_type_type;
+  return;
+}
diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h
new file mode 100644
index 0000000..c87e7e6
--- a/dev/null
+++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
+#ifndef SEQUENCE_DATA_INIT_H
+#define SEQUENCE_DATA_INIT_H
+#include <hdf5.h>
+/*
+ * Initialize the structures describing sequence_data.  These
+ * descriptive structures are used by the HDF5 API.
+ */
+void 
+sequence_data_init (size_t *dst_size, size_t *dst_offset, size_t *dst_sizes,
+                    hid_t *field_type);
+#endif // SEQUENCE_DATA_INIT_H
author	Don Pellegrino <don@drexel.edu>	2010-01-18 17:26:18 (GMT)
committer	Don Pellegrino <don@drexel.edu>	2010-01-18 17:26:18 (GMT)
commit	a7cc532248146968a9296be7db42830e35525afa (patch) (unidiff)
tree	93c44ad4f1ac27a1daed4ad5c7fdcb76971c0221
parent	0d0e0886d17612fb7ebdb9110679d5b7bd5087be (diff)
download	exp007-a7cc532248146968a9296be7db42830e35525afa.zip exp007-a7cc532248146968a9296be7db42830e35525afa.tar.gz exp007-a7cc532248146968a9296be7db42830e35525afa.tar.bz2

diff --git a/.gitignore b/.gitignore index b07ba15..4e963fd 100644 --- a/.gitignore +++ b/.gitignore
@@ -14,7 +14,8 @@ configure
14	depcomp	14	depcomp
15	install-sh	15	install-sh
16	missing	16	missing
17	stamp-h1	17	src/.deps
18	src/aggregator	18	src/aggregator
19	src/influenza.h5	19	src/influenza.h5
20	src/.deps	20	src/updator
		21	stamp-h1


diff --git a/src/Makefile.am b/src/Makefile.am index a7e6852..6dd8e16 100644 --- a/src/Makefile.am +++ b/src/Makefile.am
@@ -5,14 +5,16 @@ aggregator_SOURCES = \
5	check_error.c \	5	check_error.c \
6	check_h5_error.c \	6	check_h5_error.c \
7	load_influenza_aa_dat.c \	7	load_influenza_aa_dat.c \
8	load_influenza_faa.c	8	load_influenza_faa.c \
		9	sequence_data_init.c
9		10
10	updator_SOURCES = \	11	updator_SOURCES = \
11	updator.c \	12	updator.c \
12	check_error.c \	13	check_error.c \
13	check_h5_error.c \	14	check_h5_error.c \
14	check_ncbi_error.c \	15	check_ncbi_error.c \
15	assign_protein_type.c	16	assign_protein_type.c \
		17	sequence_data_init.c
16		18
17	noinst_HEADERS = \	19	noinst_HEADERS = \
18	assign_protein_type.h \	20	assign_protein_type.h \
@@ -20,6 +22,8 @@ noinst_HEADERS = \
20	check_h5_error.h \	22	check_h5_error.h \
21	check_ncbi_error.h \	23	check_ncbi_error.h \
22	load_influenza_aa_dat.h \	24	load_influenza_aa_dat.h \
23	load_influenza_faa.h	25	load_influenza_faa.h \
		26	sequence_data.h \
		27	sequence_data_init.h
24		28
25	AM_CFLAGS = -Wall -std=gnu99 -ggdb	29	AM_CFLAGS = -Wall -std=gnu99 -ggdb


diff --git a/src/assign_protein_type.c b/src/assign_protein_type.c index 1b58f54..166e787 100644 --- a/src/assign_protein_type.c +++ b/src/assign_protein_type.c
@@ -1,12 +1,15 @@
1	#include "assign_protein_type.h"	1	#include "assign_protein_type.h"
2	#include "check_ncbi_error.h"	2	#include "check_ncbi_error.h"
3	#include "check_h5_error.h"	3	#include "check_h5_error.h"
		4	#include "sequence_data.h"
		5	#include "sequence_data_init.h"
4	#include <ncbi.h>	6	#include <ncbi.h>
5	#include <readdb.h>	7	#include <readdb.h>
6	#include <blast.h>	8	#include <blast.h>
7	#include <salpacc.h>	9	#include <salpacc.h>
8	#include <stdbool.h>	10	#include <stdbool.h>
9	#include <hdf5_hl.h>	11	#include <hdf5_hl.h>
		12	#include <error.h>
10		13
11	/*	14	/*
12	* BLAST database containing all of the influenza protein sequences.	15	* BLAST database containing all of the influenza protein sequences.
@@ -44,6 +47,13 @@ assign_protein_type (hid_t file_id)
44	* Get default BLAST options.	47	* Get default BLAST options.
45	*/	48	*/
46	BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);	49	BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
		50
		51	/*
		52	* Set to single hit mode.
		53	*/
		54	options->window_size = 0;
		55	options->multiple_hits_only = false;
		56
47	ValNodePtr error_returns = NULL;	57	ValNodePtr error_returns = NULL;
48		58
49	/*	59	/*
@@ -56,53 +66,104 @@ assign_protein_type (hid_t file_id)
56	if (status < 0)	66	if (status < 0)
57	check_h5_error (status, __FILE__, __LINE__);	67	check_h5_error (status, __FILE__, __LINE__);
58		68
59	/*	69	sequence_data* dst_buf = malloc (sizeof(sequence_data) * nrecords);
60	* todo: Allocate memory of nrecords for dst_buf.	70
61	*	71	size_t dst_size;
62	* todo: Refactor code to share structres in read and write HDF5	72	size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
63	* calls.	73	size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
64	*/	74	hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
		75	sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
65		76
66	status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,	77	status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
67	dst_sizes, dst_buf);	78	dst_sizes, dst_buf);
68	if (status < 0)	79	if (status < 0)
69	check_h5_error (status, __FILE__, __LINE__);	80	check_h5_error (status, __FILE__, __LINE__);
70		81
71	for (int i = 0; i < nrecords; i++)
72	{
73
74	}
75
76	/*	82	/*
77	* Read the sequence from the database by GI.	83	* Assign protein types to records for which the field is empty.
78	*/	84	*/
79	Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);	85	printf ("Records to process: %i\n", (int)nrecords);
80	BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);	86	for (int i = 0; i < nrecords; i++)
81
82	SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
83	"blastp",
84	REFDB,
85	options,
86	NULL,
87	&error_returns,
88	NULL);
89
90	if (error_returns != NULL)
91	check_ncbi_error (error_returns, __FILE__, __LINE__);
92
93	if (seqalign != NULL)
94	{	87	{
95	Char target_id_buf[BUFFER_LEN + 1];
96	SeqIdPtr target_id = SeqAlignId (seqalign, 1);
97	SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
98	printf ("%s\n", target_id_buf);
99	}
100
101	// Clean up memory for the next ieration.
102	seqalign = SeqAlignSetFree (seqalign);
103	bsp = BioseqFree (bsp);
104		88
		89	if (dst_buf[i].protein_type[0] == '\0') {
		90	/*
		91	* Read the sequence from the database by GI.
		92	*/
		93	Int4 sequence_number = readdb_gi2seq (seqdb, dst_buf[i].gi, NULL);
		94	BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);
		95
		96	SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
		97	"blastp",
		98	REFDB,
		99	options,
		100	NULL,
		101	&error_returns,
		102	NULL);
		103
		104	/*
		105	* BLAST reported an error. Write it out and continue processing.
		106	*/
		107	if (error_returns != NULL)
		108	{
		109	printf ("Warning: An error has been reported by the NCBI Toolkit API "
		110	"for sequence gi\|%i: %s",
		111	dst_buf[i].gi, BlastErrorToString (error_returns));
		112
		113	}
		114
		115	/*
		116	* A hit was found. Record the first hit as the protein type.
		117	* Skip the first 6 characters and eat the "lcl\|x_".
		118	*/
		119	else if (seqalign != NULL)
		120	{
		121	Char target_id_buf[BUFFER_LEN + 1];
		122	SeqIdPtr target_id = SeqAlignId (seqalign, 1);
		123	SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
		124	strncpy (dst_buf[i].protein_type, &target_id_buf[6],
		125	sizeof (dst_buf[i].protein_type));
		126	}
		127
		128	/*
		129	* BLAST did not find any hits.
		130	*/
		131	else
		132	{
		133	printf ("Warning: Unable to identify protein type for sequence gi\|%i\n",
		134	dst_buf[i].gi);
		135	}
		136
		137	/*
		138	* Clean up memory for the next ieration.
		139	*/
		140	seqalign = SeqAlignSetFree (seqalign);
		141	bsp = BioseqFree (bsp);
		142
		143	/*
		144	* Write the data out to the file.
		145	*/
		146	if ( (i % 1000 == 0) && (i > 0) )
		147	{
		148	status = H5TBwrite_records (file_id, "influenza.faa", i - 1000, 1000,
		149	dst_size, dst_offset, dst_sizes,
		150	&dst_buf[i-1000]);
		151	if (status < 0)
		152	check_h5_error (status, __FILE__, __LINE__);
		153
		154	status = H5Fflush (file_id, H5F_SCOPE_GLOBAL);
		155	if (status < 0)
		156	check_h5_error (status, __FILE__, __LINE__);
		157
		158	printf ("Processed %i of %i records.\n", i, (int)nrecords);
		159	}
		160	}
		161
		162	}
		163
		164	free (dst_buf);
		165
105	options = BLASTOptionDelete (options);	166	options = BLASTOptionDelete (options);
106		167
107	return;	168	return;
108	}	169	}


diff --git a/src/load_influenza_faa.c b/src/load_influenza_faa.c index 749b7ad..fd35254 100644 --- a/src/load_influenza_faa.c +++ b/src/load_influenza_faa.c
@@ -1,62 +1,22 @@
1	#include "load_influenza_faa.h"
2	#include "check_error.h"	1	#include "check_error.h"
3	#include "check_h5_error.h"	2	#include "check_h5_error.h"
4	#include "hdf5_hl.h"	3	#include "load_influenza_faa.h"
		4	#include "sequence_data.h"
		5	#include "sequence_data_init.h"
		6	#include <hdf5_hl.h>
5	#include <string.h>	7	#include <string.h>
6	#include <stdlib.h>	8	#include <stdlib.h>
7		9
8	#define SEQUENCE_DATA_FIELD_NUM 4
9
10	void	10	void
11	load_influenza_faa (hid_t file_id)	11	load_influenza_faa (hid_t file_id)
12	{	12	{
13	typedef struct	13	size_t dst_size;
14	{	14	size_t dst_offset[SEQUENCE_DATA_FIELD_NUM];
15	int gi;	15	size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM];
16	char gb[9];
17	char description[196];
18	char protein_type[7];
19	} sequence_data;
20
21	size_t dst_size = sizeof (sequence_data);
22	size_t dst_offset[SEQUENCE_DATA_FIELD_NUM] =
23	{ HOFFSET (sequence_data, gi),
24	HOFFSET (sequence_data, gb),
25	HOFFSET (sequence_data, description),
26	HOFFSET (sequence_data, protein_type)
27	};
28
29	sequence_data dst_buf[1];
30
31	size_t dst_sizes[SEQUENCE_DATA_FIELD_NUM] = {
32	sizeof (dst_buf[0].gi),
33	sizeof (dst_buf[0].gb),
34	sizeof (dst_buf[0].description),
35	sizeof (dst_buf[0].protein_type)
36	};
37
38	hid_t field_type[SEQUENCE_DATA_FIELD_NUM];	16	hid_t field_type[SEQUENCE_DATA_FIELD_NUM];
39		17
40	field_type[0] = H5T_NATIVE_INT;	18	sequence_data_init (&dst_size, dst_offset, dst_sizes, field_type);
41		19
42	hid_t gb_type = H5Tcopy (H5T_C_S1);
43	H5Tset_size (gb_type, 9);
44	field_type[1] = gb_type;
45
46	hid_t description_type = H5Tcopy (H5T_C_S1);
47	H5Tset_size (description_type, 196);
48	field_type[2] = description_type;
49
50	hid_t protein_type_type = H5Tcopy (H5T_C_S1);
51	H5Tset_size (protein_type_type, 7);
52	field_type[3] = protein_type_type;
53
54	const char *field_names[SEQUENCE_DATA_FIELD_NUM] =
55	{ "GI",
56	"GB",
57	"Description",
58	"Protein Type" };
59
60	hsize_t chunk_size = 10;	20	hsize_t chunk_size = 10;
61	int *fill_data = NULL;	21	int *fill_data = NULL;
62	int compress = 0;	22	int compress = 0;
@@ -99,12 +59,15 @@ load_influenza_faa (hid_t file_id)
99		59
100	strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));	60	strncpy (p_data.protein_type, "", sizeof (p_data.protein_type));
101		61
		62	const char* sequence_data_field_names[SEQUENCE_DATA_FIELD_NUM] =
		63	SEQUENCE_DATA_FIELD_NAMES;
		64
102	if (current_line == 1)	65	if (current_line == 1)
103	{	66	{
104	herr_t status = H5TBmake_table ("influenza.faa", file_id,	67	herr_t status = H5TBmake_table ("influenza.faa", file_id,
105	"influenza.faa",	68	"influenza.faa",
106	SEQUENCE_DATA_FIELD_NUM, 1,	69	SEQUENCE_DATA_FIELD_NUM, 1,
107	dst_size, field_names,	70	dst_size, sequence_data_field_names,
108	dst_offset, field_type,	71	dst_offset, field_type,
109	chunk_size, fill_data, compress,	72	chunk_size, fill_data, compress,
110	&p_data);	73	&p_data);
@@ -132,9 +95,5 @@ load_influenza_faa (hid_t file_id)
132		95
133	fclose (dat);	96	fclose (dat);
134		97
135	H5Tclose (gb_type);
136	H5Tclose (description_type);
137	H5Tclose (protein_type_type);
138
139	return;	98	return;
140	}	99	}


diff --git a/src/sequence_data.h b/src/sequence_data.h new file mode 100644 index 0000000..fb4ff78 --- a/dev/null +++ b/src/sequence_data.h
@@ -0,0 +1,16 @@
		1	#ifndef SEQUENCE_DATA_H
		2	#define SEQUENCE_DATA_H
		3
		4	#define SEQUENCE_DATA_FIELD_NUM 4
		5
		6	#define SEQUENCE_DATA_FIELD_NAMES { "GI", "GB", "Description", "Protein Type" }
		7
		8	typedef struct
		9	{
		10	int gi;
		11	char gb[9];
		12	char description[196];
		13	char protein_type[7];
		14	} sequence_data;
		15
		16	#endif // SEQUENCE_DATA_H


diff --git a/src/sequence_data_init.c b/src/sequence_data_init.c new file mode 100644 index 0000000..09ba189 --- a/dev/null +++ b/src/sequence_data_init.c
@@ -0,0 +1,37 @@
		1	#include "sequence_data_init.h"
		2	#include "sequence_data.h"
		3
		4	void
		5	sequence_data_init (size_t dst_size, size_t dst_offset, size_t *dst_sizes,
		6	hid_t *field_type)
		7	{
		8	*dst_size = sizeof (sequence_data);
		9
		10	dst_offset[0] = HOFFSET (sequence_data, gi);
		11	dst_offset[1] = HOFFSET (sequence_data, gb);
		12	dst_offset[2] = HOFFSET (sequence_data, description);
		13	dst_offset[3] = HOFFSET (sequence_data, protein_type);
		14
		15	sequence_data dst_buf[1];
		16
		17	dst_sizes[0] = sizeof (dst_buf[0].gi);
		18	dst_sizes[1] = sizeof (dst_buf[0].gb);
		19	dst_sizes[2] = sizeof (dst_buf[0].description);
		20	dst_sizes[3] = sizeof (dst_buf[0].protein_type);
		21
		22	field_type[0] = H5T_NATIVE_INT;
		23
		24	hid_t gb_type = H5Tcopy (H5T_C_S1);
		25	H5Tset_size (gb_type, 9);
		26	field_type[1] = gb_type;
		27
		28	hid_t description_type = H5Tcopy (H5T_C_S1);
		29	H5Tset_size (description_type, 196);
		30	field_type[2] = description_type;
		31
		32	hid_t protein_type_type = H5Tcopy (H5T_C_S1);
		33	H5Tset_size (protein_type_type, 7);
		34	field_type[3] = protein_type_type;
		35
		36	return;
		37	}


diff --git a/src/sequence_data_init.h b/src/sequence_data_init.h new file mode 100644 index 0000000..c87e7e6 --- a/dev/null +++ b/src/sequence_data_init.h
@@ -0,0 +1,14 @@
		1	#ifndef SEQUENCE_DATA_INIT_H
		2	#define SEQUENCE_DATA_INIT_H
		3
		4	#include <hdf5.h>
		5
		6	/*
		7	* Initialize the structures describing sequence_data. These
		8	* descriptive structures are used by the HDF5 API.
		9	*/
		10	void
		11	sequence_data_init (size_t dst_size, size_t dst_offset, size_t *dst_sizes,
		12	hid_t *field_type);
		13
		14	#endif // SEQUENCE_DATA_INIT_H