summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-03-28 08:13:39 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-03-28 08:13:39 (GMT)
commit75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e (patch) (unidiff)
tree35c5bdd4f7c44946192059c5d3c78980ba9c4b07
parent1d29fba5de1dd0731564829dbf5aec572d161bd5 (diff)
downloadexp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.zip
exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.gz
exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.bz2
Testing options for loading meta-data/features/Entrez into HDF5.
-rw-r--r--src/load/load_asn.c173
-rw-r--r--src/load/load_asn.h24
-rw-r--r--src/load/load_features.c167
-rw-r--r--src/load/load_features.h12
4 files changed, 376 insertions, 0 deletions
diff --git a/src/load/load_asn.c b/src/load/load_asn.c
new file mode 100644
index 0000000..fc27d84
--- a/dev/null
+++ b/src/load/load_asn.c
@@ -0,0 +1,173 @@
1#define _GNU_SOURCE
2#include "load_asn.h"
3#include <string.h>
4#include <asn.h>
5#include <objgbseq.h>
6#include <objsset.h>
7#include <sqnutils.h>
8
9void
10print_asn (ObjectIdPtr oid, SeqIdPtr id, ValNodePtr descr, SeqAnnotPtr annot)
11{
12 /*
13 * Print the record identifiers.
14 */
15 printf (" IDENTIFIERS\n");
16 printf (" -----------\n");
17 while (oid != NULL)
18 {
19 printf("%i, %s\n", oid->id, oid->str);
20 }
21 while (id != NULL)
22 {
23 // printf ("ID: %i\n", id->choice);
24
25 char idval[256];
26 SeqIdPrint (id, idval, PRINTID_FASTA_SHORT);
27 printf (" %s\n", idval);
28
29 // if (id->choice == SEQID_GI)
30 //printf ("GI: %i\n", id->data.intvalue);
31
32 id = id->next;
33 }
34
35 /*
36 * Print descriptions.
37 * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/BIOSEQ.HTML#_Seq-descr:_Describing_the]
38 */
39 printf ("\n DESCRIPTIONS\n");
40 printf (" ------------\n");
41 while (descr != NULL)
42 {
43 switch (descr->choice)
44 {
45 case Seq_descr_title:
46 printf (" TITLE: %s\n", (char*)descr->data.ptrvalue);
47 break;
48 case Seq_descr_genbank:
49 printf (" GENBANK\n");
50 break;
51 case Seq_descr_pub:
52 printf (" PUB\n");
53 break;
54 case Seq_descr_create_date:
55 printf (" CREATE DATE\n");
56 break;
57 case Seq_descr_update_date:
58 printf (" UPDATE DATE\n");
59 break;
60 case Seq_descr_source:
61 printf (" BIOSOURCE\n");
62 break;
63 case Seq_descr_molinfo:
64 printf (" MOLINFO\n");
65 break;
66 default:
67 printf (" DESCRIPTION CHOICE=%i\n", descr->choice);
68 break;
69 }
70
71 descr = descr->next;
72 }
73
74 /*
75 * Print annotations.
76 */
77 printf ("\n ANNOTATIONS\n");
78 printf (" -----------\n");
79 while (annot != NULL)
80 {
81 printf (" ANNOTATION: %s, ", annot->name);
82 if (annot->desc != NULL) {
83 switch (annot->desc->choice)
84 {
85 case Annot_descr_name:
86 printf (" NAME: %s\n", (char*)annot->desc->data.ptrvalue);
87 break;
88 default:
89 printf (" CHOICE=%i\n", annot->desc->choice);
90 break;
91 }
92 }
93 else
94 printf (" NONE\n");
95
96 annot = annot->next;
97 }
98
99}
100
101/*
102 * Based on example at
103 * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQUTIL.HTML].
104 */
105void
106load_asn (hid_t file_id, const char* file_name)
107{
108 char* asn_file = strdup(file_name);
109 AsnIoPtr aip = AsnIoOpen (asn_file, "r");
110 SeqEntryPtr sep = SeqEntryAsnRead (aip, NULL);
111 BioseqSetPtr bsetp = 0;
112 ValNodePtr descr = 0;
113 SeqAnnotPtr annot = 0;
114 SeqIdPtr id = 0;
115 ObjectIdPtr oid = 0;
116
117 /*
118 * Data file statistics.
119 */
120 printf ("NODES: %i\tBIOSEQS: %i\n", ValNodeLen (sep), BioseqCount (sep));
121 printf ("\n");
122
123 /*
124 * This loop needs to be corrected to handle nesting of sets.
125 */
126
127 while (sep != NULL)
128 {
129 bsetp = (BioseqSetPtr) sep->data.ptrvalue;
130 if (bsetp != NULL)
131 {
132 oid = bsetp->id;
133 id = NULL;
134 descr = bsetp->descr;
135 annot = bsetp->annot;
136 }
137
138 printf ("BIOSEQSET\n");
139 printf ("\n");
140 print_asn (oid, id, descr, annot);
141 printf ("\n");
142
143 /*
144 * Process Bioseqs in the set.
145 */
146 SeqEntryPtr sep2 = bsetp->seq_set;
147 while (sep2 != NULL)
148 {
149 BioseqPtr bsp = sep2->data.ptrvalue;
150 if (bsp != NULL)
151 {
152 oid = NULL;
153 id = bsp->id;
154 descr = bsp->descr;
155 annot = bsp->annot;
156
157 printf ("BIOSEQ\n");
158 printf ("\n");
159 print_asn (oid, id, descr, annot);
160 printf ("\n");
161 }
162
163 sep2 = sep2->next;
164 }
165
166 sep = sep->next;
167 }
168
169 AsnIoClose (aip);
170 free (asn_file);
171
172 return;
173}
diff --git a/src/load/load_asn.h b/src/load/load_asn.h
new file mode 100644
index 0000000..a7d54db
--- a/dev/null
+++ b/src/load/load_asn.h
@@ -0,0 +1,24 @@
1#ifndef LOAD_ASN_H
2#define LOAD_ASN_H
3
4#include <hdf5.h>
5
6/*
7 * Load the features and other meta-data pulled from Entrez via eFetch
8 * as ASN.1.
9 *
10 * Test: gi|453644
11 *
12 * Retrieving the ASN.1 file via eFetch for gi|453644 worked smoothly
13 * however the hierarchy of the ASN.1 is difficult to align with other
14 * data by GI. This is due to the Bioseqset returned lacking
15 * identifiers and the gi|453644 appearing as a Bioseq member of the
16 * set. It is positioned on the same hierarchical level as gi|453643.
17 * The containing set includes the PUB records. Comparatively the XML
18 * files returned via the same process list the gi|453644 at the top
19 * of the hierarchy and above the PUB records. This output appears to
20 * be more consistent with the perspective requested in the input.
21 */
22void load_asn (hid_t file_id, const char* file_name);
23
24#endif // LOAD_ASN_H
diff --git a/src/load/load_features.c b/src/load/load_features.c
new file mode 100644
index 0000000..b18031a
--- a/dev/null
+++ b/src/load/load_features.c
@@ -0,0 +1,167 @@
1#include "load_features.h"
2#include <libxml/parser.h>
3#include <stdbool.h>
4#include <asn.h>
5#include <objgbseq.h>
6
7/*
8 * An NCBI GBSeq structure to hold the data for the current record.
9 */
10GBSeqPtr g;
11
12bool in_element;
13
14static void
15lf_startDocument (void *ctx ATTRIBUTE_UNUSED)
16{
17 printf ("SAX.startDocument()\n");
18
19 return;
20}
21
22static void
23lf_endDocument (void *ctx ATTRIBUTE_UNUSED)
24{
25 printf ("SAX.endDocument()\n");
26
27 return;
28}
29
30static xmlEntityPtr
31lf_getEntity (void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
32{
33 printf("SAX.getEntity(%s)\n", name);
34
35 return (NULL);
36}
37
38static void
39lf_startElement(void *ctx ATTRIBUTE_UNUSED,
40 const xmlChar *name, const xmlChar **atts)
41{
42 int i;
43
44 fprintf(stdout, "SAX.startElement(%s", (char *) name);
45 if (atts != NULL) {
46 for (i = 0;(atts[i] != NULL);i++) {
47 fprintf(stdout, ", %s='", atts[i++]);
48 if (atts[i] != NULL)
49 fprintf(stdout, "%s'", atts[i]);
50 }
51 }
52 fprintf(stdout, ")\n");
53
54 in_element = true;
55
56 return;
57}
58
59static void
60lf_endElement(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
61{
62 printf("SAX.endElement(%s)\n", (char *) name);
63 in_element = false;
64
65 return;
66}
67
68static void
69lf_characters(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
70{
71 if (!in_element)
72 return;
73
74 char output[40];
75 int i;
76
77 for (i = 0;(i<len) && (i < 30);i++)
78 output[i] = ch[i];
79 output[i] = 0;
80
81 printf("SAX.characters(%s, %d)\n", output, len);
82}
83
84/*
85 * 1. Parse and load the XML file into memory.
86 * 2. Insert the XML into HDF5.
87 * For an example of parsing XML with libxml2 and SAX see:
88 * [http://git.gnome.org/browse/libxml2/tree/testSAX.c].
89 */
90void
91load_features (hid_t file_id, const char* file_name)
92{
93 g = GBSeqNew ();
94
95 LIBXML_TEST_VERSION;
96
97 in_element = false;
98
99 static xmlSAXHandler emptySAXHandlerStruct = {
100 NULL, /* internalSubset */
101 NULL, /* isStandalone */
102 NULL, /* hasInternalSubset */
103 NULL, /* hasExternalSubset */
104 NULL, /* resolveEntity */
105 lf_getEntity, /* getEntity */
106 NULL, /* entityDecl */
107 NULL, /* notationDecl */
108 NULL, /* attributeDecl */
109 NULL, /* elementDecl */
110 NULL, /* unparsedEntityDecl */
111 NULL, /* setDocumentLocator */
112 lf_startDocument, /* startDocument */
113 lf_endDocument, /* endDocument */
114 lf_startElement, /* startElement */
115 lf_endElement, /* endElement */
116 NULL, /* reference */
117 lf_characters, /* characters */
118 NULL, /* ignorableWhitespace */
119 NULL, /* processingInstruction */
120 NULL, /* comment */
121 NULL, /* xmlParserWarning */
122 NULL, /* xmlParserError */
123 NULL, /* xmlParserError */
124 NULL, /* getParameterEntity */
125 NULL, /* cdataBlock; */
126 NULL, /* externalSubset; */
127 1,
128 NULL,
129 NULL, /* startElementNs */
130 NULL, /* endElementNs */
131 NULL /* xmlStructuredErrorFunc */
132 };
133
134 static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
135
136 FILE *f = fopen (file_name, "r");
137
138 if (f != NULL)
139 {
140 int ret;
141 char chars[10];
142 xmlParserCtxtPtr ctxt;
143
144 ret = fread (chars, 1, 4, f);
145 if (ret > 0)
146 {
147 ctxt = xmlCreatePushParserCtxt (emptySAXHandler, NULL,
148 chars, ret, file_name);
149 while ((ret = fread(chars, 1, 3, f)) > 0)
150 {
151 xmlParseChunk (ctxt, chars, ret, 0);
152 }
153 xmlParseChunk (ctxt, chars, 0, 1);
154 xmlFreeParserCtxt(ctxt);
155 }
156 fclose (f);
157 }
158 else
159 {
160 xmlGenericError (xmlGenericErrorContext,
161 "Cannot read file.");
162 }
163
164 GBSeqFree (g);
165
166 return;
167}
diff --git a/src/load/load_features.h b/src/load/load_features.h
new file mode 100644
index 0000000..932883f
--- a/dev/null
+++ b/src/load/load_features.h
@@ -0,0 +1,12 @@
1#ifndef LOAD_FEATURES_H
2#define LOAD_FEATURES_H
3
4#include <hdf5.h>
5
6/*
7 * Load the features and other meta-data pulled from Entrez via eFetch
8 * as XML.
9 */
10void load_features (hid_t file_id, const char* file_name);
11
12#endif // LOAD_FEATURES_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.