-rw-r--r-- | src/load/load_asn.c | 173 | ||||
-rw-r--r-- | src/load/load_asn.h | 24 | ||||
-rw-r--r-- | src/load/load_features.c | 167 | ||||
-rw-r--r-- | src/load/load_features.h | 12 |
4 files changed, 376 insertions, 0 deletions
diff --git a/src/load/load_asn.c b/src/load/load_asn.c new file mode 100644 index 0000000..fc27d84 --- a/dev/null +++ b/src/load/load_asn.c | |||
@@ -0,0 +1,173 @@ | |||
1 | #define _GNU_SOURCE | ||
2 | #include "load_asn.h" | ||
3 | #include <string.h> | ||
4 | #include <asn.h> | ||
5 | #include <objgbseq.h> | ||
6 | #include <objsset.h> | ||
7 | #include <sqnutils.h> | ||
8 | |||
9 | void | ||
10 | print_asn (ObjectIdPtr oid, SeqIdPtr id, ValNodePtr descr, SeqAnnotPtr annot) | ||
11 | { | ||
12 | /* | ||
13 | * Print the record identifiers. | ||
14 | */ | ||
15 | printf (" IDENTIFIERS\n"); | ||
16 | printf (" -----------\n"); | ||
17 | while (oid != NULL) | ||
18 | { | ||
19 | printf("%i, %s\n", oid->id, oid->str); | ||
20 | } | ||
21 | while (id != NULL) | ||
22 | { | ||
23 | // printf ("ID: %i\n", id->choice); | ||
24 | |||
25 | char idval[256]; | ||
26 | SeqIdPrint (id, idval, PRINTID_FASTA_SHORT); | ||
27 | printf (" %s\n", idval); | ||
28 | |||
29 | // if (id->choice == SEQID_GI) | ||
30 | //printf ("GI: %i\n", id->data.intvalue); | ||
31 | |||
32 | id = id->next; | ||
33 | } | ||
34 | |||
35 | /* | ||
36 | * Print descriptions. | ||
37 | * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/BIOSEQ.HTML#_Seq-descr:_Describing_the] | ||
38 | */ | ||
39 | printf ("\n DESCRIPTIONS\n"); | ||
40 | printf (" ------------\n"); | ||
41 | while (descr != NULL) | ||
42 | { | ||
43 | switch (descr->choice) | ||
44 | { | ||
45 | case Seq_descr_title: | ||
46 | printf (" TITLE: %s\n", (char*)descr->data.ptrvalue); | ||
47 | break; | ||
48 | case Seq_descr_genbank: | ||
49 | printf (" GENBANK\n"); | ||
50 | break; | ||
51 | case Seq_descr_pub: | ||
52 | printf (" PUB\n"); | ||
53 | break; | ||
54 | case Seq_descr_create_date: | ||
55 | printf (" CREATE DATE\n"); | ||
56 | break; | ||
57 | case Seq_descr_update_date: | ||
58 | printf (" UPDATE DATE\n"); | ||
59 | break; | ||
60 | case Seq_descr_source: | ||
61 | printf (" BIOSOURCE\n"); | ||
62 | break; | ||
63 | case Seq_descr_molinfo: | ||
64 | printf (" MOLINFO\n"); | ||
65 | break; | ||
66 | default: | ||
67 | printf (" DESCRIPTION CHOICE=%i\n", descr->choice); | ||
68 | break; | ||
69 | } | ||
70 | |||
71 | descr = descr->next; | ||
72 | } | ||
73 | |||
74 | /* | ||
75 | * Print annotations. | ||
76 | */ | ||
77 | printf ("\n ANNOTATIONS\n"); | ||
78 | printf (" -----------\n"); | ||
79 | while (annot != NULL) | ||
80 | { | ||
81 | printf (" ANNOTATION: %s, ", annot->name); | ||
82 | if (annot->desc != NULL) { | ||
83 | switch (annot->desc->choice) | ||
84 | { | ||
85 | case Annot_descr_name: | ||
86 | printf (" NAME: %s\n", (char*)annot->desc->data.ptrvalue); | ||
87 | break; | ||
88 | default: | ||
89 | printf (" CHOICE=%i\n", annot->desc->choice); | ||
90 | break; | ||
91 | } | ||
92 | } | ||
93 | else | ||
94 | printf (" NONE\n"); | ||
95 | |||
96 | annot = annot->next; | ||
97 | } | ||
98 | |||
99 | } | ||
100 | |||
101 | /* | ||
102 | * Based on example at | ||
103 | * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQUTIL.HTML]. | ||
104 | */ | ||
105 | void | ||
106 | load_asn (hid_t file_id, const char* file_name) | ||
107 | { | ||
108 | char* asn_file = strdup(file_name); | ||
109 | AsnIoPtr aip = AsnIoOpen (asn_file, "r"); | ||
110 | SeqEntryPtr sep = SeqEntryAsnRead (aip, NULL); | ||
111 | BioseqSetPtr bsetp = 0; | ||
112 | ValNodePtr descr = 0; | ||
113 | SeqAnnotPtr annot = 0; | ||
114 | SeqIdPtr id = 0; | ||
115 | ObjectIdPtr oid = 0; | ||
116 | |||
117 | /* | ||
118 | * Data file statistics. | ||
119 | */ | ||
120 | printf ("NODES: %i\tBIOSEQS: %i\n", ValNodeLen (sep), BioseqCount (sep)); | ||
121 | printf ("\n"); | ||
122 | |||
123 | /* | ||
124 | * This loop needs to be corrected to handle nesting of sets. | ||
125 | */ | ||
126 | |||
127 | while (sep != NULL) | ||
128 | { | ||
129 | bsetp = (BioseqSetPtr) sep->data.ptrvalue; | ||
130 | if (bsetp != NULL) | ||
131 | { | ||
132 | oid = bsetp->id; | ||
133 | id = NULL; | ||
134 | descr = bsetp->descr; | ||
135 | annot = bsetp->annot; | ||
136 | } | ||
137 | |||
138 | printf ("BIOSEQSET\n"); | ||
139 | printf ("\n"); | ||
140 | print_asn (oid, id, descr, annot); | ||
141 | printf ("\n"); | ||
142 | |||
143 | /* | ||
144 | * Process Bioseqs in the set. | ||
145 | */ | ||
146 | SeqEntryPtr sep2 = bsetp->seq_set; | ||
147 | while (sep2 != NULL) | ||
148 | { | ||
149 | BioseqPtr bsp = sep2->data.ptrvalue; | ||
150 | if (bsp != NULL) | ||
151 | { | ||
152 | oid = NULL; | ||
153 | id = bsp->id; | ||
154 | descr = bsp->descr; | ||
155 | annot = bsp->annot; | ||
156 | |||
157 | printf ("BIOSEQ\n"); | ||
158 | printf ("\n"); | ||
159 | print_asn (oid, id, descr, annot); | ||
160 | printf ("\n"); | ||
161 | } | ||
162 | |||
163 | sep2 = sep2->next; | ||
164 | } | ||
165 | |||
166 | sep = sep->next; | ||
167 | } | ||
168 | |||
169 | AsnIoClose (aip); | ||
170 | free (asn_file); | ||
171 | |||
172 | return; | ||
173 | } | ||
diff --git a/src/load/load_asn.h b/src/load/load_asn.h new file mode 100644 index 0000000..a7d54db --- a/dev/null +++ b/src/load/load_asn.h | |||
@@ -0,0 +1,24 @@ | |||
1 | #ifndef LOAD_ASN_H | ||
2 | #define LOAD_ASN_H | ||
3 | |||
4 | #include <hdf5.h> | ||
5 | |||
6 | /* | ||
7 | * Load the features and other meta-data pulled from Entrez via eFetch | ||
8 | * as ASN.1. | ||
9 | * | ||
10 | * Test: gi|453644 | ||
11 | * | ||
12 | * Retrieving the ASN.1 file via eFetch for gi|453644 worked smoothly | ||
13 | * however the hierarchy of the ASN.1 is difficult to align with other | ||
14 | * data by GI. This is due to the Bioseqset returned lacking | ||
15 | * identifiers and the gi|453644 appearing as a Bioseq member of the | ||
16 | * set. It is positioned on the same hierarchical level as gi|453643. | ||
17 | * The containing set includes the PUB records. Comparatively the XML | ||
18 | * files returned via the same process list the gi|453644 at the top | ||
19 | * of the hierarchy and above the PUB records. This output appears to | ||
20 | * be more consistent with the perspective requested in the input. | ||
21 | */ | ||
22 | void load_asn (hid_t file_id, const char* file_name); | ||
23 | |||
24 | #endif // LOAD_ASN_H | ||
diff --git a/src/load/load_features.c b/src/load/load_features.c new file mode 100644 index 0000000..b18031a --- a/dev/null +++ b/src/load/load_features.c | |||
@@ -0,0 +1,167 @@ | |||
1 | #include "load_features.h" | ||
2 | #include <libxml/parser.h> | ||
3 | #include <stdbool.h> | ||
4 | #include <asn.h> | ||
5 | #include <objgbseq.h> | ||
6 | |||
7 | /* | ||
8 | * An NCBI GBSeq structure to hold the data for the current record. | ||
9 | */ | ||
10 | GBSeqPtr g; | ||
11 | |||
12 | bool in_element; | ||
13 | |||
14 | static void | ||
15 | lf_startDocument (void *ctx ATTRIBUTE_UNUSED) | ||
16 | { | ||
17 | printf ("SAX.startDocument()\n"); | ||
18 | |||
19 | return; | ||
20 | } | ||
21 | |||
22 | static void | ||
23 | lf_endDocument (void *ctx ATTRIBUTE_UNUSED) | ||
24 | { | ||
25 | printf ("SAX.endDocument()\n"); | ||
26 | |||
27 | return; | ||
28 | } | ||
29 | |||
30 | static xmlEntityPtr | ||
31 | lf_getEntity (void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) | ||
32 | { | ||
33 | printf("SAX.getEntity(%s)\n", name); | ||
34 | |||
35 | return (NULL); | ||
36 | } | ||
37 | |||
38 | static void | ||
39 | lf_startElement(void *ctx ATTRIBUTE_UNUSED, | ||
40 | const xmlChar *name, const xmlChar **atts) | ||
41 | { | ||
42 | int i; | ||
43 | |||
44 | fprintf(stdout, "SAX.startElement(%s", (char *) name); | ||
45 | if (atts != NULL) { | ||
46 | for (i = 0;(atts[i] != NULL);i++) { | ||
47 | fprintf(stdout, ", %s='", atts[i++]); | ||
48 | if (atts[i] != NULL) | ||
49 | fprintf(stdout, "%s'", atts[i]); | ||
50 | } | ||
51 | } | ||
52 | fprintf(stdout, ")\n"); | ||
53 | |||
54 | in_element = true; | ||
55 | |||
56 | return; | ||
57 | } | ||
58 | |||
59 | static void | ||
60 | lf_endElement(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) | ||
61 | { | ||
62 | printf("SAX.endElement(%s)\n", (char *) name); | ||
63 | in_element = false; | ||
64 | |||
65 | return; | ||
66 | } | ||
67 | |||
68 | static void | ||
69 | lf_characters(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len) | ||
70 | { | ||
71 | if (!in_element) | ||
72 | return; | ||
73 | |||
74 | char output[40]; | ||
75 | int i; | ||
76 | |||
77 | for (i = 0;(i<len) && (i < 30);i++) | ||
78 | output[i] = ch[i]; | ||
79 | output[i] = 0; | ||
80 | |||
81 | printf("SAX.characters(%s, %d)\n", output, len); | ||
82 | } | ||
83 | |||
84 | /* | ||
85 | * 1. Parse and load the XML file into memory. | ||
86 | * 2. Insert the XML into HDF5. | ||
87 | * For an example of parsing XML with libxml2 and SAX see: | ||
88 | * [http://git.gnome.org/browse/libxml2/tree/testSAX.c]. | ||
89 | */ | ||
90 | void | ||
91 | load_features (hid_t file_id, const char* file_name) | ||
92 | { | ||
93 | g = GBSeqNew (); | ||
94 | |||
95 | LIBXML_TEST_VERSION; | ||
96 | |||
97 | in_element = false; | ||
98 | |||
99 | static xmlSAXHandler emptySAXHandlerStruct = { | ||
100 | NULL, /* internalSubset */ | ||
101 | NULL, /* isStandalone */ | ||
102 | NULL, /* hasInternalSubset */ | ||
103 | NULL, /* hasExternalSubset */ | ||
104 | NULL, /* resolveEntity */ | ||
105 | lf_getEntity, /* getEntity */ | ||
106 | NULL, /* entityDecl */ | ||
107 | NULL, /* notationDecl */ | ||
108 | NULL, /* attributeDecl */ | ||
109 | NULL, /* elementDecl */ | ||
110 | NULL, /* unparsedEntityDecl */ | ||
111 | NULL, /* setDocumentLocator */ | ||
112 | lf_startDocument, /* startDocument */ | ||
113 | lf_endDocument, /* endDocument */ | ||
114 | lf_startElement, /* startElement */ | ||
115 | lf_endElement, /* endElement */ | ||
116 | NULL, /* reference */ | ||
117 | lf_characters, /* characters */ | ||
118 | NULL, /* ignorableWhitespace */ | ||
119 | NULL, /* processingInstruction */ | ||
120 | NULL, /* comment */ | ||
121 | NULL, /* xmlParserWarning */ | ||
122 | NULL, /* xmlParserError */ | ||
123 | NULL, /* xmlParserError */ | ||
124 | NULL, /* getParameterEntity */ | ||
125 | NULL, /* cdataBlock; */ | ||
126 | NULL, /* externalSubset; */ | ||
127 | 1, | ||
128 | NULL, | ||
129 | NULL, /* startElementNs */ | ||
130 | NULL, /* endElementNs */ | ||
131 | NULL /* xmlStructuredErrorFunc */ | ||
132 | }; | ||
133 | |||
134 | static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct; | ||
135 | |||
136 | FILE *f = fopen (file_name, "r"); | ||
137 | |||
138 | if (f != NULL) | ||
139 | { | ||
140 | int ret; | ||
141 | char chars[10]; | ||
142 | xmlParserCtxtPtr ctxt; | ||
143 | |||
144 | ret = fread (chars, 1, 4, f); | ||
145 | if (ret > 0) | ||
146 | { | ||
147 | ctxt = xmlCreatePushParserCtxt (emptySAXHandler, NULL, | ||
148 | chars, ret, file_name); | ||
149 | while ((ret = fread(chars, 1, 3, f)) > 0) | ||
150 | { | ||
151 | xmlParseChunk (ctxt, chars, ret, 0); | ||
152 | } | ||
153 | xmlParseChunk (ctxt, chars, 0, 1); | ||
154 | xmlFreeParserCtxt(ctxt); | ||
155 | } | ||
156 | fclose (f); | ||
157 | } | ||
158 | else | ||
159 | { | ||
160 | xmlGenericError (xmlGenericErrorContext, | ||
161 | "Cannot read file."); | ||
162 | } | ||
163 | |||
164 | GBSeqFree (g); | ||
165 | |||
166 | return; | ||
167 | } | ||
diff --git a/src/load/load_features.h b/src/load/load_features.h new file mode 100644 index 0000000..932883f --- a/dev/null +++ b/src/load/load_features.h | |||
@@ -0,0 +1,12 @@ | |||
1 | #ifndef LOAD_FEATURES_H | ||
2 | #define LOAD_FEATURES_H | ||
3 | |||
4 | #include <hdf5.h> | ||
5 | |||
6 | /* | ||
7 | * Load the features and other meta-data pulled from Entrez via eFetch | ||
8 | * as XML. | ||
9 | */ | ||
10 | void load_features (hid_t file_id, const char* file_name); | ||
11 | |||
12 | #endif // LOAD_FEATURES_H | ||