summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-03-28 08:13:39 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-03-28 08:13:39 (GMT)
commit75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e (patch) (side-by-side diff)
tree35c5bdd4f7c44946192059c5d3c78980ba9c4b07
parent1d29fba5de1dd0731564829dbf5aec572d161bd5 (diff)
downloadexp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.zip
exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.gz
exp007-75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e.tar.bz2
Testing options for loading meta-data/features/Entrez into HDF5.
-rw-r--r--src/load/load_asn.c173
-rw-r--r--src/load/load_asn.h24
-rw-r--r--src/load/load_features.c167
-rw-r--r--src/load/load_features.h12
4 files changed, 376 insertions, 0 deletions
diff --git a/src/load/load_asn.c b/src/load/load_asn.c
new file mode 100644
index 0000000..fc27d84
--- a/dev/null
+++ b/src/load/load_asn.c
@@ -0,0 +1,173 @@
+#define _GNU_SOURCE
+#include "load_asn.h"
+#include <string.h>
+#include <asn.h>
+#include <objgbseq.h>
+#include <objsset.h>
+#include <sqnutils.h>
+
+void
+print_asn (ObjectIdPtr oid, SeqIdPtr id, ValNodePtr descr, SeqAnnotPtr annot)
+{
+ /*
+ * Print the record identifiers.
+ */
+ printf (" IDENTIFIERS\n");
+ printf (" -----------\n");
+ while (oid != NULL)
+ {
+ printf("%i, %s\n", oid->id, oid->str);
+ }
+ while (id != NULL)
+ {
+ // printf ("ID: %i\n", id->choice);
+
+ char idval[256];
+ SeqIdPrint (id, idval, PRINTID_FASTA_SHORT);
+ printf (" %s\n", idval);
+
+ // if (id->choice == SEQID_GI)
+ // printf ("GI: %i\n", id->data.intvalue);
+
+ id = id->next;
+ }
+
+ /*
+ * Print descriptions.
+ * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/BIOSEQ.HTML#_Seq-descr:_Describing_the]
+ */
+ printf ("\n DESCRIPTIONS\n");
+ printf (" ------------\n");
+ while (descr != NULL)
+ {
+ switch (descr->choice)
+ {
+ case Seq_descr_title:
+ printf (" TITLE: %s\n", (char*)descr->data.ptrvalue);
+ break;
+ case Seq_descr_genbank:
+ printf (" GENBANK\n");
+ break;
+ case Seq_descr_pub:
+ printf (" PUB\n");
+ break;
+ case Seq_descr_create_date:
+ printf (" CREATE DATE\n");
+ break;
+ case Seq_descr_update_date:
+ printf (" UPDATE DATE\n");
+ break;
+ case Seq_descr_source:
+ printf (" BIOSOURCE\n");
+ break;
+ case Seq_descr_molinfo:
+ printf (" MOLINFO\n");
+ break;
+ default:
+ printf (" DESCRIPTION CHOICE=%i\n", descr->choice);
+ break;
+ }
+
+ descr = descr->next;
+ }
+
+ /*
+ * Print annotations.
+ */
+ printf ("\n ANNOTATIONS\n");
+ printf (" -----------\n");
+ while (annot != NULL)
+ {
+ printf (" ANNOTATION: %s, ", annot->name);
+ if (annot->desc != NULL) {
+ switch (annot->desc->choice)
+ {
+ case Annot_descr_name:
+ printf (" NAME: %s\n", (char*)annot->desc->data.ptrvalue);
+ break;
+ default:
+ printf (" CHOICE=%i\n", annot->desc->choice);
+ break;
+ }
+ }
+ else
+ printf (" NONE\n");
+
+ annot = annot->next;
+ }
+
+}
+
+/*
+ * Based on example at
+ * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQUTIL.HTML].
+ */
+void
+load_asn (hid_t file_id, const char* file_name)
+{
+ char* asn_file = strdup(file_name);
+ AsnIoPtr aip = AsnIoOpen (asn_file, "r");
+ SeqEntryPtr sep = SeqEntryAsnRead (aip, NULL);
+ BioseqSetPtr bsetp = 0;
+ ValNodePtr descr = 0;
+ SeqAnnotPtr annot = 0;
+ SeqIdPtr id = 0;
+ ObjectIdPtr oid = 0;
+
+ /*
+ * Data file statistics.
+ */
+ printf ("NODES: %i\tBIOSEQS: %i\n", ValNodeLen (sep), BioseqCount (sep));
+ printf ("\n");
+
+ /*
+ * This loop needs to be corrected to handle nesting of sets.
+ */
+
+ while (sep != NULL)
+ {
+ bsetp = (BioseqSetPtr) sep->data.ptrvalue;
+ if (bsetp != NULL)
+ {
+ oid = bsetp->id;
+ id = NULL;
+ descr = bsetp->descr;
+ annot = bsetp->annot;
+ }
+
+ printf ("BIOSEQSET\n");
+ printf ("\n");
+ print_asn (oid, id, descr, annot);
+ printf ("\n");
+
+ /*
+ * Process Bioseqs in the set.
+ */
+ SeqEntryPtr sep2 = bsetp->seq_set;
+ while (sep2 != NULL)
+ {
+ BioseqPtr bsp = sep2->data.ptrvalue;
+ if (bsp != NULL)
+ {
+ oid = NULL;
+ id = bsp->id;
+ descr = bsp->descr;
+ annot = bsp->annot;
+
+ printf ("BIOSEQ\n");
+ printf ("\n");
+ print_asn (oid, id, descr, annot);
+ printf ("\n");
+ }
+
+ sep2 = sep2->next;
+ }
+
+ sep = sep->next;
+ }
+
+ AsnIoClose (aip);
+ free (asn_file);
+
+ return;
+}
diff --git a/src/load/load_asn.h b/src/load/load_asn.h
new file mode 100644
index 0000000..a7d54db
--- a/dev/null
+++ b/src/load/load_asn.h
@@ -0,0 +1,24 @@
+#ifndef LOAD_ASN_H
+#define LOAD_ASN_H
+
+#include <hdf5.h>
+
+/*
+ * Load the features and other meta-data pulled from Entrez via eFetch
+ * as ASN.1.
+ *
+ * Test: gi|453644
+ *
+ * Retrieving the ASN.1 file via eFetch for gi|453644 worked smoothly
+ * however the hierarchy of the ASN.1 is difficult to align with other
+ * data by GI. This is due to the Bioseqset returned lacking
+ * identifiers and the gi|453644 appearing as a Bioseq member of the
+ * set. It is positioned on the same hierarchical level as gi|453643.
+ * The containing set includes the PUB records. Comparatively the XML
+ * files returned via the same process list the gi|453644 at the top
+ * of the hierarchy and above the PUB records. This output appears to
+ * be more consistent with the perspective requested in the input.
+ */
+void load_asn (hid_t file_id, const char* file_name);
+
+#endif // LOAD_ASN_H
diff --git a/src/load/load_features.c b/src/load/load_features.c
new file mode 100644
index 0000000..b18031a
--- a/dev/null
+++ b/src/load/load_features.c
@@ -0,0 +1,167 @@
+#include "load_features.h"
+#include <libxml/parser.h>
+#include <stdbool.h>
+#include <asn.h>
+#include <objgbseq.h>
+
+/*
+ * An NCBI GBSeq structure to hold the data for the current record.
+ */
+GBSeqPtr g;
+
+bool in_element;
+
+static void
+lf_startDocument (void *ctx ATTRIBUTE_UNUSED)
+{
+ printf ("SAX.startDocument()\n");
+
+ return;
+}
+
+static void
+lf_endDocument (void *ctx ATTRIBUTE_UNUSED)
+{
+ printf ("SAX.endDocument()\n");
+
+ return;
+}
+
+static xmlEntityPtr
+lf_getEntity (void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
+{
+ printf("SAX.getEntity(%s)\n", name);
+
+ return (NULL);
+}
+
+static void
+lf_startElement(void *ctx ATTRIBUTE_UNUSED,
+ const xmlChar *name, const xmlChar **atts)
+{
+ int i;
+
+ fprintf(stdout, "SAX.startElement(%s", (char *) name);
+ if (atts != NULL) {
+ for (i = 0;(atts[i] != NULL);i++) {
+ fprintf(stdout, ", %s='", atts[i++]);
+ if (atts[i] != NULL)
+ fprintf(stdout, "%s'", atts[i]);
+ }
+ }
+ fprintf(stdout, ")\n");
+
+ in_element = true;
+
+ return;
+}
+
+static void
+lf_endElement(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name)
+{
+ printf("SAX.endElement(%s)\n", (char *) name);
+ in_element = false;
+
+ return;
+}
+
+static void
+lf_characters(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len)
+{
+ if (!in_element)
+ return;
+
+ char output[40];
+ int i;
+
+ for (i = 0;(i<len) && (i < 30);i++)
+ output[i] = ch[i];
+ output[i] = 0;
+
+ printf("SAX.characters(%s, %d)\n", output, len);
+}
+
+/*
+ * 1. Parse and load the XML file into memory.
+ * 2. Insert the XML into HDF5.
+ * For an example of parsing XML with libxml2 and SAX see:
+ * [http://git.gnome.org/browse/libxml2/tree/testSAX.c].
+ */
+void
+load_features (hid_t file_id, const char* file_name)
+{
+ g = GBSeqNew ();
+
+ LIBXML_TEST_VERSION;
+
+ in_element = false;
+
+ static xmlSAXHandler emptySAXHandlerStruct = {
+ NULL, /* internalSubset */
+ NULL, /* isStandalone */
+ NULL, /* hasInternalSubset */
+ NULL, /* hasExternalSubset */
+ NULL, /* resolveEntity */
+ lf_getEntity, /* getEntity */
+ NULL, /* entityDecl */
+ NULL, /* notationDecl */
+ NULL, /* attributeDecl */
+ NULL, /* elementDecl */
+ NULL, /* unparsedEntityDecl */
+ NULL, /* setDocumentLocator */
+ lf_startDocument, /* startDocument */
+ lf_endDocument, /* endDocument */
+ lf_startElement, /* startElement */
+ lf_endElement, /* endElement */
+ NULL, /* reference */
+ lf_characters, /* characters */
+ NULL, /* ignorableWhitespace */
+ NULL, /* processingInstruction */
+ NULL, /* comment */
+ NULL, /* xmlParserWarning */
+ NULL, /* xmlParserError */
+ NULL, /* xmlParserError */
+ NULL, /* getParameterEntity */
+ NULL, /* cdataBlock; */
+ NULL, /* externalSubset; */
+ 1,
+ NULL,
+ NULL, /* startElementNs */
+ NULL, /* endElementNs */
+ NULL /* xmlStructuredErrorFunc */
+ };
+
+ static xmlSAXHandlerPtr emptySAXHandler = &emptySAXHandlerStruct;
+
+ FILE *f = fopen (file_name, "r");
+
+ if (f != NULL)
+ {
+ int ret;
+ char chars[10];
+ xmlParserCtxtPtr ctxt;
+
+ ret = fread (chars, 1, 4, f);
+ if (ret > 0)
+ {
+ ctxt = xmlCreatePushParserCtxt (emptySAXHandler, NULL,
+ chars, ret, file_name);
+ while ((ret = fread(chars, 1, 3, f)) > 0)
+ {
+ xmlParseChunk (ctxt, chars, ret, 0);
+ }
+ xmlParseChunk (ctxt, chars, 0, 1);
+ xmlFreeParserCtxt(ctxt);
+ }
+ fclose (f);
+ }
+ else
+ {
+ xmlGenericError (xmlGenericErrorContext,
+ "Cannot read file.");
+ }
+
+ GBSeqFree (g);
+
+ return;
+}
diff --git a/src/load/load_features.h b/src/load/load_features.h
new file mode 100644
index 0000000..932883f
--- a/dev/null
+++ b/src/load/load_features.h
@@ -0,0 +1,12 @@
+#ifndef LOAD_FEATURES_H
+#define LOAD_FEATURES_H
+
+#include <hdf5.h>
+
+/*
+ * Load the features and other meta-data pulled from Entrez via eFetch
+ * as XML.
+ */
+void load_features (hid_t file_id, const char* file_name);
+
+#endif // LOAD_FEATURES_H

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.