From 75c5ad4cdc439bfe28cc2bfea78839e4fb22e73e Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Sun, 28 Mar 2010 08:13:39 +0000 Subject: Testing options for loading meta-data/features/Entrez into HDF5. --- diff --git a/src/load/load_asn.c b/src/load/load_asn.c new file mode 100644 index 0000000..fc27d84 --- a/dev/null +++ b/src/load/load_asn.c @@ -0,0 +1,173 @@ +#define _GNU_SOURCE +#include "load_asn.h" +#include +#include +#include +#include +#include + +void +print_asn (ObjectIdPtr oid, SeqIdPtr id, ValNodePtr descr, SeqAnnotPtr annot) +{ + /* + * Print the record identifiers. + */ + printf (" IDENTIFIERS\n"); + printf (" -----------\n"); + while (oid != NULL) + { + printf("%i, %s\n", oid->id, oid->str); + } + while (id != NULL) + { + // printf ("ID: %i\n", id->choice); + + char idval[256]; + SeqIdPrint (id, idval, PRINTID_FASTA_SHORT); + printf (" %s\n", idval); + + // if (id->choice == SEQID_GI) + // printf ("GI: %i\n", id->data.intvalue); + + id = id->next; + } + + /* + * Print descriptions. + * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/BIOSEQ.HTML#_Seq-descr:_Describing_the] + */ + printf ("\n DESCRIPTIONS\n"); + printf (" ------------\n"); + while (descr != NULL) + { + switch (descr->choice) + { + case Seq_descr_title: + printf (" TITLE: %s\n", (char*)descr->data.ptrvalue); + break; + case Seq_descr_genbank: + printf (" GENBANK\n"); + break; + case Seq_descr_pub: + printf (" PUB\n"); + break; + case Seq_descr_create_date: + printf (" CREATE DATE\n"); + break; + case Seq_descr_update_date: + printf (" UPDATE DATE\n"); + break; + case Seq_descr_source: + printf (" BIOSOURCE\n"); + break; + case Seq_descr_molinfo: + printf (" MOLINFO\n"); + break; + default: + printf (" DESCRIPTION CHOICE=%i\n", descr->choice); + break; + } + + descr = descr->next; + } + + /* + * Print annotations. + */ + printf ("\n ANNOTATIONS\n"); + printf (" -----------\n"); + while (annot != NULL) + { + printf (" ANNOTATION: %s, ", annot->name); + if (annot->desc != NULL) { + switch (annot->desc->choice) + { + case Annot_descr_name: + printf (" NAME: %s\n", (char*)annot->desc->data.ptrvalue); + break; + default: + printf (" CHOICE=%i\n", annot->desc->choice); + break; + } + } + else + printf (" NONE\n"); + + annot = annot->next; + } + +} + +/* + * Based on example at + * [http://www.ncbi.nlm.nih.gov/IEB/ToolBox/SDKDOCS/SEQUTIL.HTML]. + */ +void +load_asn (hid_t file_id, const char* file_name) +{ + char* asn_file = strdup(file_name); + AsnIoPtr aip = AsnIoOpen (asn_file, "r"); + SeqEntryPtr sep = SeqEntryAsnRead (aip, NULL); + BioseqSetPtr bsetp = 0; + ValNodePtr descr = 0; + SeqAnnotPtr annot = 0; + SeqIdPtr id = 0; + ObjectIdPtr oid = 0; + + /* + * Data file statistics. + */ + printf ("NODES: %i\tBIOSEQS: %i\n", ValNodeLen (sep), BioseqCount (sep)); + printf ("\n"); + + /* + * This loop needs to be corrected to handle nesting of sets. + */ + + while (sep != NULL) + { + bsetp = (BioseqSetPtr) sep->data.ptrvalue; + if (bsetp != NULL) + { + oid = bsetp->id; + id = NULL; + descr = bsetp->descr; + annot = bsetp->annot; + } + + printf ("BIOSEQSET\n"); + printf ("\n"); + print_asn (oid, id, descr, annot); + printf ("\n"); + + /* + * Process Bioseqs in the set. + */ + SeqEntryPtr sep2 = bsetp->seq_set; + while (sep2 != NULL) + { + BioseqPtr bsp = sep2->data.ptrvalue; + if (bsp != NULL) + { + oid = NULL; + id = bsp->id; + descr = bsp->descr; + annot = bsp->annot; + + printf ("BIOSEQ\n"); + printf ("\n"); + print_asn (oid, id, descr, annot); + printf ("\n"); + } + + sep2 = sep2->next; + } + + sep = sep->next; + } + + AsnIoClose (aip); + free (asn_file); + + return; +} diff --git a/src/load/load_asn.h b/src/load/load_asn.h new file mode 100644 index 0000000..a7d54db --- a/dev/null +++ b/src/load/load_asn.h @@ -0,0 +1,24 @@ +#ifndef LOAD_ASN_H +#define LOAD_ASN_H + +#include + +/* + * Load the features and other meta-data pulled from Entrez via eFetch + * as ASN.1. + * + * Test: gi|453644 + * + * Retrieving the ASN.1 file via eFetch for gi|453644 worked smoothly + * however the hierarchy of the ASN.1 is difficult to align with other + * data by GI. This is due to the Bioseqset returned lacking + * identifiers and the gi|453644 appearing as a Bioseq member of the + * set. It is positioned on the same hierarchical level as gi|453643. + * The containing set includes the PUB records. Comparatively the XML + * files returned via the same process list the gi|453644 at the top + * of the hierarchy and above the PUB records. This output appears to + * be more consistent with the perspective requested in the input. + */ +void load_asn (hid_t file_id, const char* file_name); + +#endif // LOAD_ASN_H diff --git a/src/load/load_features.c b/src/load/load_features.c new file mode 100644 index 0000000..b18031a --- a/dev/null +++ b/src/load/load_features.c @@ -0,0 +1,167 @@ +#include "load_features.h" +#include +#include +#include +#include + +/* + * An NCBI GBSeq structure to hold the data for the current record. + */ +GBSeqPtr g; + +bool in_element; + +static void +lf_startDocument (void *ctx ATTRIBUTE_UNUSED) +{ + printf ("SAX.startDocument()\n"); + + return; +} + +static void +lf_endDocument (void *ctx ATTRIBUTE_UNUSED) +{ + printf ("SAX.endDocument()\n"); + + return; +} + +static xmlEntityPtr +lf_getEntity (void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) +{ + printf("SAX.getEntity(%s)\n", name); + + return (NULL); +} + +static void +lf_startElement(void *ctx ATTRIBUTE_UNUSED, + const xmlChar *name, const xmlChar **atts) +{ + int i; + + fprintf(stdout, "SAX.startElement(%s", (char *) name); + if (atts != NULL) { + for (i = 0;(atts[i] != NULL);i++) { + fprintf(stdout, ", %s='", atts[i++]); + if (atts[i] != NULL) + fprintf(stdout, "%s'", atts[i]); + } + } + fprintf(stdout, ")\n"); + + in_element = true; + + return; +} + +static void +lf_endElement(void *ctx ATTRIBUTE_UNUSED, const xmlChar *name) +{ + printf("SAX.endElement(%s)\n", (char *) name); + in_element = false; + + return; +} + +static void +lf_characters(void *ctx ATTRIBUTE_UNUSED, const xmlChar *ch, int len) +{ + if (!in_element) + return; + + char output[40]; + int i; + + for (i = 0;(i 0) + { + ctxt = xmlCreatePushParserCtxt (emptySAXHandler, NULL, + chars, ret, file_name); + while ((ret = fread(chars, 1, 3, f)) > 0) + { + xmlParseChunk (ctxt, chars, ret, 0); + } + xmlParseChunk (ctxt, chars, 0, 1); + xmlFreeParserCtxt(ctxt); + } + fclose (f); + } + else + { + xmlGenericError (xmlGenericErrorContext, + "Cannot read file."); + } + + GBSeqFree (g); + + return; +} diff --git a/src/load/load_features.h b/src/load/load_features.h new file mode 100644 index 0000000..932883f --- a/dev/null +++ b/src/load/load_features.h @@ -0,0 +1,12 @@ +#ifndef LOAD_FEATURES_H +#define LOAD_FEATURES_H + +#include + +/* + * Load the features and other meta-data pulled from Entrez via eFetch + * as XML. + */ +void load_features (hid_t file_id, const char* file_name); + +#endif // LOAD_FEATURES_H -- cgit v0.8.3.1-22-g547a