summaryrefslogtreecommitdiffstats
path: root/src/assign_protein_type.c (plain)
blob: 1b58f5474e7ec8e1e1a931d2643bb290d8d03943
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#include "assign_protein_type.h"
#include "check_ncbi_error.h"
#include "check_h5_error.h"
#include <ncbi.h>
#include <readdb.h>
#include <blast.h>
#include <salpacc.h>
#include <stdbool.h>
#include <hdf5_hl.h>

/*
 * BLAST database containing all of the influenza protein sequences.
 */
#define SEQDB "/home/don/exp004/influenzadb/influenzadb"

/*
 * BLAST reference database of prototypical protein types.
 */
#define REFDB "/home/don/exp004/influenzadb/proteinnames"

#define BUFFER_LEN 50

void
assign_protein_type (hid_t file_id)
{
  /*
   * Iterate through the records for which no protein type has been
   * assigned.  Create a BioSeq Pointer to the data and then use this
   * as input to the BLAST run against the reference database of
   * prototypical sequence records.
   */

  /*
   * Make BLAST messages reportable.
   */
  ErrSetMessageLevel (SEV_WARNING);

  /*
   * Open the BLAST sequence database.
   */
  ReadDBFILEPtr seqdb = readdb_new (SEQDB, true);

  /*
   * Get default BLAST options.
   */
  BLAST_OptionsBlkPtr options = BLASTOptionNew ("blastp", true);
  ValNodePtr error_returns = NULL;

  /*
   * Read the data from HDF5 file.
   */
  hsize_t nfields;
  hsize_t nrecords;
  herr_t status = H5TBget_table_info (file_id, "influenza.faa", &nfields,
				      &nrecords);
  if (status < 0)
    check_h5_error (status, __FILE__, __LINE__);

  /*
   * todo: Allocate memory of nrecords for dst_buf.
   *
   * todo: Refactor code to share structres in read and write HDF5
   * calls.
   */

  status = H5TBread_table (file_id, "influenza.faa", dst_size, dst_offset,
			   dst_sizes, dst_buf);
  if (status < 0)
    check_h5_error (status, __FILE__, __LINE__);

  for (int i = 0; i < nrecords; i++)
    {

    }

  /*
   * Read the sequence from the database by GI.
   */
  Int4 sequence_number = readdb_gi2seq (seqdb, 453644, NULL);
  BioseqPtr bsp = readdb_get_bioseq (seqdb, sequence_number);

  SeqAlignPtr seqalign = BioseqBlastEngine (bsp,
					    "blastp",
					    REFDB,
					    options,
					    NULL,
					    &error_returns,
					    NULL);

  if (error_returns != NULL)
    check_ncbi_error (error_returns, __FILE__, __LINE__);

  if (seqalign != NULL)
    {
      Char target_id_buf[BUFFER_LEN + 1];
      SeqIdPtr target_id = SeqAlignId (seqalign, 1);
      SeqIdWrite (target_id, target_id_buf, PRINTID_FASTA_SHORT, BUFFER_LEN);
      printf ("%s\n", target_id_buf);
    }

  // Clean up memory for the next ieration.
  seqalign = SeqAlignSetFree (seqalign);
  bsp = BioseqFree (bsp);

  options = BLASTOptionDelete (options);

  return;
}

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.