summaryrefslogtreecommitdiffstats
path: root/analysis/year.R (plain)
blob: 37310d5cc21b46f9e93eb12d05eeab3d4fe64f7e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# Explore the qualities of the year feature.

require(hdf5);

hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);

A <- merge (influenza.aa.dat, influenza.faa,
            by.x = "GenBank accession number",
            by.y = "GB");

B <- merge (A, gi.type.data,
            by.x = "GI",
            by.y = "GI");

# Compare the local copy with a query performed on the NCBI database.
# A quick check of the number of records returned and the first and
# last set of GB values in sorted order should not show any
# inconsistencies.
T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
nrow (T);
U <- T$"GenBank accession number";
sort (U);

# All records for 1918.  Based on code from
# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
C <- B[B$Year == 1918, ]

summary (C);

# Countries represented in the 1918 dataset.
C$Country;

C[C$Protein == "HA", ]

# All record with a year value.
E <- A[A$Year != 0, ];

hist(E$Year);

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.