-rw-r--r-- | analysis/year.R | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R index 6d68925..37310d5 100644 --- a/analysis/year.R +++ b/analysis/year.R | |||
@@ -4,22 +4,33 @@ require(hdf5); | |||
4 | 4 | ||
5 | hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); | 5 | hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); |
6 | 6 | ||
7 | A <- influenza.aa.dat; | 7 | A <- merge (influenza.aa.dat, influenza.faa, |
8 | B <- influenza.faa; | 8 | by.x = "GenBank accession number", |
9 | 9 | by.y = "GB"); | |
10 | # Join the two tables by GB value. | 10 | |
11 | C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB"); | 11 | B <- merge (A, gi.type.data, |
12 | by.x = "GI", | ||
13 | by.y = "GI"); | ||
14 | |||
15 | # Compare the local copy with a query performed on the NCBI database. | ||
16 | # A quick check of the number of records returned and the first and | ||
17 | # last set of GB values in sorted order should not show any | ||
18 | # inconsistencies. | ||
19 | T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ]; | ||
20 | nrow (T); | ||
21 | U <- T$"GenBank accession number"; | ||
22 | sort (U); | ||
12 | 23 | ||
13 | # All records for 1918. Based on code from | 24 | # All records for 1918. Based on code from |
14 | # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations | 25 | # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations |
15 | D <- C[C$Year == 1918, ] | 26 | C <- B[B$Year == 1918, ] |
16 | 27 | ||
17 | summary (D); | 28 | summary (C); |
18 | 29 | ||
19 | # Countries represented in the 1918 dataset. | 30 | # Countries represented in the 1918 dataset. |
20 | D$Country; | 31 | C$Country; |
21 | 32 | ||
22 | D[D$"Protein Type" == "HA", ] | 33 | C[C$Protein == "HA", ] |
23 | 34 | ||
24 | # All record with a year value. | 35 | # All record with a year value. |
25 | E <- A[A$Year != 0, ]; | 36 | E <- A[A$Year != 0, ]; |