summaryrefslogtreecommitdiffstats
authorDon Pellegrino <don@drexel.edu>2010-01-19 20:43:55 (GMT)
committer Don Pellegrino <don@drexel.edu>2010-01-19 20:43:55 (GMT)
commitaae357374282d545f0a036c1c80b007247ff6067 (patch) (side-by-side diff)
tree458204c6968acf89a55fa080be0f535cef3fc9c6
parent9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8 (diff)
downloadexp007-aae357374282d545f0a036c1c80b007247ff6067.zip
exp007-aae357374282d545f0a036c1c80b007247ff6067.tar.gz
exp007-aae357374282d545f0a036c1c80b007247ff6067.tar.bz2
Updated the merge command to reflect the new table structure in the HDF5 file.
-rw-r--r--analysis/year.R29
1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R
index 6d68925..37310d5 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -4,22 +4,33 @@ require(hdf5);
hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
-A <- influenza.aa.dat;
-B <- influenza.faa;
-
-# Join the two tables by GB value.
-C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
+A <- merge (influenza.aa.dat, influenza.faa,
+ by.x = "GenBank accession number",
+ by.y = "GB");
+
+B <- merge (A, gi.type.data,
+ by.x = "GI",
+ by.y = "GI");
+
+# Compare the local copy with a query performed on the NCBI database.
+# A quick check of the number of records returned and the first and
+# last set of GB values in sorted order should not show any
+# inconsistencies.
+T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
+nrow (T);
+U <- T$"GenBank accession number";
+sort (U);
# All records for 1918. Based on code from
# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
-D <- C[C$Year == 1918, ]
+C <- B[B$Year == 1918, ]
-summary (D);
+summary (C);
# Countries represented in the 1918 dataset.
-D$Country;
+C$Country;
-D[D$"Protein Type" == "HA", ]
+C[C$Protein == "HA", ]
# All record with a year value.
E <- A[A$Year != 0, ];

Valid XHTML 1.0 Strict

Copyright © 2009 Don Pellegrino All Rights Reserved.