author | Don Pellegrino <don@drexel.edu> | 2010-01-19 20:43:55 (GMT) |
---|---|---|
committer | Don Pellegrino <don@drexel.edu> | 2010-01-19 20:43:55 (GMT) |
commit | aae357374282d545f0a036c1c80b007247ff6067 (patch) (side-by-side diff) | |
tree | 458204c6968acf89a55fa080be0f535cef3fc9c6 | |
parent | 9642c682be8bb2f1dd0eb616488ccaf2c7bb1ad8 (diff) | |
download | exp007-aae357374282d545f0a036c1c80b007247ff6067.zip exp007-aae357374282d545f0a036c1c80b007247ff6067.tar.gz exp007-aae357374282d545f0a036c1c80b007247ff6067.tar.bz2 |
Updated the merge command to reflect the new table structure in the HDF5 file.
-rw-r--r-- | analysis/year.R | 29 |
1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R index 6d68925..37310d5 100644 --- a/analysis/year.R +++ b/analysis/year.R @@ -4,22 +4,33 @@ require(hdf5); hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); -A <- influenza.aa.dat; -B <- influenza.faa; - -# Join the two tables by GB value. -C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB"); +A <- merge (influenza.aa.dat, influenza.faa, + by.x = "GenBank accession number", + by.y = "GB"); + +B <- merge (A, gi.type.data, + by.x = "GI", + by.y = "GI"); + +# Compare the local copy with a query performed on the NCBI database. +# A quick check of the number of records returned and the first and +# last set of GB values in sorted order should not show any +# inconsistencies. +T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ]; +nrow (T); +U <- T$"GenBank accession number"; +sort (U); # All records for 1918. Based on code from # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations -D <- C[C$Year == 1918, ] +C <- B[B$Year == 1918, ] -summary (D); +summary (C); # Countries represented in the 1918 dataset. -D$Country; +C$Country; -D[D$"Protein Type" == "HA", ] +C[C$Protein == "HA", ] # All record with a year value. E <- A[A$Year != 0, ]; |