From aae357374282d545f0a036c1c80b007247ff6067 Mon Sep 17 00:00:00 2001 From: Don Pellegrino Date: Tue, 19 Jan 2010 20:43:55 +0000 Subject: Updated the merge command to reflect the new table structure in the HDF5 file. --- diff --git a/analysis/year.R b/analysis/year.R index 6d68925..37310d5 100644 --- a/analysis/year.R +++ b/analysis/year.R @@ -4,22 +4,33 @@ require(hdf5); hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); -A <- influenza.aa.dat; -B <- influenza.faa; - -# Join the two tables by GB value. -C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB"); +A <- merge (influenza.aa.dat, influenza.faa, + by.x = "GenBank accession number", + by.y = "GB"); + +B <- merge (A, gi.type.data, + by.x = "GI", + by.y = "GI"); + +# Compare the local copy with a query performed on the NCBI database. +# A quick check of the number of records returned and the first and +# last set of GB values in sorted order should not show any +# inconsistencies. +T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ]; +nrow (T); +U <- T$"GenBank accession number"; +sort (U); # All records for 1918. Based on code from # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations -D <- C[C$Year == 1918, ] +C <- B[B$Year == 1918, ] -summary (D); +summary (C); # Countries represented in the 1918 dataset. -D$Country; +C$Country; -D[D$"Protein Type" == "HA", ] +C[C$Protein == "HA", ] # All record with a year value. E <- A[A$Year != 0, ]; -- cgit v0.8.3.1-22-g547a