# Explore the qualities of the year feature. require(hdf5); hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE); A <- merge (influenza.aa.dat, influenza.faa, by.x = "GenBank accession number", by.y = "GB"); B <- merge (A, gi.type.data, by.x = "GI", by.y = "GI"); # Compare the local copy with a query performed on the NCBI database. # A quick check of the number of records returned and the first and # last set of GB values in sorted order should not show any # inconsistencies. T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ]; nrow (T); U <- T$"GenBank accession number"; sort (U); # All records for 1918. Based on code from # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations C <- B[B$Year == 1918, ] summary (C); # Countries represented in the 1918 dataset. C$Country; C[C$Protein == "HA", ] # All record with a year value. E <- A[A$Year != 0, ]; hist(E$Year);