# Explore the qualities of the year feature.

require(hdf5);

hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);

A <- merge (influenza.aa.dat, influenza.faa,
            by.x = "GenBank accession number",
            by.y = "GB");

B <- merge (A, gi.type.data,
            by.x = "GI",
            by.y = "GI");

# Compare the local copy with a query performed on the NCBI database.
# A quick check of the number of records returned and the first and
# last set of GB values in sorted order should not show any
# inconsistencies.
T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
nrow (T);
U <- T$"GenBank accession number";
sort (U);

# All records for 1918.  Based on code from
# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
C <- B[B$Year == 1918, ]

summary (C);

# Countries represented in the 1918 dataset.
C$Country;

C[C$Protein == "HA", ]

# All record with a year value.
E <- A[A$Year != 0, ];

hist(E$Year);