1 files changed, 20 insertions, 9 deletions
diff --git a/analysis/year.R b/analysis/year.R
index 6d68925..37310d5 100644
--- a/analysis/year.R
+++ b/analysis/year.R
@@ -4,22 +4,33 @@ require(hdf5);
 hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
-A <- influenza.aa.dat;
+A <- merge (influenza.aa.dat, influenza.faa,
-B <- influenza.faa;
+            by.x = "GenBank accession number",
+            by.y = "GB");
-# Join the two tables by GB value.
-C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");
+B <- merge (A, gi.type.data,
+            by.x = "GI",
+            by.y = "GI");
+# Compare the local copy with a query performed on the NCBI database.
+# A quick check of the number of records returned and the first and
+# last set of GB values in sorted order should not show any
+# inconsistencies.
+T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
+nrow (T);
+U <- T$"GenBank accession number";
+sort (U);
 # All records for 1918.  Based on code from
 # http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
-D <- C[C$Year == 1918, ]
+C <- B[B$Year == 1918, ]
-summary (D);
+summary (C);
 # Countries represented in the 1918 dataset.
-D$Country;
+C$Country;
-D[D$"Protein Type" == "HA", ]
+C[C$Protein == "HA", ]
 # All record with a year value.
 E <- A[A$Year != 0, ];

diff --git a/analysis/year.R b/analysis/year.R index 6d68925..37310d5 100644 --- a/analysis/year.R +++ b/analysis/year.R
@@ -4,22 +4,33 @@ require(hdf5);
4		4
5	hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);	5	hdf5load("/home/don/exp007/src/influenza.h5", tidy = TRUE);
6		6
7	A <- influenza.aa.dat;	7	A <- merge (influenza.aa.dat, influenza.faa,
8	B <- influenza.faa;	8	by.x = "GenBank accession number",
9		9	by.y = "GB");
10	# Join the two tables by GB value.	10
11	C <- merge (A, B, by.x = "GenBank accession number", by.y = "GB");	11	B <- merge (A, gi.type.data,
		12	by.x = "GI",
		13	by.y = "GI");
		14
		15	# Compare the local copy with a query performed on the NCBI database.
		16	# A quick check of the number of records returned and the first and
		17	# last set of GB values in sorted order should not show any
		18	# inconsistencies.
		19	T <- B[B$Year == 1978 & B$Type == "A" & B$Protein == "HA", ];
		20	nrow (T);
		21	U <- T$"GenBank accession number";
		22	sort (U);
12		23
13	# All records for 1918. Based on code from	24	# All records for 1918. Based on code from
14	# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations	25	# http://wiki.r-project.org/rwiki/doku.php?id=tips:data-frames:select_observations
15	D <- C[C$Year == 1918, ]	26	C <- B[B$Year == 1918, ]
16		27
17	summary (D);	28	summary (C);
18		29
19	# Countries represented in the 1918 dataset.	30	# Countries represented in the 1918 dataset.
20	D$Country;	31	C$Country;
21		32
22	D[D$"Protein Type" == "HA", ]	33	C[C$Protein == "HA", ]
23		34
24	# All record with a year value.	35	# All record with a year value.
25	E <- A[A$Year != 0, ];	36	E <- A[A$Year != 0, ];