make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / R / hapMap2numeric.R
blobad5feb9336aa23dd1fdc6174ec538ddff4310324
1 hapMap2numeric <- function(file){\r
2   hapmap <- as.matrix(read.table(file, header=TRUE, row.names=1, sep="\t",\r
3                                  stringsAsFactors=FALSE)[,-(2:10)])\r
4   \r
5   samples <- dimnames(hapmap)[[2]][-1]\r
6   loci <- dimnames(hapmap)[[1]]\r
7   \r
8   #make sure sites are biallelic\r
9   siteCheck <- names(table(hapmap[ ,1]))\r
10   if(sum(grepl("/", siteCheck)) != length(siteCheck)){\r
11     stop("There appear to be monomorphic sites in your file. Please remove them.")\r
12   }\r
13   \r
14   siteCheck2 <- strsplit(siteCheck, "/")\r
15   siteCheck2 <- unlist(lapply(siteCheck2, length))\r
16   siteCheck2 <- siteCheck2 == 2\r
17   if(sum(siteCheck2) != length(siteCheck2)){\r
18     stop("There appear to be sites with more than two allele states in your file.\r
19          Please allow a maximum of two alleles per site in your file.")\r
20   }\r
21   \r
22   \r
23   #check whether the file uses IUPAC coding or double letters\r
24   nFor <- nchar(hapmap[1, 2])\r
25   \r
26   \r
27   #use the appropriate conversions\r
28   if(nFor == 2){\r
29     # set up conversion table\r
30     s <- as.integer(c(0,1,1,2,NA))\r
31     ac <- s\r
32     ag <- s\r
33     at <- s\r
34     cg <- s\r
35     ct <- s\r
36     gt <- s\r
37     cx <- s\r
38     gx <- s\r
39     tx <- s\r
40     ax <- s\r
41     names(ac) <- c("AA","AC","CA","CC","NN")\r
42     names(ag) <- c("AA","AG","GA","GG","NN")\r
43     names(at) <- c("AA","AT","TA","TT","NN")\r
44     names(cg) <- c("CC","CG","GC","GG","NN")\r
45     names(ct) <- c("CC","CT","TC","TT","NN")\r
46     names(gt) <- c("GG","GT","TG","TT","NN")\r
47     names(cx) <- c("CC","C-","-C","--","NN")\r
48     names(gx) <- c("GG","G-","-G","--","NN")\r
49     names(tx) <- c("TT","T-","-T","--","NN")\r
50     names(ax) <- c("AA","A-","-A","--","NN")\r
51     conv <- list(ac,ac,ag,ag,at,at,cg,cg,ct,ct,gt,gt,cx,cx,gx,gx,tx,tx,ax,ax)\r
52     names(conv) <- c("A/C","C/A","A/G","G/A","A/T","T/A","C/G","G/C",\r
53                      "C/T","T/C","G/T","T/G","C/-","-/C","G/-","-/G","T/-","-/T","A/-","-/A")\r
54   }\r
55   \r
56   if(nFor == 1){\r
57   # set up conversion table\r
58   s <- as.integer(c(0,1,2,NA))\r
59   ac <- s\r
60   ag <- s\r
61   at <- s\r
62   cg <- s\r
63   ct <- s\r
64   gt <- s\r
65   cx <- s\r
66   gx <- s\r
67   tx <- s\r
68   ax <- s\r
69   names(ac) <- c("A","M","C","N")\r
70   names(ag) <- c("A","R","G","N")\r
71   names(at) <- c("A","W","T","N")\r
72   names(cg) <- c("C","S","G","N")\r
73   names(ct) <- c("C","Y","T","N")\r
74   names(gt) <- c("G","K","T","N")\r
75   names(cx) <- c("C","0","-","N")\r
76   names(gx) <- c("G","0","-","N")\r
77   names(tx) <- c("T","0","-","N")\r
78   names(ax) <- c("A","0","-","N")\r
79   conv <- list(ac,ac,ag,ag,at,at,cg,cg,ct,ct,gt,gt,cx,cx,gx,gx,tx,tx,ax,ax)\r
80   names(conv) <- c("A/C","C/A","A/G","G/A","A/T","T/A","C/G","G/C",\r
81                    "C/T","T/C","G/T","T/G","C/-","-/C","G/-","-/G","T/-","-/T","A/-","-/A")\r
82   }\r
83   \r
84   # matrix to hold output\r
85   x <- matrix(NA, nrow=length(samples), ncol=length(loci),\r
86               dimnames=list(samples, loci))\r
87   # convert genotypes\r
88   for(L in 1:length(loci)){\r
89     thisconv <- conv[[hapmap[L, 1]]]\r
90     x[,L] <- thisconv[hapmap[L, -1]]\r
91   }\r
92   \r
93   return(x)\r
94 }\r