modified: makefile
[GalaxyCodeBases.git] / tools / code.py
blobcfdbcaf805876c661a45908f514220b4f7bb02e3
1 #!/usr/bin/python
2 import sys
3 import re
4 # Format
5 # @I321_1_FC30BBUAAXX:2:1:4:1436/1
6 ver=0
7 code = {}
8 for i in xrange(0,10):
9 code[str(i)] = i
10 for i in xrange(65,91):
11 code[chr(i)] = i-55
12 for i in xrange(97,123):
13 code[chr(i)] = i-61
14 rev_code = [0,]*62
15 for (key, value) in code.iteritems():
16 rev_code[value]=key
17 for i in xrange(62):
18 print i,rev_code[i]
20 def int2bin(integer):
21 bin = ""
22 for i in xrange(63,-1,-1):
23 bin += str((integer>>i)&1)
24 return bin
25 #regexpNew = re.compile("@FC(\w{1,5})AAXX:(\d):(\d+):(-?\d+):(-?\d+)#[01]/?([12]?)")
26 regexpOld = re.compile("@(\w+)_FC(\w{1,5})AAXX:(\d):(\d+):(-?\d+):(-?\d+)/?([12]?)")
27 #regexpNew = re.compile("@FC(\w{1,5})AAXX:(\d):(\d+):(-?\d+):(-?\d+)#[01]/?([12]?)")
28 regexpNew = re.compile("@FC(\w{1,5})AAXX:(\d):(\d+):(-?\d+):(-?\d+)#\w+/?([12]?)")
29 def scode(line):
30 try:
31 data = ''
32 if re.match(regexpNew, line).groups():
33 data=re.match(regexpNew, line).groups()
34 elif re.match(regexpOld, line).groups():
35 data=re.match(regexpOld, line).groups()
36 except AttributeError:
37 print line.rstrip()
38 sys.exit(255)
39 pe = '1'
40 if re.match(regexpNew, line).groups():
41 if type(data) == None:
42 print >>sys.stderr, "The FASTQ header cannot be recognized\n",line
43 sys.exit(255)
44 elif len(data) == 5:
45 ( fc, lane, tile, x, y) = data
46 elif len(data) == 6:
47 ( fc, lane, tile, x, y, pe) = data
48 else:
49 print >>sys.stderr, "The FASTQ header cannot be recognized\n",line
50 sys.exit(255)
51 elif re.match(regexpOld, line).groups():
52 if type(data) == None:
53 print >>sys.stderr, "The FASTQ header cannot be recognized\n",line
54 sys.exit(255)
55 elif len(data) == 6:
56 (machine, fc, lane, tile, x, y) = data
57 elif len(data) == 7:
58 (machine, fc, lane, tile, x, y, pe) = data
59 else:
60 print >>sys.stderr, "The FASTQ header cannot be recognized\n",line
61 sys.exit(255)
62 fc_num = 0
63 for char in fc:
64 fc_num = (fc_num<<6)|code[char]
65 #print fc_num,int2bin(fc_num)
66 (lane, tile, x, y) = map(int, (lane, tile, x, y))
67 if x < 0:
68 x += 4096
69 if y < 0:
70 y += 4096
71 #print "FC"+fc+"AAXX",lane,tile,x,y,pe
72 lane-=1; tile-=1;#pe-=1
73 assert lane<8 and tile<512 and x < 4096 and y < 4096 #and pe<2
74 info = (lane<<34)|(tile<<25)|(x<<13)|(y<<1)|1
75 #print info,int2bin(info)
76 part1 = (fc_num<<34)|(info>>3&(~(~0<<34)))
77 #print part1,int2bin(part1)
78 part2 = (ver<<3)|(info&7)
79 #print part2,int2bin(part2)
80 code_string = "@"
81 while part2 != 0:
82 code_string += rev_code[part2 % 62]
83 part2 /= 62
84 #print part2,code_string
85 code_string += "_"
86 while part1 != 0:
87 code_string += rev_code[part1 % 62]
88 part1 /= 62
89 #print part1,code_string
90 code_string += ("/"+pe)
91 return code_string
93 def decode(code_string):
94 part1 = part2 = 0
95 is_part1 = True
96 for char in code_string[::-1]:
97 if char == "_":
98 is_part1 = False
99 elif is_part1:
100 part1 = part1*62+code[char]
101 else:
102 part2 = part2*62+code[char]
103 #print part1,part2
104 fc_num = (part1>>34)&(~(~0<<30))
105 #print fc_num
106 lane = ((part1>>31)&7)+1
107 tile = ((part1>>22)&511)+1
108 x = (part1>>10)&4095
109 y = (part1&1023)<<2|((part2>>1)&3)
110 pe = (part2&1)+1
111 fc = ""
112 while fc_num != 0:
113 fc = rev_code[fc_num&63] + fc
114 fc_num >>= 6
115 #print "FC"+fc+"AAXX",lane,tile,x,y,pe
117 try:
118 fastq = open(sys.argv[1])
119 output = open(sys.argv[2],"w")
120 except IndexError:
121 print >>sys.stderr, "python",sys.argv[0],"FASTQ OUTPUT"
122 sys.exit(1)
123 except IOError:
124 print >>sys.stderr, "No such file or directory:",sys.argv[1]
125 sys.exit(1)
127 line_count = 0
128 for line in fastq:
129 line_count += 1
130 if line_count % 4 == 1:
131 temp = scode(line)
132 print >>output, temp
133 #decode(temp)
134 elif line_count % 4 == 3:
135 print >>output, "+"
136 else:
137 print >>output, line.rstrip()