modified: myjupyterlab.sh
[GalaxyCodeBases.git] / python / lc / lc.py
blob48cdf62c4bea2f34e0c6a0ee38bed1b7400a368d
1 #!/usr/bin/env python3
2 import site,os; site.addsitedir('/'.join((os.path.expanduser("~"),'git/toGit/python/Gsite')))
3 #import sys; print('\n'.join(sys.path))
4 import Galaxy.asizeof
6 import csv,sys,math
7 from collections import OrderedDict
8 from collections import defaultdict
10 naStr = 'NA'
11 ZoneNum = 50.0
13 import pprint
14 from xopen import xopen
16 class Vividict(dict):
17 def __missing__(self, key):
18 value = self[key] = type(self)()
19 return value
21 print(sys.argv)
23 MethRate = Vividict()
24 MethRateCnt = 0
25 with xopen('meth.cg.rate.bz2','rt') as f:
26 #with open('meth.cg.rate','rt') as f:
27 tsvin = csv.reader(f, delimiter='\t')
28 for row in tsvin:
29 if row[0].startswith('#'):
30 continue
31 #print(row)
32 #Rate = (int(row[3])+int(row[6]))/(int(row[4])+int(row[7]))
33 MethRate[row[0]][int(row[1])] = float(row[9])
34 MethRateCnt += 1
35 #pprint.pprint(MethRate)
36 print(Galaxy.asizeof.asizeof(MethRate))
38 #CDSdat = OrderedDict()
39 CDSdatCnt = 0
40 CDSdat = defaultdict(set)
42 with xopen('CDS.bed.gz','rt') as f:
43 #with open('CDS.bed','rt') as f:
44 tsvin = csv.reader(f, delimiter='\t')
45 for row in tsvin:
46 ZoneLength = 1 + int(row[2]) - int(row[1])
47 #print((ZoneLength,row))
48 theKey = '\t'.join(row[0:3])
49 #CDSdat[theKey]=','.join(filter(None,(CDSdat.setdefault(theKey),row[3])))
50 #CDSdat[theKey] |= {row[3]}
51 CDSdat[theKey].add(row[3])
52 CDSdatCnt += 1
53 #pprint.pprint(CDSdat)
54 print(Galaxy.asizeof.asizeof(CDSdat))
55 print((MethRateCnt,CDSdatCnt,len(MethRate),len(CDSdat)))
57 with open('meth.zones','wt') as outf:
58 for k, v in CDSdat.items():
59 v = ','.join(sorted(v))
60 (Chrid,pLeft,pRight) = k.split('\t')
61 pLeft = int(pLeft)
62 pRight = int(pRight)
63 #print((k,v,CDSdat[k],Chrid,pLeft,pRight))
64 ZoneLen = 1 + pRight - pLeft
65 ZoneStep = math.ceil(float(ZoneLen)/ZoneNum)
66 ZoneCnt = math.ceil(ZoneLen/ZoneStep)
67 ZoneValues = []
68 ZoneValueCntV = ZoneCnt
69 startPos = pLeft
70 while startPos <= pRight:
71 endPos = startPos + ZoneStep
72 if endPos > pRight: endPos = pRight
73 ZoneSum = 0
74 ZoneHits = 0
75 for p in range(startPos,endPos):
76 if p in MethRate[Chrid]:
77 ZoneSum += MethRate[Chrid][p]
78 ZoneHits += 1
79 if ZoneHits == 0:
80 ZoneValues.append(naStr)
81 ZoneValueCntV -= 1
82 else:
83 ZoneValue1 = float(ZoneSum/ZoneHits)
84 ZoneValues.append("%.8f" % ZoneValue1)
85 startPos += ZoneStep
86 ZoneValues = [v,k,','.join(map(str,(ZoneStep,ZoneCnt,ZoneValueCntV)))] + ZoneValues
87 if ZoneValueCntV > 0:
88 print('\t'.join(ZoneValues),file=outf,flush=True)
90 #for theKey in CDSdat.keys():
91 # print(theKey,CDSdat[theKey])
92 '''
93 scp t.py blc4:/ldfssz1/FGI/users/huxuesong/t/lc/
95 f=magic.from_file('CDS.bed.gz')
96 print(f)
97 f=magic.from_file('meth.cg.rate.bz2')
98 print(f)
99 f=magic.from_file('adapter.fa')
100 print(f)
102 15615200
103 81835576
104 (164859, 202393, 25, 198788)