2 """A command line tools for CMDB variants browser
16 from datetime
import datetime
17 from urllib
import urlencode
18 from urllib2
import Request
, urlopen
, HTTPError
20 if sys
.version_info
.major
!= 2:
21 raise Exception('This tool supports only python2')
23 argparser
= argparse
.ArgumentParser(description
='Manage authentication for CMDB API and do querying from command line.')
24 commands
= argparser
.add_subparsers(dest
='command', title
='Commands')
26 login_command
= commands
.add_parser('login', help='Authorize access to CMDB API.')
27 logout_command
= commands
.add_parser('logout', help='Logout CMDB.')
28 token_command
= commands
.add_parser('print-access-token', help='Display access token for CMDB API.')
30 login_command
.add_argument('-k', '--token', type=str, required
=True, dest
='token',
31 help='CMDB API access key(Token).')
33 annotate_command
= commands
.add_parser('annotate', help='Annotate input VCF.',
34 description
='Input VCF file. Multi-allelic variant records in input VCF must be '
35 'split into multiple bi-allelic variant records.')
36 annotate_command
.add_argument('-i', '--vcffile', metavar
='VCF_FILE', type=str, required
=True, dest
='in_vcffile',
37 help='input VCF file.')
39 query_rs_command
= commands
.add_parser('query-rs',
40 help='Query variant by variant identifier or by dbSNP rsID.',
42 query_rs_command
.add_argument('-r', '--rsid', metavar
='rsid', type=str, dest
='rsid',
43 help='dbSNP rsID.', default
=None)
44 query_rs_command
.add_argument('-l', '--list', metavar
='File-contain-a-list-of-dbSNP-rsIDs',
45 type=str, dest
='list',
46 help='dbSNP rsIDs list in a file. One for each line.'
50 query_variant_command
= commands
.add_parser('query-variant',
51 help='Query variant by variant identifier or by chromosome name and '
52 'chromosomal position.',
53 description
='Query variant by identifier chromosome name and chromosomal '
55 query_variant_command
.add_argument('-c', '--chromosome', metavar
='name', type=str, dest
='chromosome',
56 help='Chromosome name.', default
=None)
57 query_variant_command
.add_argument('-p', '--position', metavar
='genome-position', type=int, dest
='position',
58 help='Genome position.', default
=None)
59 query_variant_command
.add_argument('-l', '--positions', metavar
='File-contain-a-list-of-genome-positions',
60 type=str, dest
='positions',
61 help='Genome positions list in a file. One for each line. You can input single '
62 'position by -c and -p or using -l for multiple poisitions in a single file, '
65 # query_variant_command.add_argument('-v', '--variant', metavar='chrom-pos-ref-alt/rs#', type=str, dest='variant_id',
66 # help='Variant identifier CHROM-POS-REF-ALT or rs#.')
67 # query_variant_command.add_argument('-o', '--output', required=False, choices=['json', 'vcf'], default='json',
68 # dest='format', help='Output format.')
70 USER_HOME
= os
.path
.expanduser("~")
72 CMDB_TOKENSTORE
= 'authaccess.yaml'
74 CMDB_DATASET_VERSION
= 'CMDB_hg19_v1.0'
75 CMDB_API_VERSION
= 'v1.0'
76 CMDB_API_MAIN_URL
= 'https://db.cngb.org/cmdb/api/{}'.format(CMDB_API_VERSION
)
79 '##fileformat=VCFv4.2',
80 '##FILTER=<ID=LowQual,Description="Low quality">',
81 '##INFO=<ID=CMDB_AN,Number=1,Type=Integer,Description="Number of Alleles in Samples with Coverage from {}">'.format(
82 CMDB_DATASET_VERSION
),
83 '##INFO=<ID=CMDB_AC,Number=A,Type=Integer,Description="Alternate Allele Counts in Samples with Coverage from {}">'.format(
84 CMDB_DATASET_VERSION
),
85 '##INFO=<ID=CMDB_AF,Number=A,Type=Float,Description="Alternate Allele Frequencies from {}">'.format(CMDB_DATASET_VERSION
),
86 '##INFO=<ID=CMDB_FILTER,Number=A,Type=Float,Description="Filter from {}">'.format(CMDB_DATASET_VERSION
),
87 '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO'
91 class CMDBException(Exception):
92 def __init__(self
, message
):
93 self
.message
= message
100 if not authaccess_exists():
101 print "No access tokens found. Please login first.\n"
104 tokenstore
= read_tokenstore()
105 return tokenstore
["version"]
108 class Requests(object):
109 # this implements the parts we need of the real `Requests` module
111 def get(url
, headers
={}, params
=None):
113 url
+= '?' + urlencode(params
)
115 r
= Request(url
=url
, headers
=headers
)
117 response
= urlopen(r
)
121 return _RequestsResponse(response
)
124 def post(url
, headers
={}, data
=None):
125 if data
is not None and isinstance(data
, dict):
126 data
= urlencode(data
)
128 r
= Request(url
, headers
=headers
, data
=data
)
129 return _RequestsResponse(urlopen(r
))
132 class _RequestsResponse(object):
133 def __init__(self
, response
):
136 self
.status_code
= response
.getcode()
137 self
._json
= json
.load(response
.fp
)
139 self
.status_code
= 404
146 class _RequestsExceptions(object):
150 Requests
.exceptions
= _RequestsExceptions
151 Requests
.exceptions
.RequestException
= HTTPError
154 def authaccess_exists():
155 return os
.path
.isfile(os
.path
.join(USER_HOME
, CMDB_DIR
, CMDB_TOKENSTORE
))
158 def create_tokenstore():
159 p
= os
.path
.join(USER_HOME
, CMDB_DIR
)
160 if not os
.path
.isdir(p
):
163 p
= os
.path
.join(p
, CMDB_TOKENSTORE
)
164 if not os
.path
.isfile(p
):
170 def read_tokenstore():
171 token_path
= os
.path
.join(USER_HOME
, CMDB_DIR
, CMDB_TOKENSTORE
)
172 with
open(token_path
, 'r') as I
:
173 tokenstore
= yaml
.load(I
)
175 access_token
= tokenstore
.get('access_token', None)
176 if access_token
is None or not isinstance(access_token
, basestring
):
177 raise CMDBException('Invalid or outdated access token. You may need to run login.')
182 def write_tokenstore(token
):
183 file_path
= os
.path
.join(USER_HOME
, CMDB_DIR
, CMDB_TOKENSTORE
)
184 with
open(file_path
, 'w') as tokenstore
:
186 "access_token": token
,
187 "version": CMDB_DATASET_VERSION
189 yaml
.dump(token_obj
, tokenstore
)
191 os
.chmod(file_path
, 0600)
195 # Test the token is available or not
196 test_url
= "https://db.cngb.org/cmdb/api/v1.0/variant?token={}&type=position&query=chr17-41234470".format(token
)
197 cmdb_response
= Requests
.get(test_url
)
199 if cmdb_response
.status_code
!= 201:
200 raise CMDBException('Error while obtaining your token with CMDB API authentication server.'
201 'You may do not have the API access or the token is wrong.\n')
203 if not authaccess_exists():
206 write_tokenstore(token
)
207 sys
.stdout
.write("Done.\nYou are signed in now.\n")
213 # logout by delete the tokenstore file
214 if not authaccess_exists():
215 sys
.stderr
.write("Don't find any your access token, no need to logout.\n")
218 file_path
= os
.path
.join(USER_HOME
, CMDB_DIR
, CMDB_TOKENSTORE
)
220 sys
.stdout
.write("Done.\nLogout successful.\n")
225 def print_access_token():
227 if not authaccess_exists():
228 sys
.stderr
.write('No access tokens found. Print nothing.\n')
231 tokenstore
= read_tokenstore()
232 print (tokenstore
['access_token'])
235 def _query_paged(headers
, url
):
238 cmdb_response
= Requests
.get(url
, headers
=headers
)
239 if cmdb_response
.status_code
!= 200:
240 if cmdb_response
.status_code
== 400:
241 raise CMDBException(cmdb_response
.json().get('error', 'Failed to query data.'))
243 cmdb_response
.raise_for_status()
245 cmdb_response_data
= cmdb_response
.json()
246 if cmdb_response_data
['format'] == 'vcf' and page_no
== 1:
247 for line
in cmdb_response_data
['meta']:
250 yield cmdb_response_data
['header']
252 for item
in cmdb_response_data
['data']:
255 url
= cmdb_response_data
['next']
259 def _query_nonpaged(token
, url
):
260 cmdb_response
= Requests
.get("{}&token={}".format(url
, token
))
261 if cmdb_response
.status_code
!= 201:
262 if cmdb_response
.status_code
== 403:
263 raise CMDBException(cmdb_response
.json().get('error', 'Failed to query data.'))
265 elif cmdb_response
.status_code
== 404:
268 cmdb_response
.raise_for_status()
270 return cmdb_response
.json()
274 if not authaccess_exists():
275 sys
.stderr
.write('No access tokens found. Please login first.\n')
278 tokenstore
= read_tokenstore()
280 raise CMDBException('Provide "-r rsid".')
282 query_url
= '{}/variant?&type=rs&query={}'.format(CMDB_API_MAIN_URL
, rsid
)
283 return _query_nonpaged(tokenstore
["access_token"], query_url
)
286 def query_variant(chromosome
, position
):
287 if not authaccess_exists():
288 sys
.stderr
.write('No access tokens found. Please login first.\n')
291 tokenstore
= read_tokenstore()
292 if chromosome
is None or position
is None:
293 raise CMDBException('Provide both "-c,--chromosome" and "-p,--position".')
295 query_url
= '{}/variant?&type=position&query={}-{}'.format(CMDB_API_MAIN_URL
, chromosome
, position
)
296 return _query_nonpaged(tokenstore
["access_token"], query_url
)
299 def annotate(infile
, filter=None):
300 if not authaccess_exists():
301 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
303 data_version
= load_version()
304 with gzip
.open(infile
) if infile
.endswith('.gz') else open(infile
) as I
:
308 if in_line
.startswith('#'):
309 if in_line
.startswith('##'):
310 sys
.stdout
.write('{}\n'.format(in_line
.rstrip()))
312 elif in_line
.startswith('#CHROM'):
315 '##INFO=<ID=CMDB_AN,Number=1,Type=Integer,Description="Number of Alleles in Samples with Coverage from {}">\n'.format(
318 '##INFO=<ID=CMDB_AC,Number=A,Type=Integer,Description="Alternate Allele Counts in Samples with Coverage from {}">\n'.format(
321 '##INFO=<ID=CMDB_AF,Number=A,Type=Float,Description="Alternate Allele Frequencies from {}">\n'.format(
324 '##INFO=<ID=CMDB_FILTER,Number=A,Type=Float,Description="Filter from {}">\n'.format(
326 sys
.stdout
.write('{}\n'.format(in_line
.rstrip()))
330 in_fields
= in_line
.rstrip().split()
331 if 'chr' not in in_fields
[0].lower():
332 in_fields
[0] = 'chr' + in_fields
[0].lower()
334 chromosome
= in_fields
[0]
335 position
= int(in_fields
[1])
337 alt
= in_fields
[4] # assume bi-allelic
339 cmdb_variant
= query_variant(chromosome
, position
)
340 # Must just be one element in the `list`
342 cmdb_variant
= cmdb_variant
[0]
344 # ignore `chr` in `variant` which could be compared with variant-id in CMDB
345 variants
= ['-'.join([chromosome
.split('chr')[-1], str(position
), ref
, a
]).upper()
346 for a
in alt
.split(',')]
348 if cmdb_variant
is None or (cmdb_variant
and (cmdb_variant
['variant_id'] not in variants
)):
349 sys
.stdout
.write('{}\n'.format('\t'.join(in_fields
)))
353 'CMDB_AN': 'CMDB_AN={}'.format(cmdb_variant
['allele_num']),
354 'CMDB_AC': 'CMDB_AC={}'.format(cmdb_variant
['allele_count']),
355 'CMDB_AF': 'CMDB_AF={}'.format(cmdb_variant
['allele_freq']),
356 'CMDB_FILTER': 'CMDB_FILTER={}'.format(cmdb_variant
['filter_status'])
361 for c
in info
.split(';'):
363 if k
not in new_info
:
366 info
= ';'.join([new_info
[k
] for k
in sorted(new_info
.keys())])
367 if len(in_fields
) > 8:
369 sys
.stdout
.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
370 chromosome
, position
, in_fields
[2], ref
, alt
, in_fields
[5], in_fields
[6], info
,
371 '\t'.join(in_fields
[8:]))
374 sys
.stdout
.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(
375 chromosome
, position
, in_fields
[2], ref
, alt
, in_fields
[5], in_fields
[6], info
)
380 def run_rs_variant(rsids
):
381 if not authaccess_exists():
382 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
383 sys
.stdout
.write('%s\n' % '\n'.join(CMDB_VCF_HEADER
))
385 variants
= query_rs(rsid
)
388 for cmdb_variant
in variants
:
390 cmdb_variant
['chrom'],
392 cmdb_variant
['rsid'],
395 cmdb_variant
['site_quality'],
396 cmdb_variant
['filter_status'],
397 'CMDB_AF={},CMDB_AC={},CMDB_AN={}'.format(
398 cmdb_variant
['allele_freq'],
399 cmdb_variant
['allele_count'],
400 cmdb_variant
['allele_num']
403 sys
.stdout
.write('%s\n' % '\t'.join(map(str, vcf_line
)))
406 def run_query_variant(positions
):
408 if not authaccess_exists():
409 raise CMDBException('[ERROR] No access tokens found. Please login first.\n')
411 sys
.stdout
.write('%s\n' % '\n'.join(CMDB_VCF_HEADER
))
412 for chromosome
, position
in positions
:
414 variants
= query_variant(chromosome
, position
)
418 for cmdb_variant
in variants
:
420 cmdb_variant
['chrom'],
422 cmdb_variant
['rsid'],
425 cmdb_variant
['site_quality'],
426 cmdb_variant
['filter_status'],
427 'CMDB_AF={},CMDB_AC={},CMDB_AN={}'.format(
428 cmdb_variant
['allele_freq'],
429 cmdb_variant
['allele_count'],
430 cmdb_variant
['allele_num']
433 sys
.stdout
.write('%s\n' % '\t'.join(map(str, vcf_line
)))
439 START_TIME
= datetime
.now()
441 args
= argparser
.parse_args()
443 if args
.command
== 'login':
446 elif args
.command
== 'logout':
449 elif args
.command
== 'print-access-token':
452 elif args
.command
== 'query-rs':
453 if not (args
.rsid
or args
.list):
454 sys
.stderr
.write("[Error] Couldn't find any rsIDs. You must input single rsID by '-r' "
455 "or multiple rsIDs in one single file by '-l'.\n")
460 rsIDs
.append(args
.rsid
.lower())
462 # Fetching positions from a single file
463 with gzip
.open(args
.list) if args
.list.endswith('.gz') else open(args
.list) as P
:
465 # skip header information
466 if line
.startswith("#"):
469 rsIDs
.append(col
.lower())
470 run_rs_variant(rsIDs
)
472 elif args
.command
== 'query-variant':
474 if not ((args
.chromosome
and args
.position
) or args
.positions
):
475 sys
.stderr
.write("[Error] Couldn't find any positions. You must input single position by '-c and -p' "
476 "or multiple positions in one single file by '-l'.\n")
480 if args
.chromosome
and args
.position
:
481 if 'chr' not in args
.chromosome
.lower():
482 args
.chromosome
= 'chr' + args
.chromosome
.lower()
484 positions
.append([args
.chromosome
.lower(), args
.position
])
487 # Fetching positions from a single file
488 with gzip
.open(args
.positions
) if args
.positions
.endswith('.gz') else open(args
.positions
) as P
:
491 # skip header information
492 if line
.startswith("#"):
495 col
= line
.strip().split()
497 if 'chr' not in col
[0].lower():
498 col
[0] = 'chr' + col
[0].lower()
501 positions
.append([col
[0], int(col
[1])])
504 start
, end
= map(int, col
[1:])
505 for position
in range(start
, end
+1):
506 positions
.append([col
[0], position
])
508 sys
.stderr
.write("[Error] Unexpected format hit %s in %s.\n" % (line
, args
.positions
))
510 # sorted and query positions
511 positions
.sort(key
=lambda A
:(A
[0], A
[1]))
512 run_query_variant(positions
)
514 elif args
.command
== 'query-variants':
517 elif args
.command
== 'annotate':
518 annotate(args
.in_vcffile
, filter=None)
520 except CMDBException
as e
:
524 elasped_time
= datetime
.now() -START_TIME
525 sys
.stderr
.write("** Query CMDB done, %d seconds elapsed **\n" % (elasped_time
.seconds
))
528 if __name__
== '__main__':