Bio/Restriction/*: move to another repo with same name.
[bioperl-live.git] / scripts / DB-HIV / bp_hivq.pl
blob8b05badba1781b93a99f4efb0c878a0f1e60bc18
1 #!/usr/bin/perl
2 # command-line script for HIV sequence queries
3 # using HIV.pm and HIVQuery.pm
5 =head1 NAME
7 bp_hivq.PL - an interactive command-line interface to L<Bio::DB::HIV> and L<Bio::DB::Query::HIVQuery>
9 =head1 SYNOPSIS
11 $ perl bp_hivq.PL
12 hivq> query C[subtype] SI[phenotype]
13 hivq> prerun
14 80 sequences returned
15 Query: C[subtype] SI[phenotype]
16 hivq> outfile csi.fas
17 hivq> run
18 Download complete.
19 hivq> outfile dsi.fas
20 hivq> run D[subtype] SI[phenotype]
21 Download complete.
22 hivq> count
23 25 sequences returned
24 Query: D[subtype] SI[phenotype]
25 hivq> exit
28 =head1 DESCRIPTION
30 The BioPerl modules L<Bio::DB::HIV> and L<Bio::DB::Query::HIVQuery>
31 together allow batch queries against the Los Alamos National
32 Laboratories' HIV Sequence Database using a simple query
33 language. C<bp_hivq.PL> provides both an example of the use of these
34 modules, and a standalone interactive command-line interface to the
35 LANL HIV DB. Simple commands allow the user to retrieve HIV sequences
36 and annotations using the query language implemented in
37 L<Bio::DB::Query::HIVQuery>. Visit the man pages for those modules for
38 more details.
40 =head1 USAGE
42 Run the script using C<perl bp_hivq.PL> or, in Unix, C<./bp_hivq.PL>. You will see
43 the
45 hivq>
47 prompt. Type commands with queries to retrieve sequence and annotation
48 data. See the SYNOPSIS for a sample session. Available commands
49 are described below.
51 =head2 TIPS
53 The LANL database is pretty complex and extensive. Use the C<find> facility to
54 explore the available database tables and fields. To identify aliases for a particular field, use C<find alias [fieldname]>. For example, to find a short alias to the
55 weirdly named field C<seq_sample.ssam_second_receptor>, do
57 hivq> find alias seq_sample.ssam_second_receptor
59 which returns
61 coreceptor second_receptor
63 Now, instead of the following query
65 hivq> run C[subtype] CCR5[seq_sample.ssam_second_receptor]
67 you know you can do
69 hivq> run C[subtype] CCR5[coreceptor]
71 Use the C<outfile> command to set the file that receives the retrieved
72 sequences. You can change the current output file simply by issuing a
73 new C<outfile> command during the session. The output file defaults to
74 standard output.
76 Use the C<query> command to validate a query without hitting the
77 database. Use the C<prerun> or C<count> commands to get a count of
78 sequence hits for a query without retrieving the data. Use C<run> or
79 C<do> to perform a complete query, retrieving sequence data into the
80 currently set output files.
82 To process C<bp_hivq.PL> commands in batch, create a text file
83 (C<bp_hivq.cmd>, for example) containing desired commands one per
84 line. Then execute the following from the shell:
86 $ cat bp_hivq.cmd | perl bp_hivq.PL
88 =head1 COMMANDS
90 Here is a complete list of commands. Options in single brackets (C<[req_option]>) are
91 required; options in double brackets (C<[[opt_option]]>) are optional.
93 confirm : Toggle interactive confirmation before
94 executing queries
95 exit : Quit script
96 find : Explore database schema
97 find tables Display all database tables
98 find fields Display all database fields (columns)
99 find fields [table] Display all fields in [table]
100 find alias [field] Display valid aliases for [field]
101 help [[command]] : Show command help
102 if [[command]] not specified, list all
103 available commands
104 id : Display current session id
105 outfile [filename] : Set file for collecting retrieved data
106 ping : Check if LANL DB is available
107 prerun [[query]] : Execute query but retrieve hit count only
108 if [[query]] not specified, use current query
109 query [query] : Validate and set current query
110 run [[query]] : Execute query and retrieve data
111 if [[query]] not specified, use current query
112 state : Display current state of the script
114 bye : Alias for 'exit'
115 config : Alias for 'state'
116 count : Alias for 'prerun'
117 do : Alias for 'run'
118 out : Alias for 'outfile'
119 quit : Alias for 'exit'
121 =head1 OPTIONS
123 -v : verbose; turns on the internal debug() function
125 =head1 FEEDBACK
127 =head2 Mailing Lists
129 User feedback is an integral part of the evolution of this and other
130 Bioperl modules. Send your comments and suggestions preferably to
131 the Bioperl mailing list. Your participation is much appreciated.
133 bioperl-l@bioperl.org - General discussion
134 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
136 =head2 Reporting Bugs
138 Report bugs to the Bioperl bug tracking system to help us keep track
139 of the bugs and their resolution. Bug reports can be submitted via the
140 web:
142 https://github.com/bioperl/bioperl-live/issues
144 =head1 AUTHOR - Mark A. Jensen
146 Mark A. Jensen E<lt>maj@fortinbras.usE<gt>
148 =cut
150 $|=1;
151 use strict;
152 use warnings;
153 use Term::ReadLine;
154 use Bio::DB::HIV;
155 use Bio::DB::Query::HIVQuery;
156 use Bio::SeqIO;
157 use Error qw(:try);
159 use Getopt::Std;
160 our ($opt_v);
161 getopts('v');
162 my $db = new Bio::DB::HIV;
164 my $q = new Bio::DB::Query::HIVQuery();
165 my $schema = $q->_schema;
166 my $seqio;
167 my @cmds = qw(
169 config
170 confirm
171 count
173 exit
174 find
175 help
178 outfile
179 ping
180 prerun
181 query
182 quit
184 state
187 my %state = (
188 'confirm' => 0,
189 'outfile' => {'text'=>'[stdout]','handle'=>\*STDOUT},
190 'query' => '',
191 'session_id' => '',
192 'timeout' => '',
193 'curct' => 0
196 my %help = (
197 'bye' => 'bye: End script',
198 'config' => 'config: Print current configuration state of hivq',
199 'confirm' => 'confirm: Toggle user confirmation of query execution',
200 'count' => 'count [[query string]]: Get number of sequences available for query',
201 'do' => 'do [[query string]]: Run query to completion',
202 'exit' => 'exit: End script',
203 'find' => join("\n","find tables: Print all LANL table names",
204 "find fields: Print all field names",
205 "find fields [tablename]: Print all fields in specified table",
206 "find aliases [fieldname]: Print all valid aliases for specified field",
207 "find options [fieldname/alias]: Print valid match options for specified field"),
208 'help' => 'help [[command]]: Get description of command',
209 'id' => 'id: Print current session id',
210 'out' => 'out [output filename]: Set filename to hold returned sequences',
211 'outfile' => 'outfile [output filename]: Set filename to hold returned sequences',
212 'ping' => 'ping: Ping LANL database',
213 'prerun' => 'prerun [[query string]]: Get number of sequences available',
214 'query' => 'query [[query string]]: Validate and set current query string',
215 'quit' => 'quit: End script',
216 'run' => 'run [[query string]]: Run query to completion',
217 'state' => 'state: Print current configuration state of hivq',
219 my $done = 0;
220 debug("We have arrived.");
221 #### main loop
222 my ($cmd, @tok);
223 my $prompt = "hivq> ";
224 my $term = new Term::ReadLine 'hivq';
225 CMD:
226 while ( !$done ) {
227 @tok=();$cmd='';
228 for ( $cmd = $term->readline($prompt) ) {
229 chomp;
230 s/^\s+//; #trim whsp
231 $term->addhistory($_);
232 @tok = split(/\s+/);
233 $tok[0] = lc $tok[0]; # normalize
234 for ($tok[0]) {
235 (!$_ || /^\#/) && do {
236 next CMD;
238 !(grep(/^$tok[0]$/, @cmds)) && do {
239 error("Unrecognized command \'$tok[0]\'");
240 next CMD;
242 (m/^quit$/ || m/^bye$/ || m/^exit$/) && do {
243 my $fh = $state{outfile}->{handle};
244 unless ($fh == \*STDOUT) {
245 close $fh;
247 $done=1;
248 next CMD;
250 m/^confirm$/ && do {
251 $state{confirm} = !$state{confirm};
252 print "User confirmation before running query: ". ($state{confirm} ? "yes" : "no")."\n";
253 next CMD;
255 (m/^outfile$/ || m/^out$/) && do {
256 unless ($tok[1]) {
257 error('Output filename not specified');
258 next CMD;
260 my $fh;
261 eval {
262 open $fh, ">$tok[1]" or die $!;
264 if ($@) {
265 error($@);
267 else {
268 my $oldfh = $state{outfile}->{handle};
269 if ($state{outfile}->{text} ne '[stdout]') {
270 close($oldfh);
272 $state{outfile}->{handle} = $fh;
273 $state{outfile}->{text} = $tok[1];
275 next CMD;
277 m/^query$/ && do {
278 my $query = join(' ', @tok[1..$#tok]);
279 if ($query) {
280 $q->_reset;
281 $q->query($query);
283 else {
284 if ($state{query}) {
285 print "Current query: $state{query}\n";
287 else {
288 print "No query set\n";
290 next CMD;
292 try { #catch errors
293 $q->_do_query(0);
295 catch Error with {
296 my $E = shift;
297 error($E->text);
298 next CMD;
300 $state{session_id} = $q->_session_id;
301 $state{query} = $query;
302 next CMD;
304 (m/^prerun$/ || m/^count$/) && do {
305 my $query = join(' ', @tok[1..$#tok]);
306 if (!$query) {
307 if (!$q->query()) {
308 error('No query currently set');
309 next CMD;
311 elsif ($q->{'_RUN_LEVEL'} >= 1 &&
312 ($q->query() eq $state{query})) {
313 outputPrerun();
314 next CMD;
316 # else, fallthrough to prerun current query
318 else { # new query
319 $q->_reset;
320 $q->query($query);
321 try {
322 $q->_do_query(0);
324 catch Error with {
325 my $E = shift;
326 error($E->text);
327 next CMD;
330 # on success:
331 $state{confirm} && do { next CMD unless getConfirm(); };
332 $state{session_id} = $q->_session_id;
333 $state{query} = $query;
334 try {
335 $q->_do_query(1);
337 catch Error with {
338 my $E = shift;
339 error($E->text);
340 next CMD;
342 # on success:
343 $state{curct} = $q->count;
344 $state{query} = $q->query;
345 outputPrerun();
346 next CMD;
349 (m/^run$/ || m/^do$/) && do {
350 my $query = join(' ', @tok[1..$#tok]);
351 if (!$query) {
352 unless (defined $q->{'_RUN_LEVEL'} && $q->{'_RUN_LEVEL'} < 2) {
353 error('No query currently set');
354 next CMD;
357 else { # new query
358 $q->_reset;
359 $q->query($query);
360 try {
361 $q->_do_query(0);
363 catch Error with {
364 my $E = shift;
365 error($E->text);
366 next CMD;
368 $state{query} = $query;
369 $state{curct} = 0;
370 $state{session_id} = $q->_session_id;
372 # on success:
373 $state{confirm} && do { next CMD unless getConfirm(); };
374 try {
375 $q->_do_query(2);
376 $seqio = $db->get_Stream_by_query($q);
378 catch Error with {
379 my $E = shift;
380 error($E->text);
381 next CMD;
383 # on success:
384 $state{curct} = $q->count;
385 # output the seqs
386 if ($q->count) {
387 outputSeqs($seqio);
389 else {
390 error('No sequences returned');
392 next CMD;
394 m/^find$/ && do {
395 outputFind(@tok);
396 next CMD;
398 m/^help$/ && do {
399 unless ($tok[1]) {
400 outputUsage();
401 next CMD;
403 if (grep(/$tok[1]/, @cmds)) {
404 print $help{$tok[1]}, "\n";
406 else {
407 error("Command \'$tok[1]\' unrecognized\n");
409 next CMD;
411 (m/^state$/ || m/^config$/) && do {
412 foreach my $k (sort keys %state) {
413 if (ref($state{$k}) eq 'HASH') {
414 print "$k: ".$state{$k}->{text}."\n";
416 else {
417 print "$k: ";
418 if ($k eq 'confirm') {
419 print $state{$k} ? "yes" : "no";
420 print "\n";
422 else {
423 print "$state{$k}\n";
427 next CMD;
429 do {
430 error("Command currently unimplemented\n");
431 next CMD;
437 ### end main
439 sub error {
440 my $msg = shift;
441 if ($msg =~ /MSG:/) {
442 ($msg) = grep (/^MSG:/, split(/\n|\r/,$msg));
443 $msg =~ s/^MSG: *//;
444 $msg =~ s/\sat\s.*$//;
446 print STDERR "hivq: $msg\n";
447 return 1;
450 sub outputPrerun {
451 print (($state{curct} ? $state{curct} : "No")
452 . " sequence"
453 . ($state{curct}>1 ? "s" : "")
454 . " returned\n");
455 print "Query: ".$state{query}."\n";
456 return 1;
459 sub outputSeqs {
460 my ($seqio) = @_;
461 my @ret;
462 while (my $seq = $seqio->next_seq) {
463 my $nameline = ">";
464 $nameline .= $seq->annotation->get_value('Special', 'accession').
465 '('.$seq->id.') ';
466 # loop through categories:
467 foreach my $cat ($seq->annotation->get_keys()) {
468 foreach my $an ($seq->annotation->get_keys($cat)) {
469 next if ($an eq 'accession');
470 my $value = $seq->annotation->get_value($cat, $an);
471 # next line: kludge to skip if there's an annotation
472 # object instead of a value (I believe this is a bug)
473 next if ref($value);
474 $nameline .= "\t".join('=', "'$an'", "'".$value."'");
477 push @ret, $nameline."\n";
478 push @ret, $seq->seq()."\n";
480 doOutputSeqs(@ret);
483 sub doOutputSeqs {
484 my @lines = @_;
485 if (!@lines) {
486 error("No sequences to output");
488 my $fh = $state{outfile}->{handle};
489 print $fh @lines;
490 print "Download complete\n";
491 return 1;
494 sub getConfirm {
495 print "Run query? [y/n]";
496 my $ans = <STDIN>;
497 ($ans =~ /^[yY]/) && do {return 1;};
498 return 0;
501 sub outputUsage {
502 print "Available commands:\n";
503 outputInColumns(\@cmds);
504 return 1;
507 sub outputFind {
508 my @tok = @_;
509 for my $arg ($tok[1]) {
510 !$arg && do {
511 print $help{find},"\n";
512 return;
514 ($arg =~ m/^tables$/) && do {
515 outputInColumns([$schema->tables]);
516 return;
518 ($arg =~ m/^fields$/) && do {
519 my $tbl = $tok[2];
520 if (!$tbl) {
521 outputInColumns([$schema->fields]);
523 else {
524 unless (grep /^$tbl$/, $schema->tables) {
525 error("Table \'$tbl\' not valid");
526 return;
528 outputInColumns([grep /^$tbl/, $schema->fields()]);
530 return;
532 ($arg =~ /^options$/) && do {
533 my $fld = $tok[2];
534 my @aliases = $schema->aliases($fld);
535 if (!@aliases) {
536 unless (grep /^$fld$/, $schema->aliases) {
537 error("Field \'$tok[2]\' not valid");
538 return;
540 foreach ($schema->fields) {
541 if ( grep /$fld/, $schema->aliases($_) ) {
542 $fld = $_;
543 last;
546 # on success: $fld is set to valid field
548 my @options = sort {$a cmp $b} $schema->options($fld);
549 @options = (@options ? @options : ('Free text'));
550 outputInColumns(\@options);
551 return;
553 do {
554 ($arg =~ /^alias/) && ($arg = $tok[2]);
555 if (grep /$arg/, $schema->fields) {
556 my @aliases = sort $schema->aliases($arg);
557 if (@aliases) {
558 outputInColumns(\@aliases);
560 else {
561 error("No aliases to field \'$arg\'");
564 else {
565 error("Field \'$arg\' not valid");
567 return;
575 sub outputInColumns {
576 my ($items, $n, $w) = @_;
577 my $i = 0;
578 my @items = @$items;
579 my $maxl = length([sort {length($b)<=>length($a)} @items]->[0]);
580 $n ||= int(60/($maxl+3)) || 1;
581 $w ||= $maxl+3;
582 my $coll = int(@items/$n);
583 $coll == @items/$n ? $coll : ++$coll;
584 my @t;
585 for my $j (0..$n-1) {
586 if ($j<$n-1) {
587 $t[$j] = [@items[$j*$coll..$j*$coll+$coll-1]];
589 else {
590 $t[$j] = [@items[$j*$coll..$#items]];
593 @items = map { my $j = $_; map { $t[$_]->[$j] || () } (0..$#t) } (0..scalar(@{$t[0]})-1);
594 foreach (@items) {
595 $_ .= (' ' x ($w-length($_)));
597 while ($i < @items) {
598 print join('', map { $_ || () } @items[$i..$i+$n-1] ),"\n";
599 $i+=$n;
601 return 1;
604 sub debug {
605 print STDERR shift()."\n" if $opt_v;
606 return;