Bio::DB::Universal: move into its own distribution
[bioperl-live.git] / examples / db / dbfetch
blobb0dc7458aee924353ad4e64c796d5292bc181b53
1 #!/usr/local/bin/perl -- # -*-Perl-*-
3 =head1 NAME
5 dbfetch - generic CGI program to retrieve biological database entries
6 in various formats and styles (using SRS)
8 =head1 SYNOPSIS
10 # URL examples:
12 # prints the interactive page with the HTML form
13 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch
15 # for backward compatibility, implements <ISINDEX>
16 # single entry queries defaulting to EMBL sequence database
17 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231
19 # retrieves one or more entries in default format
20 # and default style (html)
21 # returns nothing for IDs which are not valid
22 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum
24 # retrieve entries in fasta format without html tags
25 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=fasta&style=raw&id=J00231,hsfos,bum
27 # retrieve a raw Ensembl entry
28 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ensembl&style=raw&id=AL122059
31 =head1 DESCRIPTION
33 This program generates a page allowing a web user to retrieve database
34 entries from a local SRS in two styles: html and raw. Other
35 database engines can be used to implement te same interfase.
37 At this stage, on unique identifier queries are supported. Free text
38 searches returning more than one entry per query term are not in these
39 specs.
41 In its default setup, type one or more EMBL accession numbers
42 (e.g. J00231), entry name (e.g. HSFOS) or sequence version into the
43 seach dialog to retieve hypertext linked enties.
45 Note that for practical reasons only the first 50 identifiers
46 submitted are processed.
48 Additional input is needed to change the sequence format or suppress
49 the HTML tags. The styles are html and raw. In future there might be
50 additional styles (e.g. xml). Currently XML is a 'raw' format used by
51 Medline. Each style is implemented as a separate subroutine.
53 =head1 MAINTANENCE
55 A new database can be added simply by adding a new entry in the global hash
56 %IDS. Additionally, if the database defines new formats add an entry for
57 each of them into the hash %IDMATCH. After modifying the hash, run this
58 script from command line for some sanity checks with parameter debug set to
59 true (e.g. dbfetch debug=1 ).
61 Finally, the user interface needs to be updated in the L<print_prompt>
62 subroutine.
64 =head1 VERSIONS
66 Version 3 uses EBI SRS server 6.1.3. That server is able to merge release
67 and update libraries automatically which makes this script simpler. The
68 other significant change is the way sequence versions are indexed. They
69 used to be indexed together with the string accession
70 (e.g. 'J00231.1'). Now they are indexed as integers (e.g. '1').
72 Version 3.1 changes the command line interface. To get the debug
73 information use attribute 'debug' set to true. Also, it uses File::Temp
74 module to create temporary files securely.
76 Version 3.2 fixes fasta format parsing to get the entry id.
78 Version 3.3. Adds RefSeq to the database list.
80 Version 3.4. Make this compliant to BioFetch specs.
82 =head1 AUTHOR - Heikki Lehvaslaiho
84 Email: heikki-at-bioperl-dot-org
86 =cut
88 # Let the code begin...
90 $VERSION = '3.4';
91 $DATE = '28 Jan 2002';
93 use CGI "standard";
94 #use POSIX;
95 use CGI::Carp qw/ fatalsToBrowser /;
96 use File::Temp qw/ tempfile tempdir /;
97 use strict;
98 no strict "refs";
100 use constant MAXIDS => 50;
101 use constant TMPDIR => '/usr/tmp';
103 use vars qw( $VERSION $DATE %DBS %STYLES $RWGETZ $RGETZ %IDMATCH %IDLIST $XEMBL $FH );
105 BEGIN {
107 # paths to SRS binaries
108 $RWGETZ = '/ebi/srs/srs/bin/osf_5/wgetz -e';
109 $RGETZ = '/ebi/srs/srs/bin/osf_5/getz -e';
110 $XEMBL = "cd /ebi/www/pages/cgi-bin/xembl/; ./XEMBL.pl";
111 #$EMBOSSDIR = '/ebi/services/pkgs/emboss/bin';
113 # RE matching the unique ID in the db entry
114 # - key is the
115 # - put the id string in parenthesis
116 %IDMATCH = ( # 123
117 embl => 'ID (\w+)',
118 fasta => '>\w+.(\w+)',
119 medlinefull => '[\n><]MedlineID. ?(\w+)',
120 swissprot => 'ID (\w+)',
121 pdb => '.{62}(\w+)',
122 bsml => 'DUMMY',
123 agave => 'DUMMY',
124 refseq => 'LOCUS ([\w_]+)'
126 %DBS = (
127 embl => {
128 fields => ['id', 'acc'],
129 version => 'sv', # name of the SRS field
130 format => {
131 default => 'embl',
132 embl => 1,
133 fasta => 'FastaSeqs',
134 bsml => 1,
135 agave => 1
138 medline => {
139 fields => ['id'],
140 format => {
141 default => 'medlinefull',
142 # medlineref => 'MedlineRef',
143 medlinefull => 'MedlineFull'
146 ensembl => {
147 fields => ['id'],
148 format => {
149 default => 'embl',
150 embl => 1,
151 fasta => 'FastaSeqs'
154 swall => {
155 fields => ['id', 'acc'],
156 format => {
157 default => 'swissprot',
158 swissprot => 1,
159 fasta => 'FastaSeqs'
162 pdb => {
163 fields => ['id'],
164 format => {
165 default => 'pdb',
166 pdb => '1'
169 refseq => {
170 fields => ['id', 'acc'],
171 format => {
172 default => 'refseq',
173 refseq => 1,
174 fasta => 'FastaSeqs'
177 #add more databases here...
180 %STYLES = (
181 html => 1,
182 raw => 1
185 %IDLIST = (); #redundancy check list built during the execution
188 my $q = new CGI;
190 # sanity checks if the script is running from command line
191 # and debug parameter is set.
192 my $debug = protect($q->param('debug')) if $q->param('debug');
193 &debugging if not $q->user_agent and $debug;
195 if ( $q->param('id') or $q->param('keywords') ) {
197 # pacify input strings
198 my $value;
199 $value = protect($q->param('id')) if $q->param('id');
200 $value = protect($q->param('keywords')) if $q->param('keywords');
201 my $db = lc protect($q->param('db')); # let's keep the case lower
202 my $format = lc protect($q->param('format'));
203 my $style = lc protect($q->param('style'));
205 # check input and set defaults
206 $style ||= 'html'; # default style
207 input_error($q, $style, "2 Unknown style [$style].") unless $STYLES{$style};
209 $db ||= 'embl'; # default db
210 input_error($q, $style, "1 Unknown database [$db].") unless $DBS{$db};
212 $format ||= $DBS{$db}{format}{default}; # default format
213 input_error($q, $style, "3 Format [$format] not known for database [$db]")
214 unless $DBS{$db}{format}{$format};
215 $format = $DBS{$db}{format}{default} if $format eq 'default';
218 # If people choose Bsml or AGAVE, DB can only be 'embl'
219 input_error($q, $style, "1 Unknown database [$db].")
220 if ($format eq 'bsml' or $format eq 'agave') and $db ne 'embl';
222 # If people choose Bsml or AGAVE, internal style has to be xml . Make it so.
223 $style = ($format =~ /(bsml|agave)/i) ? 'xml' : $style;
225 if ($style eq 'html') {
226 print $q->header(-type => 'text/html', -charset => 'UTF-8');
228 elsif ($style eq 'raw') {
229 print "Content-Type: text/plain; charset=UTF-8\n\n";
231 $FH = tempfile('dbfetchXXXXXX', DIR => TMPDIR, UNLINK => 1 ); #automatic unlinking
233 # Check the number of IDs
234 my @ids = split (/ /, $value);
235 input_error($q, $style, "6 Too many IDs [". scalar @ids. "]. Max [". MAXIDS. "] allowed.")
236 if scalar @ids > MAXIDS;
238 # XEMBL cannot 'glue' single entries due to XML setup
239 #- we need to send things in one go.
240 if ($style eq 'xml') {
241 &xml($format, @ids);
242 } else {
243 my $counter;
244 foreach my $id (@ids) {
245 &$style($db, $id, $format);
247 no_entries($q, $value) if $style eq 'html' and tell($FH) == 0;
249 seek $FH, 0, 0;
250 print '<pre>' if $style eq 'html';
251 print $_ while <$FH>;
252 } else {
253 print_prompt($q);
257 =head2 print_prompt
259 Title : print_prompt
260 Usage :
261 Function: Prints the default page with the query form
262 to STDOUT (Web page)
263 Args :
264 Returns :
266 =cut
268 sub print_prompt {
269 print $q->header(),
270 $q->start_html(-title => 'DB Entry Retrieval',
271 -bgcolor => 'white',
272 -author => 'heikki-at-bioperl-dot-org'
274 '<IMG align=middle SRC="/icons/ebibanner.gif">',
275 $q->h1('Generic DB Entry Retrieval'),
276 $q->p("This page allows you to retrieve up to ". MAXIDS .
277 " entries at the time from various up-to-date biological databases."),
278 $q->p("For EMBL, enter an accession number (e.g. J00231) or entry name (e.g.
279 HSFOS) or a sequence version (e.g. J00231.1), or any combination of them
280 separated by a non-word character into your browser's search dialog.
281 SWALL examples are: fos_human, p53_human.
282 For short Ensembl entries, try : AL122059, AL031002, AL031030 .
283 'Random' Medline entry examples are: 20063307, 98276153.
284 PDB entry examples are: 100D, 1FOS. Try NM_006732 for RefSeq.
285 Only one copy of the latest version of the entry is returned."),
286 $q->hr,
287 $q->startform,
288 $q->popup_menu(-name => 'db',
289 -values => ['EMBL',
290 'SWALL',
291 'PDB',
292 'Medline',
293 'Ensembl',
294 'RefSeq'
296 $q->textfield(-name => 'id',
297 -size => 40,
298 -maxlength => 1000),
299 $q->popup_menu(-name => 'format',
300 -values => ['default','Fasta','bsml','agave']),
301 $q->popup_menu(-name => 'style',
302 -values => ['html','raw']),
303 $q->submit('Retrieve'),
304 $q->endform,
305 $q->hr,
306 $q->h2('Direct access'),
307 $q->p('For backward compatibility, the script defaults to EMBL:'),
308 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231">
309 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231</a>'),
310 $q->p('but the preferred way of calling it is:'),
311 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum">
312 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum</a>'),
313 $q->p('which can be extended to retrieve entries in alternative sequence formats
314 and other databases:'),
315 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=swall&format=fasta&id=fos_human">
316 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=swall&format=fasta&id=fos_human</a>'),
317 $q->p('Set style to <code>raw</code> to retrieve plain text entries for computational purposes
318 and saving to disk:'),
319 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=medline&style=raw&id=21131735">
320 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=medline&style=raw&id=21131735</a>'),
321 $q->p('There is now the possibility to retrieve EMBL sequences formatterd into two XML standards:
322 Bsml (Bioinformatic Sequence Markup Language - from
323 Labbook, Inc.) or as AGAVE (Architecture for Genomic Annotation,
324 Visualisation, and Exchange - from Labbook, Inc.). To do this, use the
325 formats \'bsml\' or \'agave\', as follows:'),
326 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=bsml&id=J00231">
327 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=bsml&id=J00231</a><br>'),
328 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=agave&id=J00231">
329 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=agave&id=J00231</a>'),
330 $q->p("Version numbers are not supported with the XML retrieval."),
331 $q->hr,
332 $q->address("Version $VERSION, $DATE, <a href=\"mailto:support\@ebi.ac.uk\">support\@ebi.ac.uk</a>"),
333 $q->end_html, "\n" ;
336 =head2 protect
338 Title : protect
339 Usage : $value = protect($q->param('id'));
340 Function:
342 Removes potentially dangerous characters from the input
343 string. At the same time, converts word separators into a
344 single space character.
346 Args : scalar, string with one or more IDs or accession numbers
347 Returns : scalar
349 =cut
351 sub protect {
352 my ($s) = @_;
353 $s =~ s![^\w\.\_]+! !g; # allow version numbers with '.' & RefSeq IDs with '_'
354 $s =~ s|^\W+||;
355 $s =~ s|\W+$||;
356 return $s;
359 =head2 input_error
361 Title : input_error
362 Usage : input_error($q, 'html', "Error message");
363 Function: Standard error message behaviour
364 Args : reference to the CGI object
365 scalar, string to display on input error.
366 Returns : scalar
368 =cut
370 sub input_error {
371 my ($q, $style, $s) = @_;
373 if ($style eq 'html' ) {
374 print $q->header,
375 $q->start_html(-title => 'DB Entry Retrieval: Input error',
376 -bgcolor => 'white'
378 "<h2>ERROR in input:<h2>$s\n",
379 $q->end_html, "\n";
380 } else {
381 print "Content-type: text/plain\n\n", "ERROR $s\n";
383 exit 0;
386 =head2 no_entries
388 Title : no_entries
389 Usage : no_entries($q, "Message");
390 Function: Standard behaviour when no entries found
391 Args : reference to the CGI object
392 scalar, string to display on input error.
393 Returns : scalar
395 =cut
397 sub no_entries {
398 my ($q, $value) = @_;
400 print $q->start_html(-title => 'DB Entry Retrieval: Input warning',
401 -bgcolor => 'white'
403 "<h2>Sorry, your query retrieved no entries.</h2>",
404 "Entries with [$value] where not found.",
405 "Please go back or press <a href=\"dbfetch\"><b>here</b></a> to try again",
406 $q->end_html, "\n";
407 exit 0;
411 =head2 raw
413 Title : raw
414 Usage :
415 Function: Retrieves a single database entry in plain text
416 Args : scalar, an ID
417 scaler, format
418 Returns : scalar
420 =cut
422 sub raw {
423 my ($db, $value, $format) = @_;
424 my ($srsq, $qdb, $entry, $id);
425 my ($seqformat) = '';
426 $seqformat = '-view '. $DBS{$db}{format}{$format}
427 if $format ne $DBS{$db}{format}{default};
429 my $version = '';
430 $value =~ /(.+)\.(.+)/;
431 $version = $2 if $2;
432 $value = $1 if $1;
434 # main db
435 $qdb = $db;
436 $srsq = '';
437 foreach my $field (@{$DBS{$db}{fields}}) {
438 $srsq .= " [$qdb-$field:$value] |";
440 chop $srsq;
442 # if database supports versions (EMBL, GenBank, RefSeq...)
443 if ($version) {
444 my $vfname = $DBS{$db}{version};
445 $srsq = "[$qdb-$vfname:$version] & (". $srsq. ")"
448 # print "rsh srs $RGETZ $seqformat $srsq\n";
449 $entry = `rsh srs "$RGETZ $seqformat '$srsq'"`;
451 $entry =~ s|EMBL[^\n]+\n||;
452 $entry =~ s|^\s+||g;
453 $entry =~ s|\s+$|\n|g;
455 my $idmatch = $IDMATCH{$format};
456 ($id) = $entry =~ /$idmatch/;
457 # die if ID not found
458 input_error(' ', 'raw', "5 ID [$value] not found in database [$db].")
459 unless $id;
461 # my $tmp = substr($entry, 0, 20);
462 # print "Entry:$tmp\n";
463 # print "-----id=$id---\$1=$1----idmatch=$idmatch=format=$format=\n";
465 print $FH $entry unless $IDLIST{$id};
466 $IDLIST{$id} = 1;
469 =head2 html
471 Title : html
472 Usage :
473 Function: Retrieves a single database entry with HTML
474 hypertext links in place. Limits retieved enties to
475 ones with correct version if the string has '.' in it.
476 Args : scalar, a UID
477 scalar, format
478 Returns : scalar
480 =cut
482 sub html {
483 my ($db, $value, $format) = @_;
484 my ($srsq, $qdb, $entry, $id, $idmatch);
485 my ($seqformat) = '';
486 $seqformat = '-view '. $DBS{$db}{format}{$format}
487 if $format ne $DBS{$db}{format}{default};
489 my $version = '';
490 $value =~ /(.+)\.(.+)/;
491 $version = $2 if $2;
492 $value = $1 if $1;
494 # SWALL plain format at EBI
495 $seqformat .= ' -vn 2 ' if $db eq 'swall' or $db eq 'refseq';
497 $qdb = $db;
498 $srsq = '';
499 foreach my $field (@{$DBS{$db}{fields}}) {
500 $srsq .= " [$qdb-$field:$value] |";
502 chop $srsq;
504 # if database supports versions (EMBL...)
505 if ($version) {
506 my $vfname = $DBS{$db}{version};
507 $srsq = "[$qdb-$vfname:$version] & (". $srsq. ")"
510 # print "rsh srs $RWGETZ $seqformat $srsq\n";
511 ### '-id EBISRS' is (hopefully) a temporary addtion until SRS HTML output is fixed
512 $entry = `rsh srs "$RWGETZ $seqformat '$srsq'"`;
514 return if $entry =~ /SRS error/;
516 $entry =~ s|^Content-type:[^\n]+\n||;
517 $entry =~ s|\n<A HREF[^\n]+\n||;
518 $entry =~ s|<A +HREF=\"?wgetz|<A HREF=http://srs6.ebi.ac.uk/srs6bin/cgi-bin/wgetz|g; #"\
519 $entry =~ s/\+-e\"/\+-e/g; #"
520 $entry =~ s|<BR>||g;
521 $entry =~ s|</?pre>||g;
522 $entry =~ s|\n+|\n|g;
523 $entry =~ s|^\n+||g;
525 $idmatch = $IDMATCH{$format};
526 ($id) = $entry =~ /$idmatch/;
528 # my $tmp = substr($entry, 0, 20);
529 # print "Entry:$tmp\n";
530 # print "-----id=$id---\$1=$1----idmatch=$idmatch=format=$format=\n";
531 print $FH $entry unless $IDLIST{$id};
532 $IDLIST{$id} = 1;
535 =head2 xml
537 Title : xml
538 Usage :
539 Function: Retrieves an entry formatted as XML
540 Args : array, UID
541 scalar, format
542 Returns : scalar
544 =cut
546 sub xml {
547 my ($format, @ids) = @_;
548 my ($entry, $id, $content, $counter, $reg);
550 $content = ($ENV{'HTTP_USER_AGENT'} =~ /MSIE/) ? "Content-type: text/xml\n\n" :
551 "Content-type: text/plain\n\n";
553 $entry = "--format ".(($format eq "bsml") ? "Bsml" : "sciobj") .
554 " " . join (" ", @ids);
556 $entry = `rsh mercury "$XEMBL $entry"`;
558 $reg = (($format eq "bsml") ? '<Sequence id=' : '<contig length-');
559 $counter++ while $entry =~ /($reg)/g;
561 foreach my $idl (@ids) {
562 input_error($q, " ", "5 ID [$idl] not found in database [embl].")
563 if ($format eq "bsml" && $entry =~ "NOT EXIST: $idl") ||
564 ($format eq "agave" && $entry =~ "NOT FOUND: $idl")
567 print $FH ($content . $entry);
570 =head2 debugging
572 Title : debugging
573 Usage : 'perl dbfetch'
574 Function:
576 Performs sanity checks on global hash %IDS when this script
577 is run from command line. %IDS holds the description of
578 formats and other crusial info for each database accessible
579 through the program.
581 Note that hash key 'version' is not tested as it should
582 only be in sequence databases.
584 Args : none
585 Returns : error messages to STDOUT
587 =cut
589 sub debugging {
591 foreach my $db (keys %DBS) {
592 my $status = 1;
594 # field
595 print "ERROR: [$db]: no SRS fields defined.".
596 " Give an array of field names?\n" and $status = 0
597 unless $DBS{$db}{fields};
598 print "ERROR: [$db]: SRS fields are not defined as an array.\n" and $status = 0
599 unless ref $DBS{$db}{fields} eq 'ARRAY';
601 # format
602 print "ERROR: [$db]: no formats defined.\n" and $status = 0
603 unless $DBS{$db}{format};
604 print "ERROR: [$db]: no default format defined.\n" and $status = 0
605 unless $DBS{$db}{format}{default};
606 my $format = $DBS{$db}{format}{default};
607 print "ERROR: [$db]: no format [$format] defined.".
608 " You declared it as a default and only.\n" and $status = 0
609 unless $DBS{$db}{format}{$format};
610 foreach my $dbformat (keys %{$DBS{$db}{format}}) {
611 print "ERROR: [$db]: format [$format] not defined in %IDMATCH.\n"
612 and $status = 0
613 unless $IDMATCH{$dbformat} or $dbformat eq 'default';
615 printf "%-12s%s", "[$db]", "OK\n" if $status;
617 exit;