1 #!/usr/local/bin/perl -- # -*-Perl-*-
5 dbfetch - generic CGI program to retrieve biological database entries
6 in various formats and styles (using SRS)
12 # prints the interactive page with the HTML form
13 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch
15 # for backward compatibility, implements <ISINDEX>
16 # single entry queries defaulting to EMBL sequence database
17 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231
19 # retrieves one or more entries in default format
20 # and default style (html)
21 # returns nothing for IDs which are not valid
22 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum
24 # retrieve entries in fasta format without html tags
25 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=fasta&style=raw&id=J00231,hsfos,bum
27 # retrieve a raw Ensembl entry
28 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=ensembl&style=raw&id=AL122059
33 This program generates a page allowing a web user to retrieve database
34 entries from a local SRS in two styles: html and raw. Other
35 database engines can be used to implement te same interfase.
37 At this stage, on unique identifier queries are supported. Free text
38 searches returning more than one entry per query term are not in these
41 In its default setup, type one or more EMBL accession numbers
42 (e.g. J00231), entry name (e.g. HSFOS) or sequence version into the
43 seach dialog to retieve hypertext linked enties.
45 Note that for practical reasons only the first 50 identifiers
46 submitted are processed.
48 Additional input is needed to change the sequence format or suppress
49 the HTML tags. The styles are html and raw. In future there might be
50 additional styles (e.g. xml). Currently XML is a 'raw' format used by
51 Medline. Each style is implemented as a separate subroutine.
55 A new database can be added simply by adding a new entry in the global hash
56 %IDS. Additionally, if the database defines new formats add an entry for
57 each of them into the hash %IDMATCH. After modifying the hash, run this
58 script from command line for some sanity checks with parameter debug set to
59 true (e.g. dbfetch debug=1 ).
61 Finally, the user interface needs to be updated in the L<print_prompt>
66 Version 3 uses EBI SRS server 6.1.3. That server is able to merge release
67 and update libraries automatically which makes this script simpler. The
68 other significant change is the way sequence versions are indexed. They
69 used to be indexed together with the string accession
70 (e.g. 'J00231.1'). Now they are indexed as integers (e.g. '1').
72 Version 3.1 changes the command line interface. To get the debug
73 information use attribute 'debug' set to true. Also, it uses File::Temp
74 module to create temporary files securely.
76 Version 3.2 fixes fasta format parsing to get the entry id.
78 Version 3.3. Adds RefSeq to the database list.
80 Version 3.4. Make this compliant to BioFetch specs.
82 =head1 AUTHOR - Heikki Lehvaslaiho
84 Email: heikki-at-bioperl-dot-org
88 # Let the code begin...
91 $DATE = '28 Jan 2002';
95 use CGI
::Carp qw
/ fatalsToBrowser /;
96 use File
::Temp qw
/ tempfile tempdir /;
100 use constant MAXIDS
=> 50;
101 use constant TMPDIR
=> '/usr/tmp';
103 use vars
qw( $VERSION $DATE %DBS %STYLES $RWGETZ $RGETZ %IDMATCH %IDLIST $XEMBL $FH );
107 # paths to SRS binaries
108 $RWGETZ = '/ebi/srs/srs/bin/osf_5/wgetz -e';
109 $RGETZ = '/ebi/srs/srs/bin/osf_5/getz -e';
110 $XEMBL = "cd /ebi/www/pages/cgi-bin/xembl/; ./XEMBL.pl";
111 #$EMBOSSDIR = '/ebi/services/pkgs/emboss/bin';
113 # RE matching the unique ID in the db entry
115 # - put the id string in parenthesis
118 fasta => '>\w+.(\w+)',
119 medlinefull => '[\n><]MedlineID. ?(\w+)',
120 swissprot => 'ID (\w+)',
124 refseq => 'LOCUS ([\w_]+)'
128 fields => ['id', 'acc'],
129 version => 'sv', # name of the SRS field
133 fasta => 'FastaSeqs',
141 default => 'medlinefull',
142 # medlineref => 'MedlineRef',
143 medlinefull => 'MedlineFull'
155 fields => ['id', 'acc'],
157 default => 'swissprot',
170 fields => ['id', 'acc'],
177 #add more databases here...
185 %IDLIST = (); #redundancy check list built during the execution
190 # sanity checks if the script is running from command line
191 # and debug parameter is set.
192 my $debug = protect($q->param('debug')) if $q->param('debug');
193 &debugging if not $q->user_agent and $debug;
195 if ( $q->param('id') or $q->param('keywords') ) {
197 # pacify input strings
199 $value = protect($q->param('id')) if $q->param('id');
200 $value = protect($q->param('keywords')) if $q->param('keywords');
201 my $db = lc protect($q->param('db')); # let's keep the case lower
202 my $format = lc protect($q->param('format'));
203 my $style = lc protect($q->param('style'));
205 # check input and set defaults
206 $style ||= 'html'; # default style
207 input_error($q, $style, "2 Unknown style [$style].") unless $STYLES{$style};
209 $db ||= 'embl'; # default db
210 input_error($q, $style, "1 Unknown database [$db].") unless $DBS{$db};
212 $format ||= $DBS{$db}{format}{default}; # default format
213 input_error($q, $style, "3 Format [$format] not known for database [$db]")
214 unless $DBS{$db}{format}{$format};
215 $format = $DBS{$db}{format}{default} if $format eq 'default';
218 # If people choose Bsml or AGAVE, DB can only be 'embl'
219 input_error($q, $style, "1 Unknown database [$db].")
220 if ($format eq 'bsml' or $format eq 'agave') and $db ne 'embl';
222 # If people choose Bsml or AGAVE, internal style has to be xml . Make it so.
223 $style = ($format =~ /(bsml|agave)/i) ? 'xml' : $style;
225 if ($style eq 'html') {
226 print $q->header(-type => 'text/html', -charset => 'UTF-8');
228 elsif ($style eq 'raw') {
229 print "Content-Type: text/plain; charset=UTF-8\n\n";
231 $FH = tempfile('dbfetchXXXXXX', DIR => TMPDIR, UNLINK => 1 ); #automatic unlinking
233 # Check the number of IDs
234 my @ids = split (/ /, $value);
235 input_error($q, $style, "6 Too many IDs [". scalar @ids. "]. Max [". MAXIDS. "] allowed.")
236 if scalar @ids > MAXIDS;
238 # XEMBL cannot 'glue' single entries due to XML setup
239 #- we need to send things in one go.
240 if ($style eq 'xml') {
244 foreach my $id (@ids) {
245 &$style($db, $id, $format);
247 no_entries($q, $value) if $style eq 'html' and tell($FH) == 0;
250 print '<pre>' if $style eq 'html';
251 print $_ while <$FH>;
261 Function: Prints the default page with the query form
270 $q->start_html(-title => 'DB Entry Retrieval',
272 -author => 'heikki-at-bioperl-dot-org'
274 '<IMG align=middle SRC="/icons/ebibanner.gif">',
275 $q->h1('Generic DB Entry Retrieval'),
276 $q->p("This page allows you to retrieve up to ". MAXIDS .
277 " entries at the time from various up-to-date biological databases."),
278 $q->p("For EMBL, enter an accession number (e.g. J00231) or entry name (e.g.
279 HSFOS) or a sequence version (e.g. J00231.1), or any combination of them
280 separated by a non-word character into your browser's search dialog.
281 SWALL examples are: fos_human, p53_human.
282 For short Ensembl entries, try : AL122059, AL031002, AL031030 .
283 'Random' Medline entry examples are: 20063307, 98276153.
284 PDB entry examples are: 100D, 1FOS. Try NM_006732 for RefSeq.
285 Only one copy of the latest version of the entry is returned."),
288 $q->popup_menu(-name => 'db',
296 $q->textfield(-name => 'id',
299 $q->popup_menu(-name => 'format',
300 -values => ['default','Fasta','bsml','agave']),
301 $q->popup_menu(-name => 'style',
302 -values => ['html','raw']),
303 $q->submit('Retrieve'),
306 $q->h2('Direct access'),
307 $q->p('For backward compatibility, the script defaults to EMBL:'),
308 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231">
309 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?J00231</a>'),
310 $q->p('but the preferred way of calling it is:'),
311 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum">
312 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?id=J00231.1,hsfos,bum</a>'),
313 $q->p('which can be extended to retrieve entries in alternative sequence formats
314 and other databases:'),
315 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=swall&format=fasta&id=fos_human">
316 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=swall&format=fasta&id=fos_human</a>'),
317 $q->p('Set style to <code>raw</code> to retrieve plain text entries for computational purposes
318 and saving to disk:'),
319 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=medline&style=raw&id=21131735">
320 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?db=medline&style=raw&id=21131735</a>'),
321 $q->p('There is now the possibility to retrieve EMBL sequences formatterd into two XML standards:
322 Bsml (Bioinformatic Sequence Markup Language - from
323 Labbook, Inc.) or as AGAVE (Architecture for Genomic Annotation,
324 Visualisation, and Exchange - from Labbook, Inc.). To do this, use the
325 formats \'bsml\' or \'agave\', as follows:'),
326 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=bsml&id=J00231">
327 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=bsml&id=J00231</a><br>'),
328 $q->code('<A href="http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=agave&id=J00231">
329 http://www.ebi.ac.uk/Tools/dbfetch/dbfetch?format=agave&id=J00231</a>'),
330 $q->p("Version numbers are not supported with the XML retrieval."),
332 $q->address("Version $VERSION, $DATE, <a href=\"mailto:support\@ebi.ac.uk\">support\@ebi.ac.uk</a>"),
339 Usage : $value = protect($q->param('id'));
342 Removes potentially dangerous characters from the input
343 string. At the same time, converts word separators into a
344 single space character.
346 Args : scalar, string with one or more IDs or accession numbers
353 $s =~ s![^\w\.\_]+! !g; # allow version numbers with '.' & RefSeq IDs with '_'
362 Usage : input_error($q, 'html', "Error message");
363 Function: Standard error message behaviour
364 Args : reference to the CGI object
365 scalar, string to display on input error.
371 my ($q, $style, $s) = @_;
373 if ($style eq 'html' ) {
375 $q->start_html(-title => 'DB Entry Retrieval: Input error',
378 "<h2>ERROR in input:<h2>$s\n",
381 print "Content-type: text/plain\n\n", "ERROR $s\n";
389 Usage : no_entries($q, "Message");
390 Function: Standard behaviour when no entries found
391 Args : reference to the CGI object
392 scalar, string to display on input error.
398 my ($q, $value) = @_;
400 print $q->start_html(-title => 'DB Entry Retrieval: Input warning',
403 "<h2>Sorry, your query retrieved no entries.</h2>",
404 "Entries with [$value] where not found.",
405 "Please go back or press <a href=\"dbfetch\"><b>here</b></a> to try again",
415 Function: Retrieves a single database entry in plain text
423 my ($db, $value, $format) = @_;
424 my ($srsq, $qdb, $entry, $id);
425 my ($seqformat) = '';
426 $seqformat = '-view '. $DBS{$db}{format}{$format}
427 if $format ne $DBS{$db}{format}{default};
430 $value =~ /(.+)\.(.+)/;
437 foreach my $field (@{$DBS{$db}{fields}}) {
438 $srsq .= " [$qdb-$field:$value] |";
442 # if database supports versions (EMBL, GenBank, RefSeq...)
444 my $vfname = $DBS{$db}{version};
445 $srsq = "[$qdb-$vfname:$version] & (". $srsq. ")"
448 # print "rsh srs $RGETZ $seqformat $srsq\n";
449 $entry = `rsh srs "$RGETZ $seqformat '$srsq'"`;
451 $entry =~ s|EMBL[^\n]+\n||;
453 $entry =~ s|\s+$|\n|g;
455 my $idmatch = $IDMATCH{$format};
456 ($id) = $entry =~ /$idmatch/;
457 # die if ID not found
458 input_error(' ', 'raw', "5 ID [$value] not found in database [$db].")
461 # my $tmp = substr($entry, 0, 20);
462 # print "Entry:$tmp\n";
463 # print "-----id=$id---\$1=$1----idmatch=$idmatch=format=$format=\n";
465 print $FH $entry unless $IDLIST{$id};
473 Function: Retrieves a single database entry with HTML
474 hypertext links in place. Limits retieved enties to
475 ones with correct version if the string has '.' in it.
483 my ($db, $value, $format) = @_;
484 my ($srsq, $qdb, $entry, $id, $idmatch);
485 my ($seqformat) = '';
486 $seqformat = '-view '. $DBS{$db}{format}{$format}
487 if $format ne $DBS{$db}{format}{default};
490 $value =~ /(.+)\.(.+)/;
494 # SWALL plain format at EBI
495 $seqformat .= ' -vn 2 ' if $db eq 'swall' or $db eq 'refseq';
499 foreach my $field (@{$DBS{$db}{fields}}) {
500 $srsq .= " [$qdb-$field:$value] |";
504 # if database supports versions (EMBL...)
506 my $vfname = $DBS{$db}{version};
507 $srsq = "[$qdb-$vfname:$version] & (". $srsq. ")"
510 # print "rsh srs $RWGETZ $seqformat $srsq\n";
511 ### '-id EBISRS' is (hopefully) a temporary addtion until SRS HTML output is fixed
512 $entry = `rsh srs "$RWGETZ $seqformat '$srsq'"`;
514 return if $entry =~ /SRS error/;
516 $entry =~ s|^Content-type:[^\n]+\n||;
517 $entry =~ s|\n<A HREF[^\n]+\n||;
518 $entry =~ s|<A +HREF=\"?wgetz|<A HREF=http://srs6.ebi.ac.uk/srs6bin/cgi-bin/wgetz|g; #"\
519 $entry =~ s/\+-e\"/\+-e/g; #"
521 $entry =~ s|</?pre>||g;
522 $entry =~ s|\n+|\n|g;
525 $idmatch = $IDMATCH{$format};
526 ($id) = $entry =~ /$idmatch/;
528 # my $tmp = substr($entry, 0, 20);
529 # print "Entry:$tmp\n";
530 # print "-----id=$id---\$1=$1----idmatch=$idmatch=format=$format=\n";
531 print $FH $entry unless $IDLIST{$id};
539 Function: Retrieves an entry formatted as XML
547 my ($format, @ids) = @_;
548 my ($entry, $id, $content, $counter, $reg);
550 $content = ($ENV{'HTTP_USER_AGENT'} =~ /MSIE/) ? "Content-type: text/xml\n\n" :
551 "Content-type: text/plain\n\n";
553 $entry = "--format ".(($format eq "bsml") ? "Bsml" : "sciobj") .
554 " " . join (" ", @ids);
556 $entry = `rsh mercury "$XEMBL $entry"`;
558 $reg = (($format eq "bsml") ? '<Sequence id=' : '<contig length-');
559 $counter++ while $entry =~ /($reg)/g;
561 foreach my $idl (@ids) {
562 input_error($q, " ", "5 ID [$idl] not found in database [embl].")
563 if ($format eq "bsml" && $entry =~ "NOT EXIST: $idl") ||
564 ($format eq "agave" && $entry =~ "NOT FOUND: $idl")
567 print $FH ($content . $entry);
573 Usage : 'perl dbfetch'
576 Performs sanity checks on global hash %IDS when this script
577 is run from command line. %IDS holds the description of
578 formats and other crusial info for each database accessible
581 Note that hash key 'version' is not tested as it should
582 only be in sequence databases.
585 Returns : error messages to STDOUT
591 foreach my $db (keys %DBS) {
595 print "ERROR: [$db]: no SRS fields defined.".
596 " Give an array of field names?\n" and $status = 0
597 unless $DBS{$db}{fields};
598 print "ERROR: [$db]: SRS fields are not defined as an array.\n" and $status = 0
599 unless ref $DBS{$db}{fields} eq 'ARRAY';
602 print "ERROR: [$db]: no formats defined.\n" and $status = 0
603 unless $DBS{$db}{format};
604 print "ERROR: [$db]: no default format defined.\n" and $status = 0
605 unless $DBS{$db}{format}{default};
606 my $format = $DBS{$db}{format}{default};
607 print "ERROR: [$db]: no format [$format] defined.".
608 " You declared it as a default and only.\n" and $status = 0
609 unless $DBS{$db}{format}{$format};
610 foreach my $dbformat (keys %{$DBS{$db}{format}}) {
611 print "ERROR: [$db]: format [$format] not defined in %IDMATCH.\n"
613 unless $IDMATCH{$dbformat} or $dbformat eq 'default';
615 printf "%-12s%s", "[$db]", "OK\n" if $status;