From a5ac9e202d11b49638699d6d2a2a58ad2bedac12 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Carn=C3=AB=20Draug?= <carandraug+dev@gmail.com>
Date: Mon, 1 Oct 2018 01:51:30 +0100
Subject: [PATCH] bin/bp_biofetch_genbank_proxy: move to Bio-DB-NCBIHelper

---
 Changes                       |   3 +
 bin/bp_biofetch_genbank_proxy | 312 ------------------------------------------
 2 files changed, 3 insertions(+), 312 deletions(-)
 delete mode 100644 bin/bp_biofetch_genbank_proxy

diff --git a/Changes b/Changes
index 336e89cb2..8b52520ae 100644
--- a/Changes
+++ b/Changes
@@ -108,6 +108,7 @@ be removed.
 
     * The following programs have been removed:
 
+          bp_biofetch_genbank_proxy
           bp_bulk_load_gff
           bp_das_server
           bp_download_query_genbank
@@ -128,6 +129,8 @@ be removed.
 
           Bio::SeqIO::staden::read
           Bio::Tools::Run::Ensembl
+          CGI
+          Cache::FileCache
           Convert::Binary::C
           DBD::Pg
           DBD::SQLite
diff --git a/bin/bp_biofetch_genbank_proxy b/bin/bp_biofetch_genbank_proxy
deleted file mode 100644
index 6d1181e21..000000000
--- a/bin/bp_biofetch_genbank_proxy
+++ /dev/null
@@ -1,312 +0,0 @@
-#!/usr/bin/perl
-
-# dbfetch style caching proxy for GenBank
-use strict;
-use warnings;
-use CGI qw(:standard);
-use HTTP::Request::Common;
-use LWP::UserAgent;
-use Cache::FileCache;
-
-use vars qw(%GOT $BUFFER %MAPPING $CACHE);
-
-use constant CACHE_LOCATION => '/usr/tmp/dbfetch_cache';
-use constant MAX_SIZE   => 100_000_000;  # 100 megs, roughly
-use constant CACHE_DEPTH => 4;
-use constant EXPIRATION => "1 week";
-use constant PURGE      => "1 hour";
-
-%MAPPING = (genbank => {db=>'nucleotide',
-			rettype => 'gb'},
-	    genpep  => {db=>'protein',
-			rettype => 'gp'});
-# we're doing everything in callbacks, so initialize globals.
-$BUFFER = '';
-%GOT    = ();
-
-print header('text/plain');
-
-param() or print_usage();
-
-my $db     = param('db');
-my $style  = param('style');
-my $format = param('format');
-my $id     = param('id');
-my @ids    = split /\s+/,$id;
-
-$format = 'genbank' if $format eq 'default';  #h'mmmph
-
-$MAPPING{$db}        or error(1=>"Unknown database [$db]");
-$style  eq 'raw'     or error(2=>"Unknown style [$style]");
-$format eq 'genbank' or error(3=>"Format [$format] not known for database [$db]");
-
-$CACHE = Cache::FileCache->new({cache_root          => CACHE_LOCATION,
-				default_expires_in  => EXPIRATION,
-				cache_DEPTH         => CACHE_DEPTH,
-				namespace           => 'dbfetch',
-				auto_purge_interval => PURGE});
-
-# handle cached entries
-foreach (@ids) {
-  if (my $obj = $CACHE->get($_)) {
-    $GOT{$_}++;
-    print $obj,"//\n";
-  }
-}
-
-# handle the remainder
-@ids = grep {!$GOT{$_}} @ids;
-if (@ids) {
-  my $request = POST('https://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi',
-		     [rettype    => $MAPPING{$db}{rettype},
-		      db         => $MAPPING{$db}{db},
-		      tool       => 'bioperl',
-		      retmode    => 'text',
-		      usehistory => 'n',
-		      id         => join(',',@ids),
-		     ]
-		    );
-
-  my $ua = LWP::UserAgent->new;
-  my $response = $ua->request($request,\&callback);
-
-  if ($response->is_error) {
-    my $status = $response->status_line;
-    error(6 => "HTTP error from GenBank [$status]");
-  }
-}
-
-my @missing_ids = grep {!$GOT{$_}} @ids;
-foreach (@missing_ids) {
-  error(4=>"ID [$_] not found in database [$db]",1);
-}
-
-# my $response = $response->content;
-
-sub process_record {
-  my $record = shift;
-  print "$record//\n";
-  my ($locus)       = $record =~ /^LOCUS\s+(\S+)/m;
-  my ($accession)   = $record =~ /^ACCESSION\s+(\S+)/m;
-  my ($version,$gi) = $record =~ /^VERSION\s+(\S+)\s+GI:(\d+)/m;
-  foreach ($locus,$accession,$version,$gi) {
-    $GOT{$_}++;
-    $CACHE->set($_,$record);
-  }
-}
-
-sub callback {
-  my $data = shift;
-  $BUFFER .= $data;
-  my $index = 0;
-  while (($index = index($BUFFER,"//\n\n",$index))>=0) {
-    my $record = substr($BUFFER,0,$index);
-    $index += length("//\n\n");
-    substr($BUFFER,0,$index) = '';
-    process_record($record);
-  }
-}
-
-
-
-sub print_usage {
-  print <<'END';
-This script is intended to be used non-interactively.
-
-Brief summary of arguments:
-URL
-
-This interface does not specify what happens when biofetch is called
-in interactive context. The implementations can return the entries
-decorated with HTML tags and hypertext links.
-
-A URL for biofetch consists of four sections:
-
-			e.g.
-1. protocol		http://
-2. host			www.ebi.ac.uk
-3. path to program	/Tools/dbfetch/dbfetch
-4. query string		?style=raw;format=embl;db=embl;id=J00231
-
-
-QUERY STRING
-
-The query string options are separated from the base URL (protocol +
-host + path) by a question mark (?) and from each other by a semicolon
-';' (or by ampersand '&'). See CGI GET documents at
-http://www.w3.org/CGI/). The order of options is not critical. It is
-recommended to leave the ID to be the last item.
-
-Input for options should be case insensitive.
-
-
-option: db
-
-  Option  : db
-  Descr   : database name
-  Type    : required
-  Usage   : db=genpep | db=genbank
-  Arg     : string 
-
-Currently this server accepts "genbank" and "genpep"
-
-option: style
-
-  Option  : style
-  Descr   : +/- HTML tags
-  Type    : required
-  Usage   : style=raw | db=html
-  Arg     : enum (raw|html)
-
-In non-interactive context, always give "style=raw". This uses
-"Content-Type: text/plain". If other content types are needed (XML),
-this part of the spesifications can be extended to accommodate them.
-
-This server only accepts "raw".
-
-
-option: format
-
-  Option  : format
-  Descr   : format of the database entries returned
-  Type    : optional
-  Usage   : format=genbank
-  Arg     : enum
-
-Format defaults to the distribution format of the database (embl for
-EMBL database). If some other supported format is needed this option
-is needed (E.g. formats for EMBL: fasta, bsml, agave).
-
-This server only accepts "genbank" format.
-
-option: id
-
-  Option  : id
-  Descr   : unique database identifier(s)
-  Type    : required
-  Usage   : db=J00231 | id=J00231+HSFOS
-  Arg     : string 
-
-The ID option should be able to process all UIDS in a database. It
-should not be necessary to know if the UID is an ID, accession number
-or accession.version.
-
-The number of entry UIDs allowed is implementation specific. If the
-limit is exceeded, the the program reports an error. The UIDs should
-be separated by spaces (use '+' in a GET method string).
-
-
-ERROR MESSAGES
-
-The following standardized one line messages should be printed out in
-case of an error.
-
-ERROR 1 Unknown database [$db].
-ERROR 2 Unknown style [$style].
-ERROR 3 Format [$format] not known for database [$db].
-ERROR 4 ID [$id] not found in database [$db].
-ERROR 5 Too many IDs [$count]. Max [$MAXIDS] allowed.
-
-END
-;
-
-exit 0;
-}
-
-sub error {
-  my ($code,$message,$noexit) = @_;
-  print "ERROR $code $message\n";
-  exit 0 unless $noexit;
-}
-
-__END__
-
-=head1 NAME
-
-bp_biofetch_genbank_proxy.pl - Caching BioFetch-compatible web proxy for GenBank
-
-=head1 SYNOPSIS
-
-  Install in cgi-bin directory of a Web server.  Stand back.
-
-=head1 DESCRIPTION
-
-This CGI script acts as the server side of the BioFetch protocol as
-described in http://obda.open-bio.org/Specs/.  It provides two
-database access services, one for data source "genbank" (nucleotide
-entries) and the other for data source "genpep" (protein entries).
-
-This script works by forwarding its requests to NCBI's eutils script,
-which lives at https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi.
-It then reformats the output according to the BioFetch format so the
-sequences can be processed and returned by the Bio::DB::BioFetch
-module.  Returned entries are temporarily cached on the Web server's
-file system, allowing frequently-accessed entries to be retrieved
-without another round trip to NCBI.
-
-=head2 INSTALLATION
-
-You must have the following installed in order to run this script:
-
-   1) perl
-   2) the perl modules LWP and Cache::FileCache
-   3) a web server (Apache recommended)
-
-To install this script, copy it into the web server's cgi-bin
-directory.  You might want to shorten its name; "dbfetch" is
-recommended.
-
-There are several constants located at the top of the script that you
-may want to adjust.  These are:
-
-CACHE_LOCATION
-
-This is the location on the filesystem where the cached files will be
-located.  The default is /usr/tmp/dbfetch_cache.
-
-MAX_SIZE
-
-This is the maximum size that the cache can grow to.  When the cache
-exceeds this size older entries will be deleted automatically.  The
-default setting is 100,000,000 bytes (100 MB).
-
-EXPIRATION
-
-Entries that haven't been accessed in this length of time will be
-removed from the cache.  The default is 1 week.
-
-PURGE
-
-This constant specifies how often the cache will be purged for older
-entries.  The default is 1 hour.
-
-=head1 TESTING
-
-To see if this script is performing as expected, you may test it with
-this script:
-
- use Bio::DB::BioFetch;
- my $db = Bio::DB::BioFetch->new(-baseaddress=>'http://localhost/cgi-bin/dbfetch',
-	 			 -format     =>'genbank',
-				 -db         =>'genbank');
- my $seq = $db->get_Seq_by_id('DDU63596');
- print $seq->seq,"\n";
-
-This should print out a DNA sequence.
-
-=head1 SEE ALSO
-
-L<Bio::DB::BioFetch>, L<Bio::DB::Registry>
-
-=head1 AUTHOR
-
-Lincoln Stein, E<lt>lstein-at-cshl.orgE<gt>
-
-Copyright (c) 2003 Cold Spring Harbor Laboratory
-
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself.  See DISCLAIMER.txt for
-disclaimers of warranty.
-
-=cut
-
-- 
2.11.4.GIT