maint: restructure to use Dist::Zilla
[bioperl-live.git] / lib / Bio / DB / SeqVersion / gi.pm
blob93741e5a52a7333ac06b60dea825938364d42350
2 # BioPerl module for Bio::DB::SeqVersion::gi
4 # Please direct questions and support issues to <bioperl-l@bioperl.org>
6 # Cared for by Brian Osborne
8 # Copyright Brian Osborne 2006
10 # You may distribute this module under the same terms as Perl itself
12 # POD documentation - main docs before the code
14 =head1 NAME
16 Bio::DB::SeqVersion::gi - interface to NCBI Sequence Revision History page
18 =head1 SYNOPSIS
20 Do not use this module directly, use Bio::DB::SeqVersion.
22 use Bio::DB::SeqVersion;
24 my $query = Bio::DB::SeqVersion->new(-type => 'gi');
26 # all GIs, which will include the GI used to query
27 my @all_gis = $query->get_all(2);
29 # the most recent GI, which may or may not be the GI used to query
30 my $live_gi = $query->get_recent(2);
32 # get all the visible data on the Sequence Revision page
33 my $array_ref = $query->get_history(11111111);
35 These methods can also take accession numbers as arguments, just like
36 the Sequence Revision page itself.
38 =head1 DESCRIPTION
40 All sequence entries at GenBank are identified by a pair of
41 identifiers, an accession and a numeric identifier, and this number is
42 frequently called a GI number (B<G>enInfo B<I>dentifier). The accession
43 is stable, but each new version of the sequence entry for the accession
44 receives a new GI number (see L<https://www.ncbi.nlm.nih.gov/genbank/sequenceids/#historical_note>
45 for more information on GenBank identifiers). One accession
46 can have one or more GI numbers and the highest of these is the most recent,
47 or "live", GI.
49 Information on an accession and its associated GI numbers is available at
50 the Sequence Revision History page at NCBI,
51 L<https://www.ncbi.nlm.nih.gov/genbank/sequencerevisionhistory/>, this information is
52 not available in file format. This module queries the Web page and retrieves GI
53 numbers and related data given an accession (e.g. NP_111111, A11111, P12345) or
54 a GI number (e.g. 2, 11111111) as query.
56 =head1 FEEDBACK
58 =head2 Mailing Lists
60 User feedback is an integral part of the evolution of this and other
61 Bioperl modules. Send your comments and suggestions preferably to one
62 of the Bioperl mailing lists. Your participation is much appreciated.
64 bioperl-l@bioperl.org - General discussion
65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
67 =head2 Support
69 Please direct usage questions or support issues to the mailing list:
71 I<bioperl-l@bioperl.org>
73 rather than to the module maintainer directly. Many experienced and
74 reponsive experts will be able look at the problem and quickly
75 address it. Please include a thorough description of the problem
76 with code and data examples if at all possible.
78 =head2 Reporting Bugs
80 Report bugs to the Bioperl bug tracking system to help us keep track
81 the bugs and their resolution. Bug reports can be submitted via the
82 web:
84 https://github.com/bioperl/bioperl-live/issues
86 =head1 AUTHOR - Brian Osborne
88 Email E<lt> osborne at optonline dot net E<gt>
90 =head1 CONTRIBUTORS
92 Torsten Seemann - torsten.seemann AT infotech.monash.edu.au
94 =head1 APPENDIX
96 The rest of the documentation details each of the object
97 methods. Internal methods are usually preceded with a _
99 =cut
101 # Let the code begin...
103 package Bio::DB::SeqVersion::gi;
104 use strict;
105 use Encode;
106 use HTML::TableExtract;
107 use base qw(Bio::DB::SeqVersion);
109 # Private class variables
111 # TODO: this may be an unstable setting (text is actually minimal XHTML)
112 my $URL = 'https://www.ncbi.nlm.nih.gov/nuccore/%s?report=girevhist&format=text';
114 =head2 new
116 Title : new
117 Usage : $gb = Bio::DB::SeqVersion::gi->new
118 Function: Creates a new query object
119 Returns : New query object
121 =cut
123 sub new {
124 my ( $class, @args ) = @_;
125 my $self = $class->SUPER::new(@args);
126 $self->_initialize;
127 return $self;
130 =head2 get_all
132 Title : get_all
133 Usage : my @gis = $q->get_all(2)
134 Function: Get all GI numbers given a GI number
135 Returns : An array of GI numbers, earliest GI number is the 0 element
136 Args : A single GI number (string)
138 =cut
140 sub get_all {
141 my ( $self, $id ) = @_;
142 my ( @arr, $ref );
143 $id eq $self->{_last_id}
144 ? $ref = $self->{_last_result}
145 : $ref = $self->get_history($id);
146 for my $row ( @{$ref} ) {
147 push @arr, $$row[0];
149 @arr;
152 =head2 get_recent
154 Title : get_recent
155 Usage : my $newest_gi = $q->get_recent(2)
156 Function: Get most recent GI given a single GI
157 Returns : String
158 Args : A single GI number (string)
160 =cut
162 sub get_recent {
163 my ( $self, $id ) = @_;
164 my $ref;
165 $id eq $self->{_last_id}
166 ? $ref = $self->{_last_result}
167 : $ref = $self->get_history($id);
168 $ref->[0]->[0];
171 =head2 get_status
173 Title : get_status
174 Usage : my $newest_gi = $q->get_status(2)
175 Function: Get most recent GI given a single GI
176 Returns : String
177 Args : A single GI number (string)
179 =cut
181 sub get_status {
182 my ( $self, $id ) = @_;
183 $self->throw("Must pass an ID") if !defined $id;
184 if ($id ne $self->{_last_id} ) {
185 $self->get_history($id);
187 $self->{_last_status};
190 =head2 get_history
192 Title : get_history
193 Usage : my $ref = $query_obj->get_history()
194 Function: Queries the NCBI Revision page, gets the data from the HTML table
195 Returns : Reference to an array of arrays where element 0 refers to the most
196 recent version and the last element refers to the oldest version.
197 In the second dimension the elements are:
199 0 GI number
200 1 Version
201 2 Update Date
203 For example, to get the GI number of the first version:
205 $ref->[$#{@$ref}]->[0]
207 To get the Update Date of the latest version:
209 $ref->[0]->[2]
211 Args : One identifier (string)
212 Note : Status of the GI was returned here previously as the last element in
213 the row of elemnts above; however the status is currently only
214 returned for the GI requested (e.g. a single value). One can get
215 the status for this using the get_status() method above
217 =cut
219 sub get_history {
220 my ( $self, $id ) = @_;
221 my $html = $self->_get_request($id);
222 my ( $ref, $status ) = $self->_process_data($html);
224 # store the very last result in case some other methods
225 # are called using the same identifier
226 $self->{_last_result} = $ref;
227 $self->{_last_id} = $id;
228 $self->{_last_status} = $status;
229 $ref;
232 =head2 _get_request
234 Title : _get_request
235 Usage : my $url = $self->_get_request
236 Function: GET using NCBI Revision page URL, uses Root::HTTPget
237 Returns : HTML
238 Args : One identifier (string)
240 =cut
242 sub _get_request {
243 my ( $self, $id ) = @_;
245 $self->throw("Must specify a single id to query") if ( !defined($id) || ref($id) );
247 my $url = sprintf( $URL, $id );
248 my $response = $self->get($url);
249 if ( not $response->is_success ) {
250 $self->throw( "Can't query $url: "
251 . $response->status_line . "\n"
252 . "ID likely does not exist" );
254 return $response->content;
257 =head2 _process_data
259 Title : _process_data
260 Usage : $self->_process_data($html)
261 Function: extract data from HTML
262 Args : HTML from Revision History page
263 Returns : reference to an array of arrays
265 =cut
267 sub _process_data {
268 my ( $self, $html ) = @_;
270 # Only one status is returned (not one per revision). Setting once
271 my $status;
272 if ($html =~ /<div class="status">Current status:\s+(\S+)<\/div>/) {
273 $status = $1;
274 } else {
275 $self->warn("No current status found, setting to 'unknown'");
276 $status = 'unknown';
279 my $te = HTML::TableExtract->new(
280 headers => ['Gi', 'Version', 'Update Date'] ,
281 depth => 0);
282 $te->parse(decode_utf8($html));
283 my $table = $te->first_table_found;
284 $self->throw("No table found") unless defined $table;
285 my $t = [$table->rows];
286 ($t, $status);
291 __END__