bp_process_wormbase: move program to new Bio-DB-Ace distribution
[bioperl-live.git] / Bio / ClusterIO / unigene.pm
blob7df3b4a9fc4ec04097bc05d5f5821b89c1677fa9
1 # BioPerl module for Bio::ClusterIO::unigene
3 # Please direct questions and support issues to <bioperl-l@bioperl.org>
5 # Cared for by Andrew Macgregor <andrew at cbbc.murdoch.edu.au>
7 # Copyright Andrew Macgregor, Jo-Ann Stanton, David Green
8 # Molecular Embryology Group, Anatomy & Structural Biology, University of Otago
9 # http://meg.otago.ac.nz
11 # You may distribute this module under the same terms as perl itself
13 # _history
14 # April 17, 2002 - Initial implementation by Andrew Macgregor
16 # POD documentation - main docs before the code
18 =head1 NAME
20 Bio::ClusterIO::unigene - UniGene input stream
22 =head1 SYNOPSIS
24 Do not use this module directly. Use it via the Bio::ClusterIO class.
26 =head1 DESCRIPTION
28 This object reads from Unigene *.data files downloaded from
29 ftp://ftp.ncbi.nih.gov/repository/UniGene/. It does not download and
30 decompress the file, you have to do that yourself.
32 =head1 FEEDBACK
34 =head2 Mailing Lists
36 User feedback is an integral part of the evolution of this and other
37 Bioperl modules. Send your comments and suggestions preferably to one
38 of the Bioperl mailing lists. Your participation is much appreciated.
40 bioperl-l@bioperl.org - General discussion
41 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
43 =head2 Support
45 Please direct usage questions or support issues to the mailing list:
47 I<bioperl-l@bioperl.org>
49 rather than to the module maintainer directly. Many experienced and
50 reponsive experts will be able look at the problem and quickly
51 address it. Please include a thorough description of the problem
52 with code and data examples if at all possible.
54 =head2 Reporting Bugs
56 Report bugs to the Bioperl bug tracking system to help us keep track
57 the bugs and their resolution. Bug reports can be submitted via the
58 web:
60 https://github.com/bioperl/bioperl-live/issues
62 =head1 AUTHORS - Andrew Macgregor
64 Email: andrew at cbbc.murdoch.edu.au
67 =head1 APPENDIX
69 The rest of the documentation details each of the object
70 methods. Internal methods are usually preceded with a _
72 =cut
75 # Let the code begin...
77 package Bio::ClusterIO::unigene;
78 use strict;
80 use Bio::Cluster::UniGene;
81 use Bio::Cluster::ClusterFactory;
83 use base qw(Bio::ClusterIO);
85 my %line_is = (
86 ID => q/ID\s+(\w{2,3}\.\d+)/,
87 TITLE => q/TITLE\s+(\S.*)/,
88 GENE => q/GENE\s+(\S.*)/,
89 CYTOBAND => q/CYTOBAND\s+(\S.*)/,
90 MGI => q/MGI\s+(\S.*)/,
91 LOCUSLINK => q/LOCUSLINK\s+(\S.*)/,
92 HOMOL => q/HOMOL\s+(\S.*)/,
93 EXPRESS => q/EXPRESS\s+(\S.*)/,
94 RESTR_EXPR => q/RESTR_EXPR\s+(\S.*)/,
95 GNM_TERMINUS => q/GNM_TERMINUS\s+(\S.*)/,
96 CHROMOSOME => q/CHROMOSOME\s+(\S.*)/,
97 STS => q/STS\s+(\S.*)/,
98 TXMAP => q/TXMAP\s+(\S.*)/,
99 PROTSIM => q/PROTSIM\s+(\S.*)/,
100 SCOUNT => q/SCOUNT\s+(\S.*)/,
101 SEQUENCE => q/SEQUENCE\s+(\S.*)/,
102 ACC => q/ACC=(\w+)(\.\d+)?/,
103 NID => q/NID=\s*(\S.*)/,
104 PID => q/PID=\s*(\S.*)/,
105 CLONE => q/CLONE=\s*(\S.*)/,
106 END => q/END=\s*(\S.*)/,
107 LID => q/LID=\s*(\S.*)/,
108 MGC => q/MGC=\s*(\S.*)/,
109 SEQTYPE => q/SEQTYPE=\s*(\S.*)/,
110 TRACE => q/TRACE=\s*(\S.*)/,
111 PERIPHERAL => q/PERIPHERAL=\s*(\S.*)/,
112 DELIMITER => q{^//},
115 # we set the right factory here
116 sub _initialize {
117 my($self, @args) = @_;
119 $self->SUPER::_initialize(@args);
120 if(! $self->cluster_factory()) {
121 $self->cluster_factory(Bio::Cluster::ClusterFactory->new(
122 -type => 'Bio::Cluster::UniGene'));
126 =head2 next_cluster
128 Title : next_cluster
129 Usage : $unigene = $stream->next_cluster()
130 Function: returns the next unigene in the stream
131 Returns : Bio::Cluster::UniGene object
132 Args : NONE
134 =cut
136 sub next_cluster {
137 my( $self) = @_;
138 local $/ = "\n//";
139 return unless my $entry = $self->_readline;
141 # set up the variables we'll need
142 my (%unigene,@express,@locuslink,@chromosome,
143 @sts,@txmap,@protsim,@sequence);
144 my $UGobj;
146 # set up the regexes
148 # add whitespace parsing and precompile regexes
149 #foreach (values %line_is) {
150 # $_ =~ s/\s+/\\s+/g;
151 # print STDERR "Regex is $_\n";
152 # #$_ = qr/$_/x;
155 #$line_is{'TITLE'} = qq/TITLE\\s+(\\S.+)/;
157 # run each line in an entry against the regexes
158 foreach my $line (split /\n/, $entry) {
159 #print STDERR "Wanting to match $line\n";
160 if ($line =~ /$line_is{ID}/gcx) {
161 $unigene{ID} = $1;
163 elsif ($line =~ /$line_is{TITLE}/gcx ) {
164 #print STDERR "MATCHED with [$1]\n";
165 $unigene{TITLE} = $1;
167 elsif ($line =~ /$line_is{GENE}/gcx) {
168 $unigene{GENE} = $1;
170 elsif ($line =~ /$line_is{CYTOBAND}/gcx) {
171 $unigene{CYTOBAND} = $1;
173 elsif ($line =~ /$line_is{MGI}/gcx) {
174 $unigene{MGI} = $1;
176 elsif ($line =~ /$line_is{LOCUSLINK}/gcx) {
177 @locuslink = split /;/, $1;
179 elsif ($line =~ /$line_is{HOMOL}/gcx) {
180 $unigene{HOMOL} = $1;
182 elsif ($line =~ /$line_is{EXPRESS}/gcx) {
183 my $express = $1;
184 # remove initial semicolon if present
185 $express =~ s/^;//;
186 @express = split /\s*;/, $express;
188 elsif ($line =~ /$line_is{RESTR_EXPR}/gcx) {
189 $unigene{RESTR_EXPR} = $1;
191 elsif ($line =~ /$line_is{GNM_TERMINUS}/gcx) {
192 $unigene{GNM_TERMINUS} = $1;
194 elsif ($line =~ /$line_is{CHROMOSOME}/gcx) {
195 push @chromosome, $1;
197 elsif ($line =~ /$line_is{TXMAP}/gcx) {
198 push @txmap, $1;
200 elsif ($line =~ /$line_is{STS}/gcx) {
201 push @sts, $1;
203 elsif ($line =~ /$line_is{PROTSIM}/gcx) {
204 push @protsim, $1;
206 elsif ($line =~ /$line_is{SCOUNT}/gcx) {
207 $unigene{SCOUNT} = $1;
209 elsif ($line =~ /$line_is{SEQUENCE}/gcx) {
210 # parse into each sequence line
211 my $seq = {};
212 # add unigene id to each seq
213 #$seq->{unigene_id} = $unigene{ID};
214 my @items = split(/;/, $1);
215 foreach (@items) {
216 if (/$line_is{ACC}/gcx) {
217 $seq->{acc} = $1;
218 # remove leading dot if version pattern matched
219 $seq->{version} = substr($2,1) if defined $2;
221 elsif (/$line_is{NID}/gcx) {
222 $seq->{nid} = $1;
224 elsif (/$line_is{PID}/gcx) {
225 $seq->{pid} = $1;
227 elsif (/$line_is{CLONE}/gcx) {
228 $seq->{clone} = $1;
230 elsif (/$line_is{END}/gcx) {
231 $seq->{end} = $1;
233 elsif (/$line_is{LID}/gcx) {
234 $seq->{lid} = $1;
236 elsif (/$line_is{MGC}/gcx) {
237 $seq->{mgc} = $1;
239 elsif (/$line_is{SEQTYPE}/gcx) {
240 $seq->{seqtype} = $1;
242 elsif (/$line_is{TRACE}/gcx) {
243 $seq->{trace} = $1;
245 elsif (/$line_is{PERIPHERAL}/gcx) {
246 $seq->{peripheral} = $1;
249 push @sequence, $seq;
251 elsif ($line =~ /$line_is{DELIMITER}/gcx) {
252 # at the end of the record, add data to the object
253 $UGobj = $self->cluster_factory->create_object(
254 -display_id => $unigene{ID},
255 -description => $unigene{TITLE},
256 -size => $unigene{SCOUNT},
257 -members => \@sequence);
258 $UGobj->gene($unigene{GENE}) if defined ($unigene{GENE});
259 $UGobj->cytoband($unigene{CYTOBAND}) if defined($unigene{CYTOBAND});
260 $UGobj->mgi($unigene{MGI}) if defined ($unigene{MGI});
261 $UGobj->locuslink(\@locuslink);
262 $UGobj->homol($unigene{HOMOL}) if defined ($unigene{HOMOL});
263 $UGobj->express(\@express);
264 $UGobj->restr_expr($unigene{RESTR_EXPR}) if defined ($unigene{RESTR_EXPR});
265 $UGobj->gnm_terminus($unigene{GNM_TERMINUS}) if defined ($unigene{GNM_TERMINUS});
266 $UGobj->chromosome(\@chromosome);
267 $UGobj->sts(\@sts);
268 $UGobj->txmap(\@txmap);
269 $UGobj->protsim(\@protsim);
272 return $UGobj;