t/AlignIO/AlignIO.t: fix number of tests in plan (fixup c523e6bed866)
[bioperl-live.git] / Bio / Matrix / PSM / IO / mast.pm
blobe82af516eb16e37d49940c2aa9e6734b20ae0119
2 =head1 NAME
4 Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
6 =head1 SYNOPSIS
8 See Bio::Matrix::PSM::IO for detailed documentation on how to
9 use PSM parsers
11 =head1 DESCRIPTION
13 Parser for mast. This driver unlike meme or transfac for example is
14 dedicated more to PSM sequence matches, than to PSM themselves.
16 =head1 TO DO
18 Section III should be parsed too, otherwise no real sequence is
19 available, so we supply 'NNNNN....' as a seq which is not right.
21 =head1 FEEDBACK
23 =head2 Mailing Lists
25 User feedback is an integral part of the evolution of this
26 and other Bioperl modules. Send your comments and suggestions preferably
27 to one of the Bioperl mailing lists. Your participation is much appreciated.
29 bioperl-l@bioperl.org - General discussion
30 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
32 =head2 Support
34 Please direct usage questions or support issues to the mailing list:
36 I<bioperl-l@bioperl.org>
38 rather than to the module maintainer directly. Many experienced and
39 reponsive experts will be able look at the problem and quickly
40 address it. Please include a thorough description of the problem
41 with code and data examples if at all possible.
43 =head2 Reporting Bugs
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 the bugs and their resolution. Bug reports can be submitted via the
47 web:
49 https://github.com/bioperl/bioperl-live/issues
51 =head1 AUTHOR - Stefan Kirov
53 Email skirov@utk.edu
55 =head1 APPENDIX
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
60 =cut
62 # Let the code begin...
63 package Bio::Matrix::PSM::IO::mast;
64 use Bio::Matrix::PSM::InstanceSite;
65 use Bio::Matrix::PSM::Psm;
66 use Bio::Root::Root;
67 use strict;
69 use base qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
71 =head2 new
73 Title : new
74 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
75 -file=>$file);
76 Function: Associates a file with the appropriate parser
77 Throws : Throws if the file passed is in HTML format or if
78 some criteria for the file
79 format are not met.
80 Example :
81 Returns : psm object, associated with a file with matrix file
82 Args : hash
83 return : "Bio::Matrix::PSM::$format"->new(@args);
85 =cut
88 sub new {
89 my($class, @args)=@_;
90 my $self = $class->SUPER::new(@args);
91 my (%instances,@header,$n);
92 my ($file)=$self->_rearrange(['FILE'], @args);
93 $self->{file} = $file;
94 $self->{_factor}=1;
95 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
96 $self->{_end}=0;
97 undef $self->{hid};
98 return $self if ($file=~/^>/);#Just writing
99 my $buf=$self->_readline;
100 $self->throw('Cannot parse HTML format yet') if ($buf =~/^<HTML>/);
101 # this should probably be moved to its own function
102 while ( defined($buf=$self->_readline)) {
103 chomp($buf);
104 if ($buf=~/DATABASE AND MOTIFS/) {
105 while ($buf=$self->_readline) {
106 if ($buf=~/DATABASE/) {
107 $buf=~s/^[\s\t]+//;
108 chomp $buf;
109 ($n,$self->{_dbname},$self->{_dbtype})=split(/\s/,$buf);
110 $self->{_dbtype}=~s/[\(\)]//g;
112 if ($buf=~/MOTIFS/) {
113 $buf=~s/^[\s\t]+//;
114 chomp $buf;
115 ($n,$self->{_mrsc},$self->{_msrctype})=split(/\s/,$buf);
116 $self->{_msrctype}=~s/[\(\)]//g;
117 last;
120 if ($self->{_msrctype} ne $self->{_dbtype}) {#Assume we have protein motifs, nuc DB (not handling opp.)
121 $self->{_factor}=3;
122 $self->{_mixquery}=1;
125 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
126 $self->_readline;
127 while (defined($buf=$self->_readline)) {
128 last if ($buf!~/\w/);
129 $buf=~s/\t+//g;
130 $buf=~s/^\s+//g;
131 my ($id,$width,$seq)=split(/\s+/,$buf);
132 push @{$self->{hid}},$id;
133 $self->{length}->{$id}=$width;
134 $self->{seq}->{$id}=$seq;
136 next;
138 if ($buf=~m/section i:/i) {
139 $self->_readline;
140 $self->_readline;
141 $self->_readline;
142 %instances=_get_genes($self);
143 $self->{instances}=\%instances;
144 if (!(%instances)) {
145 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
146 return $self; #The header might be useful so we return the object, not undef
148 next;
150 if ($buf=~m/section ii:/i) {
151 $self->_readline;
152 $self->_readline;
153 $self->_readline;
154 last;
156 $buf=~s/[\t+\s+]/ /g;
157 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
159 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
160 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
162 $self->{unstructured} = \@header;
163 $self->_initialize;
164 return $self;
168 # Get the file header and put store it as a hash, which later we'll use to create
169 # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
170 sub _get_genes {
171 my $self=shift;
172 my %llid;
173 my $ok=0;
174 my $i=0;
175 my %instances;
176 while (my $line=$self->_readline) {
177 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
178 chomp($line);
179 $i++;
180 next if ($line eq '');
181 $line=~s/\s+/,/g;
182 my ($id,$key,$eval,$len)=split(/,/,$line);
183 unless ($len) {
184 warn "Malformed data found: $line\n";
185 next;
187 $instances{$id}=Bio::Matrix::PSM::InstanceSite->new(-id=>$id,
188 -desc=>$key,
189 -score=>$eval,
190 -width=>$len,
191 -seq=>'ACGT');
193 return %instances;
197 =head2 next_psm
199 Title : next_psm
200 Usage : my $psm=$psmIO->next_psm();
201 Function: Reads the next PSM from the input file, associated with this object
202 Throws : Throws if there ara format violations in the input file (checking is not
203 very strict with all drivers).
204 Example :
205 Returns : Bio::Matrix::PSM::Psm object
206 Args : none
208 =cut
211 sub next_psm {
212 my $self=shift;
213 return if ($self->{_end}==1);
214 my (@lmotifsm,%index,$eval,$scheme,$sid);
215 %index= %{$self->{length}};
216 my (@instances,%instances);
217 my $line=$self->_readline;
218 $line=~s/[\t\n]//;
219 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
220 $self->{_end}=1;
221 return ;
223 do {
224 if ($line!~/^\s/) {
225 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
227 else
228 { $scheme .=$line; }
229 $line=$self->_readline;
230 $line=~s/[\t\n]//;
231 } until ($line!~/^\s/);
232 my $pos=1;
233 $scheme=~s/\s+//g;
234 $scheme=~s/\n//g;
235 my @motifs=split(/_/,$scheme);
236 while (@motifs) {
237 my $next=shift(@motifs);
238 if (!($next=~/\D/)) {
239 last if (!@motifs);
240 $pos+=$next;
241 next;
243 my $id=$next;
244 my $score= $id=~m/\[/ ? 'strong' : 'weak' ;
245 my $frame;
246 my $strand = $id =~ m/\-\d/ ? -1 : 1 ;
247 if ($self->{_mixquery}) {
248 $frame = 0 if $id =~ m/\d+a/ ;
249 $frame = 1 if $id =~ m/\d+b/ ;
250 $frame = 2 if $id =~ m/\d+c/ ;
252 $id=~s/\D+//g;
254 my @s;
255 my $width=$index{$id};
256 #We don't know the sequence, but we know the length
257 my $seq='N' x ($width*$self->{_factor}); #Future version will have to parse Section tree nad get the real seq
258 my $instance=Bio::Matrix::PSM::InstanceSite->new
259 ( -id=>"$id\@$sid",
260 -mid=>$id,
261 -accession_number=>$sid,
262 -desc=>"Motif $id occurrance in $sid",
263 -score=>$score,
264 -seq=>$seq,
265 -alphabet => 'dna',
266 -start=>$pos,
267 -strand=>$strand);
268 $instance->frame($frame) if ($self->{_mixquery});
269 push @instances,$instance;
270 $pos+=$index{$id}*$self->{_factor};
272 my $psm= Bio::Matrix::PSM::Psm->new(-instances=> \@instances,
273 -e_val => $eval,
274 -id => $sid);
275 $self->_pushback($line);
276 return $psm;
280 =head2 write_psm
282 Title : write_psm
283 Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
284 my $matrix=$psmin->next_matrix;
285 #Create the stream
286 my $psmio=new(-file=>">psms.mast",-format=>'mast');
287 $psmio->write_psm($matrix);
288 #Will warn if only PFM data is contained in $matrix, recalculate the PWM
289 #based on normal distribution (A=>0.25, C=>0.25, etc)
290 Function: writes pwm in mast format
291 Throws :
292 Example :
293 Args : SiteMatrix object
294 Returns :
296 =cut
298 sub write_psm {
299 my ($self,$matrix)=@_;
300 # my $idline=">". $matrix->id . "\n";
301 my $w=$matrix->width;
302 my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
303 $self->_print($header);
304 unless ($matrix->get_logs_array('A')) {
305 warn "No log-odds data, available, using normal distribution to recalculate the PWM";
306 $matrix->calc_weight({A=>0.25, C=>0.25, G=>0.25,T=>0.25});
308 while (my %h=$matrix->next_pos) {
309 $self->_print (join("\t",$h{lA},$h{lC},$h{lG},$h{lT},"\n"));