4 Bio::Matrix::PSM::IO::mast - PSM mast parser implementation
8 See Bio::Matrix::PSM::IO for detailed documentation on how to
13 Parser for mast. This driver unlike meme or transfac for example is
14 dedicated more to PSM sequence matches, than to PSM themselves.
18 Section III should be parsed too, otherwise no real sequence is
19 available, so we supply 'NNNNN....' as a seq which is not right.
25 User feedback is an integral part of the evolution of this
26 and other Bioperl modules. Send your comments and suggestions preferably
27 to one of the Bioperl mailing lists. Your participation is much appreciated.
29 bioperl-l@bioperl.org - General discussion
30 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
34 Please direct usage questions or support issues to the mailing list:
36 I<bioperl-l@bioperl.org>
38 rather than to the module maintainer directly. Many experienced and
39 reponsive experts will be able look at the problem and quickly
40 address it. Please include a thorough description of the problem
41 with code and data examples if at all possible.
45 Report bugs to the Bioperl bug tracking system to help us keep track
46 the bugs and their resolution. Bug reports can be submitted via the
49 https://github.com/bioperl/bioperl-live/issues
51 =head1 AUTHOR - Stefan Kirov
57 The rest of the documentation details each of the object
58 methods. Internal methods are usually preceded with a _
62 # Let the code begin...
63 package Bio
::Matrix
::PSM
::IO
::mast
;
64 use Bio
::Matrix
::PSM
::InstanceSite
;
65 use Bio
::Matrix
::PSM
::Psm
;
69 use base
qw(Bio::Matrix::PSM::PsmHeader Bio::Matrix::PSM::IO);
74 Usage : my $psmIO = Bio::Matrix::PSM::IO->new(-format=>'mast',
76 Function: Associates a file with the appropriate parser
77 Throws : Throws if the file passed is in HTML format or if
78 some criteria for the file
81 Returns : psm object, associated with a file with matrix file
83 return : "Bio::Matrix::PSM::$format"->new(@args);
90 my $self = $class->SUPER::new
(@args);
91 my (%instances,@header,$n);
92 my ($file)=$self->_rearrange(['FILE'], @args);
93 $self->{file
} = $file;
95 $self->_initialize_io(@args) || warn "Did you intend to use STDIN?"; #Read only for now
98 return $self if ($file=~/^>/);#Just writing
99 my $buf=$self->_readline;
100 $self->throw('Cannot parse HTML format yet') if ($buf =~/^<HTML>/);
101 # this should probably be moved to its own function
102 while ( defined($buf=$self->_readline)) {
104 if ($buf=~/DATABASE AND MOTIFS/) {
105 while ($buf=$self->_readline) {
106 if ($buf=~/DATABASE/) {
109 ($n,$self->{_dbname
},$self->{_dbtype
})=split(/\s/,$buf);
110 $self->{_dbtype
}=~s/[\(\)]//g;
112 if ($buf=~/MOTIFS/) {
115 ($n,$self->{_mrsc
},$self->{_msrctype
})=split(/\s/,$buf);
116 $self->{_msrctype
}=~s/[\(\)]//g;
120 if ($self->{_msrctype
} ne $self->{_dbtype
}) {#Assume we have protein motifs, nuc DB (not handling opp.)
122 $self->{_mixquery
}=1;
125 if ($buf=~m/MOTIF WIDTH BEST POSSIBLE MATCH/) {
127 while (defined($buf=$self->_readline)) {
128 last if ($buf!~/\w/);
131 my ($id,$width,$seq)=split(/\s+/,$buf);
132 push @
{$self->{hid
}},$id;
133 $self->{length}->{$id}=$width;
134 $self->{seq
}->{$id}=$seq;
138 if ($buf=~m/section i:/i) {
142 %instances=_get_genes
($self);
143 $self->{instances
}=\
%instances;
145 $self->warn ("Your MAST analysis did not find any matches satisfying the current threshold.\nSee MAST documentation for more information.\n");
146 return $self; #The header might be useful so we return the object, not undef
150 if ($buf=~m/section ii:/i) {
156 $buf=~s/[\t+\s+]/ /g;
157 push @header,$buf unless (($buf=~/\*{10,}/)||($buf!~/\w/));
159 $self->throw('Could not read Section I, probably wrong format, make sure it is not HTML, giving up...') if !(%instances);
160 $self->warn( "This file might be an unreadable version, proceed with caution!\n") if (!grep(/\s+MAST\s+version\s+3/,@header));
162 $self->{unstructured
} = \
@header;
168 # Get the file header and put store it as a hash, which later we'll use to create
169 # the header for each Psm. See Bio::Matrix::PSM::PsmI for header function.
176 while (my $line=$self->_readline) {
177 last if ($line=~/^[\s\t*]/); # Well, ids can be nearly anything...???
180 next if ($line eq '');
182 my ($id,$key,$eval,$len)=split(/,/,$line);
184 warn "Malformed data found: $line\n";
187 $instances{$id}=Bio
::Matrix
::PSM
::InstanceSite
->new(-id
=>$id,
200 Usage : my $psm=$psmIO->next_psm();
201 Function: Reads the next PSM from the input file, associated with this object
202 Throws : Throws if there ara format violations in the input file (checking is not
203 very strict with all drivers).
205 Returns : Bio::Matrix::PSM::Psm object
213 return if ($self->{_end
}==1);
214 my (@lmotifsm,%index,$eval,$scheme,$sid);
215 %index= %{$self->{length}};
216 my (@instances,%instances);
217 my $line=$self->_readline;
219 if ($line =~ /\*{10,}/) { #Endo of Section II if we do only section II
225 ($sid,$eval,$scheme)=split(/\s+/,$line,3);
229 $line=$self->_readline;
231 } until ($line!~/^\s/);
235 my @motifs=split(/_/,$scheme);
237 my $next=shift(@motifs);
238 if (!($next=~/\D/)) {
244 my $score= $id=~m/\[/ ?
'strong' : 'weak' ;
246 my $strand = $id =~ m/\-\d/ ?
-1 : 1 ;
247 if ($self->{_mixquery
}) {
248 $frame = 0 if $id =~ m/\d+a/ ;
249 $frame = 1 if $id =~ m/\d+b/ ;
250 $frame = 2 if $id =~ m/\d+c/ ;
255 my $width=$index{$id};
256 #We don't know the sequence, but we know the length
257 my $seq='N' x
($width*$self->{_factor
}); #Future version will have to parse Section tree nad get the real seq
258 my $instance=Bio
::Matrix
::PSM
::InstanceSite
->new
261 -accession_number
=>$sid,
262 -desc
=>"Motif $id occurrance in $sid",
268 $instance->frame($frame) if ($self->{_mixquery
});
269 push @instances,$instance;
270 $pos+=$index{$id}*$self->{_factor
};
272 my $psm= Bio
::Matrix
::PSM
::Psm
->new(-instances
=> \
@instances,
275 $self->_pushback($line);
283 Usage : #Get SiteMatrix object somehow (see Bio::Matrix::PSM::SiteMatrix)
284 my $matrix=$psmin->next_matrix;
286 my $psmio=new(-file=>">psms.mast",-format=>'mast');
287 $psmio->write_psm($matrix);
288 #Will warn if only PFM data is contained in $matrix, recalculate the PWM
289 #based on normal distribution (A=>0.25, C=>0.25, etc)
290 Function: writes pwm in mast format
293 Args : SiteMatrix object
299 my ($self,$matrix)=@_;
300 # my $idline=">". $matrix->id . "\n";
301 my $w=$matrix->width;
302 my $header="ALPHABET= ACGT\nlog-odds matrix: alength= 4 w= $w\n";
303 $self->_print($header);
304 unless ($matrix->get_logs_array('A')) {
305 warn "No log-odds data, available, using normal distribution to recalculate the PWM";
306 $matrix->calc_weight({A
=>0.25, C
=>0.25, G
=>0.25,T
=>0.25});
308 while (my %h=$matrix->next_pos) {
309 $self->_print (join("\t",$h{lA
},$h{lC
},$h{lG
},$h{lT
},"\n"));