1 #------------------------------------------------------------------
3 # BioPerl module Bio::Tools::GuessSeqFormat
5 # Please direct questions and support issues to <bioperl-l@bioperl.org>
7 # Cared for by Andreas Kähäri, andreas.kahari@ebi.ac.uk
9 # You may distribute this module under the same terms as perl itself
10 #------------------------------------------------------------------
16 Bio::Tools::GuessSeqFormat - Module for determining the sequence
17 format of the contents of a file, a string, or through a
22 # To guess the format of a flat file, given a filename:
23 my $guesser = Bio::Tools::GuessSeqFormat->new( -file => $filename );
24 my $format = $guesser->guess;
26 # To guess the format from an already open filehandle:
27 my $guesser = Bio::Tools::GuessSeqFormat->new( -fh => $filehandle );
28 my $format = $guesser->guess;
29 # The filehandle will be returned to its original position. Note that this
30 # filehandle can be STDIN.
32 # To guess the format of one or several lines of text (with
34 my $guesser = Bio::Tools::GuessSeqFormat->new( -text => $linesoftext );
35 my $format = $guesser->guess;
37 # To create a Bio::Tools::GuessSeqFormat object and set the
38 # filename, filehandle, or line to parse afterwards:
39 my $guesser = Bio::Tools::GuessSeqFormat->new();
40 $guesser->file($filename);
41 $guesser->fh($filehandle);
42 $guesser->text($linesoftext);
44 # To guess in one go, given e.g. a filename:
45 my $format = Bio::Tools::GuessSeqFormat->new( -file => $filename )->guess;
49 Bio::Tools::GuessSeqFormat tries to guess the format ("swiss",
50 "pir", "fasta" etc.) of the sequence or MSA in a file, in a
51 scalar, or through a filehandle.
53 The guess() method of a Bio::Tools::GuessSeqFormat object will
54 examine the data, line by line, until it finds a line to which
55 only one format can be assigned. If no conclusive guess can be
56 made, undef is returned.
58 If the Bio::Tools::GuessSeqFormat object is given a filehandle,
59 e.g. STDIN, it will be restored to its original position on
60 return from the guess() method.
64 Tests are currently implemented for the following formats:
90 FastA sequence ("fasta")
94 FastQ sequence ("fastq")
98 FastXY/FastA alignment ("fastxy")
110 GCG Blast ("gcgblast")
114 GCG FastA ("gcgfasta")
142 Phrap assembly file ("phrap")
186 Stockholm ("stockholm")
198 Variant Call Format ("vcf")
206 User feedback is an integral part of the evolution of this and
207 other Bioperl modules. Send your comments and suggestions
208 preferably to one of the Bioperl mailing lists. Your
209 participation is much appreciated.
211 bioperl-l@bioperl.org - General discussion
212 http://bioperl.org/wiki/Mailing_lists - About the mailing lists
216 Please direct usage questions or support issues to the mailing list:
218 I<bioperl-l@bioperl.org>
220 rather than to the module maintainer directly. Many experienced and
221 reponsive experts will be able look at the problem and quickly
222 address it. Please include a thorough description of the problem
223 with code and data examples if at all possible.
225 =head2 Reporting Bugs
227 Report bugs to the Bioperl bug tracking system to help us
228 keep track the bugs and their resolution. Bug reports can be
229 submitted via the web:
231 https://github.com/bioperl/bioperl-live/issues
235 Andreas KE<228>hE<228>ri, andreas.kahari@ebi.ac.uk
239 Heikki LehvE<228>slaiho, heikki-at-bioperl-dot-org
240 Mark A. Jensen, maj-at-fortinbras-dot-us
245 package Bio
::Tools
::GuessSeqFormat
;
251 use base
qw(Bio::Root::Root);
255 Methods available to Bio::Tools::GuessSeqFormat objects
256 are described below. Methods with names beginning with an
257 underscore are considered to be internal.
264 Usage : $guesser = Bio::Tools::GuessSeqFormat->new( ... );
265 Function : Creates a new object.
266 Example : See SYNOPSIS.
267 Returns : A new object.
268 Arguments : -file The filename of the file whose format is to
269 be guessed, e.g. STDIN, or
270 -fh An already opened filehandle from which a text
271 stream may be read, or
272 -text A scalar containing one or several lines of
273 text with embedded newlines.
275 If more than one of the above arguments are given, they
276 are tested in the order -text, -file, -fh, and the first
277 available argument will be used.
286 my $self = $class->SUPER::new
(@args);
294 $value = shift @args;
295 $self->{$attr} = $value;
304 Usage : $guesser->file($filename);
305 $filename = $guesser->file;
306 Function : Gets or sets the current filename associated with
308 Returns : The new filename.
309 Arguments : The filename of the file whose format is to be
312 A call to this method will clear the current filehandle and
313 the current lines of text associated with the object.
319 # Sets and/or returns the filename to use.
324 # Set the active filename, and clear the filehandle and
325 # text line, if present.
326 $self->{-file
} = $file;
327 $self->{-fh
} = $self->{-text
} = undef;
330 return $self->{-file
};
336 Usage : $guesser->fh($filehandle);
337 $filehandle = $guesser->fh;
338 Function : Gets or sets the current filehandle associated with
340 Returns : The new filehandle.
341 Arguments : An already opened filehandle from which a text
344 A call to this method will clear the current filename and
345 the current lines of text associated with the object.
351 # Sets and/or returns the filehandle to use.
356 # Set the active filehandle, and clear the filename and
357 # text line, if present.
359 $self->{-file
} = $self->{-text
} = undef;
369 Usage : $guesser->text($linesoftext);
370 $linesofext = $guesser->text;
371 Function : Gets or sets the current text associated with an
373 Returns : The new lines of texts.
374 Arguments : A scalar containing one or several lines of text,
375 including embedded newlines.
377 A call to this method will clear the current filename and
378 the current filehandle associated with the object.
384 # Sets and/or returns the text lines to use.
389 # Set the active text lines, and clear the filehandle
390 # and filename, if present.
391 $self->{-text
} = $text;
392 $self->{-fh
} = $self->{-file
} = undef;
395 return $self->{-text
};
401 Usage : $format = $guesser->guess;
402 @format = $guesser->guess; # if given a line of text
403 Function : Guesses the format of the data accociated with the
405 Returns : A format string such as "swiss" or "pir". If a
406 format can not be found, undef is returned.
409 If the object is associated with a filehandle, the position
410 of the filehandle will be returned to its original position
411 before the method returns.
416 ace
=> { test
=> \
&_possibly_ace
},
417 blast
=> { test
=> \
&_possibly_blast
},
418 bowtie
=> { test
=> \
&_possibly_bowtie
},
419 clustalw
=> { test
=> \
&_possibly_clustalw
},
420 codata
=> { test
=> \
&_possibly_codata
},
421 embl
=> { test
=> \
&_possibly_embl
},
422 fasta
=> { test
=> \
&_possibly_fasta
},
423 fastq
=> { test
=> \
&_possibly_fastq
},
424 fastxy
=> { test
=> \
&_possibly_fastxy
},
425 game
=> { test
=> \
&_possibly_game
},
426 gcg
=> { test
=> \
&_possibly_gcg
},
427 gcgblast
=> { test
=> \
&_possibly_gcgblast
},
428 gcgfasta
=> { test
=> \
&_possibly_gcgfasta
},
429 gde
=> { test
=> \
&_possibly_gde
},
430 genbank
=> { test
=> \
&_possibly_genbank
},
431 genscan
=> { test
=> \
&_possibly_genscan
},
432 gff
=> { test
=> \
&_possibly_gff
},
433 hmmer
=> { test
=> \
&_possibly_hmmer
},
434 nexus
=> { test
=> \
&_possibly_nexus
},
435 mase
=> { test
=> \
&_possibly_mase
},
436 mega
=> { test
=> \
&_possibly_mega
},
437 msf
=> { test
=> \
&_possibly_msf
},
438 pfam
=> { test
=> \
&_possibly_pfam
},
439 phrap
=> { test
=> \
&_possibly_phrap
},
440 phylip
=> { test
=> \
&_possibly_phylip
},
441 pir
=> { test
=> \
&_possibly_pir
},
442 prodom
=> { test
=> \
&_possibly_prodom
},
443 raw
=> { test
=> \
&_possibly_raw
},
444 rsf
=> { test
=> \
&_possibly_rsf
},
445 selex
=> { test
=> \
&_possibly_selex
},
446 stockholm
=> { test
=> \
&_possibly_stockholm
},
447 swiss
=> { test
=> \
&_possibly_swiss
},
448 tab
=> { test
=> \
&_possibly_tab
},
449 vcf
=> { test
=> \
&_possibly_vcf
},
456 while (my ($fmt_key) = each (%formats)) {
457 $formats{$fmt_key}{fmt_string
} = $fmt_key;
462 if (defined $self->{-text
}) {
463 # Break the text into separate lines.
464 my $text = $self->{-text
};
465 open $fh, '<', \
$text or $self->throw("Could not read from string: $!");
467 } elsif (defined $self->{-file
}) {
468 # If given a filename, open the file.
469 my $file = $self->{-file
};
470 open $fh, '<', $file or $self->throw("Could not read file '$file': $!");
472 } elsif (defined $self->{-fh
}) {
473 # If given a filehandle, get the current position in the stream.
475 if (not seek $fh, 0, 1) { # seek to current position to determine seekability
476 # Work around non-seekable filehandles if IO::Scalar is available
477 # (adapted from http://www.perlmonks.org/?node_id=33587)
478 # IO::Mark may be an option for very large streams?
479 $self->throw("Need IO::Scalar to guess from unseekable filehandles")
480 if not eval { require IO
::Scalar
};
482 { local $/; $data = <$fh>; $.-- }; # copy raw data from fh
483 tie
*$fh, 'IO::Scalar', my $s; # replace fh by scalar-tied fh
484 print $fh $data; # write raw data to tied fh
485 seek $fh, 0, 0; # return to start of tied fh
487 $start_pos = tell $fh;
494 my $line; # The next line of the file.
495 my $match = 0; # Number of possible formats of this line.
497 last if (!defined($line = <$fh>));
498 next if ($line =~ /^\s*$/); # Skip white and empty lines.
500 $line =~ s/\r$//; # Fix for DOS files on Unix.
503 while (my ($fmt_key, $fmt) = each (%formats)) {
504 if ($fmt->{test
}($line, $lineno)) {
506 $guess = $fmt->{fmt_string
};
510 # We're done if there was only one match.
511 $done = ($match == 1);
514 if (defined $self->{-fh
}) {
515 # Go back to original position in filehandle
516 seek $fh, $start_pos, 0 or $self->throw("Could not reset filehandle $fh: $!");
518 # Close the filehandle we opened
521 return ($done ?
$guess : undef);
524 =head1 HELPER SUBROUTINES
526 All helper subroutines will, given a line of text and the line
527 number of the same line, return 1 if the line possibly is from a
528 file of the type that they perform a test of.
530 A zero return value does not mean that the line is not part
531 of a certain type of file, just that the test did not find any
532 characteristics of that type of file in the line.
536 From bioperl test data, and from
537 "http://www.isrec.isb-sib.ch/DEA/module8/B_Stevenson/Practicals/transcriptome_recon/transcriptome_recon.html".
543 my ($line, $lineno) = (shift, shift);
544 return ($line =~ /^(?:Sequence|Peptide|DNA|Protein) [":]/);
547 =head2 _possibly_blast
549 From various blast results.
555 my ($line, $lineno) = (shift, shift);
556 return ($lineno == 1 &&
557 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/);
560 =head2 _possibly_bowtie
562 Contributed by kortsch.
568 my ($line, $lineno) = (shift, shift);
569 return ($line =~ /^[[:graph:]]+\t[-+]\t[[:graph:]]+\t\d+\t([[:alpha:]]+)\t([[:graph:]]+)\t\d+\t[[:graph:]]?/)
570 && length($1)==length($2);
573 =head2 _possibly_clustalw
575 From "http://www.ebi.ac.uk/help/formats.html".
579 sub _possibly_clustalw
581 my ($line, $lineno) = (shift, shift);
582 return ($lineno == 1 && $line =~ /CLUSTAL/);
585 =head2 _possibly_codata
587 From "http://www.ebi.ac.uk/help/formats.html".
593 my ($line, $lineno) = (shift, shift);
594 return (($lineno == 1 && $line =~ /^ENTRY/) ||
595 ($lineno == 2 && $line =~ /^SEQUENCE/) ||
596 $line =~ m{^(?:ENTRY|SEQUENCE|///)});
599 =head2 _possibly_embl
602 "http://www.ebi.ac.uk/embl/Documentation/User_manual/usrman.html#3.3".
608 my ($line, $lineno) = (shift, shift);
609 return ($lineno == 1 && $line =~ /^ID / && $line =~ /BP\.$/);
612 =head2 _possibly_fasta
614 From "http://www.ebi.ac.uk/help/formats.html".
620 my ($line, $lineno) = (shift, shift);
621 return (($lineno != 1 && $line =~ /^[A-IK-NP-Z]+$/i) ||
625 =head2 _possibly_fastq
627 From bioperl test data.
633 my ($line, $lineno) = (shift, shift);
634 return ( ($lineno == 1 && $line =~ /^@/) ||
635 ($lineno == 3 && $line =~ /^\+/) );
638 =head2 _possibly_fastxy
640 From bioperl test data.
646 my ($line, $lineno) = (shift, shift);
647 return (($lineno == 1 && $line =~ /^ FAST(?:XY|A)/) ||
648 ($lineno == 2 && $line =~ /^ version \d/));
651 =head2 _possibly_game
653 From bioperl testdata.
659 my ($line, $lineno) = (shift, shift);
660 return ($line =~ /^<!DOCTYPE game/);
665 From bioperl, Bio::SeqIO::gcg.
671 my ($line, $lineno) = (shift, shift);
672 return ($line =~ /Length: .*Type: .*Check: .*\.\.$/);
675 =head2 _possibly_gcgblast
677 From bioperl testdata.
681 sub _possibly_gcgblast
683 my ($line, $lineno) = (shift, shift);
684 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
686 $line =~ /^[[:upper:]]*BLAST[[:upper:]]*.*\[.*\]$/));
689 =head2 _possibly_gcgfasta
691 From bioperl testdata.
695 sub _possibly_gcgfasta
697 my ($line, $lineno) = (shift, shift);
698 return (($lineno == 1 && $line =~ /^!!SEQUENCE_LIST/) ||
699 ($lineno == 2 && $line =~ /FASTA/));
704 From "http://www.ebi.ac.uk/help/formats.html".
710 my ($line, $lineno) = (shift, shift);
711 return ($line =~ /^[{}]$/ ||
712 $line =~ /^(?
:name
|longname
|sequence
-ID
|
713 creation
-date
|direction
|strandedness
|
714 type
|offset
|group
-ID
|creator
|descrip
|
715 comment
|sequence
)/x
);
718 =head2 _possibly_genbank
720 From "http://www.ebi.ac.uk/help/formats.html".
721 Format of [apparantly optional] file header from
722 "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm". (TODO: dead link)
726 sub _possibly_genbank
728 my ($line, $lineno) = (shift, shift);
729 return (($lineno == 1 && $line =~ /GENETIC SEQUENCE DATA BANK/) ||
730 ($lineno == 1 && $line =~ /^LOCUS /) ||
731 ($lineno == 2 && $line =~ /^DEFINITION /) ||
732 ($lineno == 3 && $line =~ /^ACCESSION /));
735 =head2 _possibly_genscan
737 From bioperl test data.
741 sub _possibly_genscan
743 my ($line, $lineno) = (shift, shift);
744 return (($lineno == 1 && $line =~ /^GENSCAN.*Date.*Time/) ||
745 ($line =~ /^(?:Sequence\s+\w+|Parameter matrix|Predicted genes)/));
750 From bioperl test data.
756 my ($line, $lineno) = (shift, shift);
757 return (($lineno == 1 && $line =~ /^##gff-version/) ||
758 ($lineno == 2 && $line =~ /^##date/));
761 =head2 _possibly_hmmer
763 From bioperl test data.
769 my ($line, $lineno) = (shift, shift);
770 return (($lineno == 2 && $line =~ /^HMMER/) ||
772 $line =~ /Washington University School of Medicine/));
775 =head2 _possibly_nexus
777 From "http://paup.csit.fsu.edu/nfiles.html".
783 my ($line, $lineno) = (shift, shift);
784 return ($lineno == 1 && $line =~ /^#NEXUS/);
787 =head2 _possibly_mase
789 From bioperl test data.
790 More detail from "http://www.umdnj.edu/rcompweb/PA/Notes/GenbankFF.htm" (TODO: dead link)
796 my ($line, $lineno) = (shift, shift);
797 return (($lineno == 1 && $line =~ /^;;/) ||
798 ($lineno > 1 && $line =~ /^;[^;]?/));
801 =head2 _possibly_mega
803 From the ensembl broswer (AlignView data export).
809 my ($line, $lineno) = (shift, shift);
810 return ($lineno == 1 && $line =~ /^#mega$/);
816 From "http://www.ebi.ac.uk/help/formats.html".
822 my ($line, $lineno) = (shift, shift);
823 return ($line =~ m{^//} ||
824 $line =~ /MSF:.*Type:.*Check:|Name:.*Len:/);
827 =head2 _possibly_phrap
829 From "http://biodata.ccgb.umn.edu/docs/contigimage.html". (TODO: dead link)
830 From "http://genetics.gene.cwru.edu/gene508/Lec6.htm". (TODO: dead link)
831 From bioperl test data ("*.ace.1" files).
837 my ($line, $lineno) = (shift, shift);
838 return ($line =~ /^(?
:AS\
|CO\ Contig
|BQ
|AF\
|BS\
|RD\
|
844 From "http://www.ebi.ac.uk/help/formats.html".
845 The ".,()" spotted in bioperl test data.
849 sub _possibly_pir
# "NBRF/PIR" (?)
851 my ($line, $lineno) = (shift, shift);
852 return (($lineno != 1 && $line =~ /^[\sA-IK-NP-Z.,()]+\*?$/i) ||
853 $line =~ /^>(?:P1|F1|DL|DC|RL|RC|N3|N1);/);
856 =head2 _possibly_pfam
858 From bioperl test data.
864 my ($line, $lineno) = (shift, shift);
865 return ($line =~ m{^\w+/\d+-\d+\s+[A-IK-NP-Z.]+}i);
868 =head2 _possibly_phylip
870 From "http://www.ebi.ac.uk/help/formats.html". Initial space
871 allowed on first line (spotted in ensembl AlignView exported
878 my ($line, $lineno) = (shift, shift);
879 return (($lineno == 1 && $line =~ /^\s*\d+\s\d+/) ||
880 ($lineno == 2 && $line =~ /^\w\s+[A-IK-NP-Z\s]+/) ||
881 ($lineno == 3 && $line =~ /(?:^\w\s+[A-IK-NP-Z\s]+|\s+[A-IK-NP-Z\s]+)/)
885 =head2 _possibly_prodom
887 From "http://prodom.prabi.fr/prodom/current/documentation/data.php".
893 my ($line, $lineno) = (shift, shift);
894 return ($lineno == 1 && $line =~ /^ID / && $line =~ /\d+ seq\.$/);
899 From "http://www.ebi.ac.uk/help/formats.html".
905 my ($line, $lineno) = (shift, shift);
906 return ($line =~ /^[A-Za-z\s]+$/);
911 From "http://www.ebi.ac.uk/help/formats.html".
917 my ($line, $lineno) = (shift, shift);
918 return (($lineno == 1 && $line =~ /^!!RICH_SEQUENCE/) ||
920 $line =~ /^(?
:name
|type
|longname
|
921 checksum
|creation
-date
|strand
|sequence
)/x
);
924 =head2 _possibly_selex
926 From "http://www.ebc.ee/WWW/hmmer2-html/node27.html".
928 Assuming presence of Selex file header. Data exported by
929 Bioperl on Pfam and Selex formats are identical, but Pfam file
930 only holds one alignment.
936 my ($line, $lineno) = (shift, shift);
937 return (($lineno == 1 && $line =~ /^#=ID /) ||
938 ($lineno == 2 && $line =~ /^#=AC /) ||
939 ($line =~ /^#=SQ /));
942 =head2 _possibly_stockholm
944 From bioperl test data.
948 sub _possibly_stockholm
950 my ($line, $lineno) = (shift, shift);
951 return (($lineno == 1 && $line =~ /^# STOCKHOLM/) ||
952 $line =~ /^#=(?:GF|GS) /);
957 =head2 _possibly_swiss
959 From "http://ca.expasy.org/sprot/userman.html#entrystruc".
965 my ($line, $lineno) = (shift, shift);
966 return ($lineno == 1 && $line =~ /^ID / && $line =~ /AA\.$/);
971 Contributed by Heikki.
977 my ($line, $lineno) = (shift, shift);
978 return ($lineno == 1 && $line =~ /^[^\t]+\t[^\t]+/) ;
983 From "http://www.1000genomes.org/wiki/analysis/vcf4.0".
985 Assumptions made about sanity - format and date lines are line 1 and 2
986 respectively. This is not specified in the format document.
992 my ($line, $lineno) = (shift, shift);
993 return (($lineno == 1 && $line =~ /##fileformat=VCFv/) ||
994 ($lineno == 2 && $line =~ /##fileDate=/));