Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Tools / GetGaps.pm
blob60972ad2a8ba6ab06985173c0774770b5afa835b
2 =head1 NAME
4 CXGN::Tools::GetGaps - a script to obtain information about gaps (indicated as N's) in a fasta file (such as the tomato genome)
6 =head1 DESCRIPTION
8 Implemented as a MooseX::Runnable script to be run as follows:
10 mx-run CXGN::Tools::GetGaps --fasta_file sequences.fasta --min_gap_size 20 > output.txt
12 =head1 AUTHOR
14 Lukas Mueller
17 =cut
21 package CXGN::Tools::GetGaps;
22 use Moose;
24 with 'MooseX::Runnable';
25 with 'MooseX::Getopt';
27 use Bio::SeqIO;
29 has 'min_gap_size' => (is => 'rw',
30 isa => 'Int',
31 traits=> ['Getopt'],
32 default=>10,
36 has 'fasta_file' => (is => 'rw',
37 isa => 'Str',
38 traits=>['Getopt'],
41 has 'format' => (is => 'rw',
42 isa => 'Str',
43 traits => [ 'Getopt' ],
44 default => 'txt',
48 sub run {
49 my $self = shift;
51 my $io = Bio::SeqIO->new(-format=>'largefasta', -file=>$self->fasta_file());
53 my $gap_no = 1;
54 my $old_id = "";
56 while (my $s = $io->next_seq()) {
57 my $seq = $s->seq();
58 my $id = $s->id();
60 if ($id ne $old_id) {
61 $gap_no =1;
64 warn "Processing sequence $id (".$s->length()." nucleotides)...\n";
66 my $n_region_start = 0;
67 my $n_region_end = 0;
68 foreach my $i (1..$s->length()) {
69 my $nuc = $s->subseq($i, $i);
71 if ($nuc=~/n/i) {
72 if (!$n_region_start) { $n_region_start=$i; }
75 else {
76 if ($n_region_start) { $n_region_end = $i; }
78 my $gap_size = $n_region_end - $n_region_start;
79 if ($gap_size >= $self->min_gap_size()) {
81 if ($self->format=~ /gff/i) {
82 print join "\t", ($id, "getgaps", "gap", $n_region_start, $n_region_end, 1, ".", ".", "ID=$id\_$gap_no;Parent=$id"); print "\n";
84 else {
86 print "$id\_"; printf "%06d", "$gap_no"; print "\t$id\t$n_region_start\t$n_region_end\t$gap_size\n";
88 $gap_no++;
92 $n_region_start = 0;
93 $n_region_end = 0;
99 $old_id = $id;