4 CXGN::Tools::GetGaps - a script to obtain information about gaps (indicated as N's) in a fasta file (such as the tomato genome)
8 Implemented as a MooseX::Runnable script to be run as follows:
10 mx-run CXGN::Tools::GetGaps --fasta_file sequences.fasta --min_gap_size 20 > output.txt
21 package CXGN
::Tools
::GetGaps
;
24 with
'MooseX::Runnable';
25 with
'MooseX::Getopt';
29 has
'min_gap_size' => (is
=> 'rw',
36 has
'fasta_file' => (is
=> 'rw',
46 my $io = Bio
::SeqIO
->new(-format
=>'largefasta', -file
=>$self->fasta_file());
51 while (my $s = $io->next_seq()) {
59 warn "Processing sequence $id (".$s->length()." nucleotides)...\n";
61 my $n_region_start = 0;
63 foreach my $i (1..$s->length()) {
64 my $nuc = $s->subseq($i, $i);
67 if (!$n_region_start) { $n_region_start=$i; }
71 if ($n_region_start) { $n_region_end = $i; }
73 my $gap_size = $n_region_end - $n_region_start + 1;
74 if ($gap_size >= $self->min_gap_size()) {
76 print "$id\_"; printf "%06d", "$gap_no"; print "\t$id\t$n_region_start\t$n_region_end\t$gap_size\n";