4 CXGN::Tools::GetGaps - a script to obtain information about gaps (indicated as N's) in a fasta file (such as the tomato genome)
8 Implemented as a MooseX::Runnable script to be run as follows:
10 mx-run CXGN::Tools::GetGaps --fasta_file sequences.fasta --min_gap_size 20 > output.txt
21 package CXGN
::Tools
::GetGaps
;
24 with
'MooseX::Runnable';
25 with
'MooseX::Getopt';
29 has
'min_gap_size' => (is
=> 'rw',
36 has
'fasta_file' => (is
=> 'rw',
41 has
'format' => (is
=> 'rw',
43 traits
=> [ 'Getopt' ],
51 my $io = Bio
::SeqIO
->new(-format
=>'largefasta', -file
=>$self->fasta_file());
56 while (my $s = $io->next_seq()) {
64 warn "Processing sequence $id (".$s->length()." nucleotides)...\n";
66 my $n_region_start = 0;
68 foreach my $i (1..$s->length()) {
69 my $nuc = $s->subseq($i, $i);
72 if (!$n_region_start) { $n_region_start=$i; }
76 if ($n_region_start) { $n_region_end = $i; }
78 my $gap_size = $n_region_end - $n_region_start;
79 if ($gap_size >= $self->min_gap_size()) {
81 if ($self->format=~ /gff/i) {
82 print join "\t", ($id, "getgaps", "gap", $n_region_start, $n_region_end, 1, ".", ".", "ID=$id\_$gap_no;Parent=$id"); print "\n";
86 print "$id\_"; printf "%06d", "$gap_no"; print "\t$id\t$n_region_start\t$n_region_end\t$gap_size\n";