a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / GetGaps.pm
blob2733b50ead112fa8dff1e7d2ca3b39ea11df3fac
2 =head1 NAME
4 CXGN::Tools::GetGaps - a script to obtain information about gaps (indicated as N's) in a fasta file (such as the tomato genome)
6 =head1 DESCRIPTION
8 Implemented as a MooseX::Runnable script to be run as follows:
10 mx-run CXGN::Tools::GetGaps --fasta_file sequences.fasta --min_gap_size 20 > output.txt
12 =head1 AUTHOR
14 Lukas Mueller
17 =cut
21 package CXGN::Tools::GetGaps;
22 use Moose;
24 with 'MooseX::Runnable';
25 with 'MooseX::Getopt';
27 use Bio::SeqIO;
29 has 'min_gap_size' => (is => 'rw',
30 isa => 'Int',
31 traits=> ['Getopt'],
32 default=>10,
36 has 'fasta_file' => (is => 'rw',
37 isa => 'Str',
38 traits=>['Getopt'],
43 sub run {
44 my $self = shift;
46 my $io = Bio::SeqIO->new(-format=>'largefasta', -file=>$self->fasta_file());
48 my $gap_no = 1;
49 my $old_id = "";
51 while (my $s = $io->next_seq()) {
52 my $seq = $s->seq();
53 my $id = $s->id();
55 if ($id ne $old_id) {
56 $gap_no =1;
59 warn "Processing sequence $id (".$s->length()." nucleotides)...\n";
61 my $n_region_start = 0;
62 my $n_region_end = 0;
63 foreach my $i (1..$s->length()) {
64 my $nuc = $s->subseq($i, $i);
66 if ($nuc=~/n/i) {
67 if (!$n_region_start) { $n_region_start=$i; }
70 else {
71 if ($n_region_start) { $n_region_end = $i; }
73 my $gap_size = $n_region_end - $n_region_start + 1;
74 if ($gap_size >= $self->min_gap_size()) {
76 print "$id\_"; printf "%06d", "$gap_no"; print "\t$id\t$n_region_start\t$n_region_end\t$gap_size\n";
77 $gap_no++;
81 $n_region_start = 0;
82 $n_region_end = 0;