a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / ExtractSeq.pm
blob668c00c613b1435c839ceb3bfcde33c7769760d0
2 use Moose;
4 use MooseX::Declare;
6 class CXGN::Tools::ExtractSeq {
8 with 'MooseX::Runnable';
9 with 'MooseX::Getopt';
11 use Bio::Tools::GFF;
12 use Bio::SeqIO;
13 use Data::Dumper;
15 has fasta_file => (is => 'rw',
16 isa => 'Str',
17 required => 1,
18 traits => [ 'Getopt' ],
19 cmd_aliases => 'i',
20 documentation => 'Input file in fasta format',
21 default => 'largefasta',
25 has format => (is =>'rw',
26 isa => 'Str',
27 required=>0,
28 traits => ['Getopt' ],
29 documentation => 'the format of the fasta file to use. Either fasta or largefasta',
32 has gff_file => (is => 'rw',
33 traits => ['Getopt'],
34 cmd_aliases => 'g',
35 isa => 'Str',
36 required => 1,
37 documentation => 'gff3 file containing gene models',
41 has length => (is=>'rw',
42 traits=> ['Getopt'],
43 cmd_aliases => 'l',
44 isa => 'Int',
45 required => 0,
46 default => 5000,
47 documentation => 'the length of the sequence to be extracted',
51 has seq_objects => (is => 'rw',
52 isa => 'HashRef',
55 has upstream => (is => 'rw',
56 traits=>['Getopt'],
59 has downstream => (is => 'rw',
60 traits=>['Getopt'],
63 has intron => (is => 'rw',
64 traits=>['Getopt'],
65 default => 0,
69 method read_sequences {
71 print STDERR "Reading the sequence data... :-) \n";
73 my %seqs = ();
74 my $seqio = Bio::SeqIO->new(-format=>$self->format(), -file=>$self->fasta_file());
75 while (my $s = $seqio->next_seq()) {
76 $seqs{$s->id()} = $s;
79 $self->seq_objects(\%seqs);
81 $seqio->close();
84 =head2 extract_upstream
86 Usage: $e->extract_upstream(Str $parent, Str $id, Int $start, Int $end, Int $strand)
87 Desc: extracts the upstream sequence of sequence $id with
88 $coord $start and $end on sequence $parent (which must
89 occur in fasta_file(). If $strand is negative, the upstream
90 sequence is cut out from the end and reverse complemented.
91 Ret: a Bio::Seq object with the upstream sequence
92 Args:
93 Side Effects:
94 Example:
96 =cut
98 method extract_upstream(Str $parent, Str $id, Int $start, Int $end, Int $strand) {
99 #my $self = shift;
101 my %seqs = %{$self->seq_objects()};
102 my $promoter = "";
103 my $length = $self->length();
104 if (!exists($seqs{$parent})) {
105 print STDERR "Don't know about $parent\n";
107 else {
108 if ($strand > 0) {
109 # forward
111 if ($length>$start) { $length = $start; }
112 my $upstream_start = $start - $length;
113 my $upstream_end = $start -1;
115 if ($upstream_start < 1) { $upstream_start = 1; }
116 if ($upstream_end < 1) { $upstream_end =1; }
118 print STDERR "Subseq $upstream_start - $upstream_end \r";
119 $promoter = $seqs{$parent}->subseq($upstream_start, $upstream_end);
122 elsif ($strand < 0) {
123 my $upstream_start = $end +1;
125 my $upstream_end = $end + $self->length();
126 if ($upstream_end > $seqs{$parent}->length()) {
127 $upstream_end = $seqs{$parent}->length();
130 print STDERR "Subseq $upstream_start - $upstream_end \r";
131 my $forward = $seqs{$parent}->subseq($upstream_start, $upstream_end);
132 my $revseq = Bio::Seq->new(-seq=>$forward);
133 $promoter = $revseq->revcom()->seq();
136 my $seq = Bio::Seq->new(-seq=>$promoter, -id=>$id."_promoter_".$length);
137 return $seq;
140 method extract_downstream(Str $parent, Str $id, Int $start, Int $end, Int $strand) {
141 my $downstream = $self->extract_upstream($parent, $id, $start, $end, $strand * (-1));
142 $downstream->seq($downstream->revcom()->seq());
143 $downstream->id($id."_downstream_".$self->length());
144 return $downstream;
147 =head2 extract_introns
149 Usage: $e->extract_introns(Str $parent, Str $id, ArrayRef $exons, Int $strand)
150 Desc: extracts the introns from the sequence $parent, which
151 must be in fasta_file(). The exons are easily parsed out of
152 a gff3 file and can be given as an arrayref of [start, end]
153 coords. Strand is either +1 or -1, in the latter case all intron
154 sequences are reverse complemented and listed in reverse order.
155 Ret: A list of Bio::Seq object containing the introns.
156 Args:
157 Side Effects:
158 Example:
160 =cut
162 method extract_introns(Str $parent, Str $id, ArrayRef $exons, Int $strand) {
164 #print "PROCESSED EXON HASH: ".Dumper($exons);
166 my @introns = ();
168 for (my $i=0; $i<@{$exons}-1; $i++) {
169 my $intron_start = $exons->[$i]->[1];
170 my $intron_end = $exons->[$i+1]->[0];
171 my $intron_seq = $self->seq_objects()->{$parent}->subseq($intron_start+1, $intron_end-1);
173 my $intron_obj = Bio::Seq->new(-seq=>$intron_seq, -id=>$id."_intron");
174 if ($strand<0) { $intron_obj = $intron_obj->revcom(); }
176 push @introns, $intron_obj
179 if ($strand < 0) {
180 @introns = reverse(@introns);
183 for (my $i = 0; $i< @introns; $i++) {
184 $introns[$i]->id( $introns[$i]->id()."_".($i+1));
187 return @introns;
192 sub run {
193 my $self = shift;
194 my $opt = shift;
195 my $args = shift;
197 $self->read_sequences();
198 print STDERR "Processing the GFF3 file...\n";
200 my $gffio = Bio::Tools::GFF->new(
201 -file=>$self->gff_file(),
202 -gff_version=>3
205 my $upstream_out = Bio::SeqIO->new(
206 -file=>">".$self->fasta_file().".upstream_".$self->length(),
207 -format=>"fasta"
209 my $downstream_out = Bio::SeqIO->new(
210 -file=>">".$self->fasta_file().".downstream_".$self->length(),
211 -format=>"fasta"
213 my $introns_out = Bio::SeqIO->new(
214 -file=>">".$self->fasta_file().".introns",
215 -format=>'fasta'
218 my %exons = ();
219 my $gene_id = "";
220 my $previous_id = "";
221 my $previous_strand = "";
222 my $previous_parent = "";
223 my $gene_parent = "";
224 my $parent = "";
225 my $strand = 0;
226 my $start = 0;
227 my $end = 0;
228 my $id = "";
229 while (my $f = $gffio->next_feature()) {
230 $start = $f->start();
231 $end = $f->end();
232 $strand = $f->strand();
234 $parent = $f->seq_id();
235 my $primary =$f->primary_tag();
236 my $source_tag = $f->source_tag();
237 $id = $f->primary_id();
239 if ($primary eq 'gene') {
240 $previous_id=$gene_id;
241 $previous_parent = $gene_parent;
242 $gene_parent = $parent;
243 $gene_id = $id;
246 print STDERR "Processing gene $gene_id [ $start, $end ] \r";
248 if ($self->upstream()) {
249 if (!$end) { print STDERR "skipping $id (end is 0)...\n"; next; }
250 my $seq = $self->extract_upstream($gene_parent, $id, $start, $end, $strand);
251 $upstream_out->write_seq($seq);
254 if ($self->downstream()) {
255 my $seq = $self->extract_downstream($gene_parent, $id, $start, $end, $strand);
256 $downstream_out->write_seq($seq);
260 if ($self->intron() && $primary eq 'exon') {
262 #print STDERR "Processing exon $id. Parent = $parent Source = $source_tag primary $primary [ $start, $end ]\n";
263 push @{$exons{$gene_id}}, [$start, $end];
265 #print "EXON HASH: ".Dumper(%exons);
267 if ($self->intron() && $primary eq 'exon' && $previous_id && ($previous_id ne $gene_id)) {
268 print STDERR "Outputting introns...($previous_id, $id)\n";
269 if (exists($exons{$previous_id})) {
272 my @introns = $self->extract_introns($previous_parent, $previous_id, [ map { $_ } @{$exons{$previous_id}} ], $strand);
273 for (my $i = 0; $i< @introns; $i++) {
274 $introns_out->write_seq($introns[$i]);
282 $previous_strand = $strand;
286 if ($self->intron()) {
287 #print STDERR "Processing last entry... $gene_id, $strand\n";
288 my @introns = $self->extract_introns($parent, $gene_id, [ map { $_ } @{$exons{$gene_id}} ], $strand);
289 foreach my $i (@introns) {
290 $introns_out->write_seq($i);
294 $introns_out->close();
295 $upstream_out->close();
296 $downstream_out->close();
298 print STDERR "\nDone\n";
300 return 0; # means success here.