6 class CXGN
::Tools
::ExtractSeq
{
8 with
'MooseX::Runnable';
15 has fasta_file
=> (is
=> 'rw',
18 traits
=> [ 'Getopt' ],
20 documentation
=> 'Input file in fasta format',
21 default => 'largefasta',
25 has format
=> (is
=>'rw',
28 traits
=> ['Getopt' ],
29 documentation
=> 'the format of the fasta file to use. Either fasta or largefasta',
32 has gff_file
=> (is
=> 'rw',
37 documentation
=> 'gff3 file containing gene models',
41 has
length => (is
=>'rw',
47 documentation
=> 'the length of the sequence to be extracted',
51 has seq_objects
=> (is
=> 'rw',
55 has upstream
=> (is
=> 'rw',
59 has downstream
=> (is
=> 'rw',
63 has intron
=> (is
=> 'rw',
69 method read_sequences
{
71 print STDERR
"Reading the sequence data... :-) \n";
74 my $seqio = Bio
::SeqIO
->new(-format
=>$self->format(), -file
=>$self->fasta_file());
75 while (my $s = $seqio->next_seq()) {
79 $self->seq_objects(\
%seqs);
84 =head2 extract_upstream
86 Usage: $e->extract_upstream(Str $parent, Str $id, Int $start, Int $end, Int $strand)
87 Desc: extracts the upstream sequence of sequence $id with
88 $coord $start and $end on sequence $parent (which must
89 occur in fasta_file(). If $strand is negative, the upstream
90 sequence is cut out from the end and reverse complemented.
91 Ret: a Bio::Seq object with the upstream sequence
98 method extract_upstream
(Str
$parent, Str
$id, Int
$start, Int
$end, Int
$strand) {
101 my %seqs = %{$self->seq_objects()};
103 my $length = $self->length();
104 if (!exists($seqs{$parent})) {
105 print STDERR
"Don't know about $parent\n";
111 if ($length>$start) { $length = $start; }
112 my $upstream_start = $start - $length;
113 my $upstream_end = $start -1;
115 if ($upstream_start < 1) { $upstream_start = 1; }
116 if ($upstream_end < 1) { $upstream_end =1; }
118 print STDERR
"Subseq $upstream_start - $upstream_end \r";
119 $promoter = $seqs{$parent}->subseq($upstream_start, $upstream_end);
122 elsif ($strand < 0) {
123 my $upstream_start = $end +1;
125 my $upstream_end = $end + $self->length();
126 if ($upstream_end > $seqs{$parent}->length()) {
127 $upstream_end = $seqs{$parent}->length();
130 print STDERR
"Subseq $upstream_start - $upstream_end \r";
131 my $forward = $seqs{$parent}->subseq($upstream_start, $upstream_end);
132 my $revseq = Bio
::Seq
->new(-seq
=>$forward);
133 $promoter = $revseq->revcom()->seq();
136 my $seq = Bio
::Seq
->new(-seq
=>$promoter, -id
=>$id."_promoter_".$length);
140 method extract_downstream
(Str
$parent, Str
$id, Int
$start, Int
$end, Int
$strand) {
141 my $downstream = $self->extract_upstream($parent, $id, $start, $end, $strand * (-1));
142 $downstream->seq($downstream->revcom()->seq());
143 $downstream->id($id."_downstream_".$self->length());
147 =head2 extract_introns
149 Usage: $e->extract_introns(Str $parent, Str $id, ArrayRef $exons, Int $strand)
150 Desc: extracts the introns from the sequence $parent, which
151 must be in fasta_file(). The exons are easily parsed out of
152 a gff3 file and can be given as an arrayref of [start, end]
153 coords. Strand is either +1 or -1, in the latter case all intron
154 sequences are reverse complemented and listed in reverse order.
155 Ret: A list of Bio::Seq object containing the introns.
162 method extract_introns
(Str
$parent, Str
$id, ArrayRef
$exons, Int
$strand) {
164 #print "PROCESSED EXON HASH: ".Dumper($exons);
168 for (my $i=0; $i<@
{$exons}-1; $i++) {
169 my $intron_start = $exons->[$i]->[1];
170 my $intron_end = $exons->[$i+1]->[0];
171 my $intron_seq = $self->seq_objects()->{$parent}->subseq($intron_start+1, $intron_end-1);
173 my $intron_obj = Bio
::Seq
->new(-seq
=>$intron_seq, -id
=>$id."_intron");
174 if ($strand<0) { $intron_obj = $intron_obj->revcom(); }
176 push @introns, $intron_obj
180 @introns = reverse(@introns);
183 for (my $i = 0; $i< @introns; $i++) {
184 $introns[$i]->id( $introns[$i]->id()."_".($i+1));
197 $self->read_sequences();
198 print STDERR
"Processing the GFF3 file...\n";
200 my $gffio = Bio
::Tools
::GFF
->new(
201 -file
=>$self->gff_file(),
205 my $upstream_out = Bio
::SeqIO
->new(
206 -file
=>">".$self->fasta_file().".upstream_".$self->length(),
209 my $downstream_out = Bio
::SeqIO
->new(
210 -file
=>">".$self->fasta_file().".downstream_".$self->length(),
213 my $introns_out = Bio
::SeqIO
->new(
214 -file
=>">".$self->fasta_file().".introns",
220 my $previous_id = "";
221 my $previous_strand = "";
222 my $previous_parent = "";
223 my $gene_parent = "";
229 while (my $f = $gffio->next_feature()) {
230 $start = $f->start();
232 $strand = $f->strand();
234 $parent = $f->seq_id();
235 my $primary =$f->primary_tag();
236 my $source_tag = $f->source_tag();
237 $id = $f->primary_id();
239 if ($primary eq 'gene') {
240 $previous_id=$gene_id;
241 $previous_parent = $gene_parent;
242 $gene_parent = $parent;
246 print STDERR
"Processing gene $gene_id [ $start, $end ] \r";
248 if ($self->upstream()) {
249 if (!$end) { print STDERR
"skipping $id (end is 0)...\n"; next; }
250 my $seq = $self->extract_upstream($gene_parent, $id, $start, $end, $strand);
251 $upstream_out->write_seq($seq);
254 if ($self->downstream()) {
255 my $seq = $self->extract_downstream($gene_parent, $id, $start, $end, $strand);
256 $downstream_out->write_seq($seq);
260 if ($self->intron() && $primary eq 'exon') {
262 #print STDERR "Processing exon $id. Parent = $parent Source = $source_tag primary $primary [ $start, $end ]\n";
263 push @
{$exons{$gene_id}}, [$start, $end];
265 #print "EXON HASH: ".Dumper(%exons);
267 if ($self->intron() && $primary eq 'exon' && $previous_id && ($previous_id ne $gene_id)) {
268 print STDERR
"Outputting introns...($previous_id, $id)\n";
269 if (exists($exons{$previous_id})) {
272 my @introns = $self->extract_introns($previous_parent, $previous_id, [ map { $_ } @
{$exons{$previous_id}} ], $strand);
273 for (my $i = 0; $i< @introns; $i++) {
274 $introns_out->write_seq($introns[$i]);
282 $previous_strand = $strand;
286 if ($self->intron()) {
287 #print STDERR "Processing last entry... $gene_id, $strand\n";
288 my @introns = $self->extract_introns($parent, $gene_id, [ map { $_ } @
{$exons{$gene_id}} ], $strand);
289 foreach my $i (@introns) {
290 $introns_out->write_seq($i);
294 $introns_out->close();
295 $upstream_out->close();
296 $downstream_out->close();
298 print STDERR
"\nDone\n";
300 return 0; # means success here.