universal isa is deprecated and throws compile error
[ITAG.git] / hacks / itag_remap_xml2gff3.pl
blob02e2a14581d2ac7b409c8d49c9c7050f45286d2f
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use autodie ':all';
6 use Carp;
7 use Getopt::Std;
9 use 5.10.0;
12 my ($outfile,@dirs) = @ARGV;
13 @dirs or die "must provide a list of directories\n";
15 my $donefile = 'xml2gff.already_done';
17 my $jobs = parallel_gthxml_to_gff3->new(
18 donefile => $donefile,
19 outfile => $outfile,
20 dirs => \@dirs
23 $jobs->max_workers( 7 );
24 $jobs->run;
26 exit;
29 ####### PACKAGES ##########################
32 BEGIN {
33 package parallel_gthxml_to_gff3;
34 use Moose;
35 use MooseX::Types::Path::Class;
36 use File::Flock;
37 use Path::Class;
38 use autodie ':all';
40 use POSIX;
41 use DB_File;
43 with 'MooseX::Workers';
45 # file where we store which xml files we have already processed
46 has 'donefile' => ( is => 'ro',
47 isa => 'Path::Class::File',
48 required => 1,
49 coerce => 1,
52 # append our gff3 output to this file
53 has 'outfile' => ( is => 'ro',
54 isa => 'Path::Class::File',
55 required => 1,
56 coerce => 1,
59 # keep a hashref of what files were done when we started,
60 # parsed from our donefile on program start
61 has 'files_done' => ( is => 'ro',
62 isa => 'HashRef',
63 lazy_build => 1,
64 ); sub _build_files_done {
65 my ( $self ) = @_;
67 return {} unless -f $self->donefile;
69 print "indexing donefile ... ";
70 tie my %done, 'DB_File', $self->donefile.'.index', O_CREAT|O_RDWR;
71 my $d = $self->donefile->openr;
72 $done{ normalize_filename($_) } = 1 while <$d>;
73 print "done.\n";
74 return \%done;
76 sub is_done {
77 return shift->files_done->{ normalize_filename(@_) };
79 sub normalize_filename {
80 $_[0] =~ m|/(\d+/\d+\.xml)$|
81 or return;
82 return $1;
85 #arrayref of dirs we are searching for files
86 has 'dirs' => ( is => 'ro',
87 isa => 'ArrayRef',
88 required => 1,
91 # run a find process to find the XML files we want
92 has 'find_handle' => ( is => 'ro',
93 isa => 'FileHandle',
94 lazy_build => 1,
95 ); sub _build_find_handle {
96 open my $f, "find ".join(' ',map dir($_)->absolute, @{shift->dirs})." -type f -and -name '*.xml' |";
97 return $f;
100 # wrap MX::Workers enqueue to just take an xml file name, and generate
101 # the worker for it
102 around enqueue => sub {
103 my ( $orig, $self, $xml_file ) = @_;
105 if( $self->is_done( $xml_file ) ) {
106 #print "$xml_file skipped\n";
107 return;
109 print "$xml_file queued ...\n";
111 $self->$orig(sub {
112 my $gff3_out_file = File::Temp->new;
113 $gff3_out_file->close;
114 eval {
115 GTH_XML_2_GFF3->gthxml_to_gff3( $xml_file, "$gff3_out_file" );
117 if( $@ ) {
118 warn "$xml_file parse failed:\n$@";
119 } else {
120 # dump the converted results to our output file
121 open my $gff3_fh,'<', "$gff3_out_file";
122 local $SIG{INT}; #< ignore interrupts
124 { my $l;
125 my $out_fh;
126 while( <$gff3_fh> ) {
127 unless( /^##gff-version/ ) {
128 $l ||= File::Flock->new( $self->outfile );
129 $out_fh ||= $self->outfile->open('>>');
130 $out_fh->print($_);
135 # record this file as done
136 { my $l = File::Flock->new( $self->donefile );
137 $self->donefile->open('>>')->print("$xml_file\n")
142 return 1;
145 sub run {
146 my $self = shift;
148 # run find first
149 my $files = $self->find_handle;
151 # print just one gff3 header in the outfile
152 $self->outfile->open('>>')->print("##gff-version 3\n");
154 $self->fill_queue;
156 POE::Kernel->run;
159 sub worker_done {
160 shift->fill_queue if rand() < 0.3;
163 sub fill_queue {
164 my $self = shift;
166 my $queue_max = $self->max_workers*5;
167 my $queue = $self->Engine->process_queue;
169 while( @$queue < $queue_max ) {
170 my $xml_file = $self->find_handle->getline;
171 last unless $xml_file;
172 chomp $xml_file;
173 redo unless $self->enqueue( $xml_file );
177 # sub worker_manager_start { warn 'started worker manager' }
178 # sub worker_manager_stop { warn 'stopped worker manager' }
179 # sub max_workers_reached { warn 'max workers reached' }
181 sub worker_stdout { shift; warn join ' ', @_; }
182 sub worker_stderr { shift; warn join ' ', @_; }
183 sub worker_error { shift; warn join ' ', @_; }
184 # sub worker_started { shift; warn join ' ', @_; }
185 # sub sig_child { shift; warn join ' ', @_; }
187 no Moose;
189 package GTH_XML_2_GFF3;
190 use strict;
191 use warnings;
192 use English qw/ -no_match_vars /;
194 use Bio::SeqIO;
195 use Bio::FeatureIO;
197 sub gthxml_to_gff3 {
198 my ( $class, $outfile, $gff3_out_file ) = @_;
200 my $pm = $class->_parse_mode;
201 $pm eq 'alignments'
202 and die "_parse_mode() cannot be 'alignments'. consider using 'alignments_merged'";
203 eval {
204 my $gth_in = Bio::FeatureIO->new( -format => 'gthxml', -file => $outfile,
205 -mode => $pm,
206 -attach_alignments => 1,
208 my $gff3_out = $class->_open_gff3_out( $gff3_out_file );
209 while ( my $f = $gth_in->next_feature ) {
211 # set each feature's source to the name of the gth subclass that's running this
212 $class->_recursive_set_source( $f, $class->_source );
214 # do additional processing on the feature if necessary
215 # (can be implemented in subclasses)
216 $class->_process_gff3_feature( $f );
218 # make some ID and Parent tags in the subfeatures
219 $class->_make_gff3_id_and_parent($f);
221 # stringify the supporting_alignment's to their IDs
222 $f->add_Annotation('supporting_alignment',$_)
223 for map { $_->value->get_Annotations('ID') }
224 $f->remove_Annotations('supporting_alignment');
226 # write the feature to the gff3 file
227 $gff3_out->write_feature($f);
229 }; if( $EVAL_ERROR ) {
230 #workaround for a gth bug. will probably be fixed when we upgrade gth
231 die $EVAL_ERROR unless $EVAL_ERROR =~ /not well-formed \(invalid token\)/;
232 open my $gff3, '>', $gff3_out_file;
233 print $gff3 <<EOF;
234 ##gff-version 3
235 # no results. genomethreader produced invalid output XML.
237 open my $out, '>', $outfile;
242 # does nothing here, but may be implemented in subclasses
243 sub _process_gff3_feature {
246 #get the source name to use in our GFF3
247 sub _source {
248 'GTH'
251 #recursively set the source on a feature and its subfeatures
252 sub _recursive_set_source {
253 my ($self,$feature,$newsource) = @_;
254 $feature->source($newsource);
255 $self->_recursive_set_source($_,$newsource) for $feature->get_SeqFeatures;
258 # object OR class method to
259 # open a gff3 outfile with the right version and sequence-region
260 sub _open_gff3_out {
261 my ($self,$outfile) = @_;
263 # handle for out merged output file
264 return Bio::FeatureIO->new( -file => ">$outfile",
265 -format => 'gff',
266 -version => 3,
271 #take a feature hierarchy, manufacture ID and Parent tags to encode
272 #the hierarchical relationships, adding them to the features
273 sub _make_gff3_id_and_parent {
274 my ($class,$feat,$parent_ID) = @_;
276 $feat->add_Annotation('Parent',Bio::Annotation::SimpleValue->new(-value => $parent_ID))
277 if defined $parent_ID;
279 #make a unique id for this thing, keeping our id counters on a
280 #per-analysis level
281 if(my $idstem = $class->_feature_id($feat,$parent_ID)) {
282 my $uniqid = $class->_unique_bio_annotation_id($idstem);
283 $feat->add_Annotation('ID',Bio::Annotation::SimpleValue->new(-value => $uniqid));
284 #recursively ID and Parent all the subfeatures, if any
285 $class->_make_gff3_id_and_parent($_,$uniqid) for $feat->get_SeqFeatures;
289 #given a stem, make a ID that's unique to this analysis
290 #by appending a number to the stem
291 my %uniq_id_ctrs;
292 sub _unique_bio_annotation_id {
293 my ($class,$idstem) = @_;
294 $uniq_id_ctrs{$class} ||= {};
295 return Bio::Annotation::SimpleValue->new(-value => $idstem.'_'.++$uniq_id_ctrs{$class}{$idstem});
298 sub _feature_id {
299 my ($class,$feat,$parent_ID) = @_;
300 if($feat->type->name eq 'mRNA') {
301 "${parent_ID}_AGS"
302 } elsif ( $feat->type->name eq 'match') {
303 #get the target name of the first subfeature's target
304 my ($target_id) = (($feat->get_SeqFeatures)[0]->get_Annotations('Target'))[0]->target_id;
305 $target_id.'_alignment'
306 } elsif ( $feat->type->name eq 'region') {
307 'PGL'
308 } else { #just name the feature for its source and type
309 my $src = $feat->source;
310 $src =~ s/GenomeThreader/GTH/; #< shorten the sources a little
311 $src =~ s/(tom|pot)ato/$1/; #< shorten the sources a little
312 $src.'_'.$feat->type->name;
316 #can be overridden
317 sub _parse_mode {
318 'both_merged';
321 } # end of BEGIN block