12 my ($outfile,@dirs) = @ARGV;
13 @dirs or die "must provide a list of directories\n";
15 my $donefile = 'xml2gff.already_done';
17 my $jobs = parallel_gthxml_to_gff3
->new(
18 donefile
=> $donefile,
23 $jobs->max_workers( 7 );
29 ####### PACKAGES ##########################
33 package parallel_gthxml_to_gff3
;
35 use MooseX
::Types
::Path
::Class
;
43 with
'MooseX::Workers';
45 # file where we store which xml files we have already processed
46 has
'donefile' => ( is
=> 'ro',
47 isa
=> 'Path::Class::File',
52 # append our gff3 output to this file
53 has
'outfile' => ( is
=> 'ro',
54 isa
=> 'Path::Class::File',
59 # keep a hashref of what files were done when we started,
60 # parsed from our donefile on program start
61 has
'files_done' => ( is
=> 'ro',
64 ); sub _build_files_done
{
67 return {} unless -f
$self->donefile;
69 print "indexing donefile ... ";
70 tie
my %done, 'DB_File', $self->donefile.'.index', O_CREAT
|O_RDWR
;
71 my $d = $self->donefile->openr;
72 $done{ normalize_filename
($_) } = 1 while <$d>;
77 return shift->files_done->{ normalize_filename
(@_) };
79 sub normalize_filename
{
80 $_[0] =~ m
|/(\d+/\d
+\
.xml
)$|
85 #arrayref of dirs we are searching for files
86 has
'dirs' => ( is
=> 'ro',
91 # run a find process to find the XML files we want
92 has
'find_handle' => ( is
=> 'ro',
95 ); sub _build_find_handle
{
96 open my $f, "find ".join(' ',map dir
($_)->absolute, @
{shift->dirs})." -type f -and -name '*.xml' |";
100 # wrap MX::Workers enqueue to just take an xml file name, and generate
102 around enqueue
=> sub {
103 my ( $orig, $self, $xml_file ) = @_;
105 if( $self->is_done( $xml_file ) ) {
106 #print "$xml_file skipped\n";
109 print "$xml_file queued ...\n";
112 my $gff3_out_file = File
::Temp
->new;
113 $gff3_out_file->close;
115 GTH_XML_2_GFF3
->gthxml_to_gff3( $xml_file, "$gff3_out_file" );
118 warn "$xml_file parse failed:\n$@";
120 # dump the converted results to our output file
121 open my $gff3_fh,'<', "$gff3_out_file";
122 local $SIG{INT
}; #< ignore interrupts
126 while( <$gff3_fh> ) {
127 unless( /^##gff-version/ ) {
128 $l ||= File
::Flock
->new( $self->outfile );
129 $out_fh ||= $self->outfile->open('>>');
135 # record this file as done
136 { my $l = File
::Flock
->new( $self->donefile );
137 $self->donefile->open('>>')->print("$xml_file\n")
149 my $files = $self->find_handle;
151 # print just one gff3 header in the outfile
152 $self->outfile->open('>>')->print("##gff-version 3\n");
160 shift->fill_queue if rand() < 0.3;
166 my $queue_max = $self->max_workers*5;
167 my $queue = $self->Engine->process_queue;
169 while( @
$queue < $queue_max ) {
170 my $xml_file = $self->find_handle->getline;
171 last unless $xml_file;
173 redo unless $self->enqueue( $xml_file );
177 # sub worker_manager_start { warn 'started worker manager' }
178 # sub worker_manager_stop { warn 'stopped worker manager' }
179 # sub max_workers_reached { warn 'max workers reached' }
181 sub worker_stdout
{ shift; warn join ' ', @_; }
182 sub worker_stderr
{ shift; warn join ' ', @_; }
183 sub worker_error
{ shift; warn join ' ', @_; }
184 # sub worker_started { shift; warn join ' ', @_; }
185 # sub sig_child { shift; warn join ' ', @_; }
189 package GTH_XML_2_GFF3
;
192 use English qw
/ -no_match_vars /;
198 my ( $class, $outfile, $gff3_out_file ) = @_;
200 my $pm = $class->_parse_mode;
202 and die "_parse_mode() cannot be 'alignments'. consider using 'alignments_merged'";
204 my $gth_in = Bio
::FeatureIO
->new( -format
=> 'gthxml', -file
=> $outfile,
206 -attach_alignments
=> 1,
208 my $gff3_out = $class->_open_gff3_out( $gff3_out_file );
209 while ( my $f = $gth_in->next_feature ) {
211 # set each feature's source to the name of the gth subclass that's running this
212 $class->_recursive_set_source( $f, $class->_source );
214 # do additional processing on the feature if necessary
215 # (can be implemented in subclasses)
216 $class->_process_gff3_feature( $f );
218 # make some ID and Parent tags in the subfeatures
219 $class->_make_gff3_id_and_parent($f);
221 # stringify the supporting_alignment's to their IDs
222 $f->add_Annotation('supporting_alignment',$_)
223 for map { $_->value->get_Annotations('ID') }
224 $f->remove_Annotations('supporting_alignment');
226 # write the feature to the gff3 file
227 $gff3_out->write_feature($f);
229 }; if( $EVAL_ERROR ) {
230 #workaround for a gth bug. will probably be fixed when we upgrade gth
231 die $EVAL_ERROR unless $EVAL_ERROR =~ /not well-formed \(invalid token\)/;
232 open my $gff3, '>', $gff3_out_file;
235 # no results. genomethreader produced invalid output XML.
237 open my $out, '>', $outfile;
242 # does nothing here, but may be implemented in subclasses
243 sub _process_gff3_feature
{
246 #get the source name to use in our GFF3
251 #recursively set the source on a feature and its subfeatures
252 sub _recursive_set_source
{
253 my ($self,$feature,$newsource) = @_;
254 $feature->source($newsource);
255 $self->_recursive_set_source($_,$newsource) for $feature->get_SeqFeatures;
258 # object OR class method to
259 # open a gff3 outfile with the right version and sequence-region
261 my ($self,$outfile) = @_;
263 # handle for out merged output file
264 return Bio
::FeatureIO
->new( -file
=> ">$outfile",
271 #take a feature hierarchy, manufacture ID and Parent tags to encode
272 #the hierarchical relationships, adding them to the features
273 sub _make_gff3_id_and_parent
{
274 my ($class,$feat,$parent_ID) = @_;
276 $feat->add_Annotation('Parent',Bio
::Annotation
::SimpleValue
->new(-value
=> $parent_ID))
277 if defined $parent_ID;
279 #make a unique id for this thing, keeping our id counters on a
281 if(my $idstem = $class->_feature_id($feat,$parent_ID)) {
282 my $uniqid = $class->_unique_bio_annotation_id($idstem);
283 $feat->add_Annotation('ID',Bio
::Annotation
::SimpleValue
->new(-value
=> $uniqid));
284 #recursively ID and Parent all the subfeatures, if any
285 $class->_make_gff3_id_and_parent($_,$uniqid) for $feat->get_SeqFeatures;
289 #given a stem, make a ID that's unique to this analysis
290 #by appending a number to the stem
292 sub _unique_bio_annotation_id
{
293 my ($class,$idstem) = @_;
294 $uniq_id_ctrs{$class} ||= {};
295 return Bio
::Annotation
::SimpleValue
->new(-value
=> $idstem.'_'.++$uniq_id_ctrs{$class}{$idstem});
299 my ($class,$feat,$parent_ID) = @_;
300 if($feat->type->name eq 'mRNA') {
302 } elsif ( $feat->type->name eq 'match') {
303 #get the target name of the first subfeature's target
304 my ($target_id) = (($feat->get_SeqFeatures)[0]->get_Annotations('Target'))[0]->target_id;
305 $target_id.'_alignment'
306 } elsif ( $feat->type->name eq 'region') {
308 } else { #just name the feature for its source and type
309 my $src = $feat->source;
310 $src =~ s/GenomeThreader/GTH/; #< shorten the sources a little
311 $src =~ s/(tom|pot)ato/$1/; #< shorten the sources a little
312 $src.'_'.$feat->type->name;
321 } # end of BEGIN block