Bio::Tools::CodonTable::_codon_is: simplify with use all()
[bioperl-live.git] / bin / bp_genbank2gff3
blob7149b844552e67275747b0c86927a98ac1c0ddc2
1 #!/usr/bin/perl
5 =pod
7 =head1 NAME
9 bp_genbank2gff3.pl -- Genbank-E<gt>gbrowse-friendly GFF3
11 =head1 SYNOPSIS
13 bp_genbank2gff3.pl [options] filename(s)
15 # process a directory containing GenBank flatfiles
16 perl bp_genbank2gff3.pl --dir path_to_files --zip
18 # process a single file, ignore explicit exons and introns
19 perl bp_genbank2gff3.pl --filter exon --filter intron file.gbk.gz
21 # process a list of files
22 perl bp_genbank2gff3.pl *gbk.gz
24 # process data from URL, with Chado GFF model (-noCDS), and pipe to database loader
25 curl ftp://ftp.ncbi.nih.gov/genomes/Saccharomyces_cerevisiae/CHR_X/NC_001142.gbk \
26 | perl bp_genbank2gff3.pl -noCDS -in stdin -out stdout \
27 | perl gmod_bulk_load_gff3.pl -dbname mychado -organism fromdata
29 Options:
30 --noinfer -r don't infer exon/mRNA subfeatures
31 --conf -i path to the curation configuration file that contains user preferences
32 for Genbank entries (must be YAML format)
33 (if --manual is passed without --ini, user will be prompted to
34 create the file if any manual input is saved)
35 --sofile -l path to to the so.obo file to use for feature type mapping
36 (--sofile live will download the latest online revision)
37 --manual -m when trying to guess the proper SO term, if more than
38 one option matches the primary tag, the converter will
39 wait for user input to choose the correct one
40 (only works with --sofile)
41 --dir -d path to a list of genbank flatfiles
42 --outdir -o location to write GFF files (can be 'stdout' or '-' for pipe)
43 --zip -z compress GFF3 output files with gzip
44 --summary -s print a summary of the features in each contig
45 --filter -x genbank feature type(s) to ignore
46 --split -y split output to separate GFF and fasta files for
47 each genbank record
48 --nolump -n separate file for each reference sequence
49 (default is to lump all records together into one
50 output file for each input file)
51 --ethresh -e error threshold for unflattener
52 set this high (>2) to ignore all unflattener errors
53 --[no]CDS -c Keep CDS-exons, or convert to alternate gene-RNA-protein-exon
54 model. --CDS is default. Use --CDS to keep default GFF gene model,
55 use --noCDS to convert to g-r-p-e.
56 --format -f Input format (SeqIO types): GenBank, Swiss or Uniprot, EMBL work
57 (GenBank is default)
58 --GFF_VERSION 3 is default, 2 and 2.5 and other Bio::Tools::GFF versions available
59 --quiet don't talk about what is being processed
60 --typesource SO sequence type for source (e.g. chromosome; region; contig)
61 --help -h display this message
64 =head1 DESCRIPTION
66 This script uses Bio::SeqFeature::Tools::Unflattener and
67 Bio::Tools::GFF to convert GenBank flatfiles to GFF3 with gene
68 containment hierarchies mapped for optimal display in gbrowse.
70 The input files are assumed to be gzipped GenBank flatfiles for refseq
71 contigs. The files may contain multiple GenBank records. Either a
72 single file or an entire directory can be processed. By default, the
73 DNA sequence is embedded in the GFF but it can be saved into separate
74 fasta file with the --split(-y) option.
76 If an input file contains multiple records, the default behaviour is
77 to dump all GFF and sequence to a file of the same name (with .gff
78 appended). Using the 'nolump' option will create a separate file for
79 each genbank record. Using the 'split' option will create separate
80 GFF and Fasta files for each genbank record.
83 =head2 Notes
85 =head3 'split' and 'nolump' produce many files
87 In cases where the input files contain many GenBank records (for
88 example, the chromosome files for the mouse genome build), a very
89 large number of output files will be produced if the 'split' or
90 'nolump' options are selected. If you do have lists of files E<gt> 6000,
91 use the --long_list option in bp_bulk_load_gff.pl or
92 bp_fast_load_gff.pl to load the gff and/ or fasta files.
94 =head3 Designed for RefSeq
96 This script is designed for RefSeq genomic sequence entries. It may
97 work for third party annotations but this has not been tested.
98 But see below, Uniprot/Swissprot works, EMBL and possibly EMBL/Ensembl
99 if you don't mind some gene model unflattener errors (dgg).
101 =head3 G-R-P-E Gene Model
103 Don Gilbert worked this over with needs to produce GFF3 suited to
104 loading to GMOD Chado databases. Most of the changes I believe are
105 suited for general use. One main chado-specific addition is the
106 --[no]cds2protein flag
108 My favorite GFF is to set the above as ON by default (disable with --nocds2prot)
109 For general use it probably should be OFF, enabled with --cds2prot.
111 This writes GFF with an alternate, but useful Gene model,
112 instead of the consensus model for GFF3
114 [ gene > mRNA> (exon,CDS,UTR) ]
116 This alternate is
118 gene > mRNA > polypeptide > exon
120 means the only feature with dna bases is the exon. The others
121 specify only location ranges on a genome. Exon of course is a child
122 of mRNA and protein/peptide.
124 The protein/polypeptide feature is an important one, having all the
125 annotations of the GenBank CDS feature, protein ID, translation, GO
126 terms, Dbxrefs to other proteins.
128 UTRs, introns, CDS-exons are all inferred from the primary exon bases
129 inside/outside appropriate higher feature ranges. Other special gene
130 model features remain the same.
132 Several other improvements and bugfixes, minor but useful are included
134 * IO pipes now work:
135 curl ftp://ncbigenomes/... | bp_genbank2gff3 --in stdin --out stdout | gff2chado ...
137 * GenBank main record fields are added to source feature, e.g. organism, date,
138 and the sourcetype, commonly chromosome for genomes, is used.
140 * Gene Model handling for ncRNA, pseudogenes are added.
142 * GFF header is cleaner, more informative.
143 --GFF_VERSION flag allows choice of v2 as well as default v3
145 * GFF ##FASTA inclusion is improved, and
146 CDS translation sequence is moved to FASTA records.
148 * FT -> GFF attribute mapping is improved.
150 * --format choice of SeqIO input formats (GenBank default).
151 Uniprot/Swissprot and EMBL work and produce useful GFF.
153 * SeqFeature::Tools::TypeMapper has a few FT -> SOFA additions
154 and more flexible usage.
156 =head1 TODO
158 =head2 Are these additions desired?
160 * filter input records by taxon (e.g. keep only organism=xxx or taxa level = classYYY
161 * handle Entrezgene, other non-sequence SeqIO structures (really should change
162 those parsers to produce consistent annotation tags).
164 =head2 Related bugfixes/tests
166 These items from Bioperl mail were tested (sample data generating
167 errors), and found corrected:
169 From: Ed Green <green <at> eva.mpg.de>
170 Subject: genbank2gff3.pl on new human RefSeq
171 Date: 2006-03-13 21:22:26 GMT
172 -- unspecified errors (sample data works now).
174 From: Eric Just <e-just <at> northwestern.edu>
175 Subject: genbank2gff3.pl
176 Date: 2007-01-26 17:08:49 GMT
177 -- bug fixed in genbank2gff3 for multi-record handling
179 This error is for a /trans_splice gene that is hard to handle, and
180 unflattner/genbank2 doesn't
182 From: Chad Matsalla <chad <at> dieselwurks.com>
183 Subject: genbank2gff3.PLS and the unflatenner - Inconsistent order?
184 Date: 2005-07-15 19:51:48 GMT
186 =head1 AUTHOR
188 Sheldon McKay (mckays@cshl.edu)
190 Copyright (c) 2004 Cold Spring Harbor Laboratory.
192 =head2 AUTHOR of hacks for GFF2Chado loading
194 Don Gilbert (gilbertd@indiana.edu)
197 =cut
199 use strict;
200 use warnings;
202 use Pod::Usage;
203 use Bio::Root::RootI;
204 use Bio::SeqIO;
205 use File::Spec;
206 use Bio::SeqFeature::Tools::Unflattener;
207 use Bio::SeqFeature::Tools::TypeMapper;
208 use Bio::SeqFeature::Tools::IDHandler;
209 use Bio::Location::SplitLocationI;
210 use Bio::Location::Simple;
211 use Bio::Tools::GFF;
212 use Getopt::Long;
213 use List::Util qw(first);
214 use Bio::OntologyIO;
215 use YAML qw(Dump LoadFile DumpFile);
216 use File::Basename;
218 use vars qw/$split @filter $zip $outdir $help $ethresh
219 $ONTOLOGY %FEATURES %DESCENDANTS @RETURN $MANUAL @GFF_LINE_FEAT
220 $CONF $YAML $TYPE_MAP $SYN_MAP $noinfer $SO_FILE
221 $file @files $dir $summary $nolump
222 $source_type %proteinfa %exonpar $didheader $verbose $DEBUG $GFF_VERSION
223 $gene_id $rna_id $tnum $ncrna_id $rnum %method %id %seen/;
225 use constant SO_URL =>
226 'http://song.cvs.sourceforge.net/viewvc/*checkout*/song/ontology/so.obo';
227 use constant ALPHABET => [qw(a b c d e f g h i j k l m n o p q r s t u v w x y z)];
228 use constant ALPHABET_TO_NUMBER => {
229 a => 0, b => 1, c => 2, d => 3, e => 4, f => 5, g => 6, h => 7, i => 8,
230 j => 9, k => 10, l => 11, m => 12, n => 13, o => 14, p => 15, q => 16,
231 r => 17, s => 18, t => 19, u => 20, v => 21, w => 22, x => 23, y => 24,
232 z => 25,
234 use constant ALPHABET_DIVISOR => 26;
235 use constant GM_NEW_TOPLEVEL => 2;
236 use constant GM_NEW_PART => 1;
237 use constant GM_DUP_PART => 0;
238 use constant GM_NOT_PART => -1;
240 # Options cycle in multiples of 2 because of left side/right side pairing.
241 # You can make this number odd, but displayed matches will still round up
242 use constant OPTION_CYCLE => 6;
244 $GFF_VERSION = 3; # allow v2 ...
245 $verbose = 1; # right default? -nov to turn off
247 # dgg: change the gene model to Gene/mRNA/Polypeptide/exons...
248 my $CDSkeep= 1; # default should be ON (prior behavior), see gene_features()
249 my $PROTEIN_TYPE = 'polypeptide'; # for noCDSkeep;
250 # protein = flybase chado usage; GMOD Perls use 'polypeptide' with software support
252 my $FORMAT="GenBank"; # swiss ; embl; genbank ; ** guess from SOURCEID **
253 my $SOURCEID= $FORMAT; # "UniProt" "GenBank" "EMBL" should work
254 # other Bio::SeqIO formats may work. TEST: EntrezGene < problematic tags; InterPro KEGG
257 my %TAG_MAP = (
258 db_xref => 'Dbxref',
259 name => 'Name',
260 note => 'Note', # also pull GO: ids into Ontology_term
261 synonym => 'Alias',
262 symbol => 'Alias', # is symbol still used?
263 # protein_id => 'Dbxref', also seen Dbxref tags: EC_number
264 # translation: handled in gene_features
268 $| = 1;
269 my $quiet= !$verbose;
270 my $ok= GetOptions( 'd|dir|input:s' => \$dir,
271 'z|zip' => \$zip,
272 'h|help' => \$help,
273 's|summary' => \$summary,
274 'r|noinfer' => \$noinfer,
275 'i|conf=s' => \$CONF,
276 'sofile=s' => \$SO_FILE,
277 'm|manual' => \$MANUAL,
278 'o|outdir|output:s'=> \$outdir,
279 'x|filter:s'=> \@filter,
280 'y|split' => \$split,
281 "ethresh|e=s"=>\$ethresh,
282 'c|CDS!' => \$CDSkeep,
283 'f|format=s' => \$FORMAT,
284 'typesource=s' => \$source_type,
285 'GFF_VERSION=s' => \$GFF_VERSION,
286 'quiet!' => \$quiet, # swap quiet to verbose
287 'DEBUG!' => \$DEBUG,
288 'n|nolump' => \$nolump);
290 my $lump = 1 unless $nolump || $split;
291 $verbose= !$quiet;
293 # look for help request
294 pod2usage(2) if $help || !$ok;
296 # keep SOURCEID as-is and change FORMAT for SeqIO types;
297 # note SeqIO uses file.suffix to guess type; not useful here
298 $SOURCEID= $FORMAT;
299 $FORMAT = "swiss" if $FORMAT =~/UniProt|trembl/;
300 $verbose =1 if($DEBUG);
302 # initialize handlers
303 my $unflattener = Bio::SeqFeature::Tools::Unflattener->new; # for ensembl genomes (-trust_grouptag=>1);
304 $unflattener->error_threshold($ethresh) if $ethresh;
305 $unflattener->verbose(1) if($DEBUG);
306 # $unflattener->group_tag('gene') if($FORMAT =~ /embl/i) ; #? ensembl only?
307 # ensembl parsing is still problematic, forget this
309 my $tm = Bio::SeqFeature::Tools::TypeMapper->new;
310 my $idh = Bio::SeqFeature::Tools::IDHandler->new;
312 # dgg
313 $source_type ||= "region"; # should really parse from FT.source contents below
315 #my $FTSOmap = $tm->FT_SO_map();
316 my $FTSOmap;
317 my $FTSOsynonyms;
319 if (defined($SO_FILE) && $SO_FILE eq 'live') {
320 print "\nDownloading the latest SO file from ".SO_URL."\n\n";
321 use LWP::UserAgent;
322 my $ua = LWP::UserAgent->new(timeout => 30);
323 my $request = HTTP::Request->new(GET => SO_URL);
324 my $response = $ua->request($request);
327 if ($response->status_line =~ /200/) {
328 use File::Temp qw/ tempfile /;
329 my ($fh, $fn) = tempfile();
330 print $fh $response->content;
331 $SO_FILE = $fn;
332 } else {
333 print "Couldn't download SO file online...skipping validation.\n"
334 . "HTTP Status was " . $response->status_line . "\n"
335 and undef $SO_FILE
339 if ($SO_FILE) {
342 my (%terms, %syn);
344 my $parser = Bio::OntologyIO->new( -format => "obo", -file => $SO_FILE );
345 $ONTOLOGY = $parser->next_ontology();
347 for ($ONTOLOGY->get_all_terms) {
348 my $feat = $_;
350 $terms{$feat->name} = $feat->name;
351 #$terms{$feat->name} = $feat;
353 my @syn = $_->each_synonym;
355 push @{$syn{$_}}, $feat->name for @syn;
356 #push @{$syn{$_}}, $feat for @syn;
359 $FTSOmap = \%terms;
360 $FTSOsynonyms = \%syn;
362 my %hardTerms = %{ $tm->FT_SO_map() };
363 map { $FTSOmap->{$_} ||= $hardTerms{$_} } keys %hardTerms;
365 } else {
366 my %terms = %{ $tm->FT_SO_map() };
367 while (my ($k,$v) = each %terms) {
368 $FTSOmap->{$k} = ref($v) ? shift @$v : $v;
372 $TYPE_MAP = $FTSOmap;
373 $SYN_MAP = $FTSOsynonyms;
376 # #convert $FTSOmap undefined to valid SO : moved to TypeMapper->map_types( -undefined => "region")
378 # stringify filter list if applicable
379 my $filter = join ' ', @filter if @filter;
381 # determine input files
382 my $stdin=0; # dgg: let dir == stdin == '-' for pipe use
383 if ($dir && ($dir eq '-' || $dir eq 'stdin')) {
384 $stdin=1; $dir=''; @files=('stdin');
386 } elsif ( $dir ) {
387 if ( -d $dir ) {
388 opendir DIR, $dir or die "could not open $dir for reading: $!";
389 @files = map { "$dir/$_";} grep { /\.gb.*/ } readdir DIR;
390 closedir DIR;
392 else {
393 die "$dir is not a directory\n";
396 else {
397 @files = @ARGV;
398 $dir = '';
401 # we should have some files by now
402 pod2usage(2) unless @files;
405 my $stdout=0; # dgg: let outdir == stdout == '-' for pipe use
406 if($outdir && ($outdir eq '-' || $outdir eq 'stdout')) {
407 warn("std. output chosen: cannot split\n") if($split);
408 warn("std. output chosen: cannot zip\n") if($zip);
409 warn("std. output chosen: cannot nolump\n") if($nolump);
410 $stdout=1; $lump=1; $split= 0; $zip= 0; # unless we pipe stdout thru gzip
412 } elsif ( $outdir && !-e $outdir ) {
413 mkdir($outdir) or die "could not create directory $outdir: $!\n";
415 elsif ( !$outdir ) {
416 $outdir = $dir || '.';
419 for my $file ( @files ) {
420 # dgg ; allow 'stdin' / '-' input ?
421 chomp $file;
422 die "$! $file" unless($stdin || -e $file);
423 print "# Input: $file\n" if($verbose);
425 my ($lump_fh, $lumpfa_fh, $outfile, $outfa);
426 if ($stdout) {
427 $lump_fh= *STDOUT; $lump="stdout$$";
428 $outfa= "stdout$$.fa"; # this is a temp file ... see below
429 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
431 } elsif ( $lump ) {
432 my ($vol,$dirs,$fileonly) = File::Spec->splitpath($file);
433 $lump = File::Spec->catfile($outdir, $fileonly.'.gff');
434 ($outfa = $lump) =~ s/\.gff/\.fa/;
435 open $lump_fh, ">$lump" or die "Could not create a lump outfile called ($lump) because ($!)\n";
436 open $lumpfa_fh, ">$outfa" or die "Could not create a lump outfile called ($outfa) because ($!)\n";
440 # open input file, unzip if req'd
441 if ($stdin) {
442 *FH = *STDIN;
443 } elsif ( $file =~ /\.gz/ ) {
444 open FH, "gunzip -c $file |";
446 else {
447 open FH, '<', $file;
450 my $in = Bio::SeqIO->new(-fh => \*FH, -format => $FORMAT, -debug=>$DEBUG);
451 my $gffio = Bio::Tools::GFF->new( -noparse => 1, -gff_version => $GFF_VERSION );
453 while ( my $seq = $in->next_seq() ) {
454 my $seq_name = $seq->accession_number;
455 my $end = $seq->length;
456 my @to_print;
458 # arrange disposition of GFF output
459 $outfile = $lump || File::Spec->catfile($outdir, $seq_name.'.gff');
460 my $out;
462 if ( $lump ) {
463 $outfile = $lump;
464 $out = $lump_fh;
466 else {
467 $outfile = File::Spec->catfile($outdir, $seq_name.'.gff');
468 open $out, ">$outfile";
471 # filter out unwanted features
472 my $source_feat= undef;
473 my @source= filter($seq); $source_feat= $source[0];
475 ($source_type,$source_feat)=
476 getSourceInfo( $seq, $source_type, $source_feat ) ;
477 # always; here we build main prot $source_feat; # if @source;
479 # abort if there are no features
480 warn "$seq_name has no features, skipping\n" and next
481 if !$seq->all_SeqFeatures;
484 $FTSOmap->{'source'} = $source_type;
485 ## $FTSOmap->{'CDS'}= $PROTEIN_TYPE; # handle this in gene_features
487 # construct a GFF header
488 # add: get source_type from attributes of source feature? chromosome=X tag
489 # also combine 1st ft line here with source ft from $seq ..
490 my($header,$info)= gff_header($seq_name, $end, $source_type, $source_feat);
491 print $out $header;
492 print "# working on $info\n" if($verbose);
494 # unflatten gene graphs, apply SO types, etc; this also does TypeMapper ..
495 unflatten_seq($seq);
497 # Note that we use our own get_all_SeqFeatures function
498 # to rescue cloned exons
500 @GFF_LINE_FEAT = ();
501 for my $feature ( get_all_SeqFeatures($seq) ) {
503 my $method = $feature->primary_tag;
504 next if($SOURCEID =~/UniProt|swiss|trembl/i && $method ne $source_type);
506 $feature->seq_id($seq->id) unless($feature->seq_id);
507 $feature->source_tag($SOURCEID);
509 # dgg; need to convert some Genbank to GFF tags: note->Note; db_xref->Dbxref;
510 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
511 maptags2gff($feature);
513 # current gene name. The unflattened gene features should be in order so any
514 # exons, CDSs, etc that follow will belong to this gene
515 my $gene_name;
516 if ( $method eq 'gene' || $method eq 'pseudogene' ) {
517 @to_print= print_held($out, $gffio, \@to_print);
518 $gene_id = $gene_name= gene_name($feature);
519 } else {
520 $gene_name= gene_name($feature);
523 #?? should gene_name from /locus_tag,/gene,/product,/transposon=xxx
524 # be converted to or added as Name=xxx (if not ID= or as well)
525 ## problematic: convert_to_name ($feature); # drops /locus_tag,/gene, tags
526 convert_to_name($feature);
528 ## dgg: extended to protein|polypeptide
529 ## this test ($feature->has_tag('gene') ||) is not good: repeat_regions over genes
530 ## in yeast have that genbank tag; why?
531 ## these include pseudogene ...
533 ## Note we also have mapped types to SO, so these RNA's are now transcripts:
534 # pseudomRNA => "pseudogenic_transcript",
535 # pseudotranscript" => "pseudogenic_transcript",
536 # misc_RNA=>'processed_transcript',
538 warn "#at: $method $gene_id/$gene_name\n" if $DEBUG;
540 if ( $method =~ /(gene|RNA|CDS|exon|UTR|protein|polypeptide|transcript)/
541 || ( $gene_id && $gene_name eq $gene_id ) ) {
543 my $action = gene_features($feature, $gene_id, $gene_name); # -1, 0, 1, 2 result
544 if ($action == GM_DUP_PART) {
545 # ignore, this is dupl. exon with new parent ...
547 } elsif ($action == GM_NOT_PART) {
548 add_generic_id( $feature, $gene_name, "nocount");
549 my $gff = $gffio->gff_string($feature);
550 push @GFF_LINE_FEAT, $feature;
551 #print $out "$gff\n" if $gff;
553 } elsif ($action > 0) {
554 # hold off print because exon etc. may get 2nd, 3rd parents
555 @to_print= print_held($out, $gffio, \@to_print) if ($action == GM_NEW_TOPLEVEL);
556 push(@to_print, $feature);
561 # otherwise handle as generic feats with IDHandler labels
562 else {
563 add_generic_id( $feature, $gene_name, "");
564 my $gff= $gffio->gff_string($feature);
565 push @GFF_LINE_FEAT, $feature;
566 #print $out "$gff\n" if $gff;
570 # don't like doing this after others; do after each new gene id?
571 @to_print= print_held($out, $gffio, \@to_print);
573 gff_validate(@GFF_LINE_FEAT);
575 for my $feature (@GFF_LINE_FEAT) {
576 my $gff= $gffio->gff_string($feature);
577 print $out "$gff\n" if $gff;
580 # deal with the corresponding DNA
581 my ($fa_out,$fa_outfile);
582 my $dna = $seq->seq;
583 if($dna || %proteinfa) {
584 $method{'RESIDUES'} += length($dna);
585 $dna =~ s/(\S{60})/$1\n/g;
586 $dna .= "\n";
588 if ($split) {
589 $fa_outfile = $outfile;
590 $fa_outfile =~ s/gff$/fa/;
591 open $fa_out, ">$fa_outfile" or die $!;
592 print $fa_out ">$seq_name\n$dna" if $dna;
593 foreach my $aid (sort keys %proteinfa) {
594 my $aa= delete $proteinfa{$aid};
595 $method{'RESIDUES(tr)'} += length($aa);
596 $aa =~ s/(\S{60})/$1\n/g;
597 print $fa_out ">$aid\n$aa\n";
601 else {
602 ## problem here when multiple GB Seqs in one file; all FASTA needs to go at end of $out
603 ## see e.g. Mouse: mm_ref_chr19.gbk has NT_082868 and NT_039687 parts in one .gbk
604 ## maybe write this to temp .fa then cat to end of lumped gff $out
605 print $lumpfa_fh ">$seq_name\n$dna" if $dna;
606 foreach my $aid (sort keys %proteinfa) {
607 my $aa= delete $proteinfa{$aid};
608 $method{'RESIDUES(tr)'} += length($aa);
609 $aa =~ s/(\S{60})/$1\n/g;
610 print $lumpfa_fh ">$aid\n$aa\n";
614 %proteinfa=();
617 if ( $zip && !$lump ) {
618 system "gzip -f $outfile";
619 system "gzip -f $fa_outfile" if($fa_outfile);
620 $outfile .= '.gz';
621 $fa_outfile .= '.gz' if $split;
624 # print "\n>EOF\n" if($stdout); #?? need this if summary goes to stdout after FASTA
625 print "# GFF3 saved to $outfile" unless( !$verbose || $stdout || $lump);
626 print ($split ? "; DNA saved to $fa_outfile\n" : "\n") unless($stdout|| $lump);
628 # dgg: moved to after all inputs; here it prints cumulative sum for each record
629 #if ( $summary ) {
630 # print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
632 # for ( keys %method ) {
633 # print "# $_ $method{$_}\n";
635 # print "# \n";
636 # }
640 print "# GFF3 saved to $outfile\n" if( $verbose && $lump);
641 if ( $summary ) {
642 print "# Summary:\n# Feature\tCount\n# -------\t-----\n";
643 for ( keys %method ) {
644 print "# $_ $method{$_}\n";
646 print "# \n";
649 ## FIXME for piped output w/ split FA files ...
650 close($lumpfa_fh) if $lumpfa_fh;
651 if (!$split && $outfa && $lump_fh) {
652 print $lump_fh "##FASTA\n"; # GFF3 spec
653 open $lumpfa_fh, $outfa or warn "reading FA $outfa: $!";
654 while( <$lumpfa_fh>) {
655 print $lump_fh $_;
656 } # is $lump_fh still open?
657 close($lumpfa_fh);
658 unlink($outfa);
662 if ( $zip && $lump ) {
663 system "gzip -f $lump";
666 close FH;
673 sub typeorder {
674 return 1 if ($_[0] =~ /gene/);
675 return 2 if ($_[0] =~ /RNA|transcript/);
676 return 3 if ($_[0] =~ /protein|peptide/);
677 return 4 if ($_[0] =~ /exon|CDS/);
678 return 3; # default before exon (smallest part)
681 sub sort_by_feattype {
682 my($at,$bt)= ($a->primary_tag, $b->primary_tag);
683 return (typeorder($at) <=> typeorder($bt))
684 || ($at cmp $bt);
685 ## || ($a->name() cmp $b->name());
688 sub print_held {
689 my($out,$gffio,$to_print)= @_;
690 return unless(@$to_print);
691 @$to_print = sort sort_by_feattype @$to_print; # put exons after mRNA, otherwise chado loader chokes
692 while ( my $feature = shift @$to_print) {
693 my $gff= $gffio->gff_string($feature); # $gff =~ s/\'/./g; # dang bug in encode
694 push @GFF_LINE_FEAT, $feature;
695 #print $out "$gff\n";
697 return (); # @to_print
700 sub maptags2gff {
701 my $f = shift;
702 ## should copy/move locus_tag to Alias, if not ID/Name/Alias already
703 # but see below /gene /locus_tag usage
704 foreach my $tag (keys %TAG_MAP) {
705 if ($f->has_tag($tag)) {
706 my $newtag= $TAG_MAP{$tag};
707 my @v= $f->get_tag_values($tag);
708 $f->remove_tag($tag);
709 $f->add_tag_value($newtag,@v);
711 ## also, pull any GO:000 ids from /note tag and put into Ontology_term
712 ## ncbi syntax in CDS /note is now '[goid GO:0005886]' OR '[goid 0005624]'
713 if ($tag eq 'note') {
714 map { s/\[goid (\d+)/\[goid GO:$1/g; } @v;
715 my @go= map { m/(GO:\d+)/g } @v;
716 $f->add_tag_value('Ontology_term',@go) if(@go);
724 sub getSourceInfo {
725 my ($seq, $source_type, $sf) = @_;
727 my $is_swiss= ($SOURCEID =~/UniProt|swiss|trembl/i);
728 my $is_gene = ($SOURCEID =~/entrezgene/i);
729 my $is_rich = (ref($seq) =~ /RichSeq/);
730 my $seq_name= $seq->accession_number();
732 unless($sf) { # make one
733 $source_type= $is_swiss ? $PROTEIN_TYPE
734 : $is_gene ? "eneg" # "gene" # "region" #
735 : $is_rich ? $seq->molecule : $source_type;
736 $sf = Bio::SeqFeature::Generic->direct_new();
738 my $len = $seq->length(); $len=1 if($len<1); my $start = 1; ##$start= $len if ($len<1);
739 my $loc= $seq->can('location') ? $seq->location()
740 : new Bio::Location::Simple( -start => $start, -end => $len);
741 $sf->location( $loc );
742 $sf->primary_tag($source_type);
743 $sf->source_tag($SOURCEID);
744 $sf->seq_id( $seq_name);
745 #? $sf->display_name($seq->id()); ## Name or Alias ?
746 $sf->add_tag_value( Alias => $seq->id()); # unless id == accession
747 $seq->add_SeqFeature($sf);
748 ## $source_feat= $sf;
751 if ($sf->has_tag("chromosome")) {
752 $source_type= "chromosome";
753 my ($chrname) = $sf->get_tag_values("chromosome");
754 ## PROBLEM with Name <> ID, RefName for Gbrowse; use Alias instead
755 ## e.g. Mouse chr 19 has two IDs in NCBI genbank now
756 $sf->add_tag_value( Alias => $chrname );
759 # pull GB Comment, Description for source ft ...
760 # add reference - can be long, not plain string...
761 warn "# $SOURCEID:$seq_name fields = ", join(",", $seq->annotation->get_all_annotation_keys()),"\n" if $DEBUG;
762 # GenBank fields: keyword,comment,reference,date_changed
763 # Entrezgene fields 850293 =ALIAS_SYMBOL,RefSeq status,chromosome,SGD,dblink,Entrez Gene Status,OntologyTerm,LOCUS_SYNONYM
765 # is this just for main $seq object or for all seqfeatures ?
766 my %AnnotTagMap= (
767 'gene_name' => 'Alias',
768 'ALIAS_SYMBOL' => 'Alias', # Entrezgene
769 'LOCUS_SYNONYM' => 'Alias', #?
770 'symbol' => 'Alias',
771 'synonym' => 'Alias',
772 'dblink' => 'Dbxref',
773 'product' => 'product',
774 'Reference' => 'reference',
775 'OntologyTerm' => 'Ontology_term',
776 'comment' => 'Note',
777 'comment1' => 'Note',
778 # various map-type locations
779 # gene accession tag is named per source db !??
780 # 'Index terms' => keywords ??
784 my ($desc)= $seq->annotation->get_Annotations("desc") || ( $seq->desc() );
785 my ($date)= $seq->annotation->get_Annotations("dates")
786 || $seq->annotation->get_Annotations("update-date")
787 || $is_rich ? $seq->get_dates() : ();
788 my ($comment)= $seq->annotation->get_Annotations("comment");
789 my ($species)= $seq->annotation->get_Annotations("species");
790 if (!$species
791 && $seq->can('species')
792 && defined $seq->species()
793 && $seq->species()->can('binomial') ) {
794 $species= $seq->species()->binomial();
797 # update source feature with main GB fields
798 $sf->add_tag_value( ID => $seq_name ) unless $sf->has_tag('ID');
799 $sf->add_tag_value( Note => $desc ) if($desc && ! $sf->has_tag('Note'));
800 $sf->add_tag_value( organism => $species ) if($species && ! $sf->has_tag('organism'));
801 $sf->add_tag_value( comment1 => $comment ) if(!$is_swiss && $comment && ! $sf->has_tag('comment1'));
802 $sf->add_tag_value( date => $date ) if($date && ! $sf->has_tag('date'));
804 $sf->add_tag_value( Dbxref => $SOURCEID.':'.$seq_name ) if $is_swiss || $is_gene;
806 foreach my $atag (sort keys %AnnotTagMap) {
807 my $gtag= $AnnotTagMap{$atag}; next unless($gtag);
808 my @anno = map{
809 if (ref $_ && $_->can('get_all_values')) {
810 split( /[,;] */, join ";", $_->get_all_values)
812 elsif (ref $_ && $_->can('display_text')) {
813 split( /[,;] */, $_->display_text)
815 elsif (ref $_ && $_->can('value')) {
816 split( /[,;] */, $_->value)
818 else {
821 } $seq->annotation->get_Annotations($atag);
822 foreach(@anno) { $sf->add_tag_value( $gtag => $_ ); }
825 #my @genes = map{ split( /[,;] */, "$_"); } $seq->annotation->get_Annotations('gene_name');
826 #$sf->add_tag_value( Alias => $_ ) foreach(@genes);
828 #my @dblink= map { "$_"; } $seq->annotation->get_Annotations("dblink"); # add @all
829 #$sf->add_tag_value( Dbxref => $_ ) foreach(@dblink);
831 return (wantarray)? ($source_type,$sf) : $source_type; #?
835 sub gene_features {
836 my ($f, $gene_id, $genelinkID) = @_;
837 local $_ = $f->primary_tag;
838 $method{$_}++;
840 if ( /gene/ ) {
841 $f->add_tag_value( ID => $gene_id ) unless($f->has_tag('ID')); # check is same value!?
842 $tnum = $rnum= 0; $ncrna_id= $rna_id = '';
843 return GM_NEW_TOPLEVEL;
845 } elsif ( /mRNA/ ) {
846 return GM_NOT_PART unless $gene_id;
847 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
848 ($rna_id = $gene_id ) =~ s/gene/mRNA/;
849 $rna_id .= '.t0' . ++$tnum;
850 $f->add_tag_value( ID => $rna_id );
851 $f->add_tag_value( Parent => $gene_id );
853 } elsif ( /RNA|transcript/) {
854 ## misc_RNA here; missing exons ... flattener problem?
855 # all of {t,nc,sn}RNA can have gene models now
856 ## but problem in Worm chr: mRNA > misc_RNA > CDS with same locus tag
857 ## CDS needs to use mRNA, not misc_RNA, rna_id ...
858 ## also need to fix cases where tRNA,... lack a 'gene' parent: make this one top-level
860 if($gene_id) {
861 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
862 ($ncrna_id = $gene_id) =~ s/gene/ncRNA/;
863 $ncrna_id .= '.r0' . ++$rnum;
864 $f->add_tag_value( Parent => $gene_id );
865 $f->add_tag_value( ID => $ncrna_id );
866 } else {
867 unless ($f->has_tag('ID')) {
868 if($genelinkID) {
869 $f->add_tag_value( ID => $genelinkID ) ;
870 } else {
871 $idh->generate_unique_persistent_id($f);
874 ($ncrna_id)= $f->get_tag_values('ID');
875 return GM_NEW_TOPLEVEL;
876 # this feat now acts as gene-top-level; need to print @to_print to flush prior exons?
879 } elsif ( /exon/ ) { # can belong to any kind of RNA
880 return GM_NOT_PART unless ($rna_id||$ncrna_id);
881 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id);
882 ## we are getting duplicate Parents here, which chokes chado loader, with reason...
883 ## problem is when mRNA and ncRNA have same exons, both ids are active, called twice
884 ## check all Parents
885 for my $expar ($rna_id, $ncrna_id) {
886 next unless($expar);
887 if ( $exonpar{$expar} and $f->has_tag('Parent') ) {
888 my @vals = $f->get_tag_values('Parent');
889 next if (grep {$expar eq $_} @vals);
891 $exonpar{$expar}++;
892 $f->add_tag_value( Parent => $expar);
893 # last; #? could be both
895 # now we can skip cloned exons
896 # dgg note: multiple parents get added and printed for each unique exon
897 return GM_DUP_PART if ++$seen{$f} > 1;
899 } elsif ( /CDS|protein|polypeptide/ ) {
900 return GM_NOT_PART unless $rna_id; ## ignore $ncrna_id ??
901 return GM_NOT_PART if($genelinkID && $genelinkID ne $gene_id); #??
902 (my $pro_id = $rna_id) =~ s/\.t/\.p/;
904 if( ! $CDSkeep && /CDS/) {
905 $f->primary_tag($PROTEIN_TYPE);
907 ## duplicate problem is Location ..
908 if ($f->location->isa("Bio::Location::SplitLocationI")) {
909 # my($b,$e)=($f->start, $f->end); # is this all we need?
910 my($b,$e)=(-1,0);
911 foreach my $l ($f->location->each_Location) {
912 $b = $l->start if($b<0 || $b > $l->start);
913 $e = $l->end if($e < $l->end);
915 $f->location( Bio::Location::Simple->new(
916 -start => $b, -end => $e, -strand => $f->strand) );
919 $f->add_tag_value( Derives_from => $rna_id );
921 else {
922 $f->add_tag_value( Parent => $rna_id );
925 $f->add_tag_value( ID => $pro_id );
927 move_translation_fasta($f, $pro_id);
928 #if( $f->has_tag('translation')) {
929 # my ($aa) = $f->get_tag_values("translation");
930 # $proteinfa{$pro_id}= $aa;
931 # $f->remove_tag("translation");
932 # $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
934 } elsif ( /region/ ) {
935 $f->primary_tag('gene_component_region');
936 $f->add_tag_value( Parent => $gene_id );
937 } else {
938 return GM_NOT_PART unless $gene_id;
939 $f->add_tag_value( Parent => $gene_id );
942 ## return GM_DUP_PART if /exon/ && ++$seen{$f} > 1;
944 return GM_NEW_PART;
947 ## was generic_features > add_generic_id
948 sub add_generic_id {
949 my ($f, $ft_name, $flags) = @_;
950 my $method = $f->primary_tag;
951 $method{$method}++ unless($flags =~ /nocount/); ## double counts GM_NOT_PART from above
953 if ($f->has_tag('ID')) {
956 elsif ( $f->has_tag($method) ) {
957 my ($name) = $f->get_tag_values($method);
958 $f->add_tag_value( ID => "$method:$name" );
960 elsif($ft_name) { # is this unique ?
961 $f->add_tag_value( ID => $ft_name );
963 else {
964 $idh->generate_unique_persistent_id($f);
967 move_translation_fasta( $f, ($f->get_tag_values("ID"))[0] )
968 if($method =~ /CDS/);
970 # return $io->gff_string($f);
973 sub move_translation_fasta {
974 my ($f, $ft_id) = @_;
975 if( $ft_id && $f->has_tag('translation') ) {
976 my ($aa) = $f->get_tag_values("translation");
977 if($aa && $aa !~ /^length/) {
978 $proteinfa{$ft_id}= $aa;
979 $f->remove_tag("translation");
980 $f->add_tag_value("translation","length.".length($aa)); # hack for odd chado gbl problem
985 sub gff_header {
986 my ($name, $end, $source_type, $source_feat) = @_;
987 $source_type ||= "region";
989 my $info = "$source_type:$name";
990 my $head = "##gff-version $GFF_VERSION\n".
991 "##sequence-region $name 1 $end\n".
992 "# conversion-by bp_genbank2gff3.pl\n";
993 if ($source_feat) {
994 ## dgg: these header comment fields are not useful when have multi-records, diff organisms
995 for my $key (qw(organism Note date)) {
996 my $value;
997 if ($source_feat->has_tag($key)) {
998 ($value) = $source_feat->get_tag_values($key);
1000 if ($value) {
1001 $head .= "# $key $value\n";
1002 $info .= ", $value";
1005 $head = "" if $didheader;
1006 } else {
1007 $head .= "$name\t$SOURCEID\t$source_type\t1\t$end\t.\t.\t.\tID=$name\n";
1009 $didheader++;
1010 return (wantarray) ? ($head,$info) : $head;
1013 sub unflatten_seq {
1014 my $seq = shift;
1016 ## print "# working on $source_type:", $seq->accession, "\n";
1017 my $uh_oh = "Possible gene unflattening error with" . $seq->accession_number .
1018 ": consult STDERR\n";
1020 eval {
1021 $unflattener->unflatten_seq( -seq => $seq,
1022 -noinfer => $noinfer,
1023 -use_magic => 1 );
1026 # deal with unflattening errors
1027 if ( $@ ) {
1028 warn $seq->accession_number . " Unflattening error:\n";
1029 warn "Details: $@\n";
1030 print "# ".$uh_oh;
1033 return 0 if !$seq || !$seq->all_SeqFeatures;
1035 # map feature types to the sequence ontology
1036 ## $tm->map_types_to_SO( -seq => $seq );
1037 #$tm->map_types( -seq => $seq, -type_map => $FTSOmap, -undefined => "region" ); #dgg
1039 map_types(
1040 $tm,
1041 -seq => $seq,
1042 -type_map => $FTSOmap,
1043 -syn_map => $FTSOsynonyms,
1044 -undefined => "region"
1045 ); #nml
1049 sub filter {
1050 my $seq = shift;
1051 ## return unless $filter;
1052 my @feats;
1053 my @sources; # dgg; pick source features here; only 1 always?
1054 if ($filter) {
1055 for my $f ( $seq->remove_SeqFeatures ) {
1056 my $m = $f->primary_tag;
1057 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1058 push @feats, $f unless $filter =~ /$m/i;
1060 $seq->add_SeqFeature($_) foreach @feats;
1061 } else {
1062 for my $f ( $seq->get_SeqFeatures ){
1063 my $m = $f->primary_tag;
1064 push @sources, $f if ($m eq 'source'); # dgg? but leave in @feats ?
1068 return @sources;
1072 # The default behaviour of Bio::FeatureHolderI:get_all_SeqFeatures
1073 # changed to filter out cloned features. We have to implement the old
1074 # method. These two subroutines were adapted from the v1.4 Bio::FeatureHolderI
1075 sub get_all_SeqFeatures {
1076 my $seq = shift;
1077 my @flatarr;
1079 foreach my $feat ( $seq->get_SeqFeatures ){
1080 push(@flatarr,$feat);
1081 _add_flattened_SeqFeatures(\@flatarr,$feat);
1083 return @flatarr;
1086 sub gene_name {
1087 my $g = shift;
1088 my $gene_id = ''; # zero it;
1090 if ($g->has_tag('locus_tag')) {
1091 ($gene_id) = $g->get_tag_values('locus_tag');
1094 elsif ($g->has_tag('gene')) {
1095 ($gene_id) = $g->get_tag_values('gene');
1097 elsif ($g->has_tag('ID')) { # for non-Genbank > Entrezgene
1098 ($gene_id) = $g->get_tag_values('ID');
1101 ## See Unflattener comment:
1102 # on rare occasions, records will have no /gene or /locus_tag
1103 # but it WILL have /product tags. These serve the same purpose
1104 # for grouping. For an example, see AY763288 (also in t/data)
1105 # eg. product=tRNA-Asp ; product=similar to crooked neck protein
1106 elsif ($g->has_tag('product')) {
1107 my ($name)= $g->get_tag_values('product');
1108 ($gene_id) = $name unless($name =~ / /); # a description not name
1111 ## dgg; also handle transposon=xxxx ID/name
1112 # ID=GenBank:repeat_region:NC_004353:1278337:1281302;transposon=HeT-A{}1685;Dbxref=FLYBASE:FBti0059746
1113 elsif ($g->has_tag('transposon')) {
1114 my ($name)= $g->get_tag_values('transposon');
1115 ($gene_id) = $name unless($name =~ / /); # a description not name
1118 return $gene_id;
1121 # same list as gene_name .. change tag to generic Name
1122 sub convert_to_name {
1123 my $g = shift;
1124 my $gene_id = ''; # zero it;
1126 if ($g->has_tag('gene')) {
1127 ($gene_id) = $g->get_tag_values('gene');
1128 $g->remove_tag('gene');
1129 $g->add_tag_value('Name', $gene_id);
1131 elsif ($g->has_tag('locus_tag')) {
1132 ($gene_id) = $g->get_tag_values('locus_tag');
1133 $g->remove_tag('locus_tag');
1134 $g->add_tag_value('Name', $gene_id);
1137 elsif ($g->has_tag('product')) {
1138 my ($name)= $g->get_tag_values('product');
1139 ($gene_id) = $name unless($name =~ / /); # a description not name
1140 ## $g->remove_tag('product');
1141 $g->add_tag_value('Name', $gene_id);
1144 elsif ($g->has_tag('transposon')) {
1145 my ($name)= $g->get_tag_values('transposon');
1146 ($gene_id) = $name unless($name =~ / /); # a description not name
1147 ## $g->remove_tag('transposon');
1148 $g->add_tag_value('Name', $gene_id);
1150 elsif ($g->has_tag('ID')) {
1151 my ($name)= $g->get_tag_values('ID');
1152 $g->add_tag_value('Name', $name);
1154 return $gene_id;
1158 sub _add_flattened_SeqFeatures {
1159 my ($arrayref,$feat) = @_;
1160 my @subs = ();
1162 if ($feat->isa("Bio::FeatureHolderI")) {
1163 @subs = $feat->get_SeqFeatures;
1165 elsif ($feat->isa("Bio::SeqFeatureI")) {
1166 @subs = $feat->sub_SeqFeature;
1168 else {
1169 warn ref($feat)." is neither a FeatureHolderI nor a SeqFeatureI. ".
1170 "Don't know how to flatten.";
1173 for my $sub (@subs) {
1174 push(@$arrayref,$sub);
1175 _add_flattened_SeqFeatures($arrayref,$sub);
1180 sub map_types {
1182 my ($self, @args) = @_;
1184 my($sf, $seq, $type_map, $syn_map, $undefmap) =
1185 $self->_rearrange([qw(FEATURE
1187 TYPE_MAP
1188 SYN_MAP
1189 UNDEFINED
1191 @args);
1193 if (!$sf && !$seq) {
1194 $self->throw("you need to pass in either -feature or -seq");
1197 my @sfs = ($sf);
1198 if ($seq) {
1199 $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
1200 @sfs = $seq->get_all_SeqFeatures;
1202 $type_map = $type_map || $self->typemap; # dgg: was type_map;
1203 foreach my $feat (@sfs) {
1205 $feat->isa("Bio::SeqFeatureI") || $self->throw("$feat NOT A SeqFeatureI");
1206 $feat->isa("Bio::FeatureHolderI") || $self->throw("$feat NOT A FeatureHolderI");
1208 my $primary_tag = $feat->primary_tag;
1210 #if ($primary_tag =~ /^pseudo(.*)$/) {
1211 # $primary_tag = $1;
1212 # $feat->primary_tag($primary_tag);
1215 my $mtype = $type_map->{$primary_tag};
1216 if ($mtype) {
1217 if (ref($mtype)) {
1218 if (ref($mtype) eq 'ARRAY') {
1219 my $soID;
1220 ($mtype, $soID) = @$mtype;
1222 if ($soID && ref($ONTOLOGY)) {
1223 my ($term) = $ONTOLOGY->find_terms(-identifier => $soID);
1224 $mtype = $term->name if $term;
1226 # if SO ID is undefined AND we have an ontology to search, we want to delete
1227 # the feature type hash entry in order to force a fuzzy search
1228 elsif (! defined $soID && ref($ONTOLOGY)) {
1229 undef $mtype;
1230 delete $type_map->{$primary_tag};
1232 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1233 $mtype= $undefmap;
1236 $type_map->{$primary_tag} = $mtype if $mtype;
1238 elsif (ref($mtype) eq 'CODE') {
1239 $mtype = $mtype->($feat);
1241 else {
1242 $self->throw('must be scalar or CODE ref');
1245 elsif ($undefmap && $mtype eq 'undefined') { # dgg
1246 $mtype= $undefmap;
1248 $feat->primary_tag($mtype);
1251 if ($CONF) {
1252 conf_read();
1253 my %perfect_matches;
1254 while (my ($p_tag,$rules) = each %$YAML) {
1255 RULE:
1256 for my $rule (@$rules) {
1257 for my $tags (@$rule) {
1258 while (my ($tag,$values) = each %$tags) {
1259 for my $value (@$values) {
1260 if ($feat->has_tag($tag)) {
1261 for ($feat->get_tag_values($tag)) {
1262 next RULE unless $_ =~ /\Q$value\E/;
1264 } elsif ($tag eq 'primary_tag') {
1265 next RULE unless $value eq
1266 $feat->primary_tag;
1267 } elsif ($tag eq 'location') {
1268 next RULE unless $value eq
1269 $feat->start.'..'.$feat->end;
1270 } else { next RULE }
1274 $perfect_matches{$p_tag}++;
1277 if (scalar(keys %perfect_matches) == 1) {
1278 $mtype = $_ for keys %perfect_matches;
1279 } elsif (scalar(keys %perfect_matches) > 1) {
1280 warn "There are conflicting rules in the config file for the" .
1281 " following types: ";
1282 warn "\t$_\n" for keys %perfect_matches;
1283 warn "Until conflict resolution is built into the converter," .
1284 " you will have to manually edit the config file to remove the" .
1285 " conflict. Sorry :(. Skipping user preference for this entry";
1286 sleep(2);
1290 if ( ! $mtype && $syn_map) {
1291 if ($feat->has_tag('note')) {
1293 my @all_matches;
1295 my @note = $feat->each_tag_value('note');
1297 for my $k (keys %$syn_map) {
1299 if ($k =~ /"(.+)"/) {
1301 my $syn = $1;
1303 for my $note (@note) {
1305 # look through the notes to see if the description
1306 # is an exact match for synonyms
1307 if ( $syn eq $note ) {
1309 my @map = @{$syn_map->{$k}};
1312 my $best_guess = $map[0];
1314 unshift @{$all_matches[-1]}, [$best_guess];
1316 $mtype = $MANUAL
1317 ? manual_curation($feat, $best_guess, \@all_matches)
1318 : $best_guess;
1320 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1321 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1322 . '#' x 78 . "\n\n";
1324 } else {
1325 # check both primary tag and and note against
1326 # SO synonyms for best matching description
1328 SO_fuzzy_match( $k, $primary_tag, $note, $syn, \@all_matches);
1335 #unless ($mtype) {
1336 for my $note (@note) {
1337 for my $name (values %$type_map) {
1338 # check primary tag against SO names for best matching
1339 # descriptions //NML also need to check against
1340 # definition && camel case split terms
1342 SO_fuzzy_match($name, $primary_tag, $note, $name, \@all_matches);
1347 if (scalar(@all_matches) && !$mtype) {
1349 my $top_matches = first { defined $_ } @{$all_matches[-1]};
1351 my $best_guess = $top_matches->[0];
1355 # if guess has quotes, it is a synonym term. we need to
1356 # look up the corresponding name term
1357 # otherwise, guess is a name, so we can use it directly
1358 if ($best_guess =~ /"(.+)"/) {
1360 $best_guess = $syn_map->{$best_guess}->[0];
1364 @RETURN = @all_matches;
1365 $mtype = $MANUAL
1366 ? manual_curation($feat, $best_guess, \@all_matches)
1367 : $best_guess;
1369 print '#' x 78 . "\nGuessing the proper SO term for GenBank"
1370 . " entry:\n\n" . GenBank_entry($feat) . "\nis:\t$mtype\n"
1371 . '#' x 78 . "\n\n";
1375 $mtype ||= $undefmap;
1376 $feat->primary_tag($mtype);
1383 sub SO_fuzzy_match {
1385 my $candidate = shift;
1386 my $primary_tag = shift;
1387 my $note = shift;
1388 my $SO_terms = shift;
1389 my $best_matches_ref = shift;
1390 my $modifier = shift;
1392 $modifier ||= '';
1394 my @feat_terms;
1396 for ( split(" |_", $primary_tag) ) {
1397 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1398 my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1399 push @feat_terms, @camelCase;
1402 for ( split(" |_", $note) ) {
1403 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z])/g;
1404 #my @camelCase = /(?:[A-Z]|[a-z])(?:[A-Z]+|[a-z]*)(?=$|[A-Z]|[;:.,])/g;
1405 (my $word = $_) =~ s/[;:.,]//g;
1406 push @feat_terms, $word;
1410 my @SO_terms = split(" |_", $SO_terms);
1412 # fuzzy match works on a simple point system. When 2 words match,
1413 # the $plus counter adds one. When they don't, the $minus counter adds
1414 # one. This is used to sort similar matches together. Better matches
1415 # are found at the end of the array, near the top.
1417 # NML: can we improve best match by using synonym tags
1418 # EXACT,RELATED,NARROW,BROAD?
1420 my ($plus, $minus) = (0, 0);
1421 my %feat_terms;
1422 my %SO_terms;
1424 #unique terms
1425 map {$feat_terms{$_} = 1} @feat_terms;
1426 map {$SO_terms{$_} = 1} @SO_terms;
1428 for my $st (keys %SO_terms) {
1429 for my $ft (keys %feat_terms) {
1431 ($st =~ m/$modifier\Q$ft\E/) ? $plus++ : $minus++;
1436 push @{$$best_matches_ref[$plus][$minus]}, $candidate if $plus;
1440 sub manual_curation {
1442 my ($feat, $default_opt, $all_matches) = @_;
1444 my @all_matches = @$all_matches;
1446 # convert all SO synonyms into names and filter
1447 # all matches into unique term list because
1448 # synonyms can map to multiple duplicate names
1450 my (@unique_SO_terms, %seen);
1451 for (reverse @all_matches) {
1452 for (@$_) {
1453 for (@$_) {
1454 #my @names;
1455 if ($_ =~ /"(.+)"/) {
1456 for (@{$SYN_MAP->{$_}}) {
1457 push @unique_SO_terms, $_ unless $seen{$_};
1458 $seen{$_}++;
1460 } else {
1461 push @unique_SO_terms, $_ unless $seen{$_};
1462 $seen{$_}++;
1468 my $s = scalar(@unique_SO_terms);
1470 my $choice = 0;
1472 my $more =
1473 "[a]uto : automatic input (selects best guess for remaining entries)\r" .
1474 "[f]ind : search for other SO terms matching your query (e.g. f gene)\r" .
1475 "[i]nput : add a specific term\r" .
1476 "[r]eset : reset to the beginning of matches\r" .
1477 "[s]kip : skip this entry (selects best guess for this entry)\r"
1480 $more .=
1481 "[n]ext : view the next ".OPTION_CYCLE." terms\r" .
1482 "[p]rev : view the previous ".OPTION_CYCLE." terms" if ($s > OPTION_CYCLE);
1484 my $msg = #"\n\n" . '-' x 156 . "\n"
1485 "The converter found $s possible matches for the following GenBank entry: ";
1487 my $directions =
1488 "Type a number to select the SO term that best matches"
1489 . " the genbank entry, or use any of the following options:\r" . '_' x 76 . "\r$more";
1492 # lookup filtered list to pull out definitions
1493 my @options = map {
1494 my $term = $_;
1495 my %term;
1496 for (['name', 'name'], ['def', 'definition'], ['synonym',
1497 'each_synonym']) {
1498 my ($label, $method) = @$_;
1499 $term{$label} = \@{[$term->$method]};
1501 [++$choice, $_->name, ($_->definition || 'none'), \%term, $_->each_synonym ];
1502 } map { $ONTOLOGY->find_terms(-name => $_) } @unique_SO_terms;
1505 my $option = options_cycle(0, OPTION_CYCLE, $msg, $feat, $directions,
1506 $default_opt, @options);
1508 if ($option eq 'skip') { return $default_opt
1509 } elsif ($option eq 'auto') {
1510 $MANUAL = 0;
1511 return $default_opt;
1512 } else { return $option }
1516 sub options_cycle {
1518 my ($start, $stop, $msg, $feat, $directions, $best_guess, @opt) = @_;
1520 #NML: really should only call GenBank_entry once. Will need to change
1521 #method to return array & shift off header
1522 my $entry = GenBank_entry($feat, "\r");
1524 my $total = scalar(@opt);
1526 ($start,$stop) = (0, OPTION_CYCLE)
1527 if ( ($start < 0) && ($stop > 0) );
1529 ($start,$stop) = (0, OPTION_CYCLE)
1530 if ( ( ($stop - $start) < OPTION_CYCLE ) && $stop < $total);
1532 ($start,$stop) = ($total - OPTION_CYCLE, $total) if $start < 0;
1533 ($start,$stop) = (0, OPTION_CYCLE) if $start >= $total;
1535 $stop = $total if $stop > $total;
1537 my $dir_copy = $directions;
1538 my $msg_copy = $msg;
1539 my $format = "format STDOUT = \n" .
1540 '-' x 156 . "\n" .
1541 '^' . '<' x 77 . '| Available Commands:' . "\n" .
1542 '$msg_copy' . "\n" .
1543 '-' x 156 . "\n" .
1544 ' ' x 78 . "|\n" .
1545 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
1546 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
1547 (' ' x 20 . '^' . '<' x 57 . '| ^' . '<' x 75 . '~' . "\n" .
1548 ' ' x 20 . '$entry,' . ' ' x 53 . '$dir_copy,' . "\n") x 1000 . ".\n";
1551 # eval throws redefined warning that breaks formatting.
1552 # Turning off warnings just for the eval to fix this.
1553 no warnings 'redefine';
1554 eval $format;
1557 write;
1559 print '-' x 156 . "\n" .
1560 'Showing results ' . ( $stop ? ( $start + 1 ) : $start ) .
1561 " - $stop of possible SO term matches: (best guess is \"$best_guess\")" .
1562 "\n" . '-' x 156 . "\n";
1564 for (my $i = $start; $i < $stop; $i+=2) {
1566 my ($left, $right) = @opt[$i,$i+1];
1568 my ($nL, $nmL, $descL, $termL, @synL) = @$left;
1570 #odd numbered lists can cause fatal undefined errors, so check
1571 #to make sure we have data
1573 my ($nR, $nmR, $descR, $termR, @synR) = ref($right) ? @$right : (undef, undef, undef);
1576 my $format = "format STDOUT = \n";
1578 $format .=
1579 ' ' x 78 . "|\n" .
1581 '@>>: name: ^' . '<' x 64 . '~' . ' |' .
1582 ( ref($right) ? ('@>>: name: ^' . '<' x 64 . '~' ) : '' ) . "\n" .
1583 '$nL,' . ' ' x 7 . '$nmL,' .
1584 ( ref($right) ? (' ' x 63 . '$nR,' . ' ' x 7 . "\$nmR,") : '' ) . "\n" .
1586 ' ' x 11 . '^' . '<' x 61 . '...~' . ' |' .
1587 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1588 ' ' x 11 . '$nmL,' .
1589 (ref($right) ? (' ' x 74 . '$nmR,') : '') . "\n" .
1590 #' ' x 78 . '|' . "\n" .
1593 ' def: ^' . '<' x 65 . ' |' .
1594 (ref($right) ? (' def: ^' . '<' x 64 . '~') : '') . "\n" .
1595 ' ' x 11 . '$descL,' .
1596 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1599 (' ^' . '<' x 65 . ' |' .
1600 (ref($right) ? (' ^' . '<' x 64 . '~') : '') . "\n" .
1601 ' ' x 11 . '$descL,' .
1602 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n") x 5 .
1605 ' ^' . '<' x 61 . '...~ |' .
1606 (ref($right) ? (' ^' . '<' x 61 . '...~') : '') . "\n" .
1607 ' ' x 11 . '$descL,' .
1608 (ref($right) ? (' ' x 72 . '$descR,') : '') . "\n" .
1610 ".\n";
1613 # eval throws redefined warning that breaks formatting.
1614 # Turning off warnings just for the eval to fix this.
1615 no warnings 'redefine';
1616 eval $format;
1618 write;
1621 print '-' x 156 . "\nenter a command:";
1623 while (<STDIN>) {
1625 (my $input = $_) =~ s/\s+$//;
1627 if ($input =~ /^\d+$/) {
1628 if ( $input && defined $opt[$input-1] ) {
1629 return $opt[$input-1]->[1]
1630 } else {
1631 print "\nThat number is not an option. Please enter a valid number.\n:";
1633 } elsif ($input =~ /^n/i | $input =~ /next/i ) {
1634 return options_cycle($start + OPTION_CYCLE, $stop + OPTION_CYCLE, $msg,
1635 $feat, $directions, $best_guess, @opt)
1636 } elsif ($input =~ /^p/i | $input =~ /prev/i ) {
1637 return options_cycle($start - OPTION_CYCLE, $stop - OPTION_CYCLE, $msg,
1638 $feat, $directions, $best_guess, @opt)
1639 } elsif ( $input =~ /^s/i || $input =~ /skip/i ) { return 'skip'
1640 } elsif ( $input =~ /^a/i || $input =~ /auto/i ) { return 'auto'
1641 } elsif ( $input =~ /^r/i || $input =~ /reset/i ) {
1642 return manual_curation($feat, $best_guess, \@RETURN );
1643 } elsif ( $input =~ /^f/i || $input =~ /find/i ) {
1645 my ($query, @query_results);
1647 if ($input =~ /(?:^f|find)\s+?(.*?)$/) { $query = $1;
1648 } else {
1650 #do a SO search
1651 print "Type your search query\n:";
1652 while (<STDIN>) { chomp($query = $_); last }
1655 for (keys(%$TYPE_MAP), keys(%$SYN_MAP)) {
1656 SO_fuzzy_match($_, $query, '', $_, \@query_results, '(?i)');
1659 return manual_curation($feat, $best_guess, \@query_results);
1661 } elsif ( $input =~ /^i/i || $input =~ /input/i ) {
1663 #NML fast input for later
1664 #my $query;
1665 #if ($input =~ /(?:^i|input)\s+?(.*?)$/) { $query = $1 };
1667 #manual input
1668 print "Type the term you want to use\n:";
1669 while (<STDIN>) {
1670 chomp(my $input = $_);
1672 if (! $TYPE_MAP->{$input}) {
1674 print "\"$input\" doesn't appear to be a valid SO term. Are ".
1675 "you sure you want to use it? (y or n)\n:";
1677 while (<STDIN>) {
1679 chomp(my $choice = $_);
1681 if ($choice eq 'y') {
1682 print
1683 "\nWould you like to save your preference for " .
1684 "future use (so you don't have to redo manual " .
1685 "curation for this feature everytime you run " .
1686 "the converter)? (y or n)\n";
1688 #NML: all these while loops are a mess. Really should condense it.
1689 while (<STDIN>) {
1691 chomp(my $choice = $_);
1693 if ($choice eq 'y') {
1694 curation_save($feat, $input);
1695 return $input;
1696 } elsif ($choice eq 'n') {
1697 return $input
1698 } else {
1699 print "\nDidn't recognize that command. Please " .
1700 "type y or n.\n:"
1705 } elsif ($choice eq 'n') {
1706 return options_cycle($start, $stop, $msg, $feat,
1707 $directions, $best_guess, @opt)
1708 } else {
1709 print "\nDidn't recognize that command. Please " .
1710 "type y or n.\n:"
1714 } else {
1715 print
1716 "\nWould you like to save your preference for " .
1717 "future use (so you don't have to redo manual " .
1718 "curation for this feature everytime you run " .
1719 "the converter)? (y or n)\n";
1721 #NML: all these while loops are a mess. Really should condense it.
1722 while (<STDIN>) {
1724 chomp(my $choice = $_);
1726 if ($choice eq 'y') {
1727 curation_save($feat, $input);
1728 return $input;
1729 } elsif ($choice eq 'n') {
1730 return $input
1731 } else {
1732 print "\nDidn't recognize that command. Please " .
1733 "type y or n.\n:"
1740 } else {
1741 print "\nDidn't recognize that command. Please re-enter your choice.\n:"
1747 sub GenBank_entry {
1748 my ($f, $delimiter, $num) = @_;
1750 $delimiter ||= "\n";
1753 my $entry =
1755 ($num ? ' [1] ' : ' ' x 5) . $f->primary_tag
1756 . ($num
1757 ? ' ' x (12 - length $f->primary_tag ) . ' [2] '
1758 : ' ' x (15 - length $f->primary_tag)
1760 . $f->start.'..'.$f->end
1762 . "$delimiter";
1764 if ($num) {
1765 words_tag($f, \$entry);
1766 } else {
1767 for my $tag ($f->all_tags) {
1768 for my $val ( $f->each_tag_value($tag) ) {
1769 $entry .= ' ' x 20;
1770 #$entry .= "/$tag=\"$val\"$delimiter";
1771 $entry .= $val eq '_no_value'
1772 ? "/$tag$delimiter"
1773 : "/$tag=\"$val\"$delimiter";
1779 return $entry;
1783 sub gff_validate {
1784 warn "Validating GFF file\n" if $DEBUG;
1785 my @feat = @_;
1787 my (%parent2child, %all_ids, %descendants, %reserved);
1789 for my $f (@feat) {
1790 for my $aTags (['Parent', \%parent2child], ['ID', \%all_ids]) {
1791 map { push @{$$aTags[1]->{$_}}, $f } $f->get_tag_values($$aTags[0])
1792 if $f->has_tag($$aTags[0]);
1796 if ($SO_FILE) {
1797 while (my ($parentID, $aChildren) = each %parent2child) {
1798 parent_validate($parentID, $aChildren, \%all_ids, \%descendants, \%reserved);
1802 id_validate(\%all_ids, \%reserved);
1805 sub parent_validate {
1806 my ($parentID, $aChildren, $hAll, $hDescendants, $hReserved) = @_;
1808 my $aParents = $hAll->{$parentID};
1810 map {
1811 my $child = $_;
1812 $child->add_tag_value( validation_error =>
1813 "feature tried to add Parent tag, but no Parent found with ID $parentID"
1815 my %parents;
1816 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1817 delete $parents{$parentID};
1818 my @parents = keys %parents;
1820 $child->remove_tag('Parent');
1822 unless ($child->has_tag('ID')) {
1823 my $id = gene_name($child);
1824 $child->add_tag_value('ID', $id);
1825 push @{$hAll->{$id}}, $child
1828 $child->add_tag_value('Parent', @parents) if @parents;
1830 } @$aChildren and return unless scalar(@$aParents);
1832 my $par = join(',', map { $_->primary_tag } @$aParents);
1833 warn scalar(@$aParents)." POSSIBLE PARENT(S): $par" if $DEBUG;
1835 #NML manual curation needs to happen here
1838 my %parentsToRemove;
1840 CHILD:
1841 for my $child (@$aChildren) {
1842 my $childType = $child->primary_tag;
1844 warn "WORKING ON $childType at ".$child->start.' to '.$child->end
1845 if $DEBUG;
1847 for (my $i = 0; $i < scalar(@$aParents); $i++) {
1848 my $parent = $aParents->[$i];
1849 my $parentType = $parent->primary_tag;
1851 warn "CHECKING $childType against $parentType" if $DEBUG;
1853 #cache descendants so we don't have to do repeat searches
1854 unless ($hDescendants->{$parentType}) {
1856 for my $term ($ONTOLOGY->find_terms(
1857 -name => $parentType
1858 ) ) {
1860 map {
1861 $hDescendants->{$parentType}{$_->name}++
1862 } $ONTOLOGY->get_descendant_terms($term);
1866 # NML: hopefully temporary fix.
1867 # SO doesn't consider exon/CDS to be a child of mRNA
1868 # even though common knowledge dictates that they are
1869 # This cheat fixes the false positives for now
1870 if ($parentType eq 'mRNA') {
1871 $hDescendants->{$parentType}{'exon'} = 1;
1872 $hDescendants->{$parentType}{'CDS'} = 1;
1877 warn "\tCAN $childType at " . $child->start . ' to ' . $child->end .
1878 " be a child of $parentType?" if $DEBUG;
1879 if ($hDescendants->{$parentType}{$childType}) {
1880 warn "\tYES, $childType can be a child of $parentType" if $DEBUG;
1882 #NML need to deal with multiple children matched to multiple different
1883 #parents. This model only assumes the first parent id that matches a child will
1884 #be the reserved feature.
1886 $hReserved->{$parentID}{$parent}{'parent'} = $parent;
1887 push @{$hReserved->{$parentID}{$parent}{'children'}}, $child;
1889 #mark parent for later removal from all IDs
1890 #so we don't accidentally change any parents
1892 $parentsToRemove{$i}++;
1894 next CHILD;
1900 #NML shouldn't have to check this; somehow child can lose Parent
1901 #it's happening W3110
1902 #need to track this down
1903 if ( $child->has_tag('Parent') ) {
1905 warn "\tNO, @{[$child->primary_tag]} cannot be a child of $parentID"
1906 if $DEBUG;
1908 my %parents;
1910 map { $parents{$_} = 1 } $child->get_tag_values('Parent');
1912 delete $parents{$parentID};
1913 my @parents = keys %parents;
1915 warn 'VALIDATION ERROR '.$child->primary_tag." at ".$child->start .
1916 ' to ' . $child->end . " cannot be a child of ID $parentID"
1917 if $DEBUG;
1919 $child->add_tag_value( validation_error =>
1920 "feature cannot be a child of $parentID");
1922 $child->remove_tag('Parent');
1924 unless ($child->has_tag('ID')) {
1925 my $id = gene_name($child);
1926 $child->add_tag_value('ID', $id);
1927 push @{$hAll->{$id}}, $child
1930 $child->add_tag_value('Parent', @parents) if @parents;
1935 #delete $aParents->[$_] for keys %parentsToRemove;
1936 splice(@$aParents, $_, 1) for keys %parentsToRemove;
1939 sub id_validate {
1940 my ($hAll, $hReserved) = @_;
1943 for my $id (keys %$hAll) {
1945 #since 1 feature can have this id,
1946 #let's just shift it off and uniquify
1947 #the rest (unless it's reserved)
1949 shift @{$hAll->{$id}} unless $hReserved->{$id};
1950 for my $feat (@{$hAll->{$id}}) {
1951 id_uniquify(0, $id, $feat, $hAll);
1955 for my $parentID (keys %$hReserved) {
1957 my @keys = keys %{$hReserved->{$parentID}};
1959 shift @keys;
1961 for my $k (@keys) {
1962 my $parent = $hReserved->{$parentID}{$k}{'parent'};
1963 my $aChildren= $hReserved->{$parentID}{$k}{'children'};
1965 my $value = id_uniquify(0, $parentID, $parent, $hAll);
1966 for my $child (@$aChildren) {
1968 my %parents;
1969 map { $parents{$_}++ } $child->get_tag_values('Parent');
1970 $child->remove_tag('Parent');
1971 delete $parents{$parentID};
1972 $parents{$value}++;
1973 my @parents = keys %parents;
1974 $child->add_tag_value('Parent', @parents);
1981 sub id_uniquify {
1982 my ($count, $value, $feat, $hAll) = @_;
1984 warn "UNIQUIFYING $value" if $DEBUG;
1986 if (! $count) {
1987 $feat->add_tag_value(Alias => $value);
1988 $value .= ('.' . $feat->primary_tag)
1989 } elsif ($count == 1) {
1990 $value .= ".$count"
1991 } else {
1992 chop $value;
1993 $value .= $count
1995 $count++;
1997 warn "ENDED UP WITH $value" if $DEBUG;
1998 if ( $hAll->{$value} ) {
1999 warn "$value IS ALREADY TAKEN" if $DEBUG;
2000 id_uniquify($count, $value, $feat, $hAll);
2001 } else {
2002 #warn "something's breaking ".$feat->primary_tag.' at '.$feat->start.' to '.$feat->end;
2003 $feat->remove_tag('ID');
2004 $feat->add_tag_value('ID', $value);
2005 push @{$hAll->{$value}}, $value;
2008 $value;
2011 sub conf_read {
2013 print "\nCannot read $CONF. Change file permissions and retry, " .
2014 "or enter another file\n" and conf_locate() unless -r $CONF;
2016 print "\nCannot write $CONF. Change file permissions and retry, " .
2017 "or enter another file\n" and conf_locate() unless -w $CONF;
2019 $YAML = LoadFile($CONF);
2023 sub conf_create {
2025 my ($path, $input) = @_;
2027 print "Cannot write to $path. Change directory permissions and retry " .
2028 "or enter another save path\n" and conf_locate() unless -w $path;
2030 $CONF = $input;
2032 open(FH, '>', $CONF);
2033 close(FH);
2034 conf_read();
2039 sub conf_write { DumpFile($CONF, $YAML) }
2041 sub conf_locate {
2043 print "\nEnter the location of a previously saved config, or a new " .
2044 "path and file name to create a new config (this step is " .
2045 "necessary to save any preferences)";
2047 print "\n\nenter a command:";
2049 while (<STDIN>) {
2050 chomp(my $input = $_);
2051 my ($fn, $path, $suffix) = fileparse($input, qr/\.[^.]*/);
2053 if (-e $input && (! -d $input)) {
2055 print "\nReading $input...\n";
2056 $CONF = $input;
2058 conf_read();
2059 last;
2061 } elsif (! -d $input && $fn.$suffix) {
2063 print "Creating $input...\n";
2064 conf_create($path, $input);
2065 last;
2067 } elsif (-e $input && -d $input) {
2068 print "You only entered a directory. " .
2069 "Please enter BOTH a directory and filename\n";
2070 } else {
2071 print "$input does not appear to be a valid path. Please enter a " .
2072 "valid directory and filename\n";
2074 print "\nenter a command:";
2078 sub curation_save {
2080 my ($feat, $input) = @_;
2082 #my $error = "Enter the location of a previously saved config, or a new " .
2083 # "path and file name to create a new config (this step is " .
2084 # "necessary to save any preferences)\n";
2086 if (!$CONF) {
2087 print "\n\n";
2088 conf_locate();
2089 } elsif (! -e $CONF) {
2090 print "\n\nThe config file you have chosen doesn't exist.\n";
2091 conf_locate();
2092 } else { conf_read() }
2094 my $entry = GenBank_entry($feat, "\r", 1);
2096 my $msg = "Term entered: $input";
2097 my $directions = "Please select any/all tags that provide evidence for the term you
2098 have entered. You may enter multiple tags by separating them by commas/dashes
2099 (e.g 1,3,5-7). For tags with more than one word value (i.e 'note'), you have
2100 the option of either selecting the entire note as evidence, or specific
2101 keywords. If a tag has multiple keywords, they will be tagged alphabetically
2102 for selection. To select a specific keyword in a tag field, you must enter the
2103 tag number followed by the keyword letter (e.g 3a). Multiple keywords may be
2104 selected by entering each letter separated by commas/dashes (e.g 3b,f,4a-c). The more tags you select, the more specific the GenBank entry will have
2105 to be to match your curation. To match the GenBank entry exactly as it
2106 appears, type every number (start-end), or just type 'all'. Remember, once the converter saves your
2107 preference, you will no longer be prompted to choose a feature type for any
2108 matching entries until you edit the curation.ini file.";
2109 my $msg_copy = $msg;
2110 my $dir_copy = $directions;
2112 my $format = "format STDOUT = \n" .
2113 '-' x 156 . "\n" .
2114 '^' . '<' x 77 . '| Directions:' . "\n" .
2115 '$msg_copy' . "\n" .
2116 '-' x 156 . "\n" .
2117 ' ' x 78 . "|\n" .
2118 '^' . '<' x 77 . '| ^' . '<' x 75 . '~' . "\n" .
2119 '$entry' . ' ' x 74 . '$dir_copy,' . "\n" .
2120 (' ' x 15 . '^' . '<' x 62 . '| ^' . '<' x 75 . '~' . "\n" .
2121 ' ' x 15 . '$entry,' . ' ' x 58 . '$dir_copy,' . "\n") x 20 . ".\n";
2124 # eval throws redefined warning that breaks formatting.
2125 # Turning off warnings just for the eval to fix this.
2126 no warnings 'redefine';
2127 eval $format;
2130 write;
2131 print '-' x 156 . "\nenter a command:";
2133 my @tags = words_tag($feat);
2135 my $final = {};
2136 my $choices;
2137 while (<STDIN>) {
2139 chomp(my $choice = $_);
2141 if (scalar(keys %$final) && $choice =~ /^y/i) { last
2143 } elsif (scalar(keys %$final) && $choice =~ /^n/i) { curation_save($feat, $input)
2145 } elsif (scalar(keys %$final)) { print "\nInvalid selection. Please try again\n";
2147 } elsif ($choice eq 'all') {
2149 $choice = '';
2150 for (my $i=1; $i < scalar(@tags); $i++) {
2151 $choice .= "$i,";
2153 chop $choice;
2155 #print "CHOICE [$choice]";
2157 my @selections;
2158 for (split(/(?<=\w)[^[:alnum:]\-]+(?=\d)/, $choice)) {
2159 if ($_ =~ /(\d+)(?:\D*)-(\d+)(.*)/) {
2161 for ($1..$2) { push @selections, $_ }
2163 my $dangling_alphas = $3;
2164 alpha_expand($dangling_alphas, \@selections);
2166 } else {
2167 alpha_expand($_, \@selections);
2171 foreach my $numbers (@selections) {
2173 my @c = split(/(?=[\w])/, $numbers);
2174 s/\W+//g foreach @c;
2175 my $num;
2178 $^W = 0;
2179 $num = 0 + shift @c;
2182 my $tag = $tags[$num];
2183 if (ref $tag && scalar(@c)) {
2184 my $no_value;
2185 foreach (@c) {
2186 if (defined $tag->{$_}) {
2187 $choices .= "${num}[$_] ";
2188 my ($t,$v) = @{$tag->{$_}};
2189 push @{${$final->{$input}}[0]{$t}}, $v;
2191 } else { $no_value++ }
2194 if ($no_value) {
2195 _selection_add($tag,$final,$input,\$choices,$num);
2196 #my ($t,$v) = @{$tag->{'all'}};
2197 #unless (defined ${$final->{$input}}[0]{$t}) {
2198 #$choices .= "$num, ";
2199 #push @{${$final->{$input}}[0]{$t}}, $v
2203 $choices = substr($choices, 0, -2);
2204 $choices .= ', ';
2206 } elsif (ref $tag) {
2207 _selection_add($tag,$final,$input,\$choices,$num);
2208 #my ($t,$v) = @{$tag->{'all'}};
2209 #unless (defined ${$final->{$input}}[0]{$t}) {
2210 #$choices .= "$num, ";
2211 #push @{${$final->{$input}}[0]{$t}}, $v
2215 $choices = substr($choices, 0, -2) if $choices;
2216 if ($final) {
2217 print "\nYou have chosen the following tags:\n$choices\n";
2218 print "This will be written to the config file as:\n";
2219 print Dump $final;
2220 print "\nIs this correct? (y or n)\n";
2221 } else { print "\nInvalid selection. Please try again\n" }
2223 push @{$YAML->{$input}}, $final->{$input};
2224 conf_write();
2227 # words_tag() splits each tag value string into multiple words so that the
2228 # user can select the parts he/she wants to use for curation
2229 # it can tag 702 (a - zz) separate words; this should be enough
2231 sub words_tag {
2233 my ($feat, $entry) = @_;
2235 my @tags;
2237 @tags[1,2] = ({'all' => ['primary_tag', $feat->primary_tag]}, {'all' => ['location', $feat->start.'..'.$feat->end]});
2238 my $i = 3;
2239 foreach my $tag ($feat->all_tags) {
2240 foreach my $value ($feat->each_tag_value($tag)) {
2242 my ($string, $tagged_string);
2244 my @words = split(/(?=\w+?)/, $value);
2246 my $pos = 0;
2249 foreach my $word (@words) {
2251 (my $sanitized_word = $word) =~ s/\W+?//g;
2252 $string .= $word;
2254 my $lead = int($pos/ALPHABET_DIVISOR);
2255 my $lag = $pos % ALPHABET_DIVISOR;
2257 my $a = $lead ? ${(ALPHABET)}[$lead-1] : '';
2258 $a .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2260 $tagged_string .= " ($a) $word";
2262 $tags[$i]{$a} = [$tag, $sanitized_word];
2263 $pos++;
2266 $value = $tagged_string if scalar(@words) > 1;
2268 $$entry .= "[$i] /$tag=\"$value\"\r";
2270 $tags[$i]{'all'} = [$tag, $string];
2272 $i++;
2275 return @tags;
2279 sub alpha_expand {
2281 my ($dangling_alphas, $selections) = @_;
2283 if (defined($dangling_alphas) && $dangling_alphas =~ /(\d)*([[:alpha:]]+)-([[:alpha:]]+)/) {
2285 my $digit = $1;
2286 push @$selections, $digit if $digit;
2288 my $start = $2;
2289 my $stop = $3;
2291 my @starts = split('', $start);
2292 my @stops = split('', $stop);
2294 my ($final_start, $final_stop);
2296 for ([\$final_start, \@starts], [\$final_stop, \@stops]) {
2298 my ($final, $splits) = @$_;
2300 my $int = ${(ALPHABET_TO_NUMBER)}{$$splits[0]};
2301 my $rem;
2304 if ($$splits[1]) {
2305 $rem = ${(ALPHABET_TO_NUMBER)}{$$splits[1]};
2306 $int++
2307 } else {
2308 $rem = $int;
2309 $int = 0;
2313 $$final = $int * ALPHABET_DIVISOR;
2314 $$final += $rem;
2318 my $last_number = pop @$selections;
2319 for my $pos ($final_start..$final_stop) {
2320 my $lead = int($pos/ALPHABET_DIVISOR);
2321 my $lag = $pos % ALPHABET_DIVISOR;
2322 my $alpha = $lead ? ${(ALPHABET)}[$lead-1] : '';
2323 $alpha .= $lag ? ${(ALPHABET)}[$lag] : 'a';
2324 push @$selections, $last_number.$alpha;
2327 } elsif (defined($dangling_alphas)) {
2328 if ($dangling_alphas =~ /^\d/) {
2329 push @$selections, $dangling_alphas;
2330 } elsif ($dangling_alphas =~ /^\D/) {
2331 #print "$dangling_alphas ".Dumper @$selections;
2332 my $last_number = pop @$selections;
2333 $last_number ||= '';
2334 push @$selections, $last_number.$dangling_alphas;
2335 #$$selections[-1] .= $dangling_alphas;
2341 sub _selection_add {
2343 my ($tag, $final, $input, $choices, $num) = @_;
2344 my ($t,$v) = @{$tag->{'all'}};
2345 unless (defined ${$final->{$input}}[0]{$t}) {
2346 $$choices .= "$num, ";
2347 push @{${$final->{$input}}[0]{$t}}, $v