8 use CXGN
::Tools
::Script qw
/ in_fh out_fh /;
10 use SGN
::IntronFinder
::Homology
;
13 ###### DEFAULTS ##########
15 my $default_max_evalue = 1e-50; #< update the perldoc at the bottom of
16 # the file if you change this
17 ##########################
21 # check our external tool dependencies
22 foreach my $tool (qw
/ blastall /) {
25 "Program '$tool' must be in executable path to use this script. Aborting.\n";
29 ### parse and validate command line args
31 getopts
( 'i:o:f:e:t:', \
%opt ) or pod2usage
(1);
33 @ARGV == 1 or pod2usage
(1);
35 # check our protein db argument
36 my $protein_db_base = shift @ARGV;
39 foreach (qw
/ psq phr pin psd psi /) {
40 my $dbfile = $protein_db_base . ".$_";
41 unless ( -r
$dbfile ) {
42 warn "Protein db file '$dbfile' not found.\n";
46 $bad_db and die "Invalid protein db '$protein_db_base'. Quitting.\n";
49 # open our input and output filehandles ( see CXGN::Tools::Script )
50 my $in_fh = in_fh
$opt{i
};
51 my $out_fh = out_fh
$opt{o
};
53 # validate our max evalue
54 my $max_evalue = defined $opt{e
} ?
$opt{e
} : $default_max_evalue;
55 $max_evalue == $max_evalue
56 or die "-e option must be numeric\n";
58 # validate our -f (gene feature file)
59 my $gene_feature_file = $opt{f
};
60 if( $gene_feature_file ) {
62 or die "gene feature file '$gene_feature_file' not found\n";
64 or die "gene feature file '$gene_feature_file' not readable\n";
67 # validate our -t tempfile dir
68 my $tempfile_dir = defined $opt{t
} ?
$opt{t
} : File
::Spec
->tmpdir;
69 -d
$tempfile_dir or die "tempfile dir '$tempfile_dir' does not exist\n";
70 -w
$tempfile_dir or die "tempfile dir '$tempfile_dir' is not writable\n";
72 # fake-assedly factor this out into a module
73 SGN
::IntronFinder
::Homology
::find_introns_txt
( $in_fh, $out_fh, $max_evalue, $gene_feature_file, $tempfile_dir, $protein_db_base );
77 intron_finder_homology.pl - finds possible intron sites in transcript
78 sequences by aligning sequences to Arabidopsis using BLAST and then
79 parsing the introns from the known intron positions in Arabidopsis.
83 intron_finder_homology.pl [options] protein_blast_db_name
85 Required argument is the protein database basename to use for
86 running BLAST. This file should be Arabidopsis and the gene feature
87 file needs to correspond to the sequences in this file. (Download
94 maximum e-value for reporting a match.
98 input sequence file to read from. if not given, reads sequence
102 output file to write to. if not given, writes to stdout.
105 optional gene feature file (TODO: document further)
108 directory in which to put temp files for this run.
111 =head1 REQUIRES BLASTALL IN EXEC PATH
113 Requires that the 'blastall' executable be installed and available on
114 the executable path. Set your PATH environment variable before running
115 if you need to explicitly provide a path for finding blastall.
117 =head1 FEATURE FILE FORMAT
119 Structure of feature file. This file can be downloaded from the TAIR website at:
120 ftp://ftp.arabidopsis.org/ ... somewhere.
122 chromosome gene name TAIR accession feature start stop length orientation
123 1 AT1G05850.1 gene:2198687 GENE 1766502 1768662 2161 reverse
124 1 AT1G05850.1 gene:2198687 exon 1766502 1767249 748 reverse
125 1 AT1G05850.1 gene:2198687 ORF 1766832 1768116 1285 reverse
126 1 AT1G05850.1 gene:2198687 coding_region 1766832 1767249 418 reverse
127 1 AT1G05850.1 gene:2198687 exon 1767351 1767510 160 reverse
128 1 AT1G05850.1 gene:2198687 coding_region 1767351 1767510 160 reverse
129 1 AT1G05850.1 gene:2198687 exon 1767729 1768150 422 reverse
130 1 AT1G05850.1 gene:2198687 coding_region 1767729 1768116 388 reverse
131 1 AT1G05850.1 gene:2198687 exon 1768547 1768662 116 reverse
132 1 AT1G05770.1 gene:2198692 GENE 1725546 1726249 704 reverse
134 =head1 COPYRIGHT & LICENSE
136 Copyright 2009-2010 The Boyce Thompson Institute for Plant Research
138 This program is free software; you can redistribute it and/or modify
139 it under the same terms as Perl itself.