new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / getmite.pl
blob5f708f1e6c9d146623624fe5f594acbbf9b65f4a
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Term::ANSIColor qw(:constants);
5 use Data::Dump qw(ddx);
7 =pod
8 >theseq
9 ATTACATGGACTTCTCTGGCTACCAAT
11 blastn to bacteria (taxid:2)
13 http://www.ncbi.nlm.nih.gov/nucleotide/373881580?report=genbank&log$=nuclalign&blast_rank=1&RID=927901S401N
14 LOCUS JQ031552 76221 bp DNA circular BCT 29-JAN-2012
15 DEFINITION Aliivibrio fischeri strain KB1A-97 plasmid pKB1A97-67, complete
16 sequence.
17 ACCESSION JQ031552
18 VERSION JQ031552.1 GI:373881580
19 KEYWORDS .
20 SOURCE Aliivibrio fischeri (Vibrio fischeri)
21 ORGANISM Aliivibrio fischeri
22 Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales;
23 Vibrionaceae; Aliivibrio.
24 REFERENCE 1 (bases 1 to 76221)
25 AUTHORS Summers,A.O., Wireman,J. and Williams,L.E.
26 TITLE Direct Submission
27 JOURNAL Submitted (14-NOV-2011) Department of Microbiology, University of
28 Georgia, 527 Biological Sciences, Athens, GA 30602, USA
29 FEATURES Location/Qualifiers
30 source 1..76221
31 /organism="Aliivibrio fischeri"
32 /mol_type="genomic DNA"
33 /strain="KB1A-97"
34 /host="Euprymna scolopes"
35 /db_xref="taxon:668"
36 /plasmid="pKB1A97-67"
37 /country="USA: Hawaii"
38 /collection_date="2005"
39 /note="strain provided by Eric Stabb, University of
40 Georgia, Athens, GA, USA"
41 CDS complement(53120..53287)
42 /codon_start=1
43 /transl_table=11
44 /product="hypothetical protein"
45 /protein_id="AEY78253.1"
46 /db_xref="GI:373881650"
47 /translation="MIFVCNGLQFYCLVNRHESVINFQACLWVILVLFIYEKWHYVDLRRFNGCFEWLS"
48 >>> CDS 53565..54167
49 /codon_start=1
50 /transl_table=11
51 /product="Transposase-like protein"
52 /protein_id="AEY78254.1"
53 /db_xref="GI:373881651"
54 /translation="MDFSGYQYPSDIILQAVRYYVSYKLSTRDIEEIFTERGSAIDHS
55 TINRWVITFAPMLEQNARQLKRKVSSSWRMDETYIKIKGEWWYYYRAVDKYGDIVDFY
56 LSKERDEKAAKAFLRKAIHTNGLPDKVVIDKSGANALALHNLNVKLWLSVVFMLNLIE
57 IVDVKYLNNIVEQSYRPIKQKMVQALGWKSVEGATVTMSG"
58 CDS 54975..55841
59 /codon_start=1
60 /transl_table=11
61 /product="hypothetical protein"
62 /protein_id="AEY78255.1"
63 /db_xref="GI:373881652"
64 /translation="MQHDVGSNSDVYISCYSLNIKESGLSGAHYTISDIRKSIETVKVTSSYRHLHIEMKNSLCVCFSTLDITWVNGLEHGELLAGQFTVFDSECPISYKVTRVGSLCFVFIPKYFYEGVLQKQMVRCGMFEFVYVDAIRFILTRVNSKEDGEQLLISELLALGYLLSVLERKEEAVGKKVAFEDKVHEVIKDNMLNPSLYLDDIALILGCSKRKIQHCLSLQGVSFTKLVTKYRIEYLAEQLIRKKHSRIDVLCYESGFNSPGYASNSFKVIMGMSPKEYRCRYLAKSSVF"
65 =cut
67 # http://www.perlmonks.org/?node_id=1085446
68 #my %invert; @invert{ qw[ A C G T ] } = qw[ T G C A ];
70 my @CDS = (53565, 54167);
71 my $irMinLen = 3;
72 my $getLen = 40099;
74 $getLen = 150;
75 $irMinLen = 5;
77 # http://www.perlmonks.org/?node_id=197793
78 sub revdnacomp {
79 my $dna = shift; # or my $dna = shift @_;
80 my $revcomp = reverse($dna);
81 $revcomp =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
82 return $revcomp;
85 sub analyseIR($$) {
86 my ($seq1,$seq2) = @_;
87 my $seqlen1 = length $seq1;
88 my $seqlen2 = length $seq2;
89 my %Dat;
90 for my $irLeftPos2 (0 .. ($seqlen2-1)) {
91 #for my $irLen ($irMinLen .. int(($seqlen-$irLeftPos)/2)) {
92 for my $irLen ($irMinLen .. ($seqlen2-1-$irLeftPos2)) {
93 my $irSeq = substr $seq2,-$irLen-$irLeftPos2,$irLen;
94 my $revcmp = revdnacomp($irSeq);
95 my @thePoses;
96 my $p = 0;
97 while ($p >= 0) {
98 $p = index $seq1,$revcmp,$p+1;
99 push @thePoses,$p;
101 last if $thePoses[0] == -1;
102 pop @thePoses;
103 $Dat{"$_|$irLeftPos2"} = $irLen for @thePoses;
104 =pod
105 print "$irLen\[$irSeq]\t";
106 for (@thePoses) {
107 my $found = substr $seq1,$_,$irLen;
108 my $extra = substr $seq1,$_+$irLen,6;
109 print ".$found",GREEN,BOLD,"$extra $_",RESET;
111 print "\t$irLeftPos2\n";
112 =cut
115 #print "1:$seq1,$seq2\n";
116 return \%Dat;
118 sub printRes($$$$) {
119 my ($dat,$getLen,$seq1,$seq2) = @_;
120 my %res;
121 for my $poses (keys %{$dat}) {
122 my $irlen = $dat->{$poses};
123 ++$res{$irlen}{$poses};
125 for my $irlen (sort {$a<=>$b} keys %res) {
126 my %tmp;
127 for my $poses (keys %{$res{$irlen}}) {
128 my @Poses = split /\|/,$poses;
129 my $dis = $Poses[1] + $Poses[0];
130 my $upperPoses = join('|',$Poses[0]-1,$Poses[1]-1);
131 if (exists $res{$irlen+1}) {
132 delete $res{$irlen}{$poses} if exists $res{$irlen+1}{$upperPoses};
134 $tmp{$poses} = $dis;
136 my @Poses = keys %{$res{$irlen}};
137 if (@Poses > 0) {
138 my @tmp = sort { $tmp{$a} <=> $tmp{$b} } @Poses;
139 $res{$irlen} = \@tmp;
140 } else {
141 delete $res{$irlen};
144 for my $irlen (sort {$a<=>$b} keys %res) {
145 print "--- $irlen ---\n";
146 #ddx $res{$irlen};
147 for my $poses ( @{$res{$irlen}} ) {
148 my @Poses = split /\|/,$poses;
149 my $irL = substr $seq1,$Poses[0],$irlen;
150 my $irR = substr $seq2,-$irlen-$Poses[1],$irlen;
151 #my $revcmp = revdnacomp($irR);
152 print "$poses $irL $irR\n";
155 #print "2:$seq1,$seq2\n";
158 open I,'<','pKB1A97-67.fa' or die $?;
159 while (<I>) {
160 chomp;
161 my $Head;
162 my ($id,$desc)=split / /,$_,2;
163 if ($desc && $desc !~ /^\s*$/) {
164 $desc=~s/\t/_/g;
165 $Head="$id $desc";
166 } else { $desc='.';$Head=$id; }
167 $/=">";
168 my $seq=<I>;
169 chomp $seq;
170 $seq =~ s/\s//g;
171 $/="\n";
172 my $len=length($seq);
173 print STDERR "$id:$len,[$desc]\n";
174 $seq = uc $seq;
175 my $left = substr $seq,0,($CDS[0]-1);
176 my $right = substr $seq,($CDS[1]);
177 print "0 Len:(",length $left,',',length $right,")\n";
178 #$left = substr $left,-$getLen;
179 #$right = substr $right,0,$getLen;
180 print "1 Len:(",length $left,',',length $right,")\n";
181 my $res = analyseIR($left,$right);
182 #ddx $res;
183 printRes($res,$getLen,$left,$right);
184 =pod
185 #print "[$left]\n\n[$right]\n";
186 [TTTATTGACCGGCAGCGGTATGCGAGTGCGAACACCAACTTCTTATGGTGAATCACAAATGTACGACCAGCCTTTGCCTGTCACGCAGATGGACATGGGGCAAATTCCT
187 ... ATACACTTACCAACGCATCTTAATAGGGGTTCTGTCGCATGACAGGGAACTAAATCATCCATGAGATACTCAAAGTTCTGTTTTTTACATTAACTCGAGTATTAC]
189 [GAGGTCTGGACCCAAATTAAACGAAGGCAAGTTGGGGACGTAAATTTACCCGTATGGGAGCGCTTTCACGCGCTCGCTGCATAATTGTGTCTAGAATTCGGCACGTCTA
190 ... TGGATGCACGTATGCCTGCAATGGCAAATCTGACCGTAAAAGATGGGTTGGATTT]
191 =cut
193 close I;
195 __END__