modified: Makefile
[GalaxyCodeBases.git] / perl / etc / getPalindromic.pl
blobe0b6a66de5854330093f7422c606eb1a89bbf644
1 #!/usr/bin/env perl
2 use strict;
3 use warnings;
4 use Term::ANSIColor qw(:constants);
5 use Data::Dump qw(ddx);
7 =pod
8 >theseq
9 ATTACATGGACTTCTCTGGCTACCAAT
11 blastn to bacteria (taxid:2)
13 http://www.ncbi.nlm.nih.gov/nucleotide/373881580?report=genbank&log$=nuclalign&blast_rank=1&RID=927901S401N
14 LOCUS JQ031552 76221 bp DNA circular BCT 29-JAN-2012
15 DEFINITION Aliivibrio fischeri strain KB1A-97 plasmid pKB1A97-67, complete
16 sequence.
17 ACCESSION JQ031552
18 VERSION JQ031552.1 GI:373881580
19 KEYWORDS .
20 SOURCE Aliivibrio fischeri (Vibrio fischeri)
21 ORGANISM Aliivibrio fischeri
22 Bacteria; Proteobacteria; Gammaproteobacteria; Vibrionales;
23 Vibrionaceae; Aliivibrio.
24 REFERENCE 1 (bases 1 to 76221)
25 AUTHORS Summers,A.O., Wireman,J. and Williams,L.E.
26 TITLE Direct Submission
27 JOURNAL Submitted (14-NOV-2011) Department of Microbiology, University of
28 Georgia, 527 Biological Sciences, Athens, GA 30602, USA
29 FEATURES Location/Qualifiers
30 source 1..76221
31 /organism="Aliivibrio fischeri"
32 /mol_type="genomic DNA"
33 /strain="KB1A-97"
34 /host="Euprymna scolopes"
35 /db_xref="taxon:668"
36 /plasmid="pKB1A97-67"
37 /country="USA: Hawaii"
38 /collection_date="2005"
39 /note="strain provided by Eric Stabb, University of
40 Georgia, Athens, GA, USA"
41 CDS complement(53120..53287)
42 /codon_start=1
43 /transl_table=11
44 /product="hypothetical protein"
45 /protein_id="AEY78253.1"
46 /db_xref="GI:373881650"
47 /translation="MIFVCNGLQFYCLVNRHESVINFQACLWVILVLFIYEKWHYVDLRRFNGCFEWLS"
48 >>> CDS 53565..54167
49 /codon_start=1
50 /transl_table=11
51 /product="Transposase-like protein"
52 /protein_id="AEY78254.1"
53 /db_xref="GI:373881651"
54 /translation="MDFSGYQYPSDIILQAVRYYVSYKLSTRDIEEIFTERGSAIDHS
55 TINRWVITFAPMLEQNARQLKRKVSSSWRMDETYIKIKGEWWYYYRAVDKYGDIVDFY
56 LSKERDEKAAKAFLRKAIHTNGLPDKVVIDKSGANALALHNLNVKLWLSVVFMLNLIE
57 IVDVKYLNNIVEQSYRPIKQKMVQALGWKSVEGATVTMSG"
58 CDS 54975..55841
59 /codon_start=1
60 /transl_table=11
61 /product="hypothetical protein"
62 /protein_id="AEY78255.1"
63 /db_xref="GI:373881652"
64 /translation="MQHDVGSNSDVYISCYSLNIKESGLSGAHYTISDIRKSIETVKVTSSYRHLHIEMKNSLCVCFSTLDITWVNGLEHGELLAGQFTVFDSECPISYKVTRVGSLCFVFIPKYFYEGVLQKQMVRCGMFEFVYVDAIRFILTRVNSKEDGEQLLISELLALGYLLSVLERKEEAVGKKVAFEDKVHEVIKDNMLNPSLYLDDIALILGCSKRKIQHCLSLQGVSFTKLVTKYRIEYLAEQLIRKKHSRIDVLCYESGFNSPGYASNSFKVIMGMSPKEYRCRYLAKSSVF"
65 =cut
67 # http://www.perlmonks.org/?node_id=1085446
68 my %invert; @invert{ qw[ A C G T ] } = qw[ T G C A ];
70 sub getTE($) {
71 my $in = $_[0];
72 my %ret;
73 #print $in,"\n";
74 for my $p1 ( 1 .. length( $in ) -2 ) {
75 next unless substr( $in, $p1, 1 ) eq $invert{ substr $in, $p1+1, 1 };
76 my $pals = 0;
77 for my $p2 ( 1 .. $p1 -1 ) {
78 last unless substr( $in, $p1-$p2, 1 ) eq $invert{ substr $in, $p1+$p2+1, 1 };
79 ++$pals;
81 #if( $pals ) {
82 if( $pals > 1 ) {
83 #printf "%s at %d\n",substr( $in, $p1-$pals, ($pals+1)*2 ), $p1-$pals;
84 my $palindromicDNA = substr( $in, $p1-$pals, ($pals+1)*2 );
85 my $leftPad = $p1-$pals;
86 $leftPad = 0 if $leftPad > 272;
87 printf "%s%s @ %d [%d]", '.'x$leftPad,
88 $palindromicDNA, $p1-$pals, ($pals+1)*2;
89 push @{$ret{$palindromicDNA}},$p1-$pals;
90 if (scalar @{$ret{$palindromicDNA}} > 1) {
91 print BOLD,RED,' *',RESET;
93 print "\n";
96 return \%ret;
99 my @CDS = (53565, 54167);
100 my $getIN = 5;
101 my $getLen = 6000;
103 ($getIN,$getLen) = (0,2000);
105 my $wholeLen = $getIN + $getLen;
106 my ($leftTEs,$rightTEs,%TEs,$allTEs);
107 sub analyseTE($);
109 open I,'<','pKB1A97-67.fa' or die $?;
110 while (<I>) {
111 chomp;
112 my $Head;
113 my ($id,$desc)=split / /,$_,2;
114 if ($desc && $desc !~ /^\s*$/) {
115 $desc=~s/\t/_/g;
116 $Head="$id $desc";
117 } else { $desc='.';$Head=$id; }
118 $/=">";
119 my $seq=<I>;
120 chomp $seq;
121 $seq =~ s/\s//g;
122 $/="\n";
123 my $len=length($seq);
124 print STDERR "$id:$len,[$desc]\n";
125 $seq = uc $seq;
126 my $left = substr $seq,($CDS[0]-1-$getLen),$wholeLen;
127 my $right = substr $seq,($CDS[1]-$getIN),$wholeLen;
128 #local $Term::ANSIColor::AUTORESET = 1;
129 # print substr($left,0,$getLen),BOLD,GREEN,substr($left,$getLen),RESET,"\n";
130 # $leftTEs = getTE($left);
131 # print BOLD,GREEN,substr($right,0,$getIN),RESET,substr($right,$getIN,$getLen),"\n";
132 # $rightTEs = getTE($right);
133 $allTEs = getTE($seq);
134 analyseTE($seq);
136 close I;
138 sub checkType($$) {
139 my ($pos,$telen) = @_;
140 ++$pos; # now, 1-based
141 my $posR = $pos+$telen;
142 if ( $posR < $CDS[0] ) {
143 return $posR - $CDS[0];
144 } elsif ( $pos > $CDS[1] ) {
145 return $pos - $CDS[1];
146 } else { return 0; }
149 my (%PatDat,%PatFlag,@Patterns);
150 sub printTE($$) {
151 my ($flag,$mode) = @_;
152 for my $k (@Patterns) {
153 if ($mode == 1) {
154 next unless $PatFlag{$k} == $flag;
155 } elsif ($mode == 0) {
156 next unless $PatFlag{$k} & $flag;
157 } elsif ($mode == -1) {
158 next if $PatFlag{$k} == $flag;
159 } else { die; }
160 my $itsLen = length($k);
161 my ($Instde,@Left,@Right,@PosL,@PosR) = (0);
162 for my $p (@{$PatDat{$k}}) {
163 push @Left,$p if $p < 0;
164 push @Right,$p if $p > 0;
165 $Instde = 1 if $p == 0;
167 if (@Left==0) {
168 @Left=('NA');
169 @PosL=('NA');
170 } else {
171 push @PosL,$_-$itsLen+$CDS[0] for @Left;
173 if (@Right==0) {
174 @Right=('NA');
175 @PosR=('NA');
176 } else {
177 push @PosR,$_+$CDS[1] for @Right;
180 #print BOLD,GREEN,"$PatFlag{$k} $k\[$itsLen]: ",join(',',$Left[-1],$Right[0]),' -> ',join(',',$PosL[-1],$PosR[0]),($Instde?' *':''),RESET,"\n";
181 print BOLD,GREEN,"$k\[$itsLen]: ",join(',',$Left[-1],$Right[0]),' -> ',join(',',$PosL[-1],$PosR[0]),($Instde?' *':''),RESET,"\n";
182 if ((@Left + @Right)>2) {
183 print "\t",join(',',@PosL),' | ',join(',',@PosR);
184 #print " <- ",join(',',@Left),' | ',join(',',@Right);
186 print "\n";
190 sub analyseTE($) {
191 my $seq = $_[0];
192 =pod
193 for my $k (keys %{$leftTEs}) {
194 $TEs{$k} = [ ${$leftTEs}{$k} ];
196 for my $k (keys %{$rightTEs}) {
197 push @{$TEs{$k}},${$rightTEs}{$k};
199 for my $k (keys %TEs) {
200 next unless scalar @{$TEs{$k}} == 2;
201 print '>',$k,"\t";
202 ddx $TEs{$k};
204 =cut
205 %TEs = %{$allTEs};
207 @Patterns = sort { length($b)<=>length($a) || $a cmp $b } keys %TEs;
208 (%PatDat,%PatFlag)=();
209 for my $k (@Patterns) {
210 my @itsPoses;
211 my $itsLen = length($k);
212 print "\n$k\[$itsLen]: ";
213 while ($seq =~ /(?=$k)/g) { # http://www.perlmonks.org/?node_id=1090633
214 my $p = pos $seq;
215 my $chk = checkType($p,$itsLen);
216 print "$p,$chk\t";
217 push @{$PatDat{$k}},$chk;
218 $PatFlag{$k} |= 1 if $chk <0;
219 $PatFlag{$k} |= 2 if $chk >0;
220 $PatFlag{$k} |= 4 if $chk ==0;
222 print "\n";
224 #ddx \%PatFlag;
225 ddx \@Patterns;
226 print BOLD,RED,'-'x5,'Both','-'x10,"\n";
227 printTE(3,1);
228 print BOLD,RED,'-'x5,'Instde','-'x10,"\n";
229 printTE(4,0);
230 #print BOLD,RED,'-'x5,'Single','-'x10,"\n";
231 #printTE(3,-1);
234 __END__
236 #http://repo.hackerzvoice.net/depot_madchat/esprit/texture/hallucinati/finding%20DNA%20palindroms.htm
238 #!/usr/bin/perl
239 $filename = "668plasmids.fa";
240 open (TEXT, $filename)||die"Cannot";
241 $line = " ";
242 $count = 0;
243 for $n (5..20)
245 $re = qr /[CAGT]{$n}/;
246 $regexes[$n-5] = $re;
248 NEXTLINE: while ($count < 1000)
250 $line = <TEXT> ;
251 $count++;
252 foreach my $value (@regexes)
254 $start = 0;
255 while ($line =~ /$value/g)
257 $endline = $';
258 $match = $&;
259 $revmatch = reverse($match);
260 $revmatch =~ tr/CAGT/GTCA/;
261 if ($endline =~ /^([CAGT]{0,15})($revmatch)/)
263 $start = 1;
264 $palindrome = $match . "*" . $1 . "*" . $2;
265 $palhash{$palindrome}++;
268 if ($start == 0)
270 goto NEXTLINE;
274 close TEXT;
275 while(($key, $value) = each (%palhash))
277 print "$key => $value\n";
279 exit;