new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / ssr.pl
blob6fa1256569e3593ac76f02ccb2e7e5c17df2b0fe
1 #!/usr/bin/perl
2 use strict;
3 use warnings;
4 use Data::Dump qw(ddx);
6 my @patterns = qw[
7 atag
8 attc
9 cagt
10 ccgg
11 tagc
12 aact
13 gaga
14 gcac
15 tggc
16 gcga
17 acgt
18 aaa
19 ata
20 gga
21 cga
22 tgc
23 taa
24 tac
25 gac
26 ggc
27 cca
28 cac
30 sub revcom($) {
31 my $str = $_[0];
32 $str =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
33 my $rev = reverse $str;
34 $rev =~ tr/[](){}<>/][)(}{></;
35 return $rev;
37 my $maxLen = 500;
38 my $minLen = 100; # 150
39 my $flanking = 200;
40 print "[@patterns]\n";
41 my (@bothpatterns,%t);
42 push @bothpatterns,revcom($_) for @patterns;
43 ++$t{$_} for (@patterns,@bothpatterns);
44 @bothpatterns = sort { length($a) <=> length($b) || $a cmp $b } keys %t;
45 print "[@bothpatterns]\n";
47 my $in = shift;
48 #my $out = shift;
49 my $out;
50 $in =~ /_(\w+)\.mfa\.gz$/;
51 $out =$1.'.lst';
52 print "From [$in] to [$out]\n";
54 open IN, "-|", "zcat $in";
55 open O,'>',$out;
57 sub getpattern($$) {
58 my ($seq,$pattLen) = @_;
59 #my $pattLen = length $patt;
60 my $mincpy = int($minLen/$pattLen);
61 my $maxcpy = int(1+$maxLen/$pattLen);
62 my (%count, $size);
63 $size=$pattLen;
64 #$mincpy=2;
65 #$flanking = 10;
66 $count{uc(substr($seq, $_, $size))}++ for 0..(length($seq) - $size);
67 #ddx \%count;
68 for (sort {$count{$b} <=> $count{$a} || $a cmp $b} keys %count) {
69 last if $count{$_} < $mincpy;
70 next if /^N*$/i;
71 my $pattern = $_;
72 my (@poses,@groups,$lastpos,$groupid);
73 while ($seq =~ /$_/ig) {
74 push @poses,pos($seq)-$size;
76 $lastpos = shift @poses;
77 $groupid = 0;
78 for (@poses) {
79 if ($_-$lastpos == $size) {
80 push @{$groups[$groupid]},$_; # lacking the 1st one.
81 } else {
82 ++$groupid;
84 $lastpos = $_;
86 for (@groups) {
87 next unless defined $_;
88 my @eachposes = @$_;
89 if (@eachposes > $mincpy-1) {
90 my $start = $eachposes[0]-$size;
91 my $left = substr($seq, $start-$flanking, $flanking);
92 my $right = substr($seq, $eachposes[-1]+$size,$flanking);
93 my $ssrlen = (1+@eachposes) * $size;
94 my $ssrseq = substr($seq, $start, $ssrlen);
95 my $all = "$left ($ssrseq) $right";
96 print O join("\t",$pattern, $ssrlen, 1+$start,$all),"\n";
99 #print "$_ :";
100 #print ' '.(pos($seq)-$size) while $seq =~ /$_/g;
101 #print " ($count{$_} matches)\n";
103 =pod
104 while($seq=~m/($patt){$mincpy,$maxcpy}/ig){
105 #while($seq=~m/([ATCG]+)(\1){$mincpy,$maxcpy}/ig){
106 #push(@index,$-[2]);
107 #pos($seq)=$-[0];
108 my $len = length $&;
109 print O join("\t",$patt,$len,$-[0],$&,substr($seq, $-[0]-$flanking, $flanking),substr($seq, $+[0],$flanking) ),"\n";# if $len >= $minLen and $len <= $maxLen;
111 =cut
115 $/ = ">";
116 my $junk = <IN>;
117 while ( my $record = <IN> ) {
118 chomp $record; # Remove the ">" from the end of $record, and realize that the ">" is already gone from the begining of the record
119 my ($defLine, @seqLines) = split /\n/, $record;
120 my $sequence = join('',@seqLines); # Concatenates all elements of the @seqLines array into a single string.
121 $sequence =~ s/\s//g;
122 print "$defLine\n"; # Print your definition; remember the ">" has already been removed. Remember to print a newline.
123 my $seqlen = length($sequence);
124 print "Seq Length: $seqlen\n"; # Print the sequence length and a newline
125 print O ">$defLine $seqlen\n";
126 #getpattern($sequence,$_) for (@bothpatterns);
127 getpattern($sequence,$_) for (3,4);
128 print O "\n";
131 $/ = "\n";
133 print "\n";
135 close IN;
136 close O;
139 __END__
140 scp galaxy@192.168.0.83:/Users/Galaxy/t/ssr.pl .
142 find *.mfa.gz|while read a;do perl ssr.pl "$a";done
143 awk '{print $2}' *.lst|sort -n|uniq -c
145 wc -l *.lst|grep -v ' 2 '|grep lst|awk '{print $2}'|xargs tar -czvf ssr.tgz