new file: cell2loc.py
[GalaxyCodeBases.git] / perl / etc / justonce / refmt.pl
blob382857a998345b27ac40e4cce3c23d8ab06a58d9
1 #!/bin/env perl
2 use strict;
3 use warnings;
4 use Bio::AlignIO;
5 use Data::Dump qw(ddx);
7 die "Usage: $0 <input msf> <output>\n" if @ARGV < 2;
8 my ($inf,$outf)=@ARGV;
10 open OUT,'>',$outf or die;
12 my $in = Bio::AlignIO->new(-file => $inf,
13 -format => "msf" );
15 my $alnObj = $in->next_aln(); # get entire alignment data
17 my (%Seq,$cns,$combined,@CNS,$t,@ids);
18 foreach my $seqObj ($alnObj->each_seq) {
19 print join(',',$seqObj->display_id), "\n";
20 push @ids,$seqObj->display_id;
21 my $seq = $seqObj->seq;
22 $Seq{$seqObj->display_id}=[-1,0,$seq];
23 my @bases = split //,$seq;
24 $t=0;
25 for my $b (@bases) {
26 if ($b ne '.') {
27 if (exists $CNS[$t]) {
28 ++$CNS[$t]->{$b};
29 } else {
30 $CNS[$t] = {$b => 1};
32 if ($Seq{$seqObj->display_id}->[0] == -1) {
33 $Seq{$seqObj->display_id}->[0] = $t + 1;
36 ++$t;
38 $seq =~ s/\.+$//g;
39 $Seq{$seqObj->display_id}->[1] = length($seq);
42 ddx \@CNS;
43 #ddx \%Seq;
44 print "@ids\n";
46 #/share/users/huxs/git/toGit/perl/perlib/etc/Galaxy/Data.pm
47 our %REV_IUB = (A => 'A',
48 T => 'T',
49 C => 'C',
50 G => 'G',
51 AC => 'M',
52 AG => 'R',
53 AT => 'W',
54 CG => 'S',
55 CT => 'Y',
56 'GT' => 'K',
57 ACG => 'V',
58 ACT => 'H',
59 AGT => 'D',
60 CGT => 'B',
61 ACGT=> 'N',
62 N => 'N'
65 my %ATGtoBIN = ( A => 1,C => 2,G => 4,T => 8 );
67 $cns=$combined='';
68 my @CNSbases;
69 for my $base (@CNS) {
70 my %thebase = %{$base};
71 $t = (sort { $thebase{$b} <=> $thebase{$a} } keys %thebase)[0];
72 push @CNSbases,$t;
73 $cns .= $t;
74 $t = join '',(sort { $a cmp $b } keys %thebase);
75 $combined .= $REV_IUB{$t} or die;
78 print "$cns\n$combined\n";
80 my $Main = (split /\*/,$ids[0])[0];
81 print OUT <<HEAD;
82 <?xml version="1.0" encoding="UTF-8"?>
83 <reference schema="350">
84 <name>${Main}_Alleles</name>
85 <library>${Main}</library>
86 <comments>No Comment</comments>
87 <version>2012_Tang</version>
88 <start>0</start>
89 <max_deletion>5</max_deletion>
90 <consensus>$cns</consensus>
91 <combined>$combined</combined>
93 <allele-list>
94 HEAD
96 for my $id (@ids) {
97 my ($start,$stop,$seq) = @{$Seq{$id}};
98 my @bases = split //,$seq;
99 $t=0;
100 my @variants=();
101 for my $b (@bases) {
102 if ($b ne '.' and $b ne $CNSbases[$t]) {
103 push @variants,join(' ',$t+1,$ATGtoBIN{$b});
104 print "$id:$t $b ne $CNSbases[$t]\n";
106 ++$t;
108 $t = join(' ',@variants);
109 print OUT <<ITEM;
110 <allele>
111 <name>$id</name>
112 <type>0</type>
113 <start>$start</start>
114 <stop>$stop</stop>
115 <variants>$t</variants>
116 </allele>
117 ITEM
120 print OUT '</allele-list>
121 </reference>
123 close OUT;