modified: n.fq
[GalaxyCodeBases.git] / tools / sfmixer / sfmix2fa.pl
blob252f6fe49be8298b55e48358c56e332fc7b42f07
1 #! /usr/bin/perl -w
3 use strict;
4 use warnings;
5 use Getopt::Long;
6 #use File::Basename;
8 my %opts = ();
10 GetOptions(\%opts, "faFile:s", "len:i", "insert:i", "outDir:s", "oriChrName:s", "chrname:s", "project:s");
12 unless (defined $opts{faFile} && $opts{outDir} && $opts{project}) {
13 print "\n\tthis program will generate a new fa file based on the original fa file. The scaffolds will be merged into several chromosomes using \"n\" to sperate them\n\tthe output will be the new fa file and a list with which the scaffold will be traced back\n";
14 print "\n\t-faFile\t\tthe original fa file\n";
15 print "\t-outDir\t\tthe output directory\n";
16 print "\t-project\tproject name\n";
17 print "\t-len\t\tthe len of each new chromosome after merge [10000000]\n";
18 print "\t-insert\t\tthe number of \"n\" insert into the new chromosome between two scaffolds [75]\n";
19 print "\t-oriChrName\tthe original name pattern (case insensitive) [Chr]\n";
20 print "\t-chrname\tthe new chromosome name pattern [ChrNew]\n";
21 print "Example:\n\tperl $0 -faFile /share/tmp/pub/Genome/cucumberBGI/Cucumber.original.fa -outDir /share/raid11/zhanghao/software/preProcess/ -project Cucumber -chrname LG_M\n";
22 print "\toutput: /share/raid11/zhanghao/software/preProcess/Cucumber.merge.fa\n\t\t/share/raid11/zhanghao/software/preProcess/Cucumber.merge.list\n";
23 print "\tperl $0 -faFile /share/tmp/pub/Genome/SorghumBicolor/sbi1.fasta -outDir /share/raid11/zhanghao/software/preProcess/ -project SorghumBicolor -oriChrName chromosome -chrname chromosome_M\n";
24 print "\tAuther: Hao Zhang\tTime: 22:32 21/04/2009\n";
25 print " fixed bye lijun3 Mon Jan 4 17:50:33 CST 2010\n";
26 exit 0;
29 my $fa_file = $opts{faFile};
30 my $chr_len = $opts{len};
31 my $insert_size = $opts{insert};
32 my $out_dir = $opts{outDir};
33 my $scaffold_tem = $opts{oriChrName};
34 my $chr_tem = $opts{chrname};
35 my $project_name = $opts{project};
37 $chr_len ||= 10000000;
38 my $offset = $chr_len/1000;
39 $insert_size ||= 75;
40 $scaffold_tem ||= "Chr";
41 $chr_tem ||= "ChrNew";
42 my $base_per_line = 50;
43 #print"$scaffold_tem\t$chr_tem\n";
45 my $chr_num = 1;
46 my $this_chr_index = 1;
47 my $merging = 0;
48 my $current_chr = "";
49 my $current_scaf = "";
50 my %hChrInfo = ();
51 my %hScaName = (); #{chr} => (sca1, sca2, sca3...)
53 my $starttime = time();
55 open FA, $fa_file or die "$!";
56 #my $file_basename = basename $fa_file;
57 #$file_basename =~ s/\.fa//;
58 open MERGEFA, ">$out_dir/$project_name.merge.fa" or die "$!";
59 my $base_per_line_mark = 1;
60 while (my $line = <FA>) {
61 chomp $line;
62 if ($line =~ /^>/) {
63 if ($line !~ /$scaffold_tem/i) {
64 my $scaffold_name = (split /\s+/, $line)[0];
65 $scaffold_name =~ s/^>//;
66 if ($this_chr_index > ($chr_len - $offset)) { #finish merging one chromosome, and will start merge a new one.
67 $hChrInfo{$chr_num}{$current_scaf} .= "\t$this_chr_index";
68 ++$chr_num;
70 if($chr_num < 10)
72 $current_chr = "${chr_tem}0$chr_num";
74 else
76 $current_chr = "$chr_tem$chr_num";
79 $current_scaf = $scaffold_name;
80 $this_chr_index = 1;
81 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
82 push @{$hScaName{$chr_num}}, $current_scaf;
83 print MERGEFA "\n\>$current_chr\n";
85 else { #still the previous one
86 if ((1 == $this_chr_index) and (1 == $chr_num)) { #the first scaffold
87 $current_chr = "${chr_tem}0$chr_num";
88 $current_scaf = $scaffold_name;
89 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
90 push @{$hScaName{$chr_num}}, $current_scaf;
91 print MERGEFA ">$current_chr\n";
93 else {
94 my $end_index = $this_chr_index - 1;
95 $hChrInfo{$chr_num}{$current_scaf} .= "\t$end_index";
96 for(my $count = $this_chr_index; ($this_chr_index - $count) < $insert_size; ++$this_chr_index) {
97 print MERGEFA "n";
98 if (0 == ($this_chr_index % $base_per_line)) {
99 print MERGEFA "\n";
102 $current_scaf = $scaffold_name;
103 $hChrInfo{$chr_num}{$current_scaf} = $this_chr_index;
104 push @{$hScaName{$chr_num}}, $current_scaf;
107 $merging = 1;
108 } #end if
109 else {
110 $merging = 0;
111 print MERGEFA "$line\n";
114 else {
115 if (0 == $merging) {
116 print MERGEFA "$line\n";
117 if ($base_per_line_mark) {
118 my @count_base_per_line = unpack "(a)*", $line;
119 $base_per_line = @count_base_per_line;
120 $base_per_line_mark = 0;
123 else {
124 my @bases = unpack "(a)*", $line;
125 foreach my $base (@bases) {
126 print MERGEFA "$base";
127 if(0 == ($this_chr_index % $base_per_line)) {
128 print MERGEFA "\n";
130 ++$this_chr_index;
134 } #end while
135 my $end_index = $this_chr_index - 1;
136 $hChrInfo{$chr_num}{$current_scaf} .= "\t$end_index";
137 close MERGEFA;
138 close FA;
140 open LIST, ">$out_dir/$project_name.merge.list" or die "$!";
141 open CHRORDER, ">$out_dir/$project_name.merge.chrorder" or die "$!";
142 foreach my $chr (sort {$a<=>$b} keys %hChrInfo) {
143 print CHRORDER "$chr_tem$chr\n";
144 foreach my $scaf (@{$hScaName{$chr}}) {
145 print LIST "$chr_tem$chr\t$scaf\t$hChrInfo{$chr}{$scaf}\n";
148 close CHRORDER;
149 close LIST;
151 my $endtime = time();
152 my $processtime = $endtime - $starttime;
153 print "take $processtime seconds to process $fa_file\n";