modified: myjupyterlab.sh
[GalaxyCodeBases.git] / makefiles / genmkf4bwakit.pl
blob0b420a906b5286393be3a14e568bc1f18801b080
1 #!/usr/bin/env perl
2 use strict;
3 #use warnings;
4 #use IO::Unread qw(unread);
5 use Data::Dump qw(ddx);
7 #die "Usage: $0 <fq.gz path>\n" if @ARGV < 1;
8 my ($inp)=@ARGV;
9 $inp='fq';
11 my $Ref = 'ref/hs37d5.fa';
12 my $BwaKit = "./bwa.kit/run-bwamem -sad -t12";
13 my $SamTools = './bwa.kit/samtools';
14 my $SamtoolsMerge = "$SamTools merge -l 9";
15 my $readsCounter = '/share/users/huxs/git/toGit/c_cpp/faststater/readsCounter';
16 my $GATK = 'java -Xmx16g -jar /opt/jar/GenomeAnalysisTK.jar';
18 opendir my($dh), $inp or die "Couldn't open dir '$inp': $!";
19 my @files = readdir $dh;
20 closedir $dh;
21 # http://perlmeme.org/faqs/file_io/directory_listing.html
23 @files = grep(/\.f(ast|)q\b/i, @files);
25 #ddx \@files;
27 my (%Pairs,@AllTargets);
28 for ( @files ) {
29 m~^([^/]+?)([\W_.]*?)([12])?(\.clean)?(\.f(ast|)q(\.gz)?)$~i or die $_;
30 my ($m,$rp,$r,$ext) = ($1,$2,$3,$5);
31 #$m =~ s/\W//g and die $m; # 减号也是\W, >_<
32 print "$1, $2, $3, $4, $5, $6, $7\t$m,$rp,$r,$ext, $_\n";
33 die unless ($r == 1) or ($r==2); # no SE now
34 push @{$Pairs{$m}},[$_,$r];
37 my (%SamplesFN,%Samples);
38 for (keys %Pairs) {
39 my ($sid) = (split /_/,$_)[-2];
40 push @{$SamplesFN{$sid}},$_;
42 @AllTargets = sort keys %SamplesFN;
44 ddx \%Pairs;
45 ddx \%SamplesFN;
46 print join(',',@AllTargets),"\n";
48 open S,'>','Samples.list.pre';
49 print S "$_\t\n" for @AllTargets;
50 close S;
52 if (-s 'Samples.list') {
53 open S,'<','Samples.list';
54 while (<S>) {
55 chomp;
56 my ($id,$sample)=split /\t/;
57 $Samples{$sample} = $SamplesFN{$id};
59 } else {
60 $Samples{$_} = $SamplesFN{$_} for @AllTargets;
62 ddx \%Samples;
63 print join(',',@AllTargets),"\n";
64 @AllTargets = sort keys %Samples;
65 print join(',',@AllTargets),"\n";
67 open M,'>','Makefile' or die $!;
68 print M 'all: ',join(' ',@AllTargets,'_AdditionalTG_'),"\n";
69 my (@AdditionalTG,@PHONY);
71 mkdir 'bam';
72 mkdir 'stat';
73 mkdir 'merged';
75 for my $sid ( @AllTargets ) {
76 my @Runs = @{$Samples{$sid}};
77 for my $Target (@Runs) {
78 my @Files = @{$Pairs{$Target}};
79 die if @Files != 2; # no SE now
80 my (@fqFiles,@newFiles);
81 for (@Files) {
82 my ($fqname,$read12) = @$_;
83 $newFiles[$read12-1] = "${Target}_$read12";
84 $fqFiles[$read12-1] = $fqname;
85 print M "
86 stat/${Target}_$read12.fqstat: $inp/$fqname
87 \t$readsCounter -o stat/${Target}_$read12.fqstat $inp/$fqname\n";
88 push @AdditionalTG, "stat/${Target}_$read12.fqstat";
90 print M "
91 bam/${Target}.aln.bam: ",join(' ',(map { "$inp/$_" } @fqFiles)),"
92 \t$BwaKit -R'\@RG\\tID:$Target\\tLB:${sid}-1\\tPL:ILLUMINA\\tSM:$sid\\tPI:350' -o bam/${Target} $Ref ",join(' ',(map { "$inp/$_" } @fqFiles) )," | sh
95 print M "
96 merged/$sid.bam: ",join(' ',(map { "bam/$_.aln.bam" } @Runs)),"
97 \t$SamtoolsMerge merged/$sid.bam ",join(' ',(map { "bam/$_.aln.bam" } @Runs)),"
98 \t$SamTools index merged/$sid.bam
100 print M "
101 merged/$sid.bam.bai: merged/$sid.bam
102 \t$SamTools index merged/$sid.bam
104 push @PHONY,$sid;
105 print M "$sid: merged/$sid.bam.bai\n";
108 print M "\n";
109 print M join(' ','_AdditionalTG_:',@AdditionalTG),"\n";
110 print M ".PHONY: all clean",join(' ',@PHONY),"\n";
111 print M "clean:
112 \t-rm bam/*.aln.*.bam
114 close M;
116 __END__
117 make -j4
119 $ ./bwa.kit/run-bwamem -sad -t24 -R'@RG\tID:FCAP086\tLB:FCAP086\tPL:ILLUMINA\tSM:FCAP086' -o bam/FCAP086_H2LGFCCXX_L3 ref/Felis_catus80_chr.fa fq/FCAP086_H2LGFCCXX_L3_1.fq.gz fq/FCAP086_H2LGFCCXX_L3_2.fq.gz
120 ./bwa.kit/seqtk mergepe fq/FCAP086_H2LGFCCXX_L3_1.fq.gz fq/FCAP086_H2LGFCCXX_L3_2.fq.gz \
121 | ./bwa.kit/trimadap 2> bam/FCAP086_H2LGFCCXX_L3.log.trim \
122 | ./bwa.kit/bwa mem -p -t24 -R'@RG\tID:FCAP086\tLB:FCAP086\tPL:ILLUMINA\tSM:FCAP086' ref/Felis_catus80_chr.fa - 2> bam/FCAP086_H2LGFCCXX_L3.log.bwamem \
123 | ./bwa.kit/samblaster 2> bam/FCAP086_H2LGFCCXX_L3.log.dedup \
124 | ./bwa.kit/samtools sort -@ 4 -m1G - bam/FCAP086_H2LGFCCXX_L3.aln;
126 ARR=`echo {10814..10823}| tr ' ' ,` && pidstat -h -r -u -p $ARR 1
130 @RG Read group. Unordered multiple @RG lines are allowed.
131 ID* Read group identifier. Each @RG line must have a unique ID. The value of ID is used in the RG tags of alignment records. Must be unique among all read groups in header section. Read group IDs may be modified when merging SAM files in order to handle collisions.
132 CN Name of sequencing center producing the read.
133 DS Description.
134 DT Date the run was produced (ISO8601 date or date/time).
135 FO Flow order. The array of nucleotide bases that correspond to the nucleotides used for each flow of each read. Multi-base flows are encoded in IUPAC format, and non-nucleotide flows by various other characters. Format: /\*|[ACMGRSVTWYHKDBN]+/
136 KS The array of nucleotide bases that correspond to the key sequence of each read.
137 LB Library.
138 PG Programs used for processing the read group.
139 PI Predicted median insert size.
140 PL Platform/technology used to produce the reads. Valid values: CAPILLARY, LS454, ILLUMINA, SOLID, HELICOS, IONTORRENT, ONT, and PACBIO.
141 PM Platform model. Free-form text providing further details of the platform/technology used.
142 PU Platform unit (e.g. flowcell-barcode.lane for Illumina or slide for SOLiD). Unique identifier.
143 SM Sample. Use pool name where a pool is being sequenced
145 http://gatkforums.broadinstitute.org/discussion/1317/collected-faqs-about-bam-files
146 Dad's data:
147 @RG ID:FLOWCELL1.LANE1 PL:ILLUMINA LB:LIB-DAD-1 SM:DAD PI:200
148 @RG ID:FLOWCELL1.LANE2 PL:ILLUMINA LB:LIB-DAD-1 SM:DAD PI:200
149 @RG ID:FLOWCELL1.LANE3 PL:ILLUMINA LB:LIB-DAD-2 SM:DAD PI:400
150 @RG ID:FLOWCELL1.LANE4 PL:ILLUMINA LB:LIB-DAD-2 SM:DAD PI:400
152 Mom's data:
153 @RG ID:FLOWCELL1.LANE5 PL:ILLUMINA LB:LIB-MOM-1 SM:MOM PI:200
154 @RG ID:FLOWCELL1.LANE6 PL:ILLUMINA LB:LIB-MOM-1 SM:MOM PI:200
155 @RG ID:FLOWCELL1.LANE7 PL:ILLUMINA LB:LIB-MOM-2 SM:MOM PI:400
156 @RG ID:FLOWCELL1.LANE8 PL:ILLUMINA LB:LIB-MOM-2 SM:MOM PI:400