4 #use IO::Unread qw(unread);
5 use Data
::Dump
qw(ddx);
7 #die "Usage: $0 <fq.gz path>\n" if @ARGV < 1;
11 my $Ref = 'ref/hs37d5.fa';
12 my $BwaKit = "./bwa.kit/run-bwamem -sad -t12";
13 my $SamTools = './bwa.kit/samtools';
14 my $SamtoolsMerge = "$SamTools merge -l 9";
15 my $readsCounter = '/share/users/huxs/git/toGit/c_cpp/faststater/readsCounter';
16 my $GATK = 'java -Xmx16g -jar /opt/jar/GenomeAnalysisTK.jar';
18 opendir my($dh), $inp or die "Couldn't open dir '$inp': $!";
19 my @files = readdir $dh;
21 # http://perlmeme.org/faqs/file_io/directory_listing.html
23 @files = grep(/\.f(ast|)q\b/i, @files);
27 my (%Pairs,@AllTargets);
29 m
~^([^/]+?
)([\W_
.]*?
)([12])?
(\
.clean
)?
(\
.f
(ast
|)q
(\
.gz
)?
)$~i
or die $_;
30 my ($m,$rp,$r,$ext) = ($1,$2,$3,$5);
31 #$m =~ s/\W//g and die $m; # 减号也是\W, >_<
32 print "$1, $2, $3, $4, $5, $6, $7\t$m,$rp,$r,$ext, $_\n";
33 die unless ($r == 1) or ($r==2); # no SE now
34 push @
{$Pairs{$m}},[$_,$r];
37 my (%SamplesFN,%Samples);
39 my ($sid) = (split /_/,$_)[-2];
40 push @
{$SamplesFN{$sid}},$_;
42 @AllTargets = sort keys %SamplesFN;
46 print join(',',@AllTargets),"\n";
48 open S
,'>','Samples.list.pre';
49 print S
"$_\t\n" for @AllTargets;
52 if (-s
'Samples.list') {
53 open S
,'<','Samples.list';
56 my ($id,$sample)=split /\t/;
57 $Samples{$sample} = $SamplesFN{$id};
60 $Samples{$_} = $SamplesFN{$_} for @AllTargets;
63 print join(',',@AllTargets),"\n";
64 @AllTargets = sort keys %Samples;
65 print join(',',@AllTargets),"\n";
67 open M
,'>','Makefile' or die $!;
68 print M
'all: ',join(' ',@AllTargets,'_AdditionalTG_'),"\n";
69 my (@AdditionalTG,@PHONY);
75 for my $sid ( @AllTargets ) {
76 my @Runs = @
{$Samples{$sid}};
77 for my $Target (@Runs) {
78 my @Files = @
{$Pairs{$Target}};
79 die if @Files != 2; # no SE now
80 my (@fqFiles,@newFiles);
82 my ($fqname,$read12) = @
$_;
83 $newFiles[$read12-1] = "${Target}_$read12";
84 $fqFiles[$read12-1] = $fqname;
86 stat/${Target}_$read12.fqstat: $inp/$fqname
87 \t$readsCounter -o stat/${Target}_$read12.fqstat $inp/$fqname\n";
88 push @AdditionalTG, "stat/${Target}_$read12.fqstat";
91 bam/${Target}.aln.bam: ",join(' ',(map { "$inp/$_" } @fqFiles)),"
92 \t$BwaKit -R'\@RG\\tID:$Target\\tLB:${sid}-1\\tPL:ILLUMINA\\tSM:$sid\\tPI:350' -o bam/${Target} $Ref ",join(' ',(map { "$inp/$_" } @fqFiles) )," | sh
96 merged/$sid.bam: ",join(' ',(map { "bam/$_.aln.bam" } @Runs)),"
97 \t$SamtoolsMerge merged/$sid.bam ",join(' ',(map { "bam/$_.aln.bam" } @Runs)),"
98 \t$SamTools index merged/$sid.bam
101 merged/$sid.bam.bai: merged/$sid.bam
102 \t$SamTools index merged/$sid.bam
105 print M
"$sid: merged/$sid.bam.bai\n";
109 print M
join(' ','_AdditionalTG_:',@AdditionalTG),"\n";
110 print M
".PHONY: all clean",join(' ',@PHONY),"\n";
112 \t-rm bam/*.aln.*.bam
119 $ ./bwa.kit/run
-bwamem
-sad
-t24
-R
'@RG\tID:FCAP086\tLB:FCAP086\tPL:ILLUMINA\tSM:FCAP086' -o bam
/FCAP086_H2LGFCCXX_L3
ref/Felis_catus80_chr
.fa fq
/FCAP086_H2LGFCCXX_L3_1.fq.gz fq/FCAP086_H2LGFCCXX_L3_2
.fq
.gz
120 ./bwa.kit/seqtk mergepe fq
/FCAP086_H2LGFCCXX_L3_1.fq.gz fq/FCAP086_H2LGFCCXX_L3_2
.fq
.gz \
121 | ./bwa.kit/trimadap
2> bam
/FCAP086_H2LGFCCXX_L3
.log.trim \
122 | ./bwa.kit/bwa mem
-p
-t24
-R
'@RG\tID:FCAP086\tLB:FCAP086\tPL:ILLUMINA\tSM:FCAP086' ref/Felis_catus80_chr.fa - 2> bam/FCAP086_H2LGFCCXX_L3
.log.bwamem \
123 | ./bwa.kit/samblaster
2> bam
/FCAP086_H2LGFCCXX_L3
.log.dedup \
124 | ./bwa.kit/samtools
sort -@
4 -m1G
- bam
/FCAP086_H2LGFCCXX_L3
.aln
;
126 ARR
=`echo {10814..10823}| tr ' ' ,` && pidstat
-h
-r
-u
-p
$ARR 1
130 @RG Read group
. Unordered multiple
@RG lines are allowed
.
131 ID
* Read group identifier
. Each
@RG line must have a unique ID
. The value of ID is used
in the RG tags of alignment records
. Must be unique among all
read groups
in header section
. Read group IDs may be modified
when merging SAM files
in order to handle collisions
.
132 CN Name of sequencing center producing the
read.
134 DT Date the run was produced
(ISO8601 date
or date
/time).
135 FO Flow order
. The array of nucleotide bases that correspond to the nucleotides used
for each flow of
each read. Multi
-base flows are encoded
in IUPAC format
, and non
-nucleotide flows by various other characters
. Format
: /\*|[ACMGRSVTWYHKDBN]+/
136 KS The array of nucleotide bases that correspond to the key sequence of
each read.
138 PG Programs used
for processing the
read group
.
139 PI Predicted median insert size
.
140 PL Platform
/technology used to produce the reads
. Valid
values: CAPILLARY
, LS454
, ILLUMINA
, SOLID
, HELICOS
, IONTORRENT
, ONT
, and PACBIO
.
141 PM Platform model
. Free
-form text providing further details of the platform
/technology used
.
142 PU Platform unit
(e
.g
. flowcell
-barcode
.lane
for Illumina
or slide
for SOLiD
). Unique identifier
.
143 SM Sample
. Use pool name where a pool is being sequenced
145 http
://gatkforums
.broadinstitute
.org
/discussion/1317/collected
-faqs
-about
-bam
-files
147 @RG ID:FLOWCELL1.LANE1 PL:ILLUMINA LB:LIB-DAD-1 SM:DAD PI:200
148 @RG ID:FLOWCELL1.LANE2 PL:ILLUMINA LB:LIB-DAD-1 SM:DAD PI:200
149 @RG ID:FLOWCELL1.LANE3 PL:ILLUMINA LB:LIB-DAD-2 SM:DAD PI:400
150 @RG ID:FLOWCELL1.LANE4 PL:ILLUMINA LB:LIB-DAD-2 SM:DAD PI:400
153 @RG ID
:FLOWCELL1
.LANE5 PL
:ILLUMINA LB
:LIB
-MOM
-1 SM
:MOM PI
:200
154 @RG ID
:FLOWCELL1
.LANE6 PL
:ILLUMINA LB
:LIB
-MOM
-1 SM
:MOM PI
:200
155 @RG ID
:FLOWCELL1
.LANE7 PL
:ILLUMINA LB
:LIB
-MOM
-2 SM
:MOM PI
:400
156 @RG ID
:FLOWCELL1
.LANE8 PL
:ILLUMINA LB
:LIB
-MOM
-2 SM
:MOM PI
:400