modified: myjupyterlab.sh
[GalaxyCodeBases.git] / perl / etc / justonce / oyk / fetch1k1.pl
blobefcd3d04e29b9cdea9de16cf5648c5ba7a08729d
1 #!/usr/bin/env perl
2 =pod
3 Author: Hu Xuesong @ BGI <huxuesong@genomics.org.cn>
4 Version: 1.0.0 @ 20180516
5 =cut
6 use strict;
7 use warnings;
9 use Data::Dump qw(ddx);
11 die "Usage: $0 <in.tsv> ...\n[!]ChrMT are not allowed now.\n" if @ARGV<1;
13 my (%Pos19d,%Pos38d);
14 while (<>) {
15 next if /^#/;
16 chomp;
17 my ($rs,$chr19,$pos19,$chr38,$pos38,@d) = split /\t/,$_;
18 $chr19 =~ s/^chr//; # We have no marker on ChrM now.
19 $Pos19d{$chr19}{$pos19} = $rs;
20 $Pos38d{$chr38}{$pos38} = $rs if $chr38 ne 'unlocalized';
22 #ddx \%Pos19d;
24 my $hg19fa = '/share/FGI2017B/pub/Ref/Human/hs37d5.fna.bgz';
25 my $hg38fa = '/share/FGI2017B/pub/Ref/Human/GRCh38_no_alt_analysis_set.fna.bgz';
26 my @ChrIDs19 = (1..22,'X','Y','MT');
27 my @ChrIDs38 = map {"chr$_"} (1..22,'X','Y','M');
29 sub dealfa($$$$) {
30 my ($PosH,$ChrIDsA,$faf,$outf) = @_;
31 #ddx ($outf,$faf); ddx $ChrIDsA;
32 #ddx $PosH;
33 open O,'>',$outf or die $?;
34 for my $chr (@$ChrIDsA) {
35 my @Poses = sort { $a <=> $b } keys %{$PosH->{$chr}};
36 for my $p (@Poses) {
37 my $left = $p - 500;
38 my $right = $p + 500;
39 die if $left < 1;
40 open( SEQ,"-|","samtools faidx $faf $chr:$left-$right") or die $!;
41 my $head = <SEQ>;
42 if ($head ne ">$chr:$left-$right\n") {
43 die;
45 my $theSeq;
46 while (<SEQ>) {
47 chomp;
48 s/\s//g;
49 $theSeq .= $_;
51 my $top = substr $theSeq,0,500;
52 my $base = substr $theSeq,500,1;
53 my $bottom = substr $theSeq,501,500;
54 print O ">$PosH->{$chr}{$p} $chr:$p $left-$right\n${top}${base}\n$bottom\n";
55 # `samtools faidx` says "[E::fai_build_core] Different line length in sequence 'rs10453900'" if not in standard line length.
58 close O;
60 dealfa(\%Pos19d,\@ChrIDs19,$hg19fa,'nippt.out.hg19.fa');
61 dealfa(\%Pos38d,\@ChrIDs38,$hg38fa,'nippt.out.GRCh38.fa');
63 __END__
64 $ grep -v GL hs37d5.fna.bgz.fai
65 1 249250621 52 60 61
66 2 243199373 253404903 60 61
67 3 198022430 500657651 60 61
68 4 191154276 701980507 60 61
69 5 180915260 896320740 60 61
70 6 171115067 1080251307 60 61
71 7 159138663 1254218344 60 61
72 8 146364022 1416009371 60 61
73 9 141213431 1564812846 60 61
74 10 135534747 1708379889 60 61
75 11 135006516 1846173603 60 61
76 12 133851895 1983430282 60 61
77 13 115169878 2119513096 60 61
78 14 107349540 2236602526 60 61
79 15 102531392 2345741279 60 61
80 16 90354753 2449981581 60 61
81 17 81195210 2541842300 60 61
82 18 78077248 2624390817 60 61
83 19 59128983 2703769406 60 61
84 20 63025520 2763883926 60 61
85 21 48129895 2827959925 60 61
86 22 51304566 2876892038 60 61
87 X 155270560 2929051733 60 61
88 Y 59373566 3086910193 60 61
89 MT 16569 3147273397 70 71
90 NC_007605 171823 3153506529 60 61
91 hs37d5 35477943 3153681224 60 61
93 $ egrep -v 'chrUn|_random' GRCh38_no_alt_analysis_set.fna.bgz.fai
94 chr1 248956422 112 70 71
95 chr2 242193529 252513167 70 71
96 chr3 198295559 498166716 70 71
97 chr4 190214555 699295181 70 71
98 chr5 181538259 892227221 70 71
99 chr6 170805979 1076358996 70 71
100 chr7 159345973 1249605173 70 71
101 chr8 145138636 1411227630 70 71
102 chr9 138394717 1558439788 70 71
103 chr10 133797422 1698811686 70 71
104 chr11 135086622 1834520613 70 71
105 chr12 133275309 1971537157 70 71
106 chr13 114364328 2106716512 70 71
107 chr14 107043718 2222714743 70 71
108 chr15 101991189 2331287770 70 71
109 chr16 90338345 2434736088 70 71
110 chr17 83257441 2526365093 70 71
111 chr18 80373285 2610812039 70 71
112 chr19 58617616 2692333639 70 71
113 chr20 64444167 2751788762 70 71
114 chr21 46709983 2817153685 70 71
115 chr22 50818468 2864531079 70 71
116 chrX 156040895 2916075638 70 71
117 chrY 57227415 3074345836 70 71
118 chrM 16569 3132390908 70 71
119 chrEBV 171823 3144056708 70 71