make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / download_genotypes.pl
blobae97e010b746ec5cc0d80ad7b9b33cd98e40c54d
1 #!/usr/bin/perl
3 =head1 NAME
5 download_genotypes.pl - downloads a genotyping file (vcf or dosage) using a file with a list of accession names and a genotyping protocol id.
7 =head1 SYNOPSIS
9 perl bin/download_genotypes.pl -h [dbhost] -d [dbname] -i [infile] -o [outfile] -p [genotyping_protocol]
11 =head2 REQUIRED ARGUMENTS
13 -h host name e.g. "localhost"
14 -d database name e.g. "cxgn_cassava"
15 -p genotyping protocol name
16 -i path to infile
17 -o path to output file
18 -f format [default vcf]
20 =head2 OPTIONAL ARGUMENTS
22 -q web cluster queue
23 -t cluster shared temp dir
24 -c cache root dir
25 -b basepath
27 =head1 AUTHOR
29 Lukas Mueller
31 =cut
33 use strict;
34 use warnings;
36 use Bio::Chado::Schema;
37 use Getopt::Std;
38 use Data::Dumper;
40 use SGN::Model::Cvterm;
41 use CXGN::DB::InsertDBH;
42 use CXGN::Dataset::File;
43 use CXGN::List;
45 our ($opt_h, $opt_d, $opt_p, $opt_i, $opt_o, $opt_q, $opt_t, $opt_c, $opt_b, $opt_f);
47 getopts("h:d:p:i:o:q:t:c:b:f:");
49 my $dbhost = $opt_h;
50 my $dbname = $opt_d;
51 my $in_file = $opt_i;
52 my $out_file = $opt_o;
53 my $protocol_name = $opt_p;
54 my $web_cluster_queue = $opt_q || '';
55 my $cluster_shared_tempdir = $opt_t || '/tmp';
56 my $cluster_host = $opt_c || 'localhost';
57 my $format = $opt_f || "vcf";
58 my $basepath = $opt_b || '/home/production/cxgn/sgn';
60 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
61 dbname=>$dbname,
62 dbargs => {AutoCommit => 1,
63 RaiseError => 1,
66 } );
68 my $schema= Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() });
70 my $q = "SELECT nd_protocol_id, name FROM nd_protocol WHERE name = ?";
72 my $h = $dbh->prepare($q);
73 $h->execute($protocol_name);
75 my $protocol_exists;
77 my $protocol_id;
78 while (my ($pr_id, $pr_name) = $h->fetchrow_array()) {
79 print STDERR "\nFound genotyping protocol: $pr_name -- id: $pr_id\n";
80 $protocol_exists = 1;
81 $protocol_id = $pr_id;
84 if (!$protocol_exists) {
85 die "\n\nGENOTYPING PROTOCOL $protocol_name does not exist in the database\n\n";
88 my @accession_names;
90 if ($in_file) {
91 open(my $F, "< :encoding(UTF-8)", $in_file) || die "Can't open file $in_file\n";
93 while (<$F>) {
94 chomp;
95 push @accession_names, $_;
97 close($F);
100 my $s = Bio::Chado::Schema->connect( sub { $dbh->get_actual_dbh() } );
101 my $p = CXGN::People::Schema->connect( sub { $dbh->get_actual_dbh() });
103 my @accession_ids;
105 foreach my $a (@accession_names) {
106 my $row = $s->resultset('Stock::Stock')->find( { uniquename => $a });
107 if (!$row) {
108 print STDERR "Accession $a does not exist! Skipping!\n";
110 else {
111 push @accession_ids, $row->stock_id();
115 my $ds = CXGN::Dataset::File->new( { people_schema => $p, schema => $s } );
117 if ($in_file) {
118 $ds->accessions(\@accession_ids);
121 $ds->genotyping_protocols([ $protocol_id ]);
123 if ($format eq "vcf") {
124 my $fh = $ds->retrieve_genotypes_vcf($protocol_id, $out_file, '/tmp',
125 $cluster_shared_tempdir, 'Slurm',
126 $cluster_host, $web_cluster_queue,
127 $basepath, 1);
129 elsif ($format eq "dosage") {
130 my $fh = $ds->retrieve_genotypes($protocol_id, $out_file, '/tmp',
131 $cluster_shared_tempdir, 'Slurm',
132 $cluster_host, $web_cluster_queue,
133 $basepath, 1);
135 else {
136 print STDERR "Unknown format $format.\n";
139 print STDERR "Done.\n";