bin/load_genotypes.pl

   1 #!/usr/bin/perl
   2
   3 =head1
   4
   5 load_genotypes.pl - loading genotypes into cxgn databases, based on the load_cassava_snps.pl script by Naama.
   6
   7 =head1 SYNOPSIS
   8
   9     load_genotypes.pl -H [dbhost] -D [dbname] -i [infile] -p [project name] -y [year] g [population name] -m [protocol name] -t
  10
  11 =head1 COMMAND-LINE OPTIONS
  12   ARGUMENTS
  13  -H host name (required) e.g. "localhost"
  14  -D database name (required) e.g. "cxgn_cassava"
  15  -i path to infile (required)
  16  -p project name (required) e.g. "SNP genotyping 2012 Cornell Biotech"
  17  -y project year (required) e.g. "2012"
  18  -g population name (required) e.g. "NaCRRI training population"
  19  -m protocol name (required) e.g. "GBS ApeKI Cassava genome v6"
  20   FLAGS
  21  -x delete old genotypes for accessions that have new genotypes
  22  -a add accessions that are not in the databaseA
  23  -v order markers using version sort
  24  -s sort markers according to custom sort order (see script source)
  25  -t Test run . Rolling back at the end.
  26
  27 =head1 DESCRIPTION
  28
  29 This script loads genotype data into the Chado genotype table it encodes the genotype + marker name in a json format in the genotyope.uniquename field for easy parsing by a Perl program. The genotypes are linked to the relevant stock using nd_experiment_genotype. Each column in the spreadsheet, which represents a single accession (stock) is stored as a single genotype entry and linked to the stock via nd_experiment_genotype. Stock names are stored in the stock table if cannot be found, and linked to a population stock with the name supplied in opt_g
  30
  31 =head1 AUTHOR
  32
  33  Naama Menda (nm249@cornell.edu) - July 2012
  34  Modified by Lukas, Jan 2015
  35
  36 =cut
  37
  38 use strict;
  39
  40 use Getopt::Std;
  41 use Data::Dumper;
  42 use JSON::Any;
  43 use JSON::PP;
  44 use Carp qw /croak/ ;
  45 use Try::Tiny;
  46 use Pod::Usage;
  47
  48 use Bio::Chado::Schema;
  49 use CXGN::People::Person;
  50 use CXGN::DB::InsertDBH;
  51 use CXGN::Genotype;
  52 use CXGN::GenotypeIO;
  53 use Sort::Versions;
  54 use SGN::Model::Cvterm;
  55
  56 our ($opt_H, $opt_D, $opt_i, $opt_t, $opt_p, $opt_y, $opt_g, $opt_a, $opt_x, $opt_v, $opt_s, $opt_m);
  57
  58 getopts('H:i:tD:p:y:g:axsm:');
  59
  60 my $dbhost = $opt_H;
  61 my $dbname = $opt_D;
  62 my $file = $opt_i;
  63 my $population_name = $opt_g;
  64 my $protocol_name = $opt_m;
  65
  66 print STDERR "Input file: $file\n";
  67 print STDERR "DB host: $dbhost\n";
  68 print STDERR "DB name: $dbname\n";
  69 print STDERR "Population name: $population_name\n";
  70 print STDERR "Project year: $opt_y\n";
  71 print STDERR "Add missing accessions: $opt_a\n";
  72 print STDERR "Delete old duplicate phenotypes: $opt_x\n";
  73 print STDERR "Rollback: $opt_t\n";
  74
  75 if (!$opt_H || !$opt_D || !$opt_i || !$opt_g) {
  76     pod2usage(-verbose => 2, -message => "Must provide options -H (hostname), -D (database name), -i (input file) , and -g (populations name for associating accessions in your SNP file) \n");
  77 }
  78
  79 my $dbh = CXGN::DB::InsertDBH->new( { dbhost=>$dbhost,
  80                                       dbname=>$dbname,
  81                                       dbargs => {AutoCommit => 1,
  82                                                  RaiseError => 1}
  83                                     }
  84     );
  85
  86
  87 my $schema= Bio::Chado::Schema->connect(  sub { $dbh->get_actual_dbh() } );
  88 $dbh->do('SET search_path TO public,sgn');
  89
  90 # getting the last database ids for resetting at the end in case of rolling back
  91 #
  92 my $last_nd_experiment_id = $schema->resultset('NaturalDiversity::NdExperiment')->get_column('nd_experiment_id')->max;
  93 my $last_cvterm_id = $schema->resultset('Cv::Cvterm')->get_column('cvterm_id')->max;
  94 my $last_nd_experiment_project_id = $schema->resultset('NaturalDiversity::NdExperimentProject')->get_column('nd_experiment_project_id')->max;
  95 my $last_nd_experiment_stock_id = $schema->resultset('NaturalDiversity::NdExperimentStock')->get_column('nd_experiment_stock_id')->max;
  96 my $last_nd_experiment_genotype_id = $schema->resultset('NaturalDiversity::NdExperimentGenotype')->get_column('nd_experiment_genotype_id')->max;
  97 my $last_genotype_id = $schema->resultset('Genetic::Genotype')->get_column('genotype_id')->max;
  98 my $last_project_id = $schema->resultset('Project::Project')->get_column('project_id')->max;
  99
 100 my %seq  = (
 101     'nd_experiment_nd_experiment_id_seq' => $last_nd_experiment_id,
 102     'cvterm_cvterm_id_seq' => $last_cvterm_id,
 103     'nd_experiment_project_nd_experiment_project_id_seq' => $last_nd_experiment_project_id,
 104     'nd_experiment_stock_nd_experiment_stock_id_seq' => $last_nd_experiment_stock_id,
 105     'nd_experiment_genotype_nd_experiment_genotype_id_seq' => $last_nd_experiment_genotype_id,
 106     'genotype_genotype_id_seq' => $last_genotype_id,
 107     'project_project_id_seq'   => $last_project_id,
 108     );
 109
 110 my $accession_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'accession', 'stock_type');
 111
 112 my $population_cvterm =  SGN::Model::Cvterm->get_cvterm_row($schema, 'training population', 'stock_type');
 113
 114
 115 my $igd_number_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'igd number', 'genotype_property');
 116
 117 #store a project
 118 my $project = $schema->resultset("Project::Project")->find_or_create(
 119     {
 120         name => $opt_p,
 121         description => $opt_p,
 122     } ) ;
 123 $project->create_projectprops( { 'project year' => $opt_y }, { autocreate => 1 } );
 124
 125 # find the cvterm for a genotyping experiment
 126
 127 my $geno_cvterm = SGN::Model::Cvterm->get_cvterm_row($schema, 'genotyping_experiment', 'experiment_type');
 128
 129 my $protocol_row = $schema->resultset("NaturalDiversity::NdProtocol")->find_or_create(
 130     { name => $protocol_name,
 131       type_id => $geno_cvterm->cvterm_id
 132     });
 133
 134 my $protocol_id = $protocol_row->nd_protocol_id();
 135
 136 # find the cvterm for the SNP calling experiment
 137 my $snp_genotype = SGN::Model::Cvterm->get_cvterm_row($schema, 'snp genotyping', 'genotype_property');
 138
 139 my $geolocation = $schema->resultset("NaturalDiversity::NdGeolocation")->find_or_create(
 140     {
 141         description => 'Cornell Biotech', #add this as an option
 142     } ) ;
 143
 144
 145 my $organism = $schema->resultset("Organism::Organism")->find_or_create(
 146     {
 147         genus   => 'Manihot',
 148         species => 'Manihot esculenta',
 149     } );
 150
 151 my $population_members = SGN::Model::Cvterm->get_cvterm_row($schema, 'member_of', 'stock_relationship');
 152
 153 my $organism_id = $organism->organism_id();
 154 ########################
 155
 156 #new spreadsheet,
 157 #my $spreadsheet=CXGN::Tools::File::Spreadsheet->new($file);
 158
 159 print STDERR "Reading genotype information...\n";
 160 my $gtio = CXGN::GenotypeIO->new( { file => $file, format => "dosage_transposed" });
 161
 162 #my @rows = $spreadsheet->row_labels();
 163 #my @columns = $spreadsheet->column_labels();
 164
 165 my $coderef = sub {
 166     while (my $gt = $gtio->next())  {
 167         my ($accession_name, $igd_number) = split(/:/, $gt->name());
 168         my $db_name = $accession_name;
 169
 170         $db_name =~ s/(.*?)\.(.*)/$1/;
 171
 172         if ($accession_name eq "marker" || $accession_name =~ /BLANK/i ) {next;}
 173
 174         #print Dumper($gt->rawscores);
 175
 176         print STDERR "Looking for accession $accession_name ($db_name)\n";
 177         my %json;
 178         my $cassava_stock;
 179         my $stock_name;
 180         my $stock_rs = $schema->resultset("Stock::Stock")->search(
 181             { 'lower(me.uniquename)' => { like => lc($db_name) },
 182
 183             });
 184
 185
 186         if ($stock_rs->count() == 0) {
 187
 188             print STDERR "No uniquename found for $db_name, checking synonyms...\n";
 189             $stock_rs = $schema->resultset("Stock::Stock")->search(
 190                 {
 191                     -and => [
 192                          'lower(type.name)'       => { like => '%synonym%' },
 193                          'lower(stockprops.value)' => { like => lc($db_name) },
 194                         ],
 195                 },
 196                 { join => { 'stockprops' => 'type'} ,
 197                   distinct => 1
 198                 }
 199                 );
 200         }
 201         if ($stock_rs->count() >1 ) {
 202             print STDERR "ERROR: found multiple accession synonyms found for that accession name, skipping $accession_name! \n";
 203             while ( my $st = $stock_rs->next) {
 204                 print STDERR "stock name = " . $st->uniquename . "\n";
 205             }
 206             next();
 207         }
 208
 209         if ($stock_rs->count() == 1) {
 210             $cassava_stock = $stock_rs->first();
 211             $stock_name = $cassava_stock->uniquename();
 212         }
 213
 214         if ($stock_rs->count ==0)  {
 215             print STDERR "No synonym was found for $db_name\n";
 216
 217             #store the plant accession in the stock table if $opt_a
 218             #
 219             if (!$opt_a) {
 220                 print STDERR "WARNING! Accession $accession_name (using: $db_name) not found.\n";
 221                 print STDERR "Use option -a to add automatically.\n";
 222                 next();
 223             }
 224             else {
 225                 $cassava_stock = $schema->resultset("Stock::Stock")->create(
 226                     { organism_id => $organism_id,
 227                       name       => $db_name,
 228                       uniquename => $db_name,
 229                       type_id     => $accession_cvterm->cvterm_id,
 230                     } );
 231
 232             }
 233         }
 234
 235         my $population_stock = $schema->resultset("Stock::Stock")->find_or_create(
 236             { organism_id => $organism_id,
 237               name       => $population_name,
 238               uniquename => $population_name,
 239               type_id => $population_cvterm->cvterm_id,
 240             } );
 241
 242         my $has_rel_rs = $schema->resultset("Stock::StockRelationship")->search(
 243             {
 244                 type_id => $population_members->cvterm_id(),
 245                 subject_id => $cassava_stock->stock_id(),
 246                 object_id => $population_stock->stock_id(),
 247             });
 248
 249         if ($has_rel_rs->count() == 0) {
 250             $cassava_stock->find_or_create_related('stock_relationship_objects', {
 251                 type_id => $population_members->cvterm_id(),
 252                 subject_id => $cassava_stock->stock_id(),
 253                 object_id => $population_stock->stock_id(),
 254                                                    } );
 255         }
 256         ###############
 257         print STDERR "cassava stock name = " . $cassava_stock->name . "\n";
 258         my $experiment = $schema->resultset('NaturalDiversity::NdExperiment')->create(
 259             {
 260                 nd_geolocation_id => $geolocation->nd_geolocation_id(),
 261                 type_id => $geno_cvterm->cvterm_id(),
 262             } );
 263
 264         print STDERR "Linking to protocol...\n";
 265         my $nd_experiment_protocol = $schema->resultset('NaturalDiversity::NdExperimentProtocol')->create(
 266             {
 267                 nd_experiment_id => $experiment->nd_experiment_id(),
 268                 nd_protocol_id => $protocol_id,
 269             });
 270
 271
 272         #link to the project
 273         $experiment->find_or_create_related('nd_experiment_projects', {
 274             project_id => $project->project_id()
 275                                             } );
 276         #link the experiment to the stock
 277         $experiment->find_or_create_related('nd_experiment_stocks' , {
 278             stock_id => $cassava_stock->stock_id(),
 279             type_id  =>  $geno_cvterm->cvterm_id(),
 280                                             });
 281         if ($opt_x) {
 282             print STDERR "OPTION -x: REMOVING OLD GENOTYPE... \n";
 283             my $has_genotype_rs =  $schema->resultset('NaturalDiversity::NdExperimentStock')->search_related('nd_experiment')->search_related('nd_experiment_genotypes')->search_related('genotype')->search_related('genotypeprops')->search( { 'me.stock_id' => $cassava_stock->stock_id() });
 284
 285             while (my $has_genotype = $has_genotype_rs->next()) {
 286                 print STDERR "Note: -x option: removing already present genotype for $db_name.\n";
 287                 my $genotypeprop_rs = $schema->resultset('Genetic::Genotypeprop')->search(
 288                     { genotype_id => $has_genotype->genotype_id() } );
 289                 while (my $genotypeprop = $genotypeprop_rs->next()) {
 290                     print STDERR "DELETING GENOTYPE PROP ".$genotypeprop->genotypeprop_id()."\n";
 291                     $genotypeprop->delete();
 292                 }
 293                 my $genotype = $schema->resultset('Genetic::Genotypeprop')->search(
 294                     { 'me.genotype_id' => $has_genotype->genotype_id(),  } );
 295
 296                 print STDERR "DELETING GENOTYPE: ".$has_genotype->genotype_id()."\n";
 297                 $genotype->delete();
 298
 299                 #my $nd_experiment_genotypes = $schema->resultset('Genetic::NdExperimentGenotypes')->search( { nd_experiment_id => $has_genotype_rs->nd_experiment_id(), });
 300
 301             }
 302         }
 303
 304         my @markers = @{$gtio->markers()};
 305
 306         if ($opt_s) {
 307             @markers = sort bychr @{$gtio->markers()};
 308         }
 309         foreach my $marker_name (@markers) {
 310             #print STDERR "markername: $marker_name\n";
 311             #print STDERR Dumper($gt->rawscores);
 312             my $base_calls = $gt->rawscores->{$marker_name}; #($marker_namefg, $accession_name);
 313             #print STDERR "BASE CALL: $base_calls\n";
 314             $base_calls =~ s/\s+//g;
 315             if ($base_calls !~/[0-9.]+|NA/i) {
 316                 print STDERR "SKIPPING BASECALL $base_calls\n";
 317             }
 318
 319             $json{$marker_name} = $base_calls;
 320         }
 321
 322         my $json_string;
 323         if ($opt_v) {
 324             my $json_obj = JSON::PP->new;
 325             print STDERR "Sorting and encoding markers and values... \n\n";
 326             $json_string = $json_obj->sort_by(sub {versioncmp($JSON::PP::a,$JSON::PP::b)})->encode(\%json);
 327         } else {
 328             my $json_obj = JSON::Any->new;
 329             $json_string = $json_obj->encode(\%json);
 330         }
 331
 332         print STDERR "Storing new genotype for stock " . $cassava_stock->name . " \n\n";
 333         my $genotype = $schema->resultset("Genetic::Genotype")->find_or_create(
 334             {
 335                 name        => $cassava_stock->name . "|" . $experiment->nd_experiment_id,
 336                 uniquename  => $cassava_stock->name . "|" . $experiment->nd_experiment_id,
 337                 description => "Cassava SNP genotypes for stock " . "(name = " . $cassava_stock->name . ", id = " . $cassava_stock->stock_id . ")",
 338                 type_id     => $snp_genotype->cvterm_id,
 339             }
 340             );
 341         $genotype->create_genotypeprops( { 'snp genotyping' => $json_string } , {autocreate =>1 , allow_duplicate_values => 1 } );
 342         $genotype->create_genotypeprops( { 'igd number' => $igd_number } , {autocreate =>1 , allow_duplicate_values => 1 } );
 343         #link the genotype to the nd_experiment
 344         my $nd_experiment_genotype = $experiment->find_or_create_related('nd_experiment_genotypes', { genotype_id => $genotype->genotype_id() } );
 345
 346     }
 347 };
 348
 349 try {
 350     $schema->txn_do($coderef);
 351     if (!$opt_t) { print "Transaction succeeded! Commiting genotyping experiments! \n\n"; }
 352 } catch {
 353     # Transaction failed
 354     foreach my $value ( keys %seq ) {
 355         my $maxval= $seq{$value} || 0;
 356         if ($maxval) { $dbh->do("SELECT setval ('$value', $maxval, true)") ;  }
 357         else {  $dbh->do("SELECT setval ('$value', 1, false)");  }
 358     }
 359     die "An error occured! Rolling back  and reseting database sequences!" . $_ . "\n";
 360 };
 361
 362 sub bychr {
 363     my @a = split "\t", $a;
 364     my @b = split "\t", $b;
 365
 366     my $a_chr;
 367     my $a_coord;
 368     my $b_chr;
 369     my $b_coord;
 370
 371     if ($a[1] =~ /^[A-Za-z]+(\d+)[_-](\d+)$/) {
 372         $a_chr = $1;
 373         $a_coord = $2;
 374     }
 375
 376     if ($b[1] =~ /[A-Za-z]+(\d+)[_-](\d+)/) {
 377         $b_chr = $1;
 378         $b_coord = $2;
 379     }
 380
 381     if ($a_chr eq $b_chr) {
 382         return $a_coord <=> $b_coord;
 383     }
 384     else {
 385         return $a_chr <=> $b_chr;
 386     }
 387 }