Merge pull request #2222 from solgenomics/topic/solgs/cluster-fix-171018
[sgn.git] / bin / load_fish_stuff.pl
blobdef977bc0dd1e35028a89110e352a7555a29a85c
1 #!/usr/bin/perl
3 =head1 NAME
5 load_fish_stuff.pl
6 load fish data and images into database
8 =cut
10 =head1 SYPNOSIS
12 load_fish_stuff.pl -i fishlist -t -D database -H host -I image_dir
14 =head2 I<Flags:>
16 =over
18 =item -i
20 B<input> a list of paths to the *BAC_Locations.csv files that contain the data to be loaded. This can be generated by
21 find -name *ocations.csv > fishlist
23 =item -t
25 B<test> test the loading but do not actually put it in the database
27 =item -D
29 B<database> the database where the fish data will be loaded
31 =item -H
33 B<host>
35 =item -I
37 B<image_dir> the directory to store the images
39 =back
41 =cut
43 =head1 DESCRIPTION
45 This script loads fish data and images. The columns in the csv files are expected to be in the following comma-delimited order:
47 BAC_ID number,Photo_ID,SC_ID,SC_rel_len%,SC_AR,BAC_loc_arm,BAC%from_kc
49 Make sure to set DBHOST, DBNAME, DBUSER, and DBPASS accordingly. Also, check dir that images are stored.
51 =cut
53 =head1 AUTHORS
55 Lukas Mueller and Suzy Strickler
57 =cut
59 =head1 METHODS
61 load_fish_stuff.pl
64 =cut
67 use strict;
68 use File::Slurp;
69 use Getopt::Std;
70 use File::Basename;
71 use CXGN::DB::InsertDBH;
72 use CXGN::Genomic::Clone;
73 use CXGN::Image;
75 our ($opt_i, $opt_D, $opt_H, $opt_I, $opt_t, $opt_h);
77 getopts('i:D:H:I:th');
79 if (!$opt_i && !$opt_D && !$opt_H && !$opt_I && !$opt_t && !$opt_h){
80 print "Wrong script usage. Printing help\n\n";
81 help();
84 if ($opt_h) {
85 help();
88 #Get dir to store images
89 my $image_dir = $opt_I || die("INPUT ERROR: Image directory was not supplied (-I </dir/>).\n");
91 #list of all "Locations.csv" files ex: 01-26-12/Tomato_10P/Tomato_10P_BAC_Locations.csv
92 my $master_file = $opt_i || die("INPUT ERROR: Input option was not supplied (-i <fish_list>).\n");
93 my @files = read_file($master_file);
95 our $dbh = CXGN::DB::InsertDBH->new(
97 dbname=>$opt_D,
98 dbhost=>$opt_H,
99 dbargs => {AutoCommit => 0,
100 RaiseError => 1}
101 }) ;
103 ###To Add:check for duplicates in file to be loaded
105 foreach my $file (@files) {
106 chomp $file;
108 my $path = dirname($file); #path will now be determined from csv file
109 my @fishinfo = read_file($file);
111 foreach my $fi (@fishinfo) {
112 my $lib;
114 if ($fi =~ m/^"/ || $fi =~ m/^,/){
115 next; #should check that it is in this order: BAC ID number, etc
118 else {
119 if ($fi =~ m/SL_EcoRI/i || $fi =~ m/SL_s/i || $fi =~ m/SL_MboI/i || $fi =~ m/SL_FOS/i){
120 $lib = '';
123 else {
124 $lib = "LE_HBA";
127 my ($bac_id, $photo_id, $sc_id, $sc_rel_len, $sc_ar, $bac_loc_arm, $per_from_kin) = split (/,/, $fi);
128 #$sc_rel_len and $sc_ar are not loaded
130 my $clone_id = CXGN::Genomic::Clone->retrieve_from_clone_name("$lib$bac_id");
132 if (!$clone_id) {
133 print STDERR "$lib$bac_id not found. Skipping!!!\n";
134 next;
137 my @images = '';
138 @images = glob("$path/*$bac_id/Photo_ID_$photo_id/*.jpg");
139 my $num_images = @images;
141 #Check that there are 3 image files in the directory
142 if ($num_images != 3){
143 print STDERR "Images in $path associated with experient $photo_id are missing. SKIPPING!\n";
144 next;
147 else {
149 my $fish_result_id;
151 $bac_loc_arm =~ tr/sSlL/PPQQ/;
153 print STDERR "\n\nFound clone_id $clone_id for experiment $photo_id on $sc_id $bac_loc_arm at $per_from_kin\n".(join "\n",@images)."\n";
154 if ($opt_t) { next;}
156 eval {
157 $fish_result_id = insert_fish_result(
159 dbh => $dbh,
160 chromo_num => $sc_id,
161 chromo_arm => $bac_loc_arm,
162 experiment_name => $photo_id,
163 percent_from_centromere => $per_from_kin / 100, # in the db, stored as 0..1
164 fish_experimenter_name => 'fish_stack',
165 clone_id => $clone_id,
169 print "this is fish result id " . $fish_result_id . "\n";
171 my $image_id;
173 foreach my $image(@images) {
174 my $i = CXGN::Image->new(dbh=>$dbh, image_dir=>$image_dir); #'/data/prod/public/images/image_files');
175 if ($i->process_image($image, "fish", $fish_result_id)) {
176 $i->set_description("FISH Localization of $bac_id on chromosome $sc_id arm $bac_loc_arm at $per_from_kin");
177 $i->set_sp_person_id(233);
178 $i->set_obsolete('f');
179 $i->store();
180 $image_id = $i->get_image_id();
182 insert_fish_result_image($fish_result_id, $image_id);
185 else {
186 print "Could not store image $image. Skipping!!!\n";
191 if ($@) {
192 $dbh->rollback();
193 print STDERR "Saving of $bac_id, chr $sc_id arm $bac_loc_arm at dist $per_from_kin failed due to $@";
196 else {
197 print STDERR "Committing info for $sc_id.\n";
198 $dbh->commit();
205 sub insert_fish_result {
206 my $fd = shift;
208 #Check if record is already in database
209 my $check_dup_query = "SELECT * FROM sgn.fish_result WHERE clone_id = ? AND experiment_name = ?";
210 my $cdq = $dbh->prepare($check_dup_query);
212 $cdq->execute(
213 $fd->{clone_id},
214 $fd->{experiment_name}
217 my $rows = $cdq->rows();
219 #Insert the record if it is not already in database
220 if ($rows != 0){
221 print STDERR "This record " . $fd->{experiment_name} . " is already in database. SKIPPING\n";
222 next;
225 else {
226 my $result_insert_query = #fish_result is table name
227 "INSERT INTO sgn.fish_result
228 (chromo_num, chromo_arm,
229 experiment_name, percent_from_centromere,
230 clone_id, fish_experimenter_id, map_id)
231 SELECT ?, ?, ?, ?, ?,
232 (SELECT fish_experimenter_id
233 FROM sgn.fish_experimenter
234 WHERE fish_experimenter_name = ?),
235 (SELECT map_id
236 FROM sgn.map
237 WHERE short_name = 'Tomato FISH map')";
238 my $dbh = $fd->{dbh};
239 my $sth = $dbh->prepare($result_insert_query);
241 print STDERR "name=" . $fd->{experiment_name} . "\n";
242 $sth->execute( #This is the data inserted for the ? in the sql above
243 $fd->{chromo_num},
244 $fd->{chromo_arm},
245 $fd->{experiment_name},
246 $fd->{percent_from_centromere},
247 $fd->{clone_id},
248 $fd->{fish_experimenter_name},
251 my $frh = $dbh->prepare("select currval('fish_result_fish_result_id_seq')");
252 $frh->execute();
253 my ($fish_result_id) = $frh->fetchrow_array();
255 return $fish_result_id;
259 sub insert_fish_result_image {
260 my $fish_result_id = shift;
261 my $image_id = shift;
262 print STDERR "This is the image id " . $image_id . "\n";
264 my $q = "INSERT INTO fish_result_image (fish_result_id, image_id) VALUES (?,?)";
265 my $frih = $dbh->prepare($q);
266 $frih->execute($fish_result_id, $image_id);
271 =head2 help
273 Usage: help()
274 Desc: print help
275 Ret: none
276 Args: none
277 Side_Effects: exit of the script
278 Example: if (!@ARGV) {
279 help();
282 =cut
284 sub help {
285 print STDERR <<EOF;
288 Description:
290 This script loads fish data and images into the database.
292 Usage:
294 load_fish_stuff.pl [-h] -i fish_list -t test -D database -H host -I image_dir
296 Examples:
298 load_fish_stuff.pl -i fish_list.csv -t -D sandbox -H localhost -I /data/prod/public/images/image_files_sandbox
300 Flags:
302 -i <input> input list of locations of fish csv files
303 -t <test> test mode, does not actually load data
304 -D <database> the database to load the fish data
305 -H <host>
306 -I <image_dir> the directory to store the images
309 exit (1);