port locus_display.t to SGN::Test::WWW::Mechanize
[sgn.git] / bin / load_map_data.pl
blobc2845a72f0b0f488844587d5ead434c4cd2ad2e5
1 #!/usr/bin/perl
3 # basic script to load maps
5 # usage: load_map_data <dbhost> <dbname> <COMMIT|ROLLBACK> <map_id> <map file>
6 # example: load_map_data.pl db-devel sandbox COMMIT 9 map-file.csv
8 # copy and edit this file as necessary
9 # common changes include the following:
10 # specified linkage groups
11 # different column headings
12 # "pos" field versus separate "chromosome" and "position" fields
14 =head1 NAME
16 load_map_data.pl - a script to load maps into the SGN database.
18 =head1 DESCRIPTION
20 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
22 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
24 Options:
26 =over 5
28 =item -H
30 The hostname of the server hosting the database.
32 =item -D
34 the name of the database
36 =item -r
38 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 =item -i
42 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 =item -n
46 required if -i is not used. Provides the map name.
48 =back
50 The tab-delimited map file has the following columns:
52 Marker
53 Confidence
54 Linkage_group
55 Position
56 Protocol
59 =head1 AUTHORS
61 John Binns, Adri Mills, Lukas Mueller (among others).
63 Current maintainer: Lukas Mueller.
65 =cut
67 use strict;
69 use Getopt::Std;
70 use CXGN::Tools::List qw | str_in |;
71 use CXGN::Tools::File::Spreadsheet;
72 use CXGN::Tools::Text;
73 use CXGN::Marker::Modifiable;
74 use CXGN::Marker::Tools;
75 use CXGN::Marker::Location;
76 use CXGN::Map::Version;
77 use CXGN::DB::Connection;
78 use CXGN::DB::InsertDBH;
79 use Data::Dumper;
82 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n);
84 getopts('H:D:i:rn:');
86 my $map_id;
87 my $map_file;
88 # specify linkage groups
89 # example: my $linkage_groups = ['1','2','3','4','5'];
90 my $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
92 # unless ($ARGV[0]) {
93 # print "Usage: load_map_data <dbhost> <dbname> "
94 # ."<COMMIT|ROLLBACK> <map_id> <map file>\n" and exit();
95 # }
96 # unless ($ARGV[0] eq 'db-devel' or $ARGV[0] eq 'db') {
97 # die "First argument must be valid database host";
98 # }
99 # unless ($ARGV[1] eq 'sandbox' or $ARGV[1] eq 'cxgn') {
100 # die "Second argument must be valid database name";
102 # unless ($ARGV[2] eq 'COMMIT' or $ARGV[2] eq 'ROLLBACK') {
103 # die "Third argument must be either COMMIT or ROLLBACK";
105 # unless ($map_id=$ARGV[3]) {
106 # die "Fourth argument must be the map_id";
108 # unless ($map_file=$ARGV[4]) {
109 # die "Fifth argument must be the map positions file";
112 $map_id = $opt_i;
114 $map_file = shift;
116 if (!$opt_H && !$opt_D) {
117 die "-H and -D parameters are required.\n";
120 my $dbh=CXGN::DB::InsertDBH::connect
121 ({dbhost=>$opt_H,dbname=>$opt_D,dbschema=>'sgn'});
124 eval {
125 if (!$map_id) {
126 print "No map_id was provided. Insert a new map? ";
127 my $key = <STDIN>;
128 if ($key =~ /Y/i) {
129 print "Inserting a new map...\n";
131 my $map = CXGN::Map->new_map($dbh, $opt_n);
133 $map_id = $map->get_map_id();
135 print "New map_id: $map_id\n";
138 else {
139 exit();
145 # we are creating a new map version every time we run this script,
146 # as long as the transaction goes through
147 my $new_map_version = CXGN::Map::Version->
148 #new($dbh,{map_id=>$map_id});
149 new($dbh,{map_id=>$map_id},$linkage_groups);
150 # saving the new map version
151 my $map_version_id = $new_map_version->insert_into_database();
152 print $new_map_version->as_string();
153 # make an object to give us the values from the spreadsheet
154 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
155 my @markers = $ss->row_labels(); # row labels are the marker names
156 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
157 # make sure the spreadsheet is how we expect it to be
158 @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
159 or die"Column headings are not as expected";
161 # for all the (uncleaned) marker names in the spreadsheet
162 for my $dirty_marker_name(@markers) {
163 # clean it, meaning, get the subscript if it's there,
164 # and convert the name into a canonical form so we can find it
165 # if it already exists with a different spelling or something
166 my ($marker_name,$subscript) =
167 CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
168 # get as many IDs as you can for a marker with a name like this
169 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
170 # if we get more than one, our script needs to be made smarter
171 # so it can differentiate between them, or maybe one of them
172 # needs to be deleted from the database
173 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
174 # just get the first ID in the list (if the list is longer than 1, we've already died)
175 my($marker_id) = @marker_ids;
176 my $marker;
177 if($marker_id) { # if we have found an existing marker
178 # make a modifiable marker object from it
179 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
181 else { # if we are making a new marker
182 # make a modifiable marker object and start to populate it
183 $marker = CXGN::Marker::Modifiable->new($dbh);
184 $marker->set_marker_name($marker_name); #give it a name
185 # marker must exist before locations can be added for it.
186 # this is a db constraint. if you didn't do this, this script would die later.
187 my $inserts = $marker->store_new_data();
188 # if any data was inserted for this marker (presumably it was,
189 # because we're making a new one)
190 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
191 else { die "Oops, I thought I was inserting some new data" }
192 $marker_id=$marker->marker_id();
194 print $marker->name_that_marker()."\n";
195 my $loc=$marker->new_location(); #create a new location object
197 # some files have pos which contains chromsome and position
198 #my $pos=$ss->value_at($dirty_marker_name,'Position')
199 # get position from spreadsheet
200 #or die "No position found for $marker_name";
201 # extract linkage group name and cm position from string like '01.035'
202 #my ($chromosome,$position) =
203 #CXGN::Marker::Tools::lg_name_and_position($pos);
205 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
206 # get chromosome from spreadsheet
207 or die"No chromosome found for $marker_name";
210 if (! str_in($chromosome, @$linkage_groups)) {
211 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
212 next;
214 # some have separate fields for chromsome and position
215 my $position = $ss->value_at($dirty_marker_name,'POSITION');
216 # get position from spreadsheet
217 if (!$position) {
218 print STDERR "No position found for $marker_name\n";
219 next;
223 my $conf;
224 # get confidence from spreadsheet
225 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
226 if ($conf=~/^(\d+)$/) {
227 if ($conf == 0) { $conf = "I"; }
228 elsif ($conf == 1) { $conf = "I(LOD2)"; }
229 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
230 elsif ($conf == 3) { $conf = "F(LOD3)"; }
231 else { $conf = "uncalculated"; }
233 # get protocol from spreadsheet
234 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
235 # some entries have been mapped to the same location by more than
236 # one method separated in the spreadsheet by plus signs
237 my @protocols=split(/\+/,$protocols_string);
238 if (@protocols) {
239 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
241 else { die "Protocols not found for '$dirty_marker_name'" }
242 for my $protocol(@protocols) {
243 $protocol =~ tr/[a-z]/[A-Z]/;
244 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
245 or $protocol eq 'SNP' or $protocol eq 'SSR'
246 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i )
248 $protocol = 'unknown';
251 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
253 # set the marker_id that will be at this location
254 $loc->marker_id($marker_id);
255 # set the map_version_id this location is found on
256 # (this must be done before calling function lg_name)
257 $loc->map_version_id($map_version_id);
258 # set the linkage group name for this marker location
259 # (the map_version_id must be already set for this to work,
260 # else how would it be able to know different linkage groups on
261 # different map versions from each other, when they all have the same names?)
262 $loc->lg_name($chromosome);
264 #set the position of the marker on this linkage group
265 $loc->position($position);
267 # set the confidence with which this marker is mapped at this position
268 $loc->confidence($conf);
270 # set the subscript for this location, because the same marker
271 # can be mapped to multiple locations, and these locations must be distinguishable
272 $loc->subscript($subscript);
274 # this method call represents the insertion into the
275 # marker_experiment table. this is currently a troublesome
276 # issue because this marker was probably mapped here via a
277 # PCR or RFLP experiment. where is this experiment data?
278 # well, it's in another spreadsheet, to be loaded later,
279 # or maybe it has already been loaded. if it was previously
280 # loaded, it was matched up with an old map version. how can we
281 # match that existing PCR/RFLP data up with this new map
282 # version? well, it will have to be done later by some other script.
284 $marker->add_experiment({location=>$loc,protocol=>$protocol});
287 # store whatever new data you have (in this case, the new data
288 # is the location we just assigned the marker)
289 my $inserts = $marker->store_new_data();
292 # if any data was inserted for this marker (presumably it was,
293 # since we're adding locations on a brand new map version)
294 if ($inserts and @{$inserts}) {
295 print "New marker data inserted:\n".Dumper($inserts);
296 print $loc->as_string();
299 else { die "Oops, I thought I was inserting some new data" }
302 # deprecate the old map version and make the new one we just made the current one
303 $new_map_version->set_current();
308 if ($@) {
309 print $@;
310 print "Failed; rolling back.\n";
311 $dbh->rollback();
313 else {
314 print"Succeeded.\n";
315 if (!$opt_r) {
316 print"Committing.\n";
317 $dbh->commit();
319 else {
320 print"Rolling back.\n";
321 $dbh->rollback();