Committing Lukas changes before site update
[sgn.git] / bin / load_map_data.pl
blobcedb42858cd6a07d361302575802b61dfb7af28a
1 #!/usr/bin/perl
3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
12 =head1 NAME
14 load_map_data.pl - a script to load maps into the SGN database.
16 =head1 DESCRIPTION
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
22 Options:
24 =over 5
26 =item -H
28 The hostname of the server hosting the database.
30 =item -D
32 the name of the database
34 =item -r
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
38 =item -i
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
42 =item -v
44 add data to map version provided. Conflicts with -i and -n.
46 =item -n
48 required if -i is not used. Provides the map name.
50 =item -l
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
56 =item -f
58 force to 'unknown' protocol type if no protocol is provided.
60 =back
62 The tab-delimited map file has the following columns:
64 MARKER
65 CONFIDENCE
66 LINKAGE_GROUP
67 POSITION (must be a float! 0.0 )
68 PROTOCOL
71 =head1 AUTHORS
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
77 =cut
79 use strict;
81 use Getopt::Std;
82 use CXGN::Tools::List qw | str_in |;
83 use CXGN::Tools::File::Spreadsheet;
84 use CXGN::Tools::Text;
85 use CXGN::Marker::Modifiable;
86 use CXGN::Marker::Tools;
87 use CXGN::Marker::Location;
88 use CXGN::Map::Version;
89 use CXGN::DB::Connection;
90 use CXGN::DB::InsertDBH;
91 use Data::Dumper;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts('H:D:i:rn:fv:l:');
98 my $map_id;
99 my $map_file;
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
103 my $linkage_groups;
104 if ($opt_l) {
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
107 else {
108 $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
111 $map_id = $opt_i;
112 my $map_version_id = $opt_v;
113 $map_file = shift;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN::DB::InsertDBH->new({
119 dbname => $opt_D,
120 dbhost => $opt_H,
121 dbargs => {AutoCommit => 0,
122 RaiseError => 1}
125 eval {
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
128 my $key = <STDIN>;
129 if ($key =~ /Y/i) {
130 print "Inserting a new map...\n";
132 my $map = CXGN::Map->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
139 else {
140 exit();
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
146 my $new_map_version;
148 if ($map_id) {
149 $new_map_version = CXGN::Map::Version->
150 #new($dbh,{map_id=>$map_id});
151 new($dbh,{map_id=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN::Map::Version->
159 new($dbh, {map_version_id=>$map_version_id});
161 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
166 or die"Column headings are not as expected";
168 # for all the (uncleaned) marker names in the spreadsheet
170 for my $dirty_marker_name(@markers) {
172 # clean it, meaning, get the subscript if it's there,
173 # and convert the name into a canonical form so we can find it
174 # if it already exists with a different spelling or something
176 my ($marker_name,$subscript) = ($dirty_marker_name, "");
177 #CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
179 # get as many IDs as you can for a marker with a name like this
181 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
182 # if we get more than one, our script needs to be made smarter
183 # so it can differentiate between them, or maybe one of them
184 # needs to be deleted from the database
186 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
187 # just get the first ID in the list (if the list is longer than 1,
188 # we've already died)
190 my ($marker_id) = @marker_ids;
191 my $marker;
193 if($marker_id) {
194 # if we have found an existing marker
195 # make a modifiable marker object from it
197 print "Found existing marker: $marker_id, $marker_name\n";
198 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
200 else {
201 # if we are making a new marker
202 # make a modifiable marker object and start to populate it
204 print "Loading new marker id from marker $marker_name\n";
205 $marker = CXGN::Marker::Modifiable->new($dbh);
206 $marker->set_marker_name($marker_name); #give it a name
209 # marker must exist before locations can be added for it.
210 # this is a db constraint. if you didn't do this, this script
211 # would die later.
213 my $inserts = $marker->store_new_data();
215 # if any data was inserted for this marker (presumably it was,
216 # because we're making a new one)
218 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
219 else { die "Oops, I thought I was inserting some new data" }
220 $marker_id=$marker->marker_id();
224 print $marker->get_name()."\n";
226 my $loc=$marker->new_location(); #create a new location object
228 # some files have pos which contains chromsome and position
229 #my $pos=$ss->value_at($dirty_marker_name,'Position')
230 # get position from spreadsheet
231 #or die "No position found for $marker_name";
232 # extract linkage group name and cm position from string like '01.035'
233 #my ($chromosome,$position) =
234 #CXGN::Marker::Tools::lg_name_and_position($pos);
236 # foreach my $me (@{$marker->current_mapping_experiments}) {
237 # print $me->{protocol}."\n";
240 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP'); # get chromosome from spreadsheet
241 if (!defined($chromosome)) { die"No chromosome found for $marker_name"; }
243 if (! str_in($chromosome, @$linkage_groups)) {
244 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
245 next;
248 # some have separate fields for chromsome and position
250 my $position = $ss->value_at($dirty_marker_name,'POSITION');
251 # get position from spreadsheet
252 if (!defined($position)) {
253 print STDERR "No position found for $marker_name\n";
254 next;
257 my $confidence;
259 # get confidence from spreadsheet
261 $confidence = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $confidence='uncalculated';
262 if ($confidence=~/^(\d+)$/) {
263 if ($confidence == 0) { $confidence = "I"; }
264 elsif ($confidence == 1) { $confidence = "I(LOD2)"; }
265 elsif ($confidence == 2) { $confidence = "CF(LOD3)"; }
266 elsif ($confidence == 3) { $confidence = "F(LOD3)"; }
267 else { $confidence = "uncalculated"; }
269 # get protocol from spreadsheet
271 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
273 # some entries have been mapped to the same location by more than
274 # one method separated in the spreadsheet by plus signs
276 my @protocols=split(/\+/,$protocols_string);
277 if (@protocols) {
278 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
280 else {
281 if ($opt_f) {
282 print STDERR "Protocols not found for '$dirty_marker_name'";
283 @protocols = ('unknown' x scalar(@protocols));
285 else {
286 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
289 for my $protocol(@protocols) {
290 $protocol = uc($protocol);
291 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
292 or $protocol eq 'SNP' or $protocol eq 'SSR'
293 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
295 print STDERR "UNKNOWN protocol ($protocol)\n! ";
296 $protocol = 'unknown';
299 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
300 print "Protocol = $protocol\n";
302 # set the marker_id that will be at this location
304 $loc->marker_id($marker_id);
306 # set the map_version_id this location is found on
307 # (this must be done before calling function lg_name)
309 $loc->map_version_id($map_version_id);
311 # set the linkage group name for this marker location
312 # (the map_version_id must be already set for this to work,
313 # else how would it be able to know different linkage groups on
314 # different map versions from each other, when they all have the same names?)
316 $loc->lg_name($chromosome);
318 #set the position of the marker on this linkage group
320 $loc->position($position);
322 # set the confidence with which this marker is mapped at this position
324 $loc->confidence($confidence);
326 # set the subscript for this location, because the same marker
327 # can be mapped to multiple locations, and these locations must be distinguishable
329 $loc->subscript($subscript);
331 # this method call represents the insertion into the
332 # marker_experiment table. this is currently a troublesome
333 # issue because this marker was probably mapped here via a
334 # PCR or RFLP experiment. where is this experiment data?
335 # well, it's in another spreadsheet, to be loaded later,
336 # or maybe it has already been loaded. if it was previously
337 # loaded, it was matched up with an old map version. how can we
338 # match that existing PCR/RFLP data up with this new map
339 # version? well, it will have to be done later by some other script.
341 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
342 $marker->add_experiment({location=>$loc,protocol=>$protocol});
345 # store whatever new data you have (in this case, the new data
346 # is the location we just assigned the marker)
348 print "Storing new marker data...\n";
349 my $inserts = $marker->store_new_data();
352 # if any data was inserted for this marker (presumably it was,
353 # since we're adding locations on a brand new map version)
355 if ($inserts and @{$inserts}) {
356 print "New marker data inserted:\n";#.Dumper($inserts);
357 print $loc->as_string();
360 else { die "Oops, I thought I was inserting some new data" }
363 # deprecate the old map version and make the new one we just made the current one
365 $new_map_version->set_current();
370 if ($@) {
371 print $@;
372 print "Failed; rolling back.\n";
373 $dbh->rollback();
375 else {
376 print"Succeeded.\n";
377 if (!$opt_r) {
378 print"Committing.\n";
379 $dbh->commit();
381 else {
382 print"Rolling back.\n";
383 $dbh->rollback();