3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
14 load_map_data.pl - a script to load maps into the SGN database.
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
28 The hostname of the server hosting the database.
32 the name of the database
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 add data to map version provided. Conflicts with -i and -n.
48 required if -i is not used. Provides the map name.
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
58 force to 'unknown' protocol type if no protocol is provided.
62 The tab-delimited map file has the following columns:
67 POSITION (must be a float! 0.0 )
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
82 use CXGN
::Tools
::List qw
| str_in
|;
83 use CXGN
::Tools
::File
::Spreadsheet
;
84 use CXGN
::Tools
::Text
;
85 use CXGN
::Marker
::Modifiable
;
86 use CXGN
::Marker
::Tools
;
87 use CXGN
::Marker
::Location
;
88 use CXGN
::Map
::Version
;
89 use CXGN
::DB
::Connection
;
90 use CXGN
::DB
::InsertDBH
;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts
('H:D:i:rn:fv:l:');
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
108 $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
112 my $map_version_id = $opt_v;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN
::DB
::InsertDBH
->new({
121 dbargs
=> {AutoCommit
=> 0,
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
130 print "Inserting a new map...\n";
132 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
149 $new_map_version = CXGN
::Map
::Version
->
150 #new($dbh,{map_id=>$map_id});
151 new
($dbh,{map_id
=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN
::Map
::Version
->
159 new
($dbh, {map_version_id
=>$map_version_id});
161 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw
| MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
166 or die"Column headings are not as expected";
168 # for all the (uncleaned) marker names in the spreadsheet
170 for my $dirty_marker_name(@markers) {
172 # clean it, meaning, get the subscript if it's there,
173 # and convert the name into a canonical form so we can find it
174 # if it already exists with a different spelling or something
176 my ($marker_name,$subscript) = ($dirty_marker_name, "");
177 #CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
179 # get as many IDs as you can for a marker with a name like this
181 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
182 # if we get more than one, our script needs to be made smarter
183 # so it can differentiate between them, or maybe one of them
184 # needs to be deleted from the database
186 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
187 # just get the first ID in the list (if the list is longer than 1,
188 # we've already died)
190 my ($marker_id) = @marker_ids;
194 # if we have found an existing marker
195 # make a modifiable marker object from it
197 print "Found existing marker: $marker_id, $marker_name\n";
198 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
201 # if we are making a new marker
202 # make a modifiable marker object and start to populate it
204 print "Loading new marker id from marker $marker_name\n";
205 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
206 $marker->set_marker_name($marker_name); #give it a name
209 # marker must exist before locations can be added for it.
210 # this is a db constraint. if you didn't do this, this script
213 my $inserts = $marker->store_new_data();
215 # if any data was inserted for this marker (presumably it was,
216 # because we're making a new one)
218 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
219 else { die "Oops, I thought I was inserting some new data" }
220 $marker_id=$marker->marker_id();
224 print $marker->get_name()."\n";
226 my $loc=$marker->new_location(); #create a new location object
228 # some files have pos which contains chromsome and position
229 #my $pos=$ss->value_at($dirty_marker_name,'Position')
230 # get position from spreadsheet
231 #or die "No position found for $marker_name";
232 # extract linkage group name and cm position from string like '01.035'
233 #my ($chromosome,$position) =
234 #CXGN::Marker::Tools::lg_name_and_position($pos);
236 # foreach my $me (@{$marker->current_mapping_experiments}) {
237 # print $me->{protocol}."\n";
240 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP'); # get chromosome from spreadsheet
241 if (!defined($chromosome)) { die"No chromosome found for $marker_name"; }
243 if (! str_in
($chromosome, @
$linkage_groups)) {
244 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
248 # some have separate fields for chromsome and position
250 my $position = $ss->value_at($dirty_marker_name,'POSITION');
251 # get position from spreadsheet
252 if (!defined($position)) {
253 print STDERR
"No position found for $marker_name\n";
259 # get confidence from spreadsheet
261 $confidence = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $confidence='uncalculated';
262 if ($confidence=~/^(\d+)$/) {
263 if ($confidence == 0) { $confidence = "I"; }
264 elsif ($confidence == 1) { $confidence = "I(LOD2)"; }
265 elsif ($confidence == 2) { $confidence = "CF(LOD3)"; }
266 elsif ($confidence == 3) { $confidence = "F(LOD3)"; }
267 else { $confidence = "uncalculated"; }
269 # get protocol from spreadsheet
271 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
273 # some entries have been mapped to the same location by more than
274 # one method separated in the spreadsheet by plus signs
276 my @protocols=split(/\+/,$protocols_string);
278 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
282 print STDERR
"Protocols not found for '$dirty_marker_name'";
283 @protocols = ('unknown' x
scalar(@protocols));
286 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
289 for my $protocol(@protocols) {
290 $protocol = uc($protocol);
291 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
292 or $protocol eq 'SNP' or $protocol eq 'SSR'
293 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
295 print STDERR
"UNKNOWN protocol ($protocol)\n! ";
296 $protocol = 'unknown';
299 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
300 print "Protocol = $protocol\n";
302 # set the marker_id that will be at this location
304 $loc->marker_id($marker_id);
306 # set the map_version_id this location is found on
307 # (this must be done before calling function lg_name)
309 $loc->map_version_id($map_version_id);
311 # set the linkage group name for this marker location
312 # (the map_version_id must be already set for this to work,
313 # else how would it be able to know different linkage groups on
314 # different map versions from each other, when they all have the same names?)
316 $loc->lg_name($chromosome);
318 #set the position of the marker on this linkage group
320 $loc->position($position);
322 # set the confidence with which this marker is mapped at this position
324 $loc->confidence($confidence);
326 # set the subscript for this location, because the same marker
327 # can be mapped to multiple locations, and these locations must be distinguishable
329 $loc->subscript($subscript);
331 # this method call represents the insertion into the
332 # marker_experiment table. this is currently a troublesome
333 # issue because this marker was probably mapped here via a
334 # PCR or RFLP experiment. where is this experiment data?
335 # well, it's in another spreadsheet, to be loaded later,
336 # or maybe it has already been loaded. if it was previously
337 # loaded, it was matched up with an old map version. how can we
338 # match that existing PCR/RFLP data up with this new map
339 # version? well, it will have to be done later by some other script.
341 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
342 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
345 # store whatever new data you have (in this case, the new data
346 # is the location we just assigned the marker)
348 print "Storing new marker data...\n";
349 my $inserts = $marker->store_new_data();
352 # if any data was inserted for this marker (presumably it was,
353 # since we're adding locations on a brand new map version)
355 if ($inserts and @
{$inserts}) {
356 print "New marker data inserted:\n";#.Dumper($inserts);
357 print $loc->as_string();
360 else { die "Oops, I thought I was inserting some new data" }
363 # deprecate the old map version and make the new one we just made the current one
365 $new_map_version->set_current();
372 print "Failed; rolling back.\n";
378 print"Committing.\n";
382 print"Rolling back.\n";