3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
14 load_map_data.pl - a script to load maps into the SGN database.
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
28 The hostname of the server hosting the database.
32 the name of the database
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 add data to map version provided. Conflicts with -i and -n.
48 required if -i is not used. Provides the map name.
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
58 force to 'unknown' protocol type if no protocol is provided.
62 The tab-delimited map file has the following columns:
67 POSITION (must be a float! 0.0 )
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
82 use CXGN
::Tools
::List qw
| str_in
|;
83 use CXGN
::Tools
::File
::Spreadsheet
;
84 use CXGN
::Tools
::Text
;
85 use CXGN
::Marker
::Modifiable
;
86 use CXGN
::Marker
::Tools
;
87 use CXGN
::Marker
::Location
;
88 use CXGN
::Map
::Version
;
89 use CXGN
::DB
::Connection
;
90 use CXGN
::DB
::InsertDBH
;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts
('H:D:i:rn:fv:l:');
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
108 $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
112 my $map_version_id = $opt_v;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN
::DB
::InsertDBH
->new({
121 dbargs
=> {AutoCommit
=> 0,
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
130 print "Inserting a new map...\n";
132 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
149 $new_map_version = CXGN
::Map
::Version
->
150 #new($dbh,{map_id=>$map_id});
151 new
($dbh,{map_id
=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN
::Map
::Version
->
159 new
($dbh, {map_version_id
=>$map_version_id});
161 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw
| MARKER LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
166 # @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
167 or die"Column headings are not as expected";
169 # for all the (uncleaned) marker names in the spreadsheet
171 for my $dirty_marker_name(@markers) {
173 # clean it, meaning, get the subscript if it's there,
174 # and convert the name into a canonical form so we can find it
175 # if it already exists with a different spelling or something
177 my ($marker_name,$subscript) = ($dirty_marker_name, "");
178 #CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
180 # get as many IDs as you can for a marker with a name like this
182 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
183 # if we get more than one, our script needs to be made smarter
184 # so it can differentiate between them, or maybe one of them
185 # needs to be deleted from the database
187 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
188 # just get the first ID in the list (if the list is longer than 1,
189 # we've already died)
191 my ($marker_id) = @marker_ids;
195 # if we have found an existing marker
196 # make a modifiable marker object from it
198 print "Found existing marker: $marker_id, $marker_name\n";
199 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
202 # if we are making a new marker
203 # make a modifiable marker object and start to populate it
205 print "Loading new marker id from marker $marker_name\n";
206 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
207 $marker->set_marker_name($marker_name); #give it a name
210 # marker must exist before locations can be added for it.
211 # this is a db constraint. if you didn't do this, this script
214 my $inserts = $marker->store_new_data();
216 # if any data was inserted for this marker (presumably it was,
217 # because we're making a new one)
219 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
220 else { die "Oops, I thought I was inserting some new data" }
221 $marker_id=$marker->marker_id();
225 print $marker->get_name()."\n";
227 my $loc=$marker->new_location(); #create a new location object
229 # some files have pos which contains chromsome and position
230 #my $pos=$ss->value_at($dirty_marker_name,'Position')
231 # get position from spreadsheet
232 #or die "No position found for $marker_name";
233 # extract linkage group name and cm position from string like '01.035'
234 #my ($chromosome,$position) =
235 #CXGN::Marker::Tools::lg_name_and_position($pos);
237 # foreach my $me (@{$marker->current_mapping_experiments}) {
238 # print $me->{protocol}."\n";
241 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP'); # get chromosome from spreadsheet
242 if (!defined($chromosome)) { die"No chromosome found for $marker_name"; }
244 if (! str_in
($chromosome, @
$linkage_groups)) {
245 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
249 # some have separate fields for chromsome and position
251 my $position = $ss->value_at($dirty_marker_name,'POSITION');
252 # get position from spreadsheet
253 if (!defined($position)) {
254 print STDERR
"No position found for $marker_name\n";
260 # get confidence from spreadsheet
262 $confidence = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $confidence='uncalculated';
263 if ($confidence=~/^(\d+)$/) {
264 if ($confidence == 0) { $confidence = "I"; }
265 elsif ($confidence == 1) { $confidence = "I(LOD2)"; }
266 elsif ($confidence == 2) { $confidence = "CF(LOD3)"; }
267 elsif ($confidence == 3) { $confidence = "F(LOD3)"; }
268 else { $confidence = "uncalculated"; }
270 # get protocol from spreadsheet
272 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
274 # some entries have been mapped to the same location by more than
275 # one method separated in the spreadsheet by plus signs
277 my @protocols=split(/\+/,$protocols_string);
279 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
283 print STDERR
"Protocols not found for '$dirty_marker_name'";
284 @protocols = ('unknown' x
scalar(@protocols));
287 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
290 for my $protocol(@protocols) {
291 $protocol = uc($protocol);
292 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
293 or $protocol eq 'SNP' or $protocol eq 'SSR'
294 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
296 print STDERR
"UNKNOWN protocol ($protocol)\n! ";
297 $protocol = 'unknown';
300 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
301 print "Protocol = $protocol\n";
303 # set the marker_id that will be at this location
305 $loc->marker_id($marker_id);
307 # set the map_version_id this location is found on
308 # (this must be done before calling function lg_name)
310 $loc->map_version_id($map_version_id);
312 # set the linkage group name for this marker location
313 # (the map_version_id must be already set for this to work,
314 # else how would it be able to know different linkage groups on
315 # different map versions from each other, when they all have the same names?)
317 $loc->lg_name($chromosome);
319 #set the position of the marker on this linkage group
321 $loc->position($position);
323 # set the confidence with which this marker is mapped at this position
325 $loc->confidence($confidence);
327 # set the subscript for this location, because the same marker
328 # can be mapped to multiple locations, and these locations must be distinguishable
330 $loc->subscript($subscript);
332 # this method call represents the insertion into the
333 # marker_experiment table. this is currently a troublesome
334 # issue because this marker was probably mapped here via a
335 # PCR or RFLP experiment. where is this experiment data?
336 # well, it's in another spreadsheet, to be loaded later,
337 # or maybe it has already been loaded. if it was previously
338 # loaded, it was matched up with an old map version. how can we
339 # match that existing PCR/RFLP data up with this new map
340 # version? well, it will have to be done later by some other script.
342 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
343 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
346 # store whatever new data you have (in this case, the new data
347 # is the location we just assigned the marker)
349 print "Storing new marker data...\n";
350 my $inserts = $marker->store_new_data();
353 # if any data was inserted for this marker (presumably it was,
354 # since we're adding locations on a brand new map version)
356 if ($inserts and @
{$inserts}) {
357 print "New marker data inserted:\n";#.Dumper($inserts);
358 print $loc->as_string();
361 else { die "Oops, I thought I was inserting some new data" }
364 # deprecate the old map version and make the new one we just made the current one
366 $new_map_version->set_current();
373 print "Failed; rolling back.\n";
379 print"Committing.\n";
383 print"Rolling back.\n";