3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
14 load_map_data.pl - a script to load maps into the SGN database.
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
28 The hostname of the server hosting the database.
32 the name of the database
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 add data to map version provided. Conflicts with -i and -n.
48 required if -i is not used. Provides the map name.
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
58 force to 'unknown' protocol type if no protocol is provided.
62 The tab-delimited map file has the following columns:
67 POSITION (must be a float! 0.0 )
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
82 use CXGN
::Tools
::List qw
| str_in
|;
83 use CXGN
::Tools
::File
::Spreadsheet
;
84 use CXGN
::Tools
::Text
;
85 use CXGN
::Marker
::Modifiable
;
86 use CXGN
::Marker
::Tools
;
87 use CXGN
::Marker
::Location
;
88 use CXGN
::Map
::Version
;
89 use CXGN
::DB
::Connection
;
90 use CXGN
::DB
::InsertDBH
;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts
('H:D:i:rn:fv:l:');
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
108 $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
112 my $map_version_id = $opt_v;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN
::DB
::InsertDBH
->new({
121 dbargs
=> {AutoCommit
=> 0,
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
130 print "Inserting a new map...\n";
132 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
149 $new_map_version = CXGN
::Map
::Version
->
150 #new($dbh,{map_id=>$map_id});
151 new
($dbh,{map_id
=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN
::Map
::Version
->
159 new
($dbh, {map_version_id
=>$map_version_id});
161 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw
| MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
166 or die"Column headings are not as expected";
168 # for all the (uncleaned) marker names in the spreadsheet
169 for my $dirty_marker_name(@markers) {
170 # clean it, meaning, get the subscript if it's there,
171 # and convert the name into a canonical form so we can find it
172 # if it already exists with a different spelling or something
173 my ($marker_name,$subscript) =
174 CXGN
::Marker
::Tools
::clean_marker_name
($dirty_marker_name);
175 # get as many IDs as you can for a marker with a name like this
176 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
177 # if we get more than one, our script needs to be made smarter
178 # so it can differentiate between them, or maybe one of them
179 # needs to be deleted from the database
180 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
181 # just get the first ID in the list (if the list is longer than 1, we've already died)
182 my($marker_id) = @marker_ids;
184 if($marker_id) { # if we have found an existing marker
185 # make a modifiable marker object from it
186 print "Found existing marker: $marker_id, $marker_name\n";
187 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
189 else { # if we are making a new marker
190 # make a modifiable marker object and start to populate it
191 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
192 $marker->set_marker_name($marker_name); #give it a name
193 print "Loading new marker id from marker $marker_name\n";
194 # marker must exist before locations can be added for it.
195 # this is a db constraint. if you didn't do this, this script would die later.
196 my $inserts = $marker->store_new_data();
197 # if any data was inserted for this marker (presumably it was,
198 # because we're making a new one)
199 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
200 else { die "Oops, I thought I was inserting some new data" }
201 $marker_id=$marker->marker_id();
204 print $marker->name_that_marker()."\n";
205 my $loc=$marker->new_location(); #create a new location object
207 # some files have pos which contains chromsome and position
208 #my $pos=$ss->value_at($dirty_marker_name,'Position')
209 # get position from spreadsheet
210 #or die "No position found for $marker_name";
211 # extract linkage group name and cm position from string like '01.035'
212 #my ($chromosome,$position) =
213 #CXGN::Marker::Tools::lg_name_and_position($pos);
215 # foreach my $me (@{$marker->current_mapping_experiments}) {
216 # print $me->{protocol}."\n";
219 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
220 # get chromosome from spreadsheet
221 or die"No chromosome found for $marker_name";
224 if (! str_in
($chromosome, @
$linkage_groups)) {
225 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
228 # some have separate fields for chromsome and position
229 my $position = $ss->value_at($dirty_marker_name,'POSITION');
230 # get position from spreadsheet
231 if (!defined($position)) {
232 print STDERR
"No position found for $marker_name\n";
237 # get confidence from spreadsheet
238 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
239 if ($conf=~/^(\d+)$/) {
240 if ($conf == 0) { $conf = "I"; }
241 elsif ($conf == 1) { $conf = "I(LOD2)"; }
242 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
243 elsif ($conf == 3) { $conf = "F(LOD3)"; }
244 else { $conf = "uncalculated"; }
246 # get protocol from spreadsheet
247 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
248 # some entries have been mapped to the same location by more than
249 # one method separated in the spreadsheet by plus signs
250 my @protocols=split(/\+/,$protocols_string);
252 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
256 print STDERR
"Protocols not found for '$dirty_marker_name'";
257 @protocols = ('unknown');
260 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
263 for my $protocol(@protocols) {
264 $protocol =~ tr/[a-z]/[A-Z]/;
265 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
266 or $protocol eq 'SNP' or $protocol eq 'SSR'
267 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
269 print STDERR
"UNKNOWN protocol ($protocol)\n! ";
270 $protocol = 'unknown';
273 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
274 print "Protocol = $protocol\n";
275 # set the marker_id that will be at this location
276 $loc->marker_id($marker_id);
277 # set the map_version_id this location is found on
278 # (this must be done before calling function lg_name)
279 $loc->map_version_id($map_version_id);
280 # set the linkage group name for this marker location
281 # (the map_version_id must be already set for this to work,
282 # else how would it be able to know different linkage groups on
283 # different map versions from each other, when they all have the same names?)
284 $loc->lg_name($chromosome);
286 #set the position of the marker on this linkage group
287 $loc->position($position);
289 # set the confidence with which this marker is mapped at this position
290 $loc->confidence($conf);
292 # set the subscript for this location, because the same marker
293 # can be mapped to multiple locations, and these locations must be distinguishable
294 $loc->subscript($subscript);
296 # this method call represents the insertion into the
297 # marker_experiment table. this is currently a troublesome
298 # issue because this marker was probably mapped here via a
299 # PCR or RFLP experiment. where is this experiment data?
300 # well, it's in another spreadsheet, to be loaded later,
301 # or maybe it has already been loaded. if it was previously
302 # loaded, it was matched up with an old map version. how can we
303 # match that existing PCR/RFLP data up with this new map
304 # version? well, it will have to be done later by some other script.
305 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
306 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
309 # store whatever new data you have (in this case, the new data
310 # is the location we just assigned the marker)
311 my $inserts = $marker->store_new_data();
314 # if any data was inserted for this marker (presumably it was,
315 # since we're adding locations on a brand new map version)
316 if ($inserts and @
{$inserts}) {
317 print "New marker data inserted:\n";#.Dumper($inserts);
318 print $loc->as_string();
321 else { die "Oops, I thought I was inserting some new data" }
324 # deprecate the old map version and make the new one we just made the current one
325 $new_map_version->set_current();
332 print "Failed; rolling back.\n";
338 print"Committing.\n";
342 print"Rolling back.\n";