3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
14 load_map_data.pl - a script to load maps into the SGN database.
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
28 The hostname of the server hosting the database.
32 the name of the database
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 add data to map version provided. Conflicts with -i and -n.
48 required if -i is not used. Provides the map name.
52 force to 'unknown' protocol type if no protocol is provided.
56 The tab-delimited map file has the following columns:
61 POSITION (must be a float! 0.0 )
67 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
69 Current maintainer: Lukas Mueller/Naama Menda.
76 use CXGN
::Tools
::List qw
| str_in
|;
77 use CXGN
::Tools
::File
::Spreadsheet
;
78 use CXGN
::Tools
::Text
;
79 use CXGN
::Marker
::Modifiable
;
80 use CXGN
::Marker
::Tools
;
81 use CXGN
::Marker
::Location
;
82 use CXGN
::Map
::Version
;
83 use CXGN
::DB
::Connection
;
84 use CXGN
::DB
::InsertDBH
;
88 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v);
90 getopts
('H:D:i:rn:fv:');
94 # specify linkage groups
95 # example: my $linkage_groups = ['1','2','3','4','5'];
96 my $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
99 my $map_version_id = $opt_v;
102 if (!$opt_H && !$opt_D) {
103 die "-H and -D parameters are required.\n";
105 my $dbh = CXGN
::DB
::InsertDBH
->new({
108 dbargs
=> {AutoCommit
=> 0,
113 if (!$map_id && !$map_version_id) {
114 print "No map_id was provided. Insert a new map? ";
117 print "Inserting a new map...\n";
119 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
121 $map_id = $map->get_map_id();
123 print "New map_id: $map_id\n";
131 # we are creating a new map version every time we run this script,
132 # as long as the transaction goes through
136 $new_map_version = CXGN
::Map
::Version
->
137 #new($dbh,{map_id=>$map_id});
138 new
($dbh,{map_id
=>$map_id},$linkage_groups);
139 # saving the new map version
140 $map_version_id = $new_map_version->insert_into_database();
141 print "map version = " . $new_map_version->as_string() . "\n";
142 # make an object to give us the values from the spreadsheet
144 elsif ($map_version_id) {
145 $new_map_version = CXGN
::Map
::Version
->
146 new
($dbh, {map_version_id
=>$map_version_id});
148 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
149 my @markers = $ss->row_labels(); # row labels are the marker names
150 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
151 # make sure the spreadsheet is how we expect it to be
152 @columns = qw
| MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
153 or die"Column headings are not as expected";
155 # for all the (uncleaned) marker names in the spreadsheet
156 for my $dirty_marker_name(@markers) {
157 # clean it, meaning, get the subscript if it's there,
158 # and convert the name into a canonical form so we can find it
159 # if it already exists with a different spelling or something
160 my ($marker_name,$subscript) =
161 CXGN
::Marker
::Tools
::clean_marker_name
($dirty_marker_name);
162 # get as many IDs as you can for a marker with a name like this
163 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
164 # if we get more than one, our script needs to be made smarter
165 # so it can differentiate between them, or maybe one of them
166 # needs to be deleted from the database
167 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
168 # just get the first ID in the list (if the list is longer than 1, we've already died)
169 my($marker_id) = @marker_ids;
171 if($marker_id) { # if we have found an existing marker
172 # make a modifiable marker object from it
173 print "Found existing marker: $marker_id, $marker_name\n";
174 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
176 else { # if we are making a new marker
177 # make a modifiable marker object and start to populate it
178 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
179 $marker->set_marker_name($marker_name); #give it a name
180 print "Loading new marker id from marker $marker_name\n";
181 # marker must exist before locations can be added for it.
182 # this is a db constraint. if you didn't do this, this script would die later.
183 my $inserts = $marker->store_new_data();
184 # if any data was inserted for this marker (presumably it was,
185 # because we're making a new one)
186 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
187 else { die "Oops, I thought I was inserting some new data" }
188 $marker_id=$marker->marker_id();
191 print $marker->name_that_marker()."\n";
192 my $loc=$marker->new_location(); #create a new location object
194 # some files have pos which contains chromsome and position
195 #my $pos=$ss->value_at($dirty_marker_name,'Position')
196 # get position from spreadsheet
197 #or die "No position found for $marker_name";
198 # extract linkage group name and cm position from string like '01.035'
199 #my ($chromosome,$position) =
200 #CXGN::Marker::Tools::lg_name_and_position($pos);
202 # foreach my $me (@{$marker->current_mapping_experiments}) {
203 # print $me->{protocol}."\n";
206 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
207 # get chromosome from spreadsheet
208 or die"No chromosome found for $marker_name";
211 if (! str_in
($chromosome, @
$linkage_groups)) {
212 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
215 # some have separate fields for chromsome and position
216 my $position = $ss->value_at($dirty_marker_name,'POSITION');
217 # get position from spreadsheet
218 if (!defined($position)) {
219 print STDERR
"No position found for $marker_name\n";
224 # get confidence from spreadsheet
225 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
226 if ($conf=~/^(\d+)$/) {
227 if ($conf == 0) { $conf = "I"; }
228 elsif ($conf == 1) { $conf = "I(LOD2)"; }
229 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
230 elsif ($conf == 3) { $conf = "F(LOD3)"; }
231 else { $conf = "uncalculated"; }
233 # get protocol from spreadsheet
234 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
235 # some entries have been mapped to the same location by more than
236 # one method separated in the spreadsheet by plus signs
237 my @protocols=split(/\+/,$protocols_string);
239 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
243 print STDERR
"Protocols not found for '$dirty_marker_name'";
244 @protocols = ('unknown');
247 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
250 for my $protocol(@protocols) {
251 $protocol =~ tr/[a-z]/[A-Z]/;
252 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
253 or $protocol eq 'SNP' or $protocol eq 'SSR'
254 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
256 print STDERR
"UNKNOWN protocol ($protocol)\n! ";
257 $protocol = 'unknown';
260 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
261 print "Protocol = $protocol\n";
262 # set the marker_id that will be at this location
263 $loc->marker_id($marker_id);
264 # set the map_version_id this location is found on
265 # (this must be done before calling function lg_name)
266 $loc->map_version_id($map_version_id);
267 # set the linkage group name for this marker location
268 # (the map_version_id must be already set for this to work,
269 # else how would it be able to know different linkage groups on
270 # different map versions from each other, when they all have the same names?)
271 $loc->lg_name($chromosome);
273 #set the position of the marker on this linkage group
274 $loc->position($position);
276 # set the confidence with which this marker is mapped at this position
277 $loc->confidence($conf);
279 # set the subscript for this location, because the same marker
280 # can be mapped to multiple locations, and these locations must be distinguishable
281 $loc->subscript($subscript);
283 # this method call represents the insertion into the
284 # marker_experiment table. this is currently a troublesome
285 # issue because this marker was probably mapped here via a
286 # PCR or RFLP experiment. where is this experiment data?
287 # well, it's in another spreadsheet, to be loaded later,
288 # or maybe it has already been loaded. if it was previously
289 # loaded, it was matched up with an old map version. how can we
290 # match that existing PCR/RFLP data up with this new map
291 # version? well, it will have to be done later by some other script.
292 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
293 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
296 # store whatever new data you have (in this case, the new data
297 # is the location we just assigned the marker)
298 my $inserts = $marker->store_new_data();
301 # if any data was inserted for this marker (presumably it was,
302 # since we're adding locations on a brand new map version)
303 if ($inserts and @
{$inserts}) {
304 print "New marker data inserted:\n";#.Dumper($inserts);
305 print $loc->as_string();
308 else { die "Oops, I thought I was inserting some new data" }
311 # deprecate the old map version and make the new one we just made the current one
312 $new_map_version->set_current();
319 print "Failed; rolling back.\n";
325 print"Committing.\n";
329 print"Rolling back.\n";