3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
14 load_map_data.pl - a script to load maps into the SGN database.
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
28 The hostname of the server hosting the database.
32 the name of the database
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
44 required if -i is not used. Provides the map name.
48 force to 'unknown' protocol type if no protocol is provided.
52 The tab-delimited map file has the following columns:
57 POSITION (must be a float! 0.0 )
63 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
65 Current maintainer: Lukas Mueller/Naama Menda.
72 use CXGN
::Tools
::List qw
| str_in
|;
73 use CXGN
::Tools
::File
::Spreadsheet
;
74 use CXGN
::Tools
::Text
;
75 use CXGN
::Marker
::Modifiable
;
76 use CXGN
::Marker
::Tools
;
77 use CXGN
::Marker
::Location
;
78 use CXGN
::Map
::Version
;
79 use CXGN
::DB
::Connection
;
80 use CXGN
::DB
::InsertDBH
;
84 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f);
86 getopts
('H:D:i:rn:f');
90 # specify linkage groups
91 # example: my $linkage_groups = ['1','2','3','4','5'];
92 my $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
98 if (!$opt_H && !$opt_D) {
99 die "-H and -D parameters are required.\n";
101 my $dbh = CXGN
::DB
::InsertDBH
->new({
104 dbargs
=> {AutoCommit
=> 0,
110 print "No map_id was provided. Insert a new map? ";
113 print "Inserting a new map...\n";
115 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
117 $map_id = $map->get_map_id();
119 print "New map_id: $map_id\n";
127 # we are creating a new map version every time we run this script,
128 # as long as the transaction goes through
129 my $new_map_version = CXGN
::Map
::Version
->
130 #new($dbh,{map_id=>$map_id});
131 new
($dbh,{map_id
=>$map_id},$linkage_groups);
132 # saving the new map version
133 my $map_version_id = $new_map_version->insert_into_database();
134 print "map version = " . $new_map_version->as_string() . "\n";
135 # make an object to give us the values from the spreadsheet
136 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
137 my @markers = $ss->row_labels(); # row labels are the marker names
138 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
139 # make sure the spreadsheet is how we expect it to be
140 @columns = qw
| MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
141 or die"Column headings are not as expected";
143 # for all the (uncleaned) marker names in the spreadsheet
144 for my $dirty_marker_name(@markers) {
145 # clean it, meaning, get the subscript if it's there,
146 # and convert the name into a canonical form so we can find it
147 # if it already exists with a different spelling or something
148 my ($marker_name,$subscript) =
149 CXGN
::Marker
::Tools
::clean_marker_name
($dirty_marker_name);
150 # get as many IDs as you can for a marker with a name like this
151 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
152 # if we get more than one, our script needs to be made smarter
153 # so it can differentiate between them, or maybe one of them
154 # needs to be deleted from the database
155 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
156 # just get the first ID in the list (if the list is longer than 1, we've already died)
157 my($marker_id) = @marker_ids;
159 if($marker_id) { # if we have found an existing marker
160 # make a modifiable marker object from it
161 print "Found existing marker: $marker_id, $marker_name\n";
162 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
164 else { # if we are making a new marker
165 # make a modifiable marker object and start to populate it
166 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
167 $marker->set_marker_name($marker_name); #give it a name
168 print "Loading new marker id from marker $marker_name\n";
169 # marker must exist before locations can be added for it.
170 # this is a db constraint. if you didn't do this, this script would die later.
171 my $inserts = $marker->store_new_data();
172 # if any data was inserted for this marker (presumably it was,
173 # because we're making a new one)
174 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
175 else { die "Oops, I thought I was inserting some new data" }
176 $marker_id=$marker->marker_id();
179 print $marker->name_that_marker()."\n";
180 my $loc=$marker->new_location(); #create a new location object
182 # some files have pos which contains chromsome and position
183 #my $pos=$ss->value_at($dirty_marker_name,'Position')
184 # get position from spreadsheet
185 #or die "No position found for $marker_name";
186 # extract linkage group name and cm position from string like '01.035'
187 #my ($chromosome,$position) =
188 #CXGN::Marker::Tools::lg_name_and_position($pos);
190 # foreach my $me (@{$marker->current_mapping_experiments}) {
191 # print $me->{protocol}."\n";
194 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
195 # get chromosome from spreadsheet
196 or die"No chromosome found for $marker_name";
199 if (! str_in
($chromosome, @
$linkage_groups)) {
200 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
203 # some have separate fields for chromsome and position
204 my $position = $ss->value_at($dirty_marker_name,'POSITION');
205 # get position from spreadsheet
206 if (!defined($position)) {
207 print STDERR
"No position found for $marker_name\n";
212 # get confidence from spreadsheet
213 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
214 if ($conf=~/^(\d+)$/) {
215 if ($conf == 0) { $conf = "I"; }
216 elsif ($conf == 1) { $conf = "I(LOD2)"; }
217 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
218 elsif ($conf == 3) { $conf = "F(LOD3)"; }
219 else { $conf = "uncalculated"; }
221 # get protocol from spreadsheet
222 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
223 # some entries have been mapped to the same location by more than
224 # one method separated in the spreadsheet by plus signs
225 my @protocols=split(/\+/,$protocols_string);
227 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
231 print STDERR
"Protocols not found for '$dirty_marker_name'";
232 @protocols = ('unknown');
235 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
238 for my $protocol(@protocols) {
239 $protocol =~ tr/[a-z]/[A-Z]/;
240 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
241 or $protocol eq 'SNP' or $protocol eq 'SSR'
242 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
244 print STDERR
"UNKNOWN protocol ($protocol)\n! ";
245 $protocol = 'unknown';
248 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
249 print "Protocol = $protocol\n";
250 # set the marker_id that will be at this location
251 $loc->marker_id($marker_id);
252 # set the map_version_id this location is found on
253 # (this must be done before calling function lg_name)
254 $loc->map_version_id($map_version_id);
255 # set the linkage group name for this marker location
256 # (the map_version_id must be already set for this to work,
257 # else how would it be able to know different linkage groups on
258 # different map versions from each other, when they all have the same names?)
259 $loc->lg_name($chromosome);
261 #set the position of the marker on this linkage group
262 $loc->position($position);
264 # set the confidence with which this marker is mapped at this position
265 $loc->confidence($conf);
267 # set the subscript for this location, because the same marker
268 # can be mapped to multiple locations, and these locations must be distinguishable
269 $loc->subscript($subscript);
271 # this method call represents the insertion into the
272 # marker_experiment table. this is currently a troublesome
273 # issue because this marker was probably mapped here via a
274 # PCR or RFLP experiment. where is this experiment data?
275 # well, it's in another spreadsheet, to be loaded later,
276 # or maybe it has already been loaded. if it was previously
277 # loaded, it was matched up with an old map version. how can we
278 # match that existing PCR/RFLP data up with this new map
279 # version? well, it will have to be done later by some other script.
280 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
281 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
284 # store whatever new data you have (in this case, the new data
285 # is the location we just assigned the marker)
286 my $inserts = $marker->store_new_data();
289 # if any data was inserted for this marker (presumably it was,
290 # since we're adding locations on a brand new map version)
291 if ($inserts and @
{$inserts}) {
292 print "New marker data inserted:\n";#.Dumper($inserts);
293 print $loc->as_string();
296 else { die "Oops, I thought I was inserting some new data" }
299 # deprecate the old map version and make the new one we just made the current one
300 $new_map_version->set_current();
307 print "Failed; rolling back.\n";
313 print"Committing.\n";
317 print"Rolling back.\n";