make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / load_map_data.pl
blobd230dd78fcf62ea46ccc9ffd8facf84b65ccff84
1 #!/usr/bin/perl
3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
12 =head1 NAME
14 load_map_data.pl - a script to load maps into the SGN database.
16 =head1 DESCRIPTION
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
22 Options:
24 =over 5
26 =item -H
28 The hostname of the server hosting the database.
30 =item -D
32 the name of the database
34 =item -r
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
38 =item -i
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
42 =item -v
44 add data to map version provided. Conflicts with -i and -n.
46 =item -n
48 required if -i is not used. Provides the map name.
50 =item -l
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
56 =item -f
58 force to 'unknown' protocol type if no protocol is provided.
60 =back
62 The tab-delimited map file has the following columns:
64 MARKER
65 CONFIDENCE
66 LINKAGE_GROUP
67 POSITION (must be a float! 0.0 )
68 PROTOCOL
71 =head1 AUTHORS
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
77 =cut
79 use strict;
81 use Getopt::Std;
82 use CXGN::Tools::List qw | str_in |;
83 use CXGN::Tools::File::Spreadsheet;
84 use CXGN::Tools::Text;
85 use CXGN::Marker::Modifiable;
86 use CXGN::Marker::Tools;
87 use CXGN::Marker::Location;
88 use CXGN::Map::Version;
89 use CXGN::DB::Connection;
90 use CXGN::DB::InsertDBH;
91 use Data::Dumper;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts('H:D:i:rn:fv:l:');
98 my $map_id;
99 my $map_file;
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
103 my $linkage_groups;
104 if ($opt_l) {
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
107 else {
108 $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
111 $map_id = $opt_i;
112 my $map_version_id = $opt_v;
113 $map_file = shift;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN::DB::InsertDBH->new({
119 dbname => $opt_D,
120 dbhost => $opt_H,
121 dbargs => {AutoCommit => 0,
122 RaiseError => 1}
125 eval {
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
128 my $key = <STDIN>;
129 if ($key =~ /Y/i) {
130 print "Inserting a new map...\n";
132 my $map = CXGN::Map->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
139 else {
140 exit();
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
146 my $new_map_version;
148 if ($map_id) {
149 $new_map_version = CXGN::Map::Version->
150 #new($dbh,{map_id=>$map_id});
151 new($dbh,{map_id=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN::Map::Version->
159 new($dbh, {map_version_id=>$map_version_id});
161 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw | MARKER LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
166 # @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
167 or die"Column headings are not as expected";
169 # for all the (uncleaned) marker names in the spreadsheet
171 for my $dirty_marker_name(@markers) {
173 # clean it, meaning, get the subscript if it's there,
174 # and convert the name into a canonical form so we can find it
175 # if it already exists with a different spelling or something
177 my ($marker_name,$subscript) = ($dirty_marker_name, "");
178 #CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
180 # get as many IDs as you can for a marker with a name like this
182 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
183 # if we get more than one, our script needs to be made smarter
184 # so it can differentiate between them, or maybe one of them
185 # needs to be deleted from the database
187 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
188 # just get the first ID in the list (if the list is longer than 1,
189 # we've already died)
191 my ($marker_id) = @marker_ids;
192 my $marker;
194 if($marker_id) {
195 # if we have found an existing marker
196 # make a modifiable marker object from it
198 print "Found existing marker: $marker_id, $marker_name\n";
199 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
201 else {
202 # if we are making a new marker
203 # make a modifiable marker object and start to populate it
205 print "Loading new marker id from marker $marker_name\n";
206 $marker = CXGN::Marker::Modifiable->new($dbh);
207 $marker->set_marker_name($marker_name); #give it a name
210 # marker must exist before locations can be added for it.
211 # this is a db constraint. if you didn't do this, this script
212 # would die later.
214 my $inserts = $marker->store_new_data();
216 # if any data was inserted for this marker (presumably it was,
217 # because we're making a new one)
219 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
220 else { die "Oops, I thought I was inserting some new data" }
221 $marker_id=$marker->marker_id();
225 print $marker->get_name()."\n";
227 my $loc=$marker->new_location(); #create a new location object
229 # some files have pos which contains chromsome and position
230 #my $pos=$ss->value_at($dirty_marker_name,'Position')
231 # get position from spreadsheet
232 #or die "No position found for $marker_name";
233 # extract linkage group name and cm position from string like '01.035'
234 #my ($chromosome,$position) =
235 #CXGN::Marker::Tools::lg_name_and_position($pos);
237 # foreach my $me (@{$marker->current_mapping_experiments}) {
238 # print $me->{protocol}."\n";
241 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP'); # get chromosome from spreadsheet
242 if (!defined($chromosome)) { die"No chromosome found for $marker_name"; }
244 if (! str_in($chromosome, @$linkage_groups)) {
245 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
246 next;
249 # some have separate fields for chromsome and position
251 my $position = $ss->value_at($dirty_marker_name,'POSITION');
252 # get position from spreadsheet
253 if (!defined($position)) {
254 print STDERR "No position found for $marker_name\n";
255 next;
258 my $confidence;
260 # get confidence from spreadsheet
262 $confidence = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $confidence='uncalculated';
263 if ($confidence=~/^(\d+)$/) {
264 if ($confidence == 0) { $confidence = "I"; }
265 elsif ($confidence == 1) { $confidence = "I(LOD2)"; }
266 elsif ($confidence == 2) { $confidence = "CF(LOD3)"; }
267 elsif ($confidence == 3) { $confidence = "F(LOD3)"; }
268 else { $confidence = "uncalculated"; }
270 # get protocol from spreadsheet
272 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
274 # some entries have been mapped to the same location by more than
275 # one method separated in the spreadsheet by plus signs
277 my @protocols=split(/\+/,$protocols_string);
278 if (@protocols) {
279 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
281 else {
282 if ($opt_f) {
283 print STDERR "Protocols not found for '$dirty_marker_name'";
284 @protocols = ('unknown' x scalar(@protocols));
286 else {
287 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
290 for my $protocol(@protocols) {
291 $protocol = uc($protocol);
292 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
293 or $protocol eq 'SNP' or $protocol eq 'SSR'
294 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
296 print STDERR "UNKNOWN protocol ($protocol)\n! ";
297 $protocol = 'unknown';
300 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
301 print "Protocol = $protocol\n";
303 # set the marker_id that will be at this location
305 $loc->marker_id($marker_id);
307 # set the map_version_id this location is found on
308 # (this must be done before calling function lg_name)
310 $loc->map_version_id($map_version_id);
312 # set the linkage group name for this marker location
313 # (the map_version_id must be already set for this to work,
314 # else how would it be able to know different linkage groups on
315 # different map versions from each other, when they all have the same names?)
317 $loc->lg_name($chromosome);
319 #set the position of the marker on this linkage group
321 $loc->position($position);
323 # set the confidence with which this marker is mapped at this position
325 $loc->confidence($confidence);
327 # set the subscript for this location, because the same marker
328 # can be mapped to multiple locations, and these locations must be distinguishable
330 $loc->subscript($subscript);
332 # this method call represents the insertion into the
333 # marker_experiment table. this is currently a troublesome
334 # issue because this marker was probably mapped here via a
335 # PCR or RFLP experiment. where is this experiment data?
336 # well, it's in another spreadsheet, to be loaded later,
337 # or maybe it has already been loaded. if it was previously
338 # loaded, it was matched up with an old map version. how can we
339 # match that existing PCR/RFLP data up with this new map
340 # version? well, it will have to be done later by some other script.
342 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
343 $marker->add_experiment({location=>$loc,protocol=>$protocol});
346 # store whatever new data you have (in this case, the new data
347 # is the location we just assigned the marker)
349 print "Storing new marker data...\n";
350 my $inserts = $marker->store_new_data();
353 # if any data was inserted for this marker (presumably it was,
354 # since we're adding locations on a brand new map version)
356 if ($inserts and @{$inserts}) {
357 print "New marker data inserted:\n";#.Dumper($inserts);
358 print $loc->as_string();
361 else { die "Oops, I thought I was inserting some new data" }
364 # deprecate the old map version and make the new one we just made the current one
366 $new_map_version->set_current();
371 if ($@) {
372 print $@;
373 print "Failed; rolling back.\n";
374 $dbh->rollback();
376 else {
377 print"Succeeded.\n";
378 if (!$opt_r) {
379 print"Committing.\n";
380 $dbh->commit();
382 else {
383 print"Rolling back.\n";
384 $dbh->rollback();