evidence_with param is already a dbxref
[sgn.git] / bin / load_map_data.pl
blob757bdeb899b354539a50a56328f4a2d1ce610d63
1 #!/usr/bin/perl
3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
12 =head1 NAME
14 load_map_data.pl - a script to load maps into the SGN database.
16 =head1 DESCRIPTION
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
22 Options:
24 =over 5
26 =item -H
28 The hostname of the server hosting the database.
30 =item -D
32 the name of the database
34 =item -r
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
38 =item -i
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
42 =item -v
44 add data to map version provided. Conflicts with -i and -n.
46 =item -n
48 required if -i is not used. Provides the map name.
50 =item -l
52 specify name of linkage groups as a comma separated list:
53 1,2,3,4,5,6,7,8,9,10,11,12
54 default is names from one to twelve.
56 =item -f
58 force to 'unknown' protocol type if no protocol is provided.
60 =back
62 The tab-delimited map file has the following columns:
64 MARKER
65 CONFIDENCE
66 LINKAGE_GROUP
67 POSITION (must be a float! 0.0 )
68 PROTOCOL
71 =head1 AUTHORS
73 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
75 Current maintainer: Lukas Mueller/Naama Menda.
77 =cut
79 use strict;
81 use Getopt::Std;
82 use CXGN::Tools::List qw | str_in |;
83 use CXGN::Tools::File::Spreadsheet;
84 use CXGN::Tools::Text;
85 use CXGN::Marker::Modifiable;
86 use CXGN::Marker::Tools;
87 use CXGN::Marker::Location;
88 use CXGN::Map::Version;
89 use CXGN::DB::Connection;
90 use CXGN::DB::InsertDBH;
91 use Data::Dumper;
94 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v, $opt_l);
96 getopts('H:D:i:rn:fv:l:');
98 my $map_id;
99 my $map_file;
100 # specify linkage groups
101 # example: my $linkage_groups = ['1','2','3','4','5'];
103 my $linkage_groups;
104 if ($opt_l) {
105 $linkage_groups = [ split /\s*\,\s*/, $opt_l ];
107 else {
108 $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
111 $map_id = $opt_i;
112 my $map_version_id = $opt_v;
113 $map_file = shift;
115 if (!$opt_H && !$opt_D) {
116 die "-H and -D parameters are required.\n";
118 my $dbh = CXGN::DB::InsertDBH->new({
119 dbname => $opt_D,
120 dbhost => $opt_H,
121 dbargs => {AutoCommit => 0,
122 RaiseError => 1}
125 eval {
126 if (!$map_id && !$map_version_id) {
127 print "No map_id was provided. Insert a new map? ";
128 my $key = <STDIN>;
129 if ($key =~ /Y/i) {
130 print "Inserting a new map...\n";
132 my $map = CXGN::Map->new_map($dbh, $opt_n);
134 $map_id = $map->get_map_id();
136 print "New map_id: $map_id\n";
139 else {
140 exit();
144 # we are creating a new map version every time we run this script,
145 # as long as the transaction goes through
146 my $new_map_version;
148 if ($map_id) {
149 $new_map_version = CXGN::Map::Version->
150 #new($dbh,{map_id=>$map_id});
151 new($dbh,{map_id=>$map_id},$linkage_groups);
152 # saving the new map version
153 $map_version_id = $new_map_version->insert_into_database();
154 print "map version = " . $new_map_version->as_string() . "\n";
155 # make an object to give us the values from the spreadsheet
157 elsif ($map_version_id) {
158 $new_map_version = CXGN::Map::Version->
159 new($dbh, {map_version_id=>$map_version_id});
161 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
162 my @markers = $ss->row_labels(); # row labels are the marker names
163 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
164 # make sure the spreadsheet is how we expect it to be
165 @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
166 or die"Column headings are not as expected";
168 # for all the (uncleaned) marker names in the spreadsheet
169 for my $dirty_marker_name(@markers) {
170 # clean it, meaning, get the subscript if it's there,
171 # and convert the name into a canonical form so we can find it
172 # if it already exists with a different spelling or something
173 my ($marker_name,$subscript) =
174 CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
175 # get as many IDs as you can for a marker with a name like this
176 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
177 # if we get more than one, our script needs to be made smarter
178 # so it can differentiate between them, or maybe one of them
179 # needs to be deleted from the database
180 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
181 # just get the first ID in the list (if the list is longer than 1, we've already died)
182 my($marker_id) = @marker_ids;
183 my $marker;
184 if($marker_id) { # if we have found an existing marker
185 # make a modifiable marker object from it
186 print "Found existing marker: $marker_id, $marker_name\n";
187 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
189 else { # if we are making a new marker
190 # make a modifiable marker object and start to populate it
191 $marker = CXGN::Marker::Modifiable->new($dbh);
192 $marker->set_marker_name($marker_name); #give it a name
193 print "Loading new marker id from marker $marker_name\n";
194 # marker must exist before locations can be added for it.
195 # this is a db constraint. if you didn't do this, this script would die later.
196 my $inserts = $marker->store_new_data();
197 # if any data was inserted for this marker (presumably it was,
198 # because we're making a new one)
199 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
200 else { die "Oops, I thought I was inserting some new data" }
201 $marker_id=$marker->marker_id();
204 print $marker->name_that_marker()."\n";
205 my $loc=$marker->new_location(); #create a new location object
207 # some files have pos which contains chromsome and position
208 #my $pos=$ss->value_at($dirty_marker_name,'Position')
209 # get position from spreadsheet
210 #or die "No position found for $marker_name";
211 # extract linkage group name and cm position from string like '01.035'
212 #my ($chromosome,$position) =
213 #CXGN::Marker::Tools::lg_name_and_position($pos);
215 # foreach my $me (@{$marker->current_mapping_experiments}) {
216 # print $me->{protocol}."\n";
219 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
220 # get chromosome from spreadsheet
221 or die"No chromosome found for $marker_name";
224 if (! str_in($chromosome, @$linkage_groups)) {
225 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
226 next;
228 # some have separate fields for chromsome and position
229 my $position = $ss->value_at($dirty_marker_name,'POSITION');
230 # get position from spreadsheet
231 if (!defined($position)) {
232 print STDERR "No position found for $marker_name\n";
233 next;
236 my $conf;
237 # get confidence from spreadsheet
238 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
239 if ($conf=~/^(\d+)$/) {
240 if ($conf == 0) { $conf = "I"; }
241 elsif ($conf == 1) { $conf = "I(LOD2)"; }
242 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
243 elsif ($conf == 3) { $conf = "F(LOD3)"; }
244 else { $conf = "uncalculated"; }
246 # get protocol from spreadsheet
247 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
248 # some entries have been mapped to the same location by more than
249 # one method separated in the spreadsheet by plus signs
250 my @protocols=split(/\+/,$protocols_string);
251 if (@protocols) {
252 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
254 else {
255 if ($opt_f) {
256 print STDERR "Protocols not found for '$dirty_marker_name'";
257 @protocols = ('unknown');
259 else {
260 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
263 for my $protocol(@protocols) {
264 $protocol =~ tr/[a-z]/[A-Z]/;
265 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
266 or $protocol eq 'SNP' or $protocol eq 'SSR'
267 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
269 print STDERR "UNKNOWN protocol ($protocol)\n! ";
270 $protocol = 'unknown';
273 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
274 print "Protocol = $protocol\n";
275 # set the marker_id that will be at this location
276 $loc->marker_id($marker_id);
277 # set the map_version_id this location is found on
278 # (this must be done before calling function lg_name)
279 $loc->map_version_id($map_version_id);
280 # set the linkage group name for this marker location
281 # (the map_version_id must be already set for this to work,
282 # else how would it be able to know different linkage groups on
283 # different map versions from each other, when they all have the same names?)
284 $loc->lg_name($chromosome);
286 #set the position of the marker on this linkage group
287 $loc->position($position);
289 # set the confidence with which this marker is mapped at this position
290 $loc->confidence($conf);
292 # set the subscript for this location, because the same marker
293 # can be mapped to multiple locations, and these locations must be distinguishable
294 $loc->subscript($subscript);
296 # this method call represents the insertion into the
297 # marker_experiment table. this is currently a troublesome
298 # issue because this marker was probably mapped here via a
299 # PCR or RFLP experiment. where is this experiment data?
300 # well, it's in another spreadsheet, to be loaded later,
301 # or maybe it has already been loaded. if it was previously
302 # loaded, it was matched up with an old map version. how can we
303 # match that existing PCR/RFLP data up with this new map
304 # version? well, it will have to be done later by some other script.
305 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
306 $marker->add_experiment({location=>$loc,protocol=>$protocol});
309 # store whatever new data you have (in this case, the new data
310 # is the location we just assigned the marker)
311 my $inserts = $marker->store_new_data();
314 # if any data was inserted for this marker (presumably it was,
315 # since we're adding locations on a brand new map version)
316 if ($inserts and @{$inserts}) {
317 print "New marker data inserted:\n";#.Dumper($inserts);
318 print $loc->as_string();
321 else { die "Oops, I thought I was inserting some new data" }
324 # deprecate the old map version and make the new one we just made the current one
325 $new_map_version->set_current();
330 if ($@) {
331 print $@;
332 print "Failed; rolling back.\n";
333 $dbh->rollback();
335 else {
336 print"Succeeded.\n";
337 if (!$opt_r) {
338 print"Committing.\n";
339 $dbh->commit();
341 else {
342 print"Rolling back.\n";
343 $dbh->rollback();