add option (-v) to add more new markers to a given map_version.
[sgn.git] / bin / load_map_data.pl
blobe8c47d08cb8490770faa7fe00bb00e0138aacc75
1 #!/usr/bin/perl
3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
12 =head1 NAME
14 load_map_data.pl - a script to load maps into the SGN database.
16 =head1 DESCRIPTION
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
22 Options:
24 =over 5
26 =item -H
28 The hostname of the server hosting the database.
30 =item -D
32 the name of the database
34 =item -r
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
38 =item -i
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
42 =item -v
44 add data to map version provided. Conflicts with -i and -n.
46 =item -n
48 required if -i is not used. Provides the map name.
50 =item -f
52 force to 'unknown' protocol type if no protocol is provided.
54 =back
56 The tab-delimited map file has the following columns:
58 MARKER
59 CONFIDENCE
60 LINKAGE_GROUP
61 POSITION (must be a float! 0.0 )
62 PROTOCOL
65 =head1 AUTHORS
67 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
69 Current maintainer: Lukas Mueller/Naama Menda.
71 =cut
73 use strict;
75 use Getopt::Std;
76 use CXGN::Tools::List qw | str_in |;
77 use CXGN::Tools::File::Spreadsheet;
78 use CXGN::Tools::Text;
79 use CXGN::Marker::Modifiable;
80 use CXGN::Marker::Tools;
81 use CXGN::Marker::Location;
82 use CXGN::Map::Version;
83 use CXGN::DB::Connection;
84 use CXGN::DB::InsertDBH;
85 use Data::Dumper;
88 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f, $opt_v);
90 getopts('H:D:i:rn:fv:');
92 my $map_id;
93 my $map_file;
94 # specify linkage groups
95 # example: my $linkage_groups = ['1','2','3','4','5'];
96 my $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
98 $map_id = $opt_i;
99 my $map_version_id = $opt_v;
100 $map_file = shift;
102 if (!$opt_H && !$opt_D) {
103 die "-H and -D parameters are required.\n";
105 my $dbh = CXGN::DB::InsertDBH->new({
106 dbname => $opt_D,
107 dbhost => $opt_H,
108 dbargs => {AutoCommit => 0,
109 RaiseError => 1}
112 eval {
113 if (!$map_id && !$map_version_id) {
114 print "No map_id was provided. Insert a new map? ";
115 my $key = <STDIN>;
116 if ($key =~ /Y/i) {
117 print "Inserting a new map...\n";
119 my $map = CXGN::Map->new_map($dbh, $opt_n);
121 $map_id = $map->get_map_id();
123 print "New map_id: $map_id\n";
126 else {
127 exit();
131 # we are creating a new map version every time we run this script,
132 # as long as the transaction goes through
133 my $new_map_version;
135 if ($map_id) {
136 $new_map_version = CXGN::Map::Version->
137 #new($dbh,{map_id=>$map_id});
138 new($dbh,{map_id=>$map_id},$linkage_groups);
139 # saving the new map version
140 $map_version_id = $new_map_version->insert_into_database();
141 print "map version = " . $new_map_version->as_string() . "\n";
142 # make an object to give us the values from the spreadsheet
144 elsif ($map_version_id) {
145 $new_map_version = CXGN::Map::Version->
146 new($dbh, {map_version_id=>$map_version_id});
148 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
149 my @markers = $ss->row_labels(); # row labels are the marker names
150 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
151 # make sure the spreadsheet is how we expect it to be
152 @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
153 or die"Column headings are not as expected";
155 # for all the (uncleaned) marker names in the spreadsheet
156 for my $dirty_marker_name(@markers) {
157 # clean it, meaning, get the subscript if it's there,
158 # and convert the name into a canonical form so we can find it
159 # if it already exists with a different spelling or something
160 my ($marker_name,$subscript) =
161 CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
162 # get as many IDs as you can for a marker with a name like this
163 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
164 # if we get more than one, our script needs to be made smarter
165 # so it can differentiate between them, or maybe one of them
166 # needs to be deleted from the database
167 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
168 # just get the first ID in the list (if the list is longer than 1, we've already died)
169 my($marker_id) = @marker_ids;
170 my $marker;
171 if($marker_id) { # if we have found an existing marker
172 # make a modifiable marker object from it
173 print "Found existing marker: $marker_id, $marker_name\n";
174 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
176 else { # if we are making a new marker
177 # make a modifiable marker object and start to populate it
178 $marker = CXGN::Marker::Modifiable->new($dbh);
179 $marker->set_marker_name($marker_name); #give it a name
180 print "Loading new marker id from marker $marker_name\n";
181 # marker must exist before locations can be added for it.
182 # this is a db constraint. if you didn't do this, this script would die later.
183 my $inserts = $marker->store_new_data();
184 # if any data was inserted for this marker (presumably it was,
185 # because we're making a new one)
186 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
187 else { die "Oops, I thought I was inserting some new data" }
188 $marker_id=$marker->marker_id();
191 print $marker->name_that_marker()."\n";
192 my $loc=$marker->new_location(); #create a new location object
194 # some files have pos which contains chromsome and position
195 #my $pos=$ss->value_at($dirty_marker_name,'Position')
196 # get position from spreadsheet
197 #or die "No position found for $marker_name";
198 # extract linkage group name and cm position from string like '01.035'
199 #my ($chromosome,$position) =
200 #CXGN::Marker::Tools::lg_name_and_position($pos);
202 # foreach my $me (@{$marker->current_mapping_experiments}) {
203 # print $me->{protocol}."\n";
206 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
207 # get chromosome from spreadsheet
208 or die"No chromosome found for $marker_name";
211 if (! str_in($chromosome, @$linkage_groups)) {
212 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
213 next;
215 # some have separate fields for chromsome and position
216 my $position = $ss->value_at($dirty_marker_name,'POSITION');
217 # get position from spreadsheet
218 if (!defined($position)) {
219 print STDERR "No position found for $marker_name\n";
220 next;
223 my $conf;
224 # get confidence from spreadsheet
225 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
226 if ($conf=~/^(\d+)$/) {
227 if ($conf == 0) { $conf = "I"; }
228 elsif ($conf == 1) { $conf = "I(LOD2)"; }
229 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
230 elsif ($conf == 3) { $conf = "F(LOD3)"; }
231 else { $conf = "uncalculated"; }
233 # get protocol from spreadsheet
234 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
235 # some entries have been mapped to the same location by more than
236 # one method separated in the spreadsheet by plus signs
237 my @protocols=split(/\+/,$protocols_string);
238 if (@protocols) {
239 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
241 else {
242 if ($opt_f) {
243 print STDERR "Protocols not found for '$dirty_marker_name'";
244 @protocols = ('unknown');
246 else {
247 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
250 for my $protocol(@protocols) {
251 $protocol =~ tr/[a-z]/[A-Z]/;
252 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
253 or $protocol eq 'SNP' or $protocol eq 'SSR'
254 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
256 print STDERR "UNKNOWN protocol ($protocol)\n! ";
257 $protocol = 'unknown';
260 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
261 print "Protocol = $protocol\n";
262 # set the marker_id that will be at this location
263 $loc->marker_id($marker_id);
264 # set the map_version_id this location is found on
265 # (this must be done before calling function lg_name)
266 $loc->map_version_id($map_version_id);
267 # set the linkage group name for this marker location
268 # (the map_version_id must be already set for this to work,
269 # else how would it be able to know different linkage groups on
270 # different map versions from each other, when they all have the same names?)
271 $loc->lg_name($chromosome);
273 #set the position of the marker on this linkage group
274 $loc->position($position);
276 # set the confidence with which this marker is mapped at this position
277 $loc->confidence($conf);
279 # set the subscript for this location, because the same marker
280 # can be mapped to multiple locations, and these locations must be distinguishable
281 $loc->subscript($subscript);
283 # this method call represents the insertion into the
284 # marker_experiment table. this is currently a troublesome
285 # issue because this marker was probably mapped here via a
286 # PCR or RFLP experiment. where is this experiment data?
287 # well, it's in another spreadsheet, to be loaded later,
288 # or maybe it has already been loaded. if it was previously
289 # loaded, it was matched up with an old map version. how can we
290 # match that existing PCR/RFLP data up with this new map
291 # version? well, it will have to be done later by some other script.
292 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
293 $marker->add_experiment({location=>$loc,protocol=>$protocol});
296 # store whatever new data you have (in this case, the new data
297 # is the location we just assigned the marker)
298 my $inserts = $marker->store_new_data();
301 # if any data was inserted for this marker (presumably it was,
302 # since we're adding locations on a brand new map version)
303 if ($inserts and @{$inserts}) {
304 print "New marker data inserted:\n";#.Dumper($inserts);
305 print $loc->as_string();
308 else { die "Oops, I thought I was inserting some new data" }
311 # deprecate the old map version and make the new one we just made the current one
312 $new_map_version->set_current();
317 if ($@) {
318 print $@;
319 print "Failed; rolling back.\n";
320 $dbh->rollback();
322 else {
323 print"Succeeded.\n";
324 if (!$opt_r) {
325 print"Committing.\n";
326 $dbh->commit();
328 else {
329 print"Rolling back.\n";
330 $dbh->rollback();