Make the old glossarysearch.pl redirect to the new controller
[sgn.git] / bin / load_map_data.pl
blobff91b21c5713b6ec4c0291097c014c3b591e15a4
1 #!/usr/bin/perl
3 # basic script to load maps
6 # copy and edit this file as necessary
7 # common changes include the following:
8 # specified linkage groups
9 # different column headings
10 # "pos" field versus separate "chromosome" and "position" fields
12 =head1 NAME
14 load_map_data.pl - a script to load maps into the SGN database.
16 =head1 DESCRIPTION
18 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
20 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
22 Options:
24 =over 5
26 =item -H
28 The hostname of the server hosting the database.
30 =item -D
32 the name of the database
34 =item -r
36 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
38 =item -i
40 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
42 =item -n
44 required if -i is not used. Provides the map name.
46 =item -f
48 force to 'unknown' protocol type if no protocol is provided.
50 =back
52 The tab-delimited map file has the following columns:
54 MARKER
55 CONFIDENCE
56 LINKAGE_GROUP
57 POSITION (must be a float! 0.0 )
58 PROTOCOL
61 =head1 AUTHORS
63 John Binns, Adri Mills, Lukas Mueller, Naama Menda (among others).
65 Current maintainer: Lukas Mueller/Naama Menda.
67 =cut
69 use strict;
71 use Getopt::Std;
72 use CXGN::Tools::List qw | str_in |;
73 use CXGN::Tools::File::Spreadsheet;
74 use CXGN::Tools::Text;
75 use CXGN::Marker::Modifiable;
76 use CXGN::Marker::Tools;
77 use CXGN::Marker::Location;
78 use CXGN::Map::Version;
79 use CXGN::DB::Connection;
80 use CXGN::DB::InsertDBH;
81 use Data::Dumper;
84 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n, $opt_f);
86 getopts('H:D:i:rn:f');
88 my $map_id;
89 my $map_file;
90 # specify linkage groups
91 # example: my $linkage_groups = ['1','2','3','4','5'];
92 my $linkage_groups = [ qw | 1 2 3 4 5 6 7 8 9 10 11 12 | ];
94 $map_id = $opt_i;
96 $map_file = shift;
98 if (!$opt_H && !$opt_D) {
99 die "-H and -D parameters are required.\n";
101 my $dbh = CXGN::DB::InsertDBH->new({
102 dbname => $opt_D,
103 dbhost => $opt_H,
104 dbargs => {AutoCommit => 0,
105 RaiseError => 1}
108 eval {
109 if (!$map_id) {
110 print "No map_id was provided. Insert a new map? ";
111 my $key = <STDIN>;
112 if ($key =~ /Y/i) {
113 print "Inserting a new map...\n";
115 my $map = CXGN::Map->new_map($dbh, $opt_n);
117 $map_id = $map->get_map_id();
119 print "New map_id: $map_id\n";
122 else {
123 exit();
127 # we are creating a new map version every time we run this script,
128 # as long as the transaction goes through
129 my $new_map_version = CXGN::Map::Version->
130 #new($dbh,{map_id=>$map_id});
131 new($dbh,{map_id=>$map_id},$linkage_groups);
132 # saving the new map version
133 my $map_version_id = $new_map_version->insert_into_database();
134 print "map version = " . $new_map_version->as_string() . "\n";
135 # make an object to give us the values from the spreadsheet
136 my $ss = CXGN::Tools::File::Spreadsheet->new($map_file);
137 my @markers = $ss->row_labels(); # row labels are the marker names
138 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
139 # make sure the spreadsheet is how we expect it to be
140 @columns = qw | MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL | # modify columns as necessary
141 or die"Column headings are not as expected";
143 # for all the (uncleaned) marker names in the spreadsheet
144 for my $dirty_marker_name(@markers) {
145 # clean it, meaning, get the subscript if it's there,
146 # and convert the name into a canonical form so we can find it
147 # if it already exists with a different spelling or something
148 my ($marker_name,$subscript) =
149 CXGN::Marker::Tools::clean_marker_name($dirty_marker_name);
150 # get as many IDs as you can for a marker with a name like this
151 my @marker_ids = CXGN::Marker::Tools::marker_name_to_ids($dbh,$marker_name);
152 # if we get more than one, our script needs to be made smarter
153 # so it can differentiate between them, or maybe one of them
154 # needs to be deleted from the database
155 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
156 # just get the first ID in the list (if the list is longer than 1, we've already died)
157 my($marker_id) = @marker_ids;
158 my $marker;
159 if($marker_id) { # if we have found an existing marker
160 # make a modifiable marker object from it
161 print "Found existing marker: $marker_id, $marker_name\n";
162 $marker = CXGN::Marker::Modifiable->new($dbh,$marker_id);
164 else { # if we are making a new marker
165 # make a modifiable marker object and start to populate it
166 $marker = CXGN::Marker::Modifiable->new($dbh);
167 $marker->set_marker_name($marker_name); #give it a name
168 print "Loading new marker id from marker $marker_name\n";
169 # marker must exist before locations can be added for it.
170 # this is a db constraint. if you didn't do this, this script would die later.
171 my $inserts = $marker->store_new_data();
172 # if any data was inserted for this marker (presumably it was,
173 # because we're making a new one)
174 if ($inserts and @{$inserts}) { print"New marker inserted: $marker_name\n" }
175 else { die "Oops, I thought I was inserting some new data" }
176 $marker_id=$marker->marker_id();
179 print $marker->name_that_marker()."\n";
180 my $loc=$marker->new_location(); #create a new location object
182 # some files have pos which contains chromsome and position
183 #my $pos=$ss->value_at($dirty_marker_name,'Position')
184 # get position from spreadsheet
185 #or die "No position found for $marker_name";
186 # extract linkage group name and cm position from string like '01.035'
187 #my ($chromosome,$position) =
188 #CXGN::Marker::Tools::lg_name_and_position($pos);
190 # foreach my $me (@{$marker->current_mapping_experiments}) {
191 # print $me->{protocol}."\n";
194 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
195 # get chromosome from spreadsheet
196 or die"No chromosome found for $marker_name";
199 if (! str_in($chromosome, @$linkage_groups)) {
200 print STDERR "$marker_name skipped because linkage_group is $chromosome...\n";
201 next;
203 # some have separate fields for chromsome and position
204 my $position = $ss->value_at($dirty_marker_name,'POSITION');
205 # get position from spreadsheet
206 if (!defined($position)) {
207 print STDERR "No position found for $marker_name\n";
208 next;
211 my $conf;
212 # get confidence from spreadsheet
213 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
214 if ($conf=~/^(\d+)$/) {
215 if ($conf == 0) { $conf = "I"; }
216 elsif ($conf == 1) { $conf = "I(LOD2)"; }
217 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
218 elsif ($conf == 3) { $conf = "F(LOD3)"; }
219 else { $conf = "uncalculated"; }
221 # get protocol from spreadsheet
222 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
223 # some entries have been mapped to the same location by more than
224 # one method separated in the spreadsheet by plus signs
225 my @protocols=split(/\+/,$protocols_string);
226 if (@protocols) {
227 print "Protocols found: ".CXGN::Tools::Text::list_to_string(@protocols)."\n";
229 else {
230 if ($opt_f) {
231 print STDERR "Protocols not found for '$dirty_marker_name'";
232 @protocols = ('unknown');
234 else {
235 die "no protocol found for $dirty_marker_name. Use -f to force protocol to unknown.";
238 for my $protocol(@protocols) {
239 $protocol =~ tr/[a-z]/[A-Z]/;
240 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
241 or $protocol eq 'SNP' or $protocol eq 'SSR'
242 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i or $protocol =~ /INDEL/i or $protocol =~ /ASPE/i or $protocol =~ /qtl/i )
244 print STDERR "UNKNOWN protocol ($protocol)\n! ";
245 $protocol = 'unknown';
248 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
249 print "Protocol = $protocol\n";
250 # set the marker_id that will be at this location
251 $loc->marker_id($marker_id);
252 # set the map_version_id this location is found on
253 # (this must be done before calling function lg_name)
254 $loc->map_version_id($map_version_id);
255 # set the linkage group name for this marker location
256 # (the map_version_id must be already set for this to work,
257 # else how would it be able to know different linkage groups on
258 # different map versions from each other, when they all have the same names?)
259 $loc->lg_name($chromosome);
261 #set the position of the marker on this linkage group
262 $loc->position($position);
264 # set the confidence with which this marker is mapped at this position
265 $loc->confidence($conf);
267 # set the subscript for this location, because the same marker
268 # can be mapped to multiple locations, and these locations must be distinguishable
269 $loc->subscript($subscript);
271 # this method call represents the insertion into the
272 # marker_experiment table. this is currently a troublesome
273 # issue because this marker was probably mapped here via a
274 # PCR or RFLP experiment. where is this experiment data?
275 # well, it's in another spreadsheet, to be loaded later,
276 # or maybe it has already been loaded. if it was previously
277 # loaded, it was matched up with an old map version. how can we
278 # match that existing PCR/RFLP data up with this new map
279 # version? well, it will have to be done later by some other script.
280 print "Adding new experiment , marker_name = $marker_name, location = " . $loc->position . " protocol = '". $protocol . "'\n";
281 $marker->add_experiment({location=>$loc,protocol=>$protocol});
284 # store whatever new data you have (in this case, the new data
285 # is the location we just assigned the marker)
286 my $inserts = $marker->store_new_data();
289 # if any data was inserted for this marker (presumably it was,
290 # since we're adding locations on a brand new map version)
291 if ($inserts and @{$inserts}) {
292 print "New marker data inserted:\n";#.Dumper($inserts);
293 print $loc->as_string();
296 else { die "Oops, I thought I was inserting some new data" }
299 # deprecate the old map version and make the new one we just made the current one
300 $new_map_version->set_current();
305 if ($@) {
306 print $@;
307 print "Failed; rolling back.\n";
308 $dbh->rollback();
310 else {
311 print"Succeeded.\n";
312 if (!$opt_r) {
313 print"Committing.\n";
314 $dbh->commit();
316 else {
317 print"Rolling back.\n";
318 $dbh->rollback();