3 # basic script to load maps
5 # usage: load_map_data <dbhost> <dbname> <COMMIT|ROLLBACK> <map_id> <map file>
6 # example: load_map_data.pl db-devel sandbox COMMIT 9 map-file.csv
8 # copy and edit this file as necessary
9 # common changes include the following:
10 # specified linkage groups
11 # different column headings
12 # "pos" field versus separate "chromosome" and "position" fields
16 load_map_data.pl - a script to load maps into the SGN database.
20 usage: load_map_data -H dbhost -D dbname [-r] [-n "map name"] [-i map_id] <map file>
22 example: load_map_data.pl -H devel -D sandbox -r -i 9 map-file.csv
30 The hostname of the server hosting the database.
34 the name of the database
38 (optional) if present, rollback after the script terminates. Database should not be affected. Good for test runs.
42 (optional) the map_id. If not present, will insert a brand new map (confirm dialog).
46 required if -i is not used. Provides the map name.
50 The tab-delimited map file has the following columns:
61 John Binns, Adri Mills, Lukas Mueller (among others).
63 Current maintainer: Lukas Mueller.
70 use CXGN
::Tools
::List qw
| str_in
|;
71 use CXGN
::Tools
::File
::Spreadsheet
;
72 use CXGN
::Tools
::Text
;
73 use CXGN
::Marker
::Modifiable
;
74 use CXGN
::Marker
::Tools
;
75 use CXGN
::Marker
::Location
;
76 use CXGN
::Map
::Version
;
77 use CXGN
::DB
::Connection
;
78 use CXGN
::DB
::InsertDBH
;
82 our ($opt_H, $opt_D, $opt_i, $opt_r, $opt_n);
88 # specify linkage groups
89 # example: my $linkage_groups = ['1','2','3','4','5'];
90 my $linkage_groups = [ qw
| 1 2 3 4 5 6 7 8 9 10 11 12 | ];
93 # print "Usage: load_map_data <dbhost> <dbname> "
94 # ."<COMMIT|ROLLBACK> <map_id> <map file>\n" and exit();
96 # unless ($ARGV[0] eq 'db-devel' or $ARGV[0] eq 'db') {
97 # die "First argument must be valid database host";
99 # unless ($ARGV[1] eq 'sandbox' or $ARGV[1] eq 'cxgn') {
100 # die "Second argument must be valid database name";
102 # unless ($ARGV[2] eq 'COMMIT' or $ARGV[2] eq 'ROLLBACK') {
103 # die "Third argument must be either COMMIT or ROLLBACK";
105 # unless ($map_id=$ARGV[3]) {
106 # die "Fourth argument must be the map_id";
108 # unless ($map_file=$ARGV[4]) {
109 # die "Fifth argument must be the map positions file";
116 if (!$opt_H && !$opt_D) {
117 die "-H and -D parameters are required.\n";
120 my $dbh=CXGN
::DB
::InsertDBH
::connect
121 ({dbhost
=>$opt_H,dbname
=>$opt_D,dbschema
=>'sgn'});
126 print "No map_id was provided. Insert a new map? ";
129 print "Inserting a new map...\n";
131 my $map = CXGN
::Map
->new_map($dbh, $opt_n);
133 $map_id = $map->get_map_id();
135 print "New map_id: $map_id\n";
145 # we are creating a new map version every time we run this script,
146 # as long as the transaction goes through
147 my $new_map_version = CXGN
::Map
::Version
->
148 #new($dbh,{map_id=>$map_id});
149 new
($dbh,{map_id
=>$map_id},$linkage_groups);
150 # saving the new map version
151 my $map_version_id = $new_map_version->insert_into_database();
152 print $new_map_version->as_string();
153 # make an object to give us the values from the spreadsheet
154 my $ss = CXGN
::Tools
::File
::Spreadsheet
->new($map_file);
155 my @markers = $ss->row_labels(); # row labels are the marker names
156 my @columns = $ss->column_labels(); # column labels are the headings for the data columns
157 # make sure the spreadsheet is how we expect it to be
158 @columns = qw
| MARKER MARKER_ID LINKAGE_GROUP POSITION CONFIDENCE PROTOCOL
| # modify columns as necessary
159 or die"Column headings are not as expected";
161 # for all the (uncleaned) marker names in the spreadsheet
162 for my $dirty_marker_name(@markers) {
163 # clean it, meaning, get the subscript if it's there,
164 # and convert the name into a canonical form so we can find it
165 # if it already exists with a different spelling or something
166 my ($marker_name,$subscript) =
167 CXGN
::Marker
::Tools
::clean_marker_name
($dirty_marker_name);
168 # get as many IDs as you can for a marker with a name like this
169 my @marker_ids = CXGN
::Marker
::Tools
::marker_name_to_ids
($dbh,$marker_name);
170 # if we get more than one, our script needs to be made smarter
171 # so it can differentiate between them, or maybe one of them
172 # needs to be deleted from the database
173 if (@marker_ids>1) { die "Too many IDs found for marker '$marker_name'" }
174 # just get the first ID in the list (if the list is longer than 1, we've already died)
175 my($marker_id) = @marker_ids;
177 if($marker_id) { # if we have found an existing marker
178 # make a modifiable marker object from it
179 $marker = CXGN
::Marker
::Modifiable
->new($dbh,$marker_id);
181 else { # if we are making a new marker
182 # make a modifiable marker object and start to populate it
183 $marker = CXGN
::Marker
::Modifiable
->new($dbh);
184 $marker->set_marker_name($marker_name); #give it a name
185 # marker must exist before locations can be added for it.
186 # this is a db constraint. if you didn't do this, this script would die later.
187 my $inserts = $marker->store_new_data();
188 # if any data was inserted for this marker (presumably it was,
189 # because we're making a new one)
190 if ($inserts and @
{$inserts}) { print"New marker inserted: $marker_name\n" }
191 else { die "Oops, I thought I was inserting some new data" }
192 $marker_id=$marker->marker_id();
194 print $marker->name_that_marker()."\n";
195 my $loc=$marker->new_location(); #create a new location object
197 # some files have pos which contains chromsome and position
198 #my $pos=$ss->value_at($dirty_marker_name,'Position')
199 # get position from spreadsheet
200 #or die "No position found for $marker_name";
201 # extract linkage group name and cm position from string like '01.035'
202 #my ($chromosome,$position) =
203 #CXGN::Marker::Tools::lg_name_and_position($pos);
205 my $chromosome=$ss->value_at($dirty_marker_name,'LINKAGE_GROUP')
206 # get chromosome from spreadsheet
207 or die"No chromosome found for $marker_name";
210 if (! str_in
($chromosome, @
$linkage_groups)) {
211 print STDERR
"$marker_name skipped because linkage_group is $chromosome...\n";
214 # some have separate fields for chromsome and position
215 my $position = $ss->value_at($dirty_marker_name,'POSITION');
216 # get position from spreadsheet
218 print STDERR
"No position found for $marker_name\n";
224 # get confidence from spreadsheet
225 $conf = $ss->value_at($dirty_marker_name,'CONFIDENCE') or $conf='uncalculated';
226 if ($conf=~/^(\d+)$/) {
227 if ($conf == 0) { $conf = "I"; }
228 elsif ($conf == 1) { $conf = "I(LOD2)"; }
229 elsif ($conf == 2) { $conf = "CF(LOD3)"; }
230 elsif ($conf == 3) { $conf = "F(LOD3)"; }
231 else { $conf = "uncalculated"; }
233 # get protocol from spreadsheet
234 my $protocols_string=uc($ss->value_at($dirty_marker_name,'PROTOCOL'));
235 # some entries have been mapped to the same location by more than
236 # one method separated in the spreadsheet by plus signs
237 my @protocols=split(/\+/,$protocols_string);
239 print "Protocols found: ".CXGN
::Tools
::Text
::list_to_string
(@protocols)."\n";
241 else { die "Protocols not found for '$dirty_marker_name'" }
242 for my $protocol(@protocols) {
243 $protocol =~ tr/[a-z]/[A-Z]/;
244 unless ($protocol eq 'AFLP' or $protocol eq 'CAPS' or $protocol eq 'RAPD'
245 or $protocol eq 'SNP' or $protocol eq 'SSR'
246 or $protocol eq 'RFLP' or $protocol eq 'PCR' or $protocol eq 'DCAPS' or $protocol =~/DArt/i or $protocol =~ /OPA/i )
248 $protocol = 'unknown';
251 if ($protocol eq 'DCAPS') { $protocol = 'dCAPS' }
253 # set the marker_id that will be at this location
254 $loc->marker_id($marker_id);
255 # set the map_version_id this location is found on
256 # (this must be done before calling function lg_name)
257 $loc->map_version_id($map_version_id);
258 # set the linkage group name for this marker location
259 # (the map_version_id must be already set for this to work,
260 # else how would it be able to know different linkage groups on
261 # different map versions from each other, when they all have the same names?)
262 $loc->lg_name($chromosome);
264 #set the position of the marker on this linkage group
265 $loc->position($position);
267 # set the confidence with which this marker is mapped at this position
268 $loc->confidence($conf);
270 # set the subscript for this location, because the same marker
271 # can be mapped to multiple locations, and these locations must be distinguishable
272 $loc->subscript($subscript);
274 # this method call represents the insertion into the
275 # marker_experiment table. this is currently a troublesome
276 # issue because this marker was probably mapped here via a
277 # PCR or RFLP experiment. where is this experiment data?
278 # well, it's in another spreadsheet, to be loaded later,
279 # or maybe it has already been loaded. if it was previously
280 # loaded, it was matched up with an old map version. how can we
281 # match that existing PCR/RFLP data up with this new map
282 # version? well, it will have to be done later by some other script.
284 $marker->add_experiment({location
=>$loc,protocol
=>$protocol});
287 # store whatever new data you have (in this case, the new data
288 # is the location we just assigned the marker)
289 my $inserts = $marker->store_new_data();
292 # if any data was inserted for this marker (presumably it was,
293 # since we're adding locations on a brand new map version)
294 if ($inserts and @
{$inserts}) {
295 print "New marker data inserted:\n".Dumper
($inserts);
296 print $loc->as_string();
299 else { die "Oops, I thought I was inserting some new data" }
302 # deprecate the old map version and make the new one we just made the current one
303 $new_map_version->set_current();
310 print "Failed; rolling back.\n";
316 print"Committing.\n";
320 print"Rolling back.\n";