cv for average = 0
[sgn.git] / cgi-bin / sequencing / agp.pl
blob11735a7b576537cbb9d554a228bd404ee1f16470
1 use strict;
3 use File::Temp qw//;
4 use File::Spec;
6 use Bio::DB::GenBank;
8 use CXGN::TomatoGenome::BACPublish qw/tpf_agp_files/;
9 use CXGN::BioTools::AGP qw/agp_parse agp_write/;
10 use CXGN::Genomic::Clone;
11 use CXGN::Genomic::CloneIdentifiers qw/parse_clone_ident/;
12 #use CXGN::Login; # does not seem to be used
13 use CXGN::Page;
14 use CXGN::Page::FormattingHelpers qw( info_section_html
15 page_title_html
16 columnar_table_html
17 info_table_html
18 modesel
19 html_break_string
21 use CXGN::People;
22 use CXGN::Publish qw/publish/;
23 use CXGN::Tools::List qw/str_in/;
26 #tpf_agp is in the current directory
27 use CXGN::TomatoGenome::tpf_agp qw(
28 format_validation_report
29 filename
30 published_ftp_download_links
31 tabdelim_to_html
32 tabdelim_to_array
33 modtime_string
36 my $page = CXGN::Page->new("AGP Display","Rob");
38 #get input arguments and validate
39 my ($chr) = $page->get_arguments(qw/chr/);
40 $chr += 0; #force chr to be numeric
41 $chr ||= 1;
43 #if we have a file upload, validate it according to its type and store it
44 if(my $upload = $page->get_upload ) {
45 #get the active login
46 # my $sp_person_id = CXGN::Login->new()->verify_session()
47 # or $page->error_page("Login ID not found");
48 # my $sp = CXGN::People::Person->new($sp_person_id);
49 # $sp->get_username
50 # or $page->error_page("Username not found.","","", "Username not found for sp_person_id '$sp_person_id'");
52 # #check that the login has permission to upload for this chromosome
53 # unless($sp->get_user_type eq 'curator'
54 # || grep {$_ == $chr && $_ <= 12 && $_ >= 1} $sp->get_projects_associated_with_person) {
55 # $page->error_page('your account does not have privileges to upload files for that chromosome');
56 # }
58 my $validation_report = validate_and_publish_agp($upload->fh,$chr);
59 $page->message_page('AGP validation failed',$validation_report) if $validation_report;
63 $page->header('Assembly Display',"Chromosome Assemblies");
64 print <<EOHTML;
65 <p>The listings below have been developed and submitted to SGN by the individual <a href="/about/tomato_sequencing.pl">tomato chromosome sequencing projects</a>.</p>
66 <p>The AGP file describes the current best known sequence assembly of this chromosome. Descriptions of this file format are available <a href="http://www.ncbi.nlm.nih.gov/genome/guide/Assembly/AGP_Specification.html">from NCBI</a> and <a href="http://genome.ucsc.edu/goldenPath/datorg.html">from UCSC</a>.</p>
67 <p>These files are also available for download <a href="ftp://ftp.sgn.cornell.edu/tomato_genome/agp">via FTP</a>.</p>
68 EOHTML
70 print qq|<div style="text-align: center; font-weight: bold; margin-bottom: 0.5em">Chromosome</div>\n|;
71 print modesel( [map {["?chr=$_",$_]} 1..12], $chr-1);
72 print "<br />\n";
74 #now show the current tpf and agp files
75 my (undef,$published_agp) = tpf_agp_files($chr);
76 my $agp_modtime = modtime_string($published_agp);
77 my (undef,$published_agp_ftp) = published_ftp_download_links($chr);
78 print info_section_html( title => 'Accessioned Golden Path (AGP)',
79 subtitle => "$agp_modtime $published_agp_ftp",
80 contents => agp_to_html($published_agp) || <<EOHTML,
81 <center><b>No AGP file is currently available for chromosome $chr</b></center>
82 EOHTML
84 print info_section_html( title => "Other Chromosome $chr Resources",
85 contents => <<EOHTML,
86 <ul>
87 <li>Tomato maps: <a href="/cview/index.pl">Genetic</a> | <a href="/cview/map.pl?map_id=9&amp;physical=1">Physical</a> | <a href="/cview/map.pl?map_id=13">BAC FISH results</a></li>
88 <li><a href="/organism/Solanum_lycopersicum/clone_sequencing?chr=$chr">Chromosome $chr BAC list</a></li>
89 <li><a href="tpf.pl?chr=$chr">Chromosome $chr Tiling Path (TPF)</a></li>
90 </ul>
91 EOHTML
93 $page->footer;
96 sub validate_and_publish_agp {
97 my ($agp_fh,$chr) = @_;
99 #warn "we got a '$agp_fh', which refs to ".ref($agp_fh);
101 #parse the AGP, validate its syntax
102 my @errors;
103 my $lines = agp_parse($agp_fh, validate_syntax => 1, error_array => \@errors )
104 or return format_validation_report(@errors);
106 #now check the report for consistency in other things
107 my $err = sub {
108 my $linerec = shift;
109 push @errors, "<AGP>:$linerec->{linenum}: ".shift;
112 foreach our $l (@$lines) {
113 next if $l->{comment}; #ignore comments for this step
115 my $err = sub {
116 push @errors, "<AGP>:$l->{linenum}: ".shift;
119 #check that we have the correct object identifier
120 $l->{objname} eq "S.lycopersicum-chr$chr"
121 or $err->("first column must be 'S.lycopersicum-chr$chr', not '$l->{objname}'");
123 if(my $ident = $l->{ident}) {
124 my $p = parse_clone_ident($ident,'agi_bac_with_chrom','agi_bac','versioned_bac_seq');
126 #try to treat it as a versioned genbank accession. look it
127 #up, get its clone name, then compare its sequence to the most
128 #recent sequence we have on file for that clone. error if
129 #they don't match
130 my $c = $p ? CXGN::Genomic::Clone->retrieve_from_parsed_name($p)
131 || $err->("Unknown sequence identifier '$ident'")
132 : lookup_and_match_genbank_accession($ident,$err);
134 unless( ref $c ) {
135 $err->("Cannot find a BAC sequence for identifier '$ident'");
136 } else {
137 $c->chromosome_num == $chr
138 or $err->("BAC referenced in identifier '$ident' is not currently assigned to chromosome $chr");
139 my $latest_ver = $c->latest_sequence_version;
140 !$p->{version} || $p->{version} == $latest_ver
141 or $err->("The sequence version for '$ident' differs from the latest sequence version in the SGN database, which is '$latest_ver'.");
143 #rewrite the identifier to the SGN-type identifier
144 $l->{ident} = $c->latest_sequence_name
145 or $err->("The sequence for '$ident' has not yet been loaded into the SGN database. If you have not yet submitted it, please do so before publishing this assembly. If you have already submitted it, please wait a few hours to ensure the loading is completed, then try uploading this file again.");
147 # check that the length of this element is not longer than the element referred to
148 my $seq = $c->seq
149 or $err->("Could not fetch seq for '$ident'");
150 my $seqlen = length($seq);
151 $seqlen >= $l->{length}
152 or $err->("length of this component ($l->{length} bases) is too large to be covered by the sequence specified ($ident, $seqlen bases)");
158 #splat if we have errors
159 @errors
160 and return format_validation_report(@errors);
162 #otherwise, go ahead and rewrite and publish
163 my $tmp = File::Temp->new( UNLINK => 1, SUFFIX => '.agp' );
164 agp_write($lines,$tmp);
165 close $tmp;
167 my $publishing_filename = filename($chr,'agp')
168 or die "no filename to publish to??";
169 publish(['cp', "$tmp", $publishing_filename]);
171 return;
174 sub agp_to_html {
175 my ($filename) = @_;
176 if( $filename && -f $filename ) {
177 return tabdelim_to_html($filename, 'AGP',
178 sub {
179 my ($contents) = @_;
180 if ($contents =~ /^#/) {
181 'color: green'
182 } # elsif ( $contents eq 'N' ) {
183 # ''
184 # } elsif( $contents eq 'yes' ){
185 # 'color: green; font-weight: bold'
186 # } elsif( $contents eq 'F' ) {
187 # 'background: #b0b0e4; font-weight: bold;'
188 # } elsif( $contents eq 'D' ) {
189 # 'background: #bfbfbf'
190 # } elsif( $contents eq 'contig' ) {
191 # 'color: gray'
198 #lookup the given string as a versioned genbank accession, return the
199 #clone object if you can find it in genbank, and it has the same
200 #sequence as the latest sequence we have on file for that bac
201 sub lookup_and_match_genbank_accession {
202 my ($accession,$err) = @_;
204 unless($accession =~ /^[a-z]{2}\d+\.\d+$/i) {
205 $err->("Unknown identifier '$accession'");
206 return;
209 my $gb = Bio::DB::GenBank->new;
210 my $seq = eval{$gb->get_Seq_by_version($accession)}; #this dies if the acc doesn't exist
211 warn $@ if $@;
212 unless($seq) {
213 $err->("'$accession' looks like it might be a GenBank accession, but is not found in GenBank.");
214 return;
217 #find the word in the description line that is our clone identifier
218 my $clone;
219 foreach my $word (split /\s+/,$seq->desc) {
220 if( my $p = parse_clone_ident($word,'agi_bac_with_chrom','agi_bac') ) {
221 $p->{match} = $accession;
222 $clone = CXGN::Genomic::Clone->retrieve_from_parsed_name($p);
223 last;
226 unless($clone) {
227 $err->("Could not match GenBank accession '$accession' to a clone name. Is a clone name included in the DEFINITION field for that record?");
228 return;
231 unless($clone->latest_sequence_name) {
232 $err->("GenBank record for $accession purports to be for SGN BAC identifier ".$clone->clone_name_with_chromosome.", but no sequence has been submitted to SGN for this clone. Please upload it to SGN.");
233 return;
236 #now make sure the clones have the same sequence
237 my @clone_seqs = $clone->seq;
238 # warn "checking our seq ".length($clone_seqs[0])." against their seq ".length($seq->seq);
239 unless(@clone_seqs == 1 && compare_gb_and_sgn_seqs($seq->seq,$clone_seqs[0])) {
240 $err->("Sequence in GenBank for $accession does not match SGN sequence for ".$clone->latest_sequence_name.". Please update either SGN or GenBank such that they match, or use SGN versioned sequence identifiers (e.g. C01HBa0001A01.1).");
241 # warn "seq for $accession is\n".$clone_seqs[0]
244 #if we get here, it all miraculously matched up
245 return $clone;
249 #compare a genbank and an sgn sequence, allowing for X-masked
250 #nucleotides that may be present in the sgn sequence
251 sub compare_gb_and_sgn_seqs {
252 my ($gb_seq,$sgn_seq) = @_;
254 #easy if they're straight equal
255 return 1 if $gb_seq eq $sgn_seq;
257 #otherwise, do a match taking into account the X's in the SGN sequence
259 return 0 unless
260 length($gb_seq) == length($sgn_seq)
261 && $sgn_seq =~ /[Xx]/;
263 $gb_seq = lc $gb_seq;
264 $sgn_seq = lc $sgn_seq;
266 for(my $i=0;$i<length($gb_seq);$i++) {
267 my $gb = substr($gb_seq,$i,1);
268 my $sgn = substr($sgn_seq,$i,1);
269 if($gb ne $sgn && $sgn ne 'x') {
270 return 0;
273 return 1;