8 use CXGN
::TomatoGenome
::BACPublish qw
/tpf_agp_files/;
9 use CXGN
::BioTools
::AGP qw
/agp_parse agp_write/;
10 use CXGN
::Genomic
::Clone
;
11 use CXGN
::Genomic
::CloneIdentifiers qw
/parse_clone_ident/;
12 #use CXGN::Login; # does not seem to be used
14 use CXGN
::Page
::FormattingHelpers
qw( info_section_html
22 use CXGN
::Publish qw
/publish/;
23 use CXGN
::Tools
::List qw
/str_in/;
26 #tpf_agp is in the current directory
27 use CXGN
::TomatoGenome
::tpf_agp
qw(
28 format_validation_report
30 published_ftp_download_links
36 my $page = CXGN
::Page
->new("AGP Display","Rob");
38 #get input arguments and validate
39 my ($chr) = $page->get_arguments(qw
/chr/);
40 $chr += 0; #force chr to be numeric
43 #if we have a file upload, validate it according to its type and store it
44 if(my $upload = $page->get_upload ) {
46 # my $sp_person_id = CXGN::Login->new()->verify_session()
47 # or $page->error_page("Login ID not found");
48 # my $sp = CXGN::People::Person->new($sp_person_id);
50 # or $page->error_page("Username not found.","","", "Username not found for sp_person_id '$sp_person_id'");
52 # #check that the login has permission to upload for this chromosome
53 # unless($sp->get_user_type eq 'curator'
54 # || grep {$_ == $chr && $_ <= 12 && $_ >= 1} $sp->get_projects_associated_with_person) {
55 # $page->error_page('your account does not have privileges to upload files for that chromosome');
58 my $validation_report = validate_and_publish_agp
($upload->fh,$chr);
59 $page->message_page('AGP validation failed',$validation_report) if $validation_report;
63 $page->header('Assembly Display',"Chromosome Assemblies");
65 <p>The listings below have been developed and submitted to SGN by the individual <a href="/about/tomato_sequencing.pl">tomato chromosome sequencing projects</a>.</p>
66 <p>The AGP file describes the current best known sequence assembly of this chromosome. Descriptions of this file format are available <a href="http://www.ncbi.nlm.nih.gov/genome/guide/Assembly/AGP_Specification.html">from NCBI</a> and <a href="http://genome.ucsc.edu/goldenPath/datorg.html">from UCSC</a>.</p>
67 <p>These files are also available for download <a href="ftp://ftp.sgn.cornell.edu/tomato_genome/agp">via FTP</a>.</p>
70 print qq|<div style
="text-align: center; font-weight: bold; margin-bottom: 0.5em">Chromosome
</div
>\n|;
71 print modesel
( [map {["?chr=$_",$_]} 1..12], $chr-1);
74 #now show the current tpf and agp files
75 my (undef,$published_agp) = tpf_agp_files
($chr);
76 my $agp_modtime = modtime_string
($published_agp);
77 my (undef,$published_agp_ftp) = published_ftp_download_links
($chr);
78 print info_section_html
( title
=> 'Accessioned Golden Path (AGP)',
79 subtitle
=> "$agp_modtime $published_agp_ftp",
80 contents
=> agp_to_html
($published_agp) || <<EOHTML,
81 <center><b>No AGP file is currently available for chromosome $chr</b></center>
84 print info_section_html
( title
=> "Other Chromosome $chr Resources",
87 <li>Tomato maps: <a href="/cview/index.pl">Genetic</a> | <a href="/cview/map.pl?map_id=9&physical=1">Physical</a> | <a href="/cview/map.pl?map_id=13">BAC FISH results</a></li>
88 <li><a href="/tomato/genome_data.pl?chr=$chr">Chromosome $chr BACs in Genome Browser</a></li>
89 <li><a href="tpf.pl?chr=$chr">Chromosome $chr Tiling Path (TPF)</a></li>
96 sub validate_and_publish_agp
{
97 my ($agp_fh,$chr) = @_;
99 #warn "we got a '$agp_fh', which refs to ".ref($agp_fh);
101 #parse the AGP, validate its syntax
103 my $lines = agp_parse
($agp_fh, validate_syntax
=> 1, error_array
=> \
@errors )
104 or return format_validation_report
(@errors);
106 #now check the report for consistency in other things
109 push @errors, "<AGP>:$linerec->{linenum}: ".shift;
112 foreach our $l (@
$lines) {
113 next if $l->{comment
}; #ignore comments for this step
116 push @errors, "<AGP>:$l->{linenum}: ".shift;
119 #check that we have the correct object identifier
120 $l->{objname
} eq "S.lycopersicum-chr$chr"
121 or $err->("first column must be 'S.lycopersicum-chr$chr', not '$l->{objname}'");
123 if(my $ident = $l->{ident
}) {
124 my $p = parse_clone_ident
($ident,'agi_bac_with_chrom','agi_bac','versioned_bac_seq');
126 #try to treat it as a versioned genbank accession. look it
127 #up, get its clone name, then compare its sequence to the most
128 #recent sequence we have on file for that clone. error if
130 my $c = $p ? CXGN
::Genomic
::Clone
->retrieve_from_parsed_name($p)
131 || $err->("Unknown sequence identifier '$ident'")
132 : lookup_and_match_genbank_accession
($ident,$err);
135 $err->("Cannot find a BAC sequence for identifier '$ident'");
137 $c->chromosome_num == $chr
138 or $err->("BAC referenced in identifier '$ident' is not currently assigned to chromosome $chr");
139 my $latest_ver = $c->latest_sequence_version;
140 !$p->{version
} || $p->{version
} == $latest_ver
141 or $err->("The sequence version for '$ident' differs from the latest sequence version in the SGN database, which is '$latest_ver'.");
143 #rewrite the identifier to the SGN-type identifier
144 $l->{ident
} = $c->latest_sequence_name
145 or $err->("The sequence for '$ident' has not yet been loaded into the SGN database. If you have not yet submitted it, please do so before publishing this assembly. If you have already submitted it, please wait a few hours to ensure the loading is completed, then try uploading this file again.");
147 # check that the length of this element is not longer than the element referred to
149 or $err->("Could not fetch seq for '$ident'");
150 my $seqlen = length($seq);
151 $seqlen >= $l->{length}
152 or $err->("length of this component ($l->{length} bases) is too large to be covered by the sequence specified ($ident, $seqlen bases)");
158 #splat if we have errors
160 and return format_validation_report
(@errors);
162 #otherwise, go ahead and rewrite and publish
163 my $tmp = File
::Temp
->new( UNLINK
=> 1, SUFFIX
=> '.agp' );
164 agp_write
($lines,$tmp);
167 my $publishing_filename = filename
($chr,'agp')
168 or die "no filename to publish to??";
169 publish
(['cp', "$tmp", $publishing_filename]);
176 if( $filename && -f
$filename ) {
177 return tabdelim_to_html
($filename, 'AGP',
180 if ($contents =~ /^#/) {
182 } # elsif ( $contents eq 'N' ) {
184 # } elsif( $contents eq 'yes' ){
185 # 'color: green; font-weight: bold'
186 # } elsif( $contents eq 'F' ) {
187 # 'background: #b0b0e4; font-weight: bold;'
188 # } elsif( $contents eq 'D' ) {
189 # 'background: #bfbfbf'
190 # } elsif( $contents eq 'contig' ) {
198 #lookup the given string as a versioned genbank accession, return the
199 #clone object if you can find it in genbank, and it has the same
200 #sequence as the latest sequence we have on file for that bac
201 sub lookup_and_match_genbank_accession
{
202 my ($accession,$err) = @_;
204 unless($accession =~ /^[a-z]{2}\d+\.\d+$/i) {
205 $err->("Unknown identifier '$accession'");
209 my $gb = Bio
::DB
::GenBank
->new;
210 my $seq = eval{$gb->get_Seq_by_version($accession)}; #this dies if the acc doesn't exist
213 $err->("'$accession' looks like it might be a GenBank accession, but is not found in GenBank.");
217 #find the word in the description line that is our clone identifier
219 foreach my $word (split /\s+/,$seq->desc) {
220 if( my $p = parse_clone_ident
($word,'agi_bac_with_chrom','agi_bac') ) {
221 $p->{match
} = $accession;
222 $clone = CXGN
::Genomic
::Clone
->retrieve_from_parsed_name($p);
227 $err->("Could not match GenBank accession '$accession' to a clone name. Is a clone name included in the DEFINITION field for that record?");
231 unless($clone->latest_sequence_name) {
232 $err->("GenBank record for $accession purports to be for SGN BAC identifier ".$clone->clone_name_with_chromosome.", but no sequence has been submitted to SGN for this clone. Please upload it to SGN.");
236 #now make sure the clones have the same sequence
237 my @clone_seqs = $clone->seq;
238 # warn "checking our seq ".length($clone_seqs[0])." against their seq ".length($seq->seq);
239 unless(@clone_seqs == 1 && compare_gb_and_sgn_seqs
($seq->seq,$clone_seqs[0])) {
240 $err->("Sequence in GenBank for $accession does not match SGN sequence for ".$clone->latest_sequence_name.". Please update either SGN or GenBank such that they match, or use SGN versioned sequence identifiers (e.g. C01HBa0001A01.1).");
241 # warn "seq for $accession is\n".$clone_seqs[0]
244 #if we get here, it all miraculously matched up
249 #compare a genbank and an sgn sequence, allowing for X-masked
250 #nucleotides that may be present in the sgn sequence
251 sub compare_gb_and_sgn_seqs
{
252 my ($gb_seq,$sgn_seq) = @_;
254 #easy if they're straight equal
255 return 1 if $gb_seq eq $sgn_seq;
257 #otherwise, do a match taking into account the X's in the SGN sequence
260 length($gb_seq) == length($sgn_seq)
261 && $sgn_seq =~ /[Xx]/;
263 $gb_seq = lc $gb_seq;
264 $sgn_seq = lc $sgn_seq;
266 for(my $i=0;$i<length($gb_seq);$i++) {
267 my $gb = substr($gb_seq,$i,1);
268 my $sgn = substr($sgn_seq,$i,1);
269 if($gb ne $sgn && $sgn ne 'x') {