a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / Identifiers.pm
blobeb80bb1e60cbceed8713714549ed71aee055eef4
1 package CXGN::Tools::Identifiers;
2 use strict;
3 use warnings;
5 =head1 NAME
7 CXGN::Tools::Identifiers - useful functions for dealing with
8 identifiers, like SGN-E23412
10 =head1 SYNOPSIS
12 my $nsname = identifier_namespace('SGN-E23412');
13 #returns 'sgn_e'
15 my $url = identifier_url('SGN-E23412');
16 #$url is now '/search/est.pl?request_type=7&request=575150'
18 my $link = link_identifier('SGN-E575150');
19 #$link is now
20 #'<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
22 my $clean = clean_identifier('SGNE3423');
23 #returns SGN-E3423, or undef if the identifier was not recognized
25 my $contents = parse_identifier('SGN-E12345');
27 =head1 DESCRIPTION
29 This module contains easy-to-use functions for working with the often
30 malformed strings of text that purport to be identifiers of some sort.
32 Note that support for identification of SGN marker names like 'TG23' has been removed. The only marker identifiers supported now are SGN-M type identifiers.
35 =head2 Supported Namespaces
37 =over 12
39 =item sgn_u
41 SGN unigene identifiers 'SGN-U2342'
43 =item cgn_u
45 CGN unigene identifers 'CGN-U122539'
47 =item sgn_e
49 SGN EST identifiers 'SGN-E234223'
51 =item sgn_s
53 SGN Microarray spot identifiers 'SGN-S1241'
55 =item sgn_m
57 SGN Marker identifier 'SGN-M1347'
59 =item sgn_t
61 SGN Trace (chromatogram) identifiers 'SGN-T1241'
63 =item sgn_locus_sequence
65 An SGN locus sequence identifier like 'Potato_SGNlocusID_2206_AM040153'
66 which is parsed as { id => 2206, species => 'Potato', ext_id => 'AM040153' }
68 =item microarray_spot
70 microarray spot identifiers like '1-1-1.2.3.4'
72 =item est
74 other kinds of EST identifiers like 'cLEC-23-A23'
76 =item bac_end
78 BAC end identifiers like 'LE_HBa0123A12_SP6_2342'
80 =item bac
82 BAC identifiers like 'LE_HBa0123A12'
84 By default, if a BAC has been assigned to a sequencing
85 project, its clean_identifier and link_identifier will
86 replace the LE_ or SL_ species name at the beginning
87 with a C##, where ## is the zero-padded chromosome number.
89 $CXGN::Tools::Identifier::insert_bac_chr_nums can be used
90 to enable or disable this. To disable, set it to a
91 false value. To enable, set true. Defaults to true.
93 =item bac_sequence
95 BAC sequence identifiers like 'LE_HBa0123A12.1'
97 =item bac_fragment
99 BAC fragment identifiers (identifies contigs in a bac that is
100 still partially assembled) like 'LE_HBa0123A12.1-4'
102 =item tomato_bac_contig
104 A tomato BAC contig, e.g. C12.4_contig1
106 =item generic_scaffold
108 A generic scaffold identifier, e.g. scaffold12345
110 =item tair_locus
112 TAIR locus identifiers like 'At1g67700.1'
114 =item species_binomial
116 e.g. 'Arabidopsis thaliana', 'Solanum lycopersicum'
118 =item genbank_gi
120 A genbank identifier containing a stable GI identification number.
121 Examples include gi|108883260|gb|EAT47485.1| or just gi|108883260|
122 or GI:108883260
124 =item genbank_accession
126 A genbank identifier denoting a sequence accession, containing a
127 namespace and an identifier, such as gb|EAT47485.1|.
129 =item genbank_cdd
131 A genbank identifier denoting an entry in the NCBI Conserved Domain
132 Database (CDD).
134 =item swissprot_accession
136 A uniprot identifier composed by 'sp'+pipe+uniprot_accession+pipe+entry_name
137 for example sp|P22142|NDUS2_NEUCR means accession=P22142 and entry_name=NDUS2_NEUCR
138 (sp means swissprot)
140 =item uniref_accession
142 A uniprot accession is contains a combination of number and letters,
143 for example: UniRef90_P22142, the accsssion will be P22142
145 =back
147 =cut
149 ############ NAMESPACE DEFINITIONS ############
150 # To add a namespace:
151 # 1. make is_<namespace>, url_<namespace>, and clean_<namespace>
152 # functions for your namespace at the end of this file
153 # 2. add its name to @namespace_list below
155 #NOTE: the ordering of this list is the order in which a given
156 #identifier is checked for membership in each class
158 # removed sgn_marker from this list because it always returns
159 # a valid identifier
160 our @namespace_list = qw/
161 sgn_u
162 cgn_u
163 sgn_e
164 sgn_s
165 sgn_m
166 sgn_t
167 sgn_locus_sequence
168 microarray_spot
170 bac_end
171 bac_fragment
172 bac_sequence
174 tomato_bac_contig
175 generic_scaffold
176 tair_gene_model
177 tair_locus
178 go_term
179 interpro_accession
180 swissprot_accession
181 uniref_accession
182 genbank_gi
183 species_binomial
184 genbank_cdd
185 genbank_accession
188 #return 1 if the given namespace is in this list
189 sub _is_valid_namespace {
190 my ($ns) = @_;
191 return 1 if grep {$ns eq $_} @namespace_list;
192 return 0;
195 =head1 FUNCTIONS
197 All functions are EXPORT_OK.
199 =cut
201 # ABOUT THE ARCHITECTURE OF THIS MODULE
203 # each namespace supported by this module
204 # has:
205 # a.) an entry in @namespace_list
206 # b.) a is_<namespace> function
207 # c.) a url_<namespace> function
209 # All of these are at the bottom of this file,
210 # where it says NAMESPACE DEFINITIONS.
212 use Carp;
213 use Tie::UrlEncoder;
214 our %urlencode;
216 BEGIN {
217 our @EXPORT_OK = qw/ identifier_url
218 link_identifier
219 identifier_namespace
220 clean_identifier
221 list_namespaces
222 parse_identifier
223 unique_identifier
226 use base qw/Exporter/;
228 use CXGN::Genomic::Clone;
229 use CXGN::Genomic::CloneIdentifiers qw/ parse_clone_ident assemble_clone_ident /;
230 use CXGN::Genomic::GSS;
231 #use CXGN::Marker::Tools qw/clean_marker_name marker_name_to_ids/;
232 use CXGN::Tools::Text qw/trim/;
235 =head2 identifier_url
237 Usage: my $url = identifier_url('SGN-E12141');
238 Desc : get an information URL for an identifier.
239 Ret : a string containing an absolute or relative URL,
240 suitable for putting in a href= in HTML
241 Args : identifier string,
242 (optional) namespace name if you know it
243 Side Effects: might look things up in the database
245 =cut
247 sub identifier_url {
248 my ($ident,$ns) = @_;
249 $ident = trim($ident);
250 $ns ||= identifier_namespace($ident)
251 or return;
252 return unless _is_valid_namespace($ns);
253 #clean up the identifier if we can
254 $ident = clean_identifier($ident,$ns) || $ident;
255 no strict 'refs';
256 return "url_$ns"->($ident);
259 =head2 link_identifier
261 Usage: my $link = link_identifier('SGN-E575150');
262 #returns '<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
263 Desc : calls identifier_url() to get a URL for your identifier,
264 then returns a complete HTML link to you, like
265 Ret : an html link, or undef if the link could not be made
266 Args : single string containing an identifier,
267 (optional) namespace name if you know it
268 Side Effects: might look things up in the database
270 =cut
272 sub link_identifier {
273 my ($ident,$ns) = @_;
274 $ident = trim($ident);
275 $ns ||= identifier_namespace($ident)
276 or return;
277 $ident = clean_identifier($ident,$ns) || $ident;
278 my $url = identifier_url($ident,$ns)
279 or return;
280 #clean up the identifier if we can
281 my $fns = ucfirst join ' ',split /_/,$ns;
282 return qq|<a title="search for this $fns" href="$url">$ident</a>|;
285 =head2 identifier_namespace
287 Usage: my $ns = identifier_namespace('SGN-U1231');
288 #returns 'sgn_u'
289 Desc : get the namespace
290 Ret : a string containing the name of the namespace,
291 or undef if it cannot identify the namespace
292 Args : a string identifier
293 Side Effects: might look things up in the database
295 =cut
297 #see bottom for namespace definitions
298 sub identifier_namespace {
299 my ($identifier) = @_;
300 $identifier = trim($identifier)
301 or return;
302 #identifiers have to be more than 2 chars, and they can't be all numbers
303 length($identifier) > 2 && $identifier =~ /\D/
304 or return;
305 foreach my $ns (our @namespace_list) {
306 no strict 'refs';
307 return $ns if "is_$ns"->($identifier);
308 #warn "$identifier is not in $ns\n";
310 return;
313 =head2 clean_identifier
315 Usage: my $newident = clean_identifier('SGNE1231');
316 Desc : attempt to guess the namespace of the identifier,
317 and clean up any irregularities in it to put
318 it in its canonical form
319 Ret : a cleaned string, or undef if the identifier
320 is not in any recognized namespace
321 Args : identifier to be cleaned
322 Side Effects: may look things up in the database
324 =cut
326 sub clean_identifier {
327 my ($ident,$ns) = @_;
328 $ident = trim($ident);
329 $ns ||= identifier_namespace($ident)
330 or return;
331 return unless _is_valid_namespace($ns);
332 no strict 'refs';
333 return "clean_$ns"->($ident);
336 =head2 list_namespaces
338 Usage: my @namespaces = list_namespaces;
339 Desc : get the list of namespace names supported by this module
340 Ret : list of valid namespace names
341 Args : none
342 Side Effects: none
344 =cut
346 sub list_namespaces {
347 return @namespace_list;
351 =head2 parse_identifier
353 Usage: my $data = parse_identifier($identifier, $namespace );
354 Desc : many identifiers have data in them, for example, an SGN-E has the EST id
355 in it, and a bac name (LE_HBa0001A02) has the organism, library, plate,
356 row, and column in it. This function parses that data out and gives it
357 to you, as it appears in the string. You might consider running
358 clean_identifier() on what you give this function.
359 Args : identifier to parse,
360 optional list of namespace names it could be a member of,
361 guesses the namespace if not provided
362 Ret : nothing if the identifier could not be parsed,
363 otherwise a hashref of data in the identifier, which varies in its
364 contents, looking like
365 { namespace => 'namespace_name',
366 <other data in the identifier>
368 Side Effects: none
369 Example:
371 my $data = parse_identifier('C03HBa0001A02');
372 #and now $data contains
373 $data = { namespace => 'bac',
374 lib => 'LE_HBa',
375 plate => 1,
376 row => 'A',
377 col => 2,
378 clonetype => 'bac',
379 match => 'C03HBa0001A02',
380 chr => 3,
383 =cut
385 sub parse_identifier {
386 my ($ident, $ns ) = @_;
388 $ident = trim( $ident);
389 $ns ||= identifier_namespace($ident)
390 or return;
392 return unless _is_valid_namespace($ns);
394 no strict 'refs';
395 my $p = "parse_$ns"->($ident)
396 or return;
397 $p->{namespace} = $ns;
398 return $p;
401 =head1 NAMESPACE FUNCTIONS
403 These functions are not exported, and
404 are only used internally by this module.
405 To add a namespace, follow the instructions inside this
406 file (they are in comments, not POD).
408 =cut
410 #for instructions, see ABOUT THE ARCHITECTURE OF THIS MODULE above
412 =head2 is_E<lt>namespaceE<gt>
414 Usage: is_sgn_e('SGN-E2342');
415 Desc : check if an identifier is in a given namespace
416 Ret : 1 if the given identifier is in that namespace,
417 0 otherwise
418 Args : identifier string
419 Side Effects: may look up things in the database
421 =head2 url_E<lt>namespaceE<gt>
423 Usage: url_sgn_e('SGN-E2342');
424 Desc : get the info URL for a given identifier,
425 Ret : string with the URL, or undef if no
426 url is available for this identifier
427 Args : identifier string
428 Side Effects: may look up things in the database
430 NOTE: These functions will ONLY be called if it has already
431 been determined that the identifier is in that namespace.
433 =head2 clean_E<lt>namespaceE<gt>
435 Usage: my $clean = clean_sgn_e('sgne12311');
436 #returns 'SGN-E12311'
437 Desc : clean up any irregularities in the identifier string
438 Ret : cleaned up identifier string. Should never fail,
439 since this function will only be called on identifiers
440 that are definitely in that namespace.
441 Args : identifier string
442 Side Effects: may look things up in the database
444 NOTE: These functions will ONLY be called if it has already
445 been determined that the identifier is in that namespace.
446 If your is_<namespace> function says it's that type of
447 identifier, your clean_<namespace> function had better
448 be able to clean it.
450 =cut
452 ######## sgn_u
453 sub is_sgn_u {
454 is_letter_identifier('sgn','u',shift);
456 sub url_sgn_u {
457 "/search/unigene.pl?unigene_id=".$urlencode{uc($_[0])};
459 sub clean_sgn_u {
460 clean_letter_identifier('sgn','u',shift);
462 sub parse_sgn_u {
463 parse_letter_identifier('sgn','u',shift);
465 ######## cgn_u
466 sub is_cgn_u {
467 is_letter_identifier('cgn','u',shift);
469 sub url_cgn_u {
470 my ($cgnid) = shift =~ /(\d+)/ or return undef;
471 return "/search/unigene.pl?unigene_id=CGN-U$cgnid";
473 sub clean_cgn_u {
474 clean_letter_identifier('cgn','u',shift);
476 sub parse_cgn_u {
477 parse_letter_identifier('cgn','u',shift);
479 ######### sgn_e
480 sub is_sgn_e {
481 is_letter_identifier('sgn','e',shift);
483 sub url_sgn_e {
484 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=automatic&search=Search";
486 sub clean_sgn_e {
487 clean_letter_identifier('sgn','e',shift);
489 sub parse_sgn_e {
490 parse_letter_identifier('sgn','e',shift);
492 ######### sgn_s
493 sub is_sgn_s {
494 is_letter_identifier('sgn','s',shift);
496 sub url_sgn_s {
497 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
499 sub clean_sgn_s {
500 clean_letter_identifier('sgn','s',shift);
502 sub parse_sgn_s {
503 parse_letter_identifier('sgn','s',shift);
505 ######### sgn_m
506 sub is_sgn_m {
507 is_letter_identifier('sgn','m',shift);
509 sub url_sgn_m {
510 my $id = shift;
511 $id =~ s/sgn.*m(\d+)$/$1/i;
512 return "/search/markers/markerinfo.pl?marker_id=$urlencode{$id}";
514 sub clean_sgn_m {
515 clean_letter_identifier('sgn','m',shift);
517 sub parse_sgn_m {
518 parse_letter_identifier('sgn','m',shift);
520 ######### sgn_t
521 sub is_sgn_t {
522 is_letter_identifier('sgn','t',shift);
524 sub url_sgn_t {
525 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=9&search=Search";
527 sub clean_sgn_t {
528 clean_letter_identifier('sgn','t',shift);
530 sub parse_sgn_t {
531 parse_letter_identifier('sgn','t',shift);
533 ######### sgn_locus_sequence
534 sub is_sgn_locus_sequence {
535 return 1 if shift =~ /[a-z]+_SGNlocusID_\d+_.+/i;
536 return 0;
538 sub url_sgn_locus_sequence {
539 my ($id) = shift =~ /SGNlocusID_(\d+)/i;
540 return "/phenome/locus_display.pl?locus_id=$id";
542 sub clean_sgn_locus_sequence {
543 my $p = parse_sgn_locus_sequence(shift);
544 return join'_',$p->{species},'SGNlocusID',$p->{id},$p->{ext_id};
546 sub parse_sgn_locus_sequence {
547 my ($species,undef,$id,$extid) = split /_/,shift,4;
548 return { species => ucfirst($species),
549 id => $id+0,
550 ext_id => $extid,
553 ######### microarray_spot
554 sub is_microarray_spot {
555 return 1 if shift =~ /^\d-\d-\d+\.\d+\.\d+\.\d+$/;
556 return 0;
558 sub url_microarray_spot {
559 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
561 sub clean_microarray_spot {
562 shift; #no cleaning is done here
564 sub parse_microarray_spot {
565 warn 'WARNING: parsing not yet implemented for microarray_spot';
566 return;
568 ######### est
569 sub is_est {
570 # XXX: stupid stupid stupid. Coffee clones have names like
571 # ccc<garbagegarbagegarbage>.
572 return 0 if $_[0] =~ m|^ccc|i;
573 return 1 if $_[0] =~ /^(c[A-Z]{2,3}|TUS)[^A-Z\d]*[0-9]+[^A-Z\d]*[A-P][^A-Z\d]*[0-9]{1,2}$/i;
574 return 0;
576 sub url_est {
577 "/search/est.pl?request_from=0&request_id=$urlencode{$_[0]}&request_type=automatic";
579 sub clean_est {
580 my $ident = shift;
581 $ident = uc($ident);
582 $ident =~ s/^C/c/;
584 if ($ident =~ /^([A-Z]{3,4})[^A-Z\d]*([0-9]+)[^A-Z\d]*([A-P])[^A-Z\d]*([0-9]{1,2})$/i) {
585 $ident = "$1-$2-$3$4";
587 return $ident;
589 sub parse_est {
590 warn 'WARNING: parsing not yet implemented for est';
591 return;
593 ######### bac_end
594 sub is_bac_end {
595 my $parsed = parse_clone_ident(shift,'bac_end')
596 or return 0;
598 return 1;
600 sub url_bac_end {
601 my $ident = shift;
602 my $parsed = parse_clone_ident($ident,'bac_end')
603 or confess 'not a valid bac end name';
604 return "/maps/physical/clone_read_info.pl?chrid=$parsed->{chromat_id}";
606 sub clean_bac_end {
607 my $ident = shift;
608 my $parsed = parse_clone_ident($ident,'bac_end')
609 or confess 'not a valid bac end name';
610 my $gss = CXGN::Genomic::GSS->retrieve_from_parsed_name($parsed)
611 or confess "could not fetch gss for ident '$ident'";
612 return $gss->external_identifier;
614 sub parse_bac_end {
615 parse_clone_ident(shift,'bac_end');
617 #bac
618 sub is_bac {
619 my ($ident) = @_;
620 my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
621 or return 0;
622 #must match the whole identifier, cause we sometimes tack on
623 #things to the ends of the names
624 return 0 unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
625 my $clone = _bac_cache($parsed)
626 or return 0;
627 return 1;
629 sub _bac_cache {
630 #single-element cache of the last BAC ident we returned. this speeds
631 #up runs of multiple queries for the same bac
632 my ($parsed) = @_;
633 our $last_key;
634 our $last_clone;
635 my $key = join(',',@{$parsed}{qw/lib plate row col clonetype/});
636 if($last_key && $last_key eq $key) {
637 # warn "cache hit $key\n";
638 return $last_clone;
639 } else {
640 # warn "cache miss $key\n";
641 $last_key = $key;
642 return $last_clone = CXGN::Genomic::Clone->retrieve_from_parsed_name($parsed);
645 sub url_bac {
646 my ($ident) = @_;
647 my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
648 or return undef;
649 #must match the whole identifier, cause we sometimes tack on
650 #things to the ends of the names
651 return undef unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
652 my $clone = _bac_cache($parsed)
653 or return undef;
654 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
656 our $insert_bac_chr_nums = 1;
657 sub clean_bac {
658 my ($ident) = @_;
659 my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
660 or return undef;
661 #must match the whole identifier, cause we sometimes tack on
662 #things to the ends of the names
663 return undef unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
664 my $clone = _bac_cache($parsed)
665 or return undef;
666 return (our $insert_bac_chr_nums) && $clone->chromosome_num
667 ? $clone->clone_name_with_chromosome : $clone->clone_name;
670 sub parse_bac {
671 parse_clone_ident(shift,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/);
674 #tomato_bac_contig
675 sub is_tomato_bac_contig {
676 my ($ident) = @_;
677 return 1 if $ident =~ /^C\d+\.\d+[^a-z\d]?contig\d+$/i;
678 return 0;
680 sub url_tomato_bac_contig {
681 my ($ident) = @_;
682 return;
684 sub clean_tomato_bac_contig {
685 my ($ident) = @_;
686 $ident = uc $ident;
687 $ident =~ s/CONTIG/contig/;
688 $ident =~ s/[^a-z\d]?contig/_contig/;
689 return $ident;
691 sub parse_tomato_bac_contig {
692 my ($ident) = @_;
693 $ident =~ /^C(\d+)\.(\d+)[^a-z\d]?contig(\d+)$/i
694 or return;
695 return { chr => $1+0,
696 chr_ver => $2+0,
697 ver => $2+0,
698 ctg_num => $3+0,
701 #generic_scaffold
702 sub is_generic_scaffold {
703 return 1 if $_[0] =~ /^scaffold\d+$/i;
704 return 0;
706 sub url_generic_scaffold {
707 my ($ident) = @_;
708 return;
710 sub clean_generic_scaffold {
711 my ($ident) = @_;
712 my ($d) = $ident =~ /(\d+)/;
713 $d ||= 0;
714 return 'scaffold'.($d+0);
716 sub parse_generic_scaffold {
717 my ($ident) = @_;
718 $ident =~ /^scaffold(\d+)$/i
719 or return;
720 return { scaffold_num => $1 };
722 #bac_sequence
723 sub is_bac_sequence {
724 my ($ident) = @_;
725 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
726 or return 0;
727 #must match the whole identifier, cause we sometimes tack on
728 #things to the ends of the names
729 return 0 unless $parsed->{match} eq $ident and defined($parsed->{version});
730 my $clone = _bac_cache($parsed)
731 or return 0;
732 return 1;
734 sub url_bac_sequence {
735 my ($ident) = @_;
736 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
737 or return;
738 # use Data::Dumper;
739 # die "$ident -> ",Dumper($parsed);
740 #must match the whole identifier, cause we sometimes tack on
741 #things to the ends of the names
742 return unless $parsed->{match} eq $ident and defined($parsed->{version});
743 my $clone = _bac_cache($parsed)
744 or return;
745 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
747 sub clean_bac_sequence {
748 my ($ident) = @_;
749 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
750 or return;
751 #must match the whole identifier, cause we sometimes tack on
752 #things to the ends of the names
753 return undef unless $parsed->{match} eq $ident and defined($parsed->{version});
754 my $clone = _bac_cache($parsed)
755 or return;
756 return assemble_clone_ident('versioned_bac_seq',$parsed);
758 sub parse_bac_sequence {
759 parse_clone_ident(shift,qw/versioned_bac_seq/);
761 #bac_fragment
762 sub is_bac_fragment {
763 my ($ident) = @_;
764 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
765 or return 0;
766 #must match the whole identifier, cause we sometimes tack on
767 #things to the ends of the names
768 return 0 unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
769 my $clone = _bac_cache($parsed)
770 or return 0;
771 return 1;
773 sub url_bac_fragment {
774 my ($ident) = @_;
775 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
776 or return;
777 # use Data::Dumper;
778 # die "$ident -> ",Dumper($parsed);
779 #must match the whole identifier, cause we sometimes tack on
780 #things to the ends of the names
781 return unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
782 my $clone = _bac_cache($parsed)
783 or return;
784 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
786 sub clean_bac_fragment {
787 my ($ident) = @_;
788 my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
789 or return;
790 #must match the whole identifier, cause we sometimes tack on
791 #things to the ends of the names
792 return undef unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
793 my $clone = _bac_cache($parsed)
794 or return;
795 return assemble_clone_ident(versioned_bac_seq => $parsed);
797 sub parse_bac_fragment {
798 parse_clone_ident(shift,qw/ versioned_bac_seq / );
800 #tair_locus
801 sub is_tair_locus {
802 return 1 if shift =~ /^AT[1-5MC]G\d{5}$/i;
803 return 0;
805 sub url_tair_locus {
806 my ($locusname) = @_;
807 # $locusname =~ s/\.\d+$//;
808 "http://arabidopsis.org/servlets/TairObject?type=locus&name=$urlencode{$locusname}"
810 sub clean_tair_locus {
811 my $ident = shift;
812 $ident =~ s/^at/At/i; #properly capitalize the first at
813 return $ident;
815 sub parse_tair_locus {
816 my ($ident) = @_;
817 warn 'WARNING: parsing not yet implemented for TAIR locus';
818 return;
820 #tair_gene_model
821 sub is_tair_gene_model {
822 return 1 if shift =~ /^AT[1-5MC]G\d{5}\.\d+$/i;
823 return 0;
825 sub url_tair_gene_model {
826 my $name = shift;
827 "http://arabidopsis.org/servlets/TairObject?type=gene&name=$urlencode{$name}"
829 sub clean_tair_gene_model {
830 my $ident = shift;
831 $ident =~ s/^at/At/i; #properly capitalize the first at
832 return $ident;
834 sub parse_tair_gene_model {
835 warn 'WARNING: parsing not yet implemented for tair gene model';
836 return;
838 #species binomial
839 sub is_species_binomial {
840 return 1 if shift =~ /^[a-z]+ [a-z]+$|^[a-z]\.\s*[a-z]+$/i;
841 return 0;
843 sub url_species_binomial {
844 _wikipedia_link(@_);
846 sub clean_species_binomial {
847 my $ident = shift;
848 $ident =~ s/\.(?=\S)/\. /g;
849 $ident =~ s/\s+/ /g;
850 $ident = lc($ident);
851 $ident = ucfirst($ident);
852 return $ident;
854 sub parse_species_binomial {
855 my ($ident) = @_;
856 my @w = split qr/\W+/, $ident;
857 return unless @w == 2;
858 return { genus => $w[0],
859 species => $w[1],
862 sub _wikipedia_link {
863 my ($ident) = @_;
864 $ident =~ s/\s+/_/g;
865 return 'http://en.wikipedia.org/wiki/Special:Search/'.$ident;
867 #uniprotKB-swissprot_accession
868 sub is_swissprot_accession {
869 my ($ident) = @_;
870 return 1 if $ident =~ /^sp\|\w+\|\w+_\w+$/i;
872 sub url_swissprot_accession {
873 my ($ident) = @_;
874 $ident = clean_swissprot_accession($ident) or return;
875 return "http://www.uniprot.org/uniprot/" . $ident;
877 sub clean_swissprot_accession {
878 my ($ident) = @_;
879 my $clean;
880 if ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
881 $clean = $1;
882 } else {
883 $clean=$ident;
885 return $clean;
887 sub parse_swissprot_accession {
888 my ($ident)=@_;
889 unless ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
890 return { id => $1};
891 } else {
892 return;
896 #uniprot_uniref_accession
897 sub is_uniref_accession {
898 my ($ident) = @_;
899 return 1 if $ident =~ m/^UniRef\d+_\w+$/i;
901 sub url_uniref_accession {
902 my ($ident) = @_;
903 $ident = clean_uniref_accession($ident) or return;
904 return "http://www.uniprot.org/uniprot/".$ident;
906 sub clean_uniref_accession {
907 my ($ident) = @_;
908 my $clean;
909 if ($ident =~ m/^UniRef\d+_(\w+)$/i) {
910 $clean = $1;
911 } else {
912 $clean=$ident;
914 return $clean;
916 sub parse_uniref_accession {
917 my ($ident)=@_;
918 unless ($ident =~ m/^UniRef\d+_(\w+)$/i) {
919 return { id => $1};
920 } else {
921 return;
926 #genbank_gi
927 sub is_genbank_gi {
928 my ($ident) = @_;
929 return 1 if $ident =~ /^gi[\|:]\d+[\|:]?$/i;
930 return 0;
932 sub url_genbank_gi {
933 my ($ident) = @_;
934 $ident = clean_genbank_gi($ident) or return;
935 "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}"
937 sub clean_genbank_gi {
938 my ($ident) = @_;
939 if($ident =~ /^gi[\|:](\d+)$/i) {
940 return "gi|$1|";
941 } else {
942 return clean_genbank($ident)
945 sub parse_genbank_gi {
946 my ($ident) = @_;
947 $ident =~ /^gi[\|:](\d+)[\|:]?$/i
948 or return;
949 return { gi => $1 + 0 };
951 #genbank_accession
952 sub is_genbank_accession {
953 my ($ident) = @_;
955 return 1 if
956 $ident =~ /([a-z]{2,3})\|+\w+\d+(\.\d+)?\|?/i
957 || $ident =~ /^[A-Z_]{1,5}\d{4,}(\.\d+)?$/;
958 return 0;
960 sub url_genbank_accession {
961 my ($ident) = @_;
962 $ident = clean_genbank_accession($ident) or return;
964 return "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}";
966 sub clean_genbank_accession { clean_genbank(@_) };
967 sub parse_genbank_accession {
968 my ($ident) = @_;
970 my %parsed;
972 if( $ident =~ /[\|:]/ ) {
973 my @fields = split /[\|:]+/, $ident;
974 pop @fields if $fields[-1] =~ /^\[\d+\]$/;
976 while( my $field = shift @fields ) {
977 if( lc $field eq 'gi' ) {
978 my $gi = shift @fields;
979 $parsed{gi} = $gi + 0;
981 elsif( lc $field eq 'gb' ) {
982 my $acc = shift @fields;
983 my $locus = shift @fields;
984 $parsed{locus} = $locus if defined $locus;
985 my $accver = _gb_acc_ver($acc);
986 @parsed{keys %$accver} = values %$accver;
988 else {
989 if( $fields[0] && $field !~ /\d/ ) {
990 $parsed{$field} = shift @fields;
991 } else {
992 $parsed{unknown} ||= [];
993 push @{$parsed{unknown}},$field;
997 return \%parsed;
998 } else {
999 return _gb_acc_ver($ident);
1002 sub _gb_acc_ver {
1003 my ($id) = @_;
1004 my %parsed;
1005 if( $id =~ /^([\w_]+\d+)\.(\d+)$/ ) {
1006 return { accession => $1,
1007 version => $2+0,
1010 else {
1011 return { accession => $id };
1015 sub clean_genbank {
1016 my ($ident) = @_;
1017 $ident =~ s/^([a-z]{2,3})\|/lc($1).'|'/ie; #lowercase initial gi and namespace idents
1018 $ident =~ s/\|([a-z]{2,3})\|/'|'.lc($1).'|'/ie; #lowercase internal gi and namespace idents
1019 $ident =~ s/\[\d+\]//; #remove any bracketed gi numbers
1020 return $ident;
1023 #genbank_cdd
1024 sub is_genbank_cdd {
1025 my ($ident) = @_;
1026 return 1 if $ident =~ /^(gnl\|)?cdd[\|:]\d+[\|:]?$/i;
1027 return 0;
1029 sub url_genbank_cdd {
1030 my ($ident) = @_;
1031 $ident = clean_genbank_cdd($ident) or return;
1032 my $p = parse_genbank_cdd($ident) or return;
1033 return "http://www.ncbi.nlm.nih.gov/sites/entrez/query.fcgi?db=cdd&term=$p->{id}";
1035 sub clean_genbank_cdd {
1036 my ($ident) = @_;
1037 if($ident =~ /(\d+)/i) {
1038 return "gnl|CDD|$1";
1039 } else {
1040 return clean_genbank($ident)
1043 sub parse_genbank_cdd {
1044 my ($ident) = @_;
1045 $ident =~ /cdd[\|:](\d+)[\|:]?$/i
1046 or return;
1047 return { id => $1 + 0 };
1051 #interpro accession
1052 sub is_interpro_accession {
1053 my ($ident) = @_;
1054 return 1 if $ident =~ /^IPR\d+$/i;
1055 return 0;
1057 sub url_interpro_accession {
1058 my ($ident) = @_;
1059 $ident = clean_interpro_accession($ident) or return;
1060 return "http://www.ebi.ac.uk/interpro/IEntry?ac=" . $ident;
1062 sub clean_interpro_accession {
1063 my ($ident) = @_;
1064 return uc($ident);
1066 sub parse_interpro_accession {
1067 my ($ident) = @_;
1068 return unless $ident =~ /^IPR(\d+)$/;
1069 return { id => $1+0 };
1073 #go term
1074 sub is_go_term {
1075 my ($ident) = @_;
1076 return 1 if $ident =~ /^GO[^a-zA-Z\d]?\d{3,}$/i;
1077 return 0;
1079 sub url_go_term {
1080 my ($ident) = @_;
1081 $ident = clean_go_term($ident) or return;
1082 return "http://www.geneontology.org/cgi-bin/chooser.cgi?search_query=$urlencode{$ident}&search_constraint=terms";
1084 sub clean_go_term {
1085 my ($ident) = @_;
1086 my @d = $ident =~ /\d+/g;
1087 return sprintf('GO:%07d',join('',@d));
1089 sub parse_go_term {
1090 my ($ident) = @_;
1091 return unless $ident =~ /^GO/;
1092 my @d = $ident =~ /(\d+)/g;
1093 return { id => join('',@d)+0 };
1097 #### NAMESPACE HELPERS ###
1099 #return 1 if the identifier is a SGN-X234232 identifier
1100 #where X is the letter of your choice
1101 sub is_letter_identifier {
1102 my ($dbname,$letter,$identifier) = @_;
1103 $dbname = uc($dbname);
1104 return 1 if $identifier =~ /^$dbname?\W?$letter\d{1,9}/i;
1105 return 0;
1108 sub quick_search_url {
1109 "/search/quick_search.pl?term=".$urlencode{+shift}
1112 sub clean_letter_identifier {
1113 my ($dbname,$letter,$identifier) = @_;
1114 $dbname = uc($dbname);
1115 $letter = uc($letter);
1116 my ($digits) = $identifier =~ /(\d+)/
1117 or return;
1118 $digits += 0;
1119 return "$dbname-$letter$digits";
1122 sub parse_letter_identifier {
1123 my ($dbname,$letter,$identifier) = @_;
1124 $dbname = uc($dbname);
1125 $letter = uc($letter);
1126 my ($digits) = $identifier =~ /(\d+)/
1127 or return;
1128 return { id => $digits + 0 };
1133 1;#do not remove