1 package CXGN
::Tools
::Identifiers
;
7 CXGN::Tools::Identifiers - useful functions for dealing with
8 identifiers, like SGN-E23412
12 my $nsname = identifier_namespace('SGN-E23412');
15 my $url = identifier_url('SGN-E23412');
16 #$url is now '/search/est.pl?request_type=7&request=575150'
18 my $link = link_identifier('SGN-E575150');
20 #'<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
22 my $clean = clean_identifier('SGNE3423');
23 #returns SGN-E3423, or undef if the identifier was not recognized
25 my $contents = parse_identifier('SGN-E12345');
29 This module contains easy-to-use functions for working with the often
30 malformed strings of text that purport to be identifiers of some sort.
32 Note that support for identification of SGN marker names like 'TG23' has been removed. The only marker identifiers supported now are SGN-M type identifiers.
35 =head2 Supported Namespaces
41 SGN unigene identifiers 'SGN-U2342'
45 CGN unigene identifers 'CGN-U122539'
49 SGN EST identifiers 'SGN-E234223'
53 SGN Microarray spot identifiers 'SGN-S1241'
57 SGN Marker identifier 'SGN-M1347'
61 SGN Trace (chromatogram) identifiers 'SGN-T1241'
63 =item sgn_locus_sequence
65 An SGN locus sequence identifier like 'Potato_SGNlocusID_2206_AM040153'
66 which is parsed as { id => 2206, species => 'Potato', ext_id => 'AM040153' }
70 microarray spot identifiers like '1-1-1.2.3.4'
74 other kinds of EST identifiers like 'cLEC-23-A23'
78 BAC end identifiers like 'LE_HBa0123A12_SP6_2342'
82 BAC identifiers like 'LE_HBa0123A12'
84 By default, if a BAC has been assigned to a sequencing
85 project, its clean_identifier and link_identifier will
86 replace the LE_ or SL_ species name at the beginning
87 with a C##, where ## is the zero-padded chromosome number.
89 $CXGN::Tools::Identifier::insert_bac_chr_nums can be used
90 to enable or disable this. To disable, set it to a
91 false value. To enable, set true. Defaults to true.
95 BAC sequence identifiers like 'LE_HBa0123A12.1'
99 BAC fragment identifiers (identifies contigs in a bac that is
100 still partially assembled) like 'LE_HBa0123A12.1-4'
102 =item tomato_bac_contig
104 A tomato BAC contig, e.g. C12.4_contig1
106 =item generic_scaffold
108 A generic scaffold identifier, e.g. scaffold12345
112 TAIR locus identifiers like 'At1g67700.1'
114 =item species_binomial
116 e.g. 'Arabidopsis thaliana', 'Solanum lycopersicum'
120 A genbank identifier containing a stable GI identification number.
121 Examples include gi|108883260|gb|EAT47485.1| or just gi|108883260|
124 =item genbank_accession
126 A genbank identifier denoting a sequence accession, containing a
127 namespace and an identifier, such as gb|EAT47485.1|.
131 A genbank identifier denoting an entry in the NCBI Conserved Domain
134 =item swissprot_accession
136 A uniprot identifier composed by 'sp'+pipe+uniprot_accession+pipe+entry_name
137 for example sp|P22142|NDUS2_NEUCR means accession=P22142 and entry_name=NDUS2_NEUCR
140 =item uniref_accession
142 A uniprot accession is contains a combination of number and letters,
143 for example: UniRef90_P22142, the accsssion will be P22142
149 ############ NAMESPACE DEFINITIONS ############
150 # To add a namespace:
151 # 1. make is_<namespace>, url_<namespace>, and clean_<namespace>
152 # functions for your namespace at the end of this file
153 # 2. add its name to @namespace_list below
155 #NOTE: the ordering of this list is the order in which a given
156 #identifier is checked for membership in each class
158 # removed sgn_marker from this list because it always returns
160 our @namespace_list = qw
/
188 #return 1 if the given namespace is in this list
189 sub _is_valid_namespace
{
191 return 1 if grep {$ns eq $_} @namespace_list;
197 All functions are EXPORT_OK.
201 # ABOUT THE ARCHITECTURE OF THIS MODULE
203 # each namespace supported by this module
205 # a.) an entry in @namespace_list
206 # b.) a is_<namespace> function
207 # c.) a url_<namespace> function
209 # All of these are at the bottom of this file,
210 # where it says NAMESPACE DEFINITIONS.
217 our @EXPORT_OK = qw
/ identifier_url
226 use base qw
/Exporter/;
228 use CXGN
::Genomic
::Clone
;
229 use CXGN
::Genomic
::CloneIdentifiers qw
/ parse_clone_ident assemble_clone_ident /;
230 use CXGN
::Genomic
::GSS
;
231 #use CXGN::Marker::Tools qw/clean_marker_name marker_name_to_ids/;
232 use CXGN
::Tools
::Text qw
/trim/;
235 =head2 identifier_url
237 Usage: my $url = identifier_url('SGN-E12141');
238 Desc : get an information URL for an identifier.
239 Ret : a string containing an absolute or relative URL,
240 suitable for putting in a href= in HTML
241 Args : identifier string,
242 (optional) namespace name if you know it
243 Side Effects: might look things up in the database
248 my ($ident,$ns) = @_;
249 $ident = trim
($ident);
250 $ns ||= identifier_namespace
($ident)
252 return unless _is_valid_namespace
($ns);
253 #clean up the identifier if we can
254 $ident = clean_identifier
($ident,$ns) || $ident;
256 return "url_$ns"->($ident);
259 =head2 link_identifier
261 Usage: my $link = link_identifier('SGN-E575150');
262 #returns '<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
263 Desc : calls identifier_url() to get a URL for your identifier,
264 then returns a complete HTML link to you, like
265 Ret : an html link, or undef if the link could not be made
266 Args : single string containing an identifier,
267 (optional) namespace name if you know it
268 Side Effects: might look things up in the database
272 sub link_identifier
{
273 my ($ident,$ns) = @_;
274 $ident = trim
($ident);
275 $ns ||= identifier_namespace
($ident)
277 $ident = clean_identifier
($ident,$ns) || $ident;
278 my $url = identifier_url
($ident,$ns)
280 #clean up the identifier if we can
281 my $fns = ucfirst join ' ',split /_/,$ns;
282 return qq|<a title
="search for this $fns" href
="$url">$ident</a
>|;
285 =head2 identifier_namespace
287 Usage: my $ns = identifier_namespace('SGN-U1231');
289 Desc : get the namespace
290 Ret : a string containing the name of the namespace,
291 or undef if it cannot identify the namespace
292 Args : a string identifier
293 Side Effects: might look things up in the database
297 #see bottom for namespace definitions
298 sub identifier_namespace
{
299 my ($identifier) = @_;
300 $identifier = trim
($identifier)
302 #identifiers have to be more than 2 chars, and they can't be all numbers
303 length($identifier) > 2 && $identifier =~ /\D/
305 foreach my $ns (our @namespace_list) {
307 return $ns if "is_$ns"->($identifier);
308 #warn "$identifier is not in $ns\n";
313 =head2 clean_identifier
315 Usage: my $newident = clean_identifier('SGNE1231');
316 Desc : attempt to guess the namespace of the identifier,
317 and clean up any irregularities in it to put
318 it in its canonical form
319 Ret : a cleaned string, or undef if the identifier
320 is not in any recognized namespace
321 Args : identifier to be cleaned
322 Side Effects: may look things up in the database
326 sub clean_identifier
{
327 my ($ident,$ns) = @_;
328 $ident = trim
($ident);
329 $ns ||= identifier_namespace
($ident)
331 return unless _is_valid_namespace
($ns);
333 return "clean_$ns"->($ident);
336 =head2 list_namespaces
338 Usage: my @namespaces = list_namespaces;
339 Desc : get the list of namespace names supported by this module
340 Ret : list of valid namespace names
346 sub list_namespaces
{
347 return @namespace_list;
351 =head2 parse_identifier
353 Usage: my $data = parse_identifier($identifier, $namespace );
354 Desc : many identifiers have data in them, for example, an SGN-E has the EST id
355 in it, and a bac name (LE_HBa0001A02) has the organism, library, plate,
356 row, and column in it. This function parses that data out and gives it
357 to you, as it appears in the string. You might consider running
358 clean_identifier() on what you give this function.
359 Args : identifier to parse,
360 optional list of namespace names it could be a member of,
361 guesses the namespace if not provided
362 Ret : nothing if the identifier could not be parsed,
363 otherwise a hashref of data in the identifier, which varies in its
364 contents, looking like
365 { namespace => 'namespace_name',
366 <other data in the identifier>
371 my $data = parse_identifier('C03HBa0001A02');
372 #and now $data contains
373 $data = { namespace => 'bac',
379 match => 'C03HBa0001A02',
385 sub parse_identifier
{
386 my ($ident, $ns ) = @_;
388 $ident = trim
( $ident);
389 $ns ||= identifier_namespace
($ident)
392 return unless _is_valid_namespace
($ns);
395 my $p = "parse_$ns"->($ident)
397 $p->{namespace
} = $ns;
401 =head1 NAMESPACE FUNCTIONS
403 These functions are not exported, and
404 are only used internally by this module.
405 To add a namespace, follow the instructions inside this
406 file (they are in comments, not POD).
410 #for instructions, see ABOUT THE ARCHITECTURE OF THIS MODULE above
412 =head2 is_E<lt>namespaceE<gt>
414 Usage: is_sgn_e('SGN-E2342');
415 Desc : check if an identifier is in a given namespace
416 Ret : 1 if the given identifier is in that namespace,
418 Args : identifier string
419 Side Effects: may look up things in the database
421 =head2 url_E<lt>namespaceE<gt>
423 Usage: url_sgn_e('SGN-E2342');
424 Desc : get the info URL for a given identifier,
425 Ret : string with the URL, or undef if no
426 url is available for this identifier
427 Args : identifier string
428 Side Effects: may look up things in the database
430 NOTE: These functions will ONLY be called if it has already
431 been determined that the identifier is in that namespace.
433 =head2 clean_E<lt>namespaceE<gt>
435 Usage: my $clean = clean_sgn_e('sgne12311');
436 #returns 'SGN-E12311'
437 Desc : clean up any irregularities in the identifier string
438 Ret : cleaned up identifier string. Should never fail,
439 since this function will only be called on identifiers
440 that are definitely in that namespace.
441 Args : identifier string
442 Side Effects: may look things up in the database
444 NOTE: These functions will ONLY be called if it has already
445 been determined that the identifier is in that namespace.
446 If your is_<namespace> function says it's that type of
447 identifier, your clean_<namespace> function had better
454 is_letter_identifier
('sgn','u',shift);
457 "/search/unigene.pl?unigene_id=".$urlencode{uc($_[0])};
460 clean_letter_identifier
('sgn','u',shift);
463 parse_letter_identifier
('sgn','u',shift);
467 is_letter_identifier
('cgn','u',shift);
470 my ($cgnid) = shift =~ /(\d+)/ or return undef;
471 return "/search/unigene.pl?unigene_id=CGN-U$cgnid";
474 clean_letter_identifier
('cgn','u',shift);
477 parse_letter_identifier
('cgn','u',shift);
481 is_letter_identifier
('sgn','e',shift);
484 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=automatic&search=Search";
487 clean_letter_identifier
('sgn','e',shift);
490 parse_letter_identifier
('sgn','e',shift);
494 is_letter_identifier
('sgn','s',shift);
497 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
500 clean_letter_identifier
('sgn','s',shift);
503 parse_letter_identifier
('sgn','s',shift);
507 is_letter_identifier
('sgn','m',shift);
511 $id =~ s/sgn.*m(\d+)$/$1/i;
512 return "/search/markers/markerinfo.pl?marker_id=$urlencode{$id}";
515 clean_letter_identifier
('sgn','m',shift);
518 parse_letter_identifier
('sgn','m',shift);
522 is_letter_identifier
('sgn','t',shift);
525 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=9&search=Search";
528 clean_letter_identifier
('sgn','t',shift);
531 parse_letter_identifier
('sgn','t',shift);
533 ######### sgn_locus_sequence
534 sub is_sgn_locus_sequence
{
535 return 1 if shift =~ /[a-z]+_SGNlocusID_\d+_.+/i;
538 sub url_sgn_locus_sequence
{
539 my ($id) = shift =~ /SGNlocusID_(\d+)/i;
540 return "/phenome/locus_display.pl?locus_id=$id";
542 sub clean_sgn_locus_sequence
{
543 my $p = parse_sgn_locus_sequence
(shift);
544 return join'_',$p->{species
},'SGNlocusID',$p->{id
},$p->{ext_id
};
546 sub parse_sgn_locus_sequence
{
547 my ($species,undef,$id,$extid) = split /_/,shift,4;
548 return { species
=> ucfirst($species),
553 ######### microarray_spot
554 sub is_microarray_spot
{
555 return 1 if shift =~ /^\d-\d-\d+\.\d+\.\d+\.\d+$/;
558 sub url_microarray_spot
{
559 "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
561 sub clean_microarray_spot
{
562 shift; #no cleaning is done here
564 sub parse_microarray_spot
{
565 warn 'WARNING: parsing not yet implemented for microarray_spot';
570 # XXX: stupid stupid stupid. Coffee clones have names like
571 # ccc<garbagegarbagegarbage>.
572 return 0 if $_[0] =~ m
|^ccc
|i
;
573 return 1 if $_[0] =~ /^(c[A-Z]{2,3}|TUS)[^A-Z\d]*[0-9]+[^A-Z\d]*[A-P][^A-Z\d]*[0-9]{1,2}$/i;
577 "/search/est.pl?request_from=0&request_id=$urlencode{$_[0]}&request_type=automatic";
584 if ($ident =~ /^([A-Z]{3,4})[^A-Z\d]*([0-9]+)[^A-Z\d]*([A-P])[^A-Z\d]*([0-9]{1,2})$/i) {
585 $ident = "$1-$2-$3$4";
590 warn 'WARNING: parsing not yet implemented for est';
595 my $parsed = parse_clone_ident
(shift,'bac_end')
602 my $parsed = parse_clone_ident
($ident,'bac_end')
603 or confess
'not a valid bac end name';
604 return "/maps/physical/clone_read_info.pl?chrid=$parsed->{chromat_id}";
608 my $parsed = parse_clone_ident
($ident,'bac_end')
609 or confess
'not a valid bac end name';
610 my $gss = CXGN
::Genomic
::GSS
->retrieve_from_parsed_name($parsed)
611 or confess
"could not fetch gss for ident '$ident'";
612 return $gss->external_identifier;
615 parse_clone_ident
(shift,'bac_end');
620 my $parsed = parse_clone_ident
($ident,qw
/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
622 #must match the whole identifier, cause we sometimes tack on
623 #things to the ends of the names
624 return 0 unless $parsed->{match
} eq $ident and !defined($parsed->{version
}) and !defined($parsed->{fragment
});
625 my $clone = _bac_cache
($parsed)
630 #single-element cache of the last BAC ident we returned. this speeds
631 #up runs of multiple queries for the same bac
635 my $key = join(',',@
{$parsed}{qw
/lib plate row col clonetype/});
636 if($last_key && $last_key eq $key) {
637 # warn "cache hit $key\n";
640 # warn "cache miss $key\n";
642 return $last_clone = CXGN
::Genomic
::Clone
->retrieve_from_parsed_name($parsed);
647 my $parsed = parse_clone_ident
($ident,qw
/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
649 #must match the whole identifier, cause we sometimes tack on
650 #things to the ends of the names
651 return undef unless $parsed->{match
} eq $ident and !defined($parsed->{version
}) and !defined($parsed->{fragment
});
652 my $clone = _bac_cache
($parsed)
654 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
656 our $insert_bac_chr_nums = 1;
659 my $parsed = parse_clone_ident
($ident,qw
/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
661 #must match the whole identifier, cause we sometimes tack on
662 #things to the ends of the names
663 return undef unless $parsed->{match
} eq $ident and !defined($parsed->{version
}) and !defined($parsed->{fragment
});
664 my $clone = _bac_cache
($parsed)
666 return (our $insert_bac_chr_nums) && $clone->chromosome_num
667 ?
$clone->clone_name_with_chromosome : $clone->clone_name;
671 parse_clone_ident
(shift,qw
/agi_bac agi_bac_with_chrom old_cornell sanger_bac/);
675 sub is_tomato_bac_contig
{
677 return 1 if $ident =~ /^C\d+\.\d+[^a-z\d]?contig\d+$/i;
680 sub url_tomato_bac_contig
{
684 sub clean_tomato_bac_contig
{
687 $ident =~ s/CONTIG/contig/;
688 $ident =~ s/[^a-z\d]?contig/_contig/;
691 sub parse_tomato_bac_contig
{
693 $ident =~ /^C(\d+)\.(\d+)[^a-z\d]?contig(\d+)$/i
695 return { chr => $1+0,
702 sub is_generic_scaffold
{
703 return 1 if $_[0] =~ /^scaffold\d+$/i;
706 sub url_generic_scaffold
{
710 sub clean_generic_scaffold
{
712 my ($d) = $ident =~ /(\d+)/;
714 return 'scaffold'.($d+0);
716 sub parse_generic_scaffold
{
718 $ident =~ /^scaffold(\d+)$/i
720 return { scaffold_num
=> $1 };
723 sub is_bac_sequence
{
725 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
727 #must match the whole identifier, cause we sometimes tack on
728 #things to the ends of the names
729 return 0 unless $parsed->{match
} eq $ident and defined($parsed->{version
});
730 my $clone = _bac_cache
($parsed)
734 sub url_bac_sequence
{
736 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
739 # die "$ident -> ",Dumper($parsed);
740 #must match the whole identifier, cause we sometimes tack on
741 #things to the ends of the names
742 return unless $parsed->{match
} eq $ident and defined($parsed->{version
});
743 my $clone = _bac_cache
($parsed)
745 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
747 sub clean_bac_sequence
{
749 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
751 #must match the whole identifier, cause we sometimes tack on
752 #things to the ends of the names
753 return undef unless $parsed->{match
} eq $ident and defined($parsed->{version
});
754 my $clone = _bac_cache
($parsed)
756 return assemble_clone_ident
('versioned_bac_seq',$parsed);
758 sub parse_bac_sequence
{
759 parse_clone_ident
(shift,qw
/versioned_bac_seq/);
762 sub is_bac_fragment
{
764 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
766 #must match the whole identifier, cause we sometimes tack on
767 #things to the ends of the names
768 return 0 unless $parsed->{match
} eq $ident and defined($parsed->{version
}) and defined($parsed->{fragment
});
769 my $clone = _bac_cache
($parsed)
773 sub url_bac_fragment
{
775 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
778 # die "$ident -> ",Dumper($parsed);
779 #must match the whole identifier, cause we sometimes tack on
780 #things to the ends of the names
781 return unless $parsed->{match
} eq $ident and defined($parsed->{version
}) and defined($parsed->{fragment
});
782 my $clone = _bac_cache
($parsed)
784 return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
786 sub clean_bac_fragment
{
788 my $parsed = parse_clone_ident
($ident,qw
/versioned_bac_seq/)
790 #must match the whole identifier, cause we sometimes tack on
791 #things to the ends of the names
792 return undef unless $parsed->{match
} eq $ident and defined($parsed->{version
}) and defined($parsed->{fragment
});
793 my $clone = _bac_cache
($parsed)
795 return assemble_clone_ident
(versioned_bac_seq
=> $parsed);
797 sub parse_bac_fragment
{
798 parse_clone_ident
(shift,qw
/ versioned_bac_seq / );
802 return 1 if shift =~ /^AT[1-5MC]G\d{5}$/i;
806 my ($locusname) = @_;
807 # $locusname =~ s/\.\d+$//;
808 "http://arabidopsis.org/servlets/TairObject?type=locus&name=$urlencode{$locusname}"
810 sub clean_tair_locus
{
812 $ident =~ s/^at/At/i; #properly capitalize the first at
815 sub parse_tair_locus
{
817 warn 'WARNING: parsing not yet implemented for TAIR locus';
821 sub is_tair_gene_model
{
822 return 1 if shift =~ /^AT[1-5MC]G\d{5}\.\d+$/i;
825 sub url_tair_gene_model
{
827 "http://arabidopsis.org/servlets/TairObject?type=gene&name=$urlencode{$name}"
829 sub clean_tair_gene_model
{
831 $ident =~ s/^at/At/i; #properly capitalize the first at
834 sub parse_tair_gene_model
{
835 warn 'WARNING: parsing not yet implemented for tair gene model';
839 sub is_species_binomial
{
840 return 1 if shift =~ /^[a-z]+ [a-z]+$|^[a-z]\.\s*[a-z]+$/i;
843 sub url_species_binomial
{
846 sub clean_species_binomial
{
848 $ident =~ s/\.(?=\S)/\. /g;
851 $ident = ucfirst($ident);
854 sub parse_species_binomial
{
856 my @w = split qr/\W+/, $ident;
857 return unless @w == 2;
858 return { genus
=> $w[0],
862 sub _wikipedia_link
{
865 return 'http://en.wikipedia.org/wiki/Special:Search/'.$ident;
867 #uniprotKB-swissprot_accession
868 sub is_swissprot_accession
{
870 return 1 if $ident =~ /^sp\|\w+\|\w+_\w+$/i;
872 sub url_swissprot_accession
{
874 $ident = clean_swissprot_accession
($ident) or return;
875 return "http://www.uniprot.org/uniprot/" . $ident;
877 sub clean_swissprot_accession
{
880 if ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
887 sub parse_swissprot_accession
{
889 unless ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
896 #uniprot_uniref_accession
897 sub is_uniref_accession
{
899 return 1 if $ident =~ m/^UniRef\d+_\w+$/i;
901 sub url_uniref_accession
{
903 $ident = clean_uniref_accession
($ident) or return;
904 return "http://www.uniprot.org/uniprot/".$ident;
906 sub clean_uniref_accession
{
909 if ($ident =~ m/^UniRef\d+_(\w+)$/i) {
916 sub parse_uniref_accession
{
918 unless ($ident =~ m/^UniRef\d+_(\w+)$/i) {
929 return 1 if $ident =~ /^gi[\|:]\d+[\|:]?$/i;
934 $ident = clean_genbank_gi
($ident) or return;
935 "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}"
937 sub clean_genbank_gi
{
939 if($ident =~ /^gi[\|:](\d+)$/i) {
942 return clean_genbank
($ident)
945 sub parse_genbank_gi
{
947 $ident =~ /^gi[\|:](\d+)[\|:]?$/i
949 return { gi
=> $1 + 0 };
952 sub is_genbank_accession
{
956 $ident =~ /([a-z]{2,3})\|+\w+\d+(\.\d+)?\|?/i
957 || $ident =~ /^[A-Z_]{1,5}\d{4,}(\.\d+)?$/;
960 sub url_genbank_accession
{
962 $ident = clean_genbank_accession
($ident) or return;
964 return "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}";
966 sub clean_genbank_accession
{ clean_genbank
(@_) };
967 sub parse_genbank_accession
{
972 if( $ident =~ /[\|:]/ ) {
973 my @fields = split /[\|:]+/, $ident;
974 pop @fields if $fields[-1] =~ /^\[\d+\]$/;
976 while( my $field = shift @fields ) {
977 if( lc $field eq 'gi' ) {
978 my $gi = shift @fields;
979 $parsed{gi
} = $gi + 0;
981 elsif( lc $field eq 'gb' ) {
982 my $acc = shift @fields;
983 my $locus = shift @fields;
984 $parsed{locus
} = $locus if defined $locus;
985 my $accver = _gb_acc_ver
($acc);
986 @parsed{keys %$accver} = values %$accver;
989 if( $fields[0] && $field !~ /\d/ ) {
990 $parsed{$field} = shift @fields;
992 $parsed{unknown
} ||= [];
993 push @
{$parsed{unknown
}},$field;
999 return _gb_acc_ver
($ident);
1005 if( $id =~ /^([\w_]+\d+)\.(\d+)$/ ) {
1006 return { accession
=> $1,
1011 return { accession
=> $id };
1017 $ident =~ s/^([a-z]{2,3})\|/lc($1).'|'/ie; #lowercase initial gi and namespace idents
1018 $ident =~ s/\|([a-z]{2,3})\|/'|'.lc($1).'|'/ie; #lowercase internal gi and namespace idents
1019 $ident =~ s/\[\d+\]//; #remove any bracketed gi numbers
1024 sub is_genbank_cdd
{
1026 return 1 if $ident =~ /^(gnl\|)?cdd[\|:]\d+[\|:]?$/i;
1029 sub url_genbank_cdd
{
1031 $ident = clean_genbank_cdd
($ident) or return;
1032 my $p = parse_genbank_cdd
($ident) or return;
1033 return "http://www.ncbi.nlm.nih.gov/sites/entrez/query.fcgi?db=cdd&term=$p->{id}";
1035 sub clean_genbank_cdd
{
1037 if($ident =~ /(\d+)/i) {
1038 return "gnl|CDD|$1";
1040 return clean_genbank
($ident)
1043 sub parse_genbank_cdd
{
1045 $ident =~ /cdd[\|:](\d+)[\|:]?$/i
1047 return { id
=> $1 + 0 };
1052 sub is_interpro_accession
{
1054 return 1 if $ident =~ /^IPR\d+$/i;
1057 sub url_interpro_accession
{
1059 $ident = clean_interpro_accession
($ident) or return;
1060 return "http://www.ebi.ac.uk/interpro/IEntry?ac=" . $ident;
1062 sub clean_interpro_accession
{
1066 sub parse_interpro_accession
{
1068 return unless $ident =~ /^IPR(\d+)$/;
1069 return { id
=> $1+0 };
1076 return 1 if $ident =~ /^GO[^a-zA-Z\d]?\d{3,}$/i;
1081 $ident = clean_go_term
($ident) or return;
1082 return "http://www.geneontology.org/cgi-bin/chooser.cgi?search_query=$urlencode{$ident}&search_constraint=terms";
1086 my @d = $ident =~ /\d+/g;
1087 return sprintf('GO:%07d',join('',@d));
1091 return unless $ident =~ /^GO/;
1092 my @d = $ident =~ /(\d+)/g;
1093 return { id
=> join('',@d)+0 };
1097 #### NAMESPACE HELPERS ###
1099 #return 1 if the identifier is a SGN-X234232 identifier
1100 #where X is the letter of your choice
1101 sub is_letter_identifier
{
1102 my ($dbname,$letter,$identifier) = @_;
1103 $dbname = uc($dbname);
1104 return 1 if $identifier =~ /^$dbname?\W?$letter\d{1,9}/i;
1108 sub quick_search_url
{
1109 "/search/quick_search.pl?term=".$urlencode{+shift}
1112 sub clean_letter_identifier
{
1113 my ($dbname,$letter,$identifier) = @_;
1114 $dbname = uc($dbname);
1115 $letter = uc($letter);
1116 my ($digits) = $identifier =~ /(\d+)/
1119 return "$dbname-$letter$digits";
1122 sub parse_letter_identifier
{
1123 my ($dbname,$letter,$identifier) = @_;
1124 $dbname = uc($dbname);
1125 $letter = uc($letter);
1126 my ($digits) = $identifier =~ /(\d+)/
1128 return { id
=> $digits + 0 };