lib/CXGN/Tools/Identifiers.pm

   1 package CXGN::Tools::Identifiers;
   2 use strict;
   3 use warnings;
   4
   5 =head1 NAME
   6
   7 CXGN::Tools::Identifiers - useful functions for dealing with
   8 identifiers, like SGN-E23412
   9
  10 =head1 SYNOPSIS
  11
  12   my $nsname = identifier_namespace('SGN-E23412');
  13   #returns 'sgn_e'
  14
  15   my $url = identifier_url('SGN-E23412');
  16   #$url is now '/search/est.pl?request_type=7&request=575150'
  17
  18   my $link = link_identifier('SGN-E575150');
  19   #$link is now
  20   #'<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
  21
  22   my $clean = clean_identifier('SGNE3423');
  23   #returns SGN-E3423, or undef if the identifier was not recognized
  24
  25   my $contents = parse_identifier('SGN-E12345');
  26
  27 =head1 DESCRIPTION
  28
  29 This module contains easy-to-use functions for working with the often
  30 malformed strings of text that purport to be identifiers of some sort.
  31
  32 Note that support for identification of SGN marker names like 'TG23' has been removed. The only marker identifiers supported now are SGN-M type identifiers.
  33
  34
  35 =head2 Supported Namespaces
  36
  37 =over 12
  38
  39 =item sgn_u
  40
  41 SGN unigene identifiers 'SGN-U2342'
  42
  43 =item cgn_u
  44
  45 CGN unigene identifers 'CGN-U122539'
  46
  47 =item sgn_e
  48
  49 SGN EST identifiers 'SGN-E234223'
  50
  51 =item sgn_s
  52
  53 SGN Microarray spot identifiers 'SGN-S1241'
  54
  55 =item sgn_m
  56
  57 SGN Marker identifier 'SGN-M1347'
  58
  59 =item sgn_t
  60
  61 SGN Trace (chromatogram) identifiers 'SGN-T1241'
  62
  63 =item sgn_locus_sequence
  64
  65 An SGN locus sequence identifier like 'Potato_SGNlocusID_2206_AM040153'
  66 which is parsed as { id => 2206, species => 'Potato', ext_id => 'AM040153' }
  67
  68 =item microarray_spot
  69
  70 microarray spot identifiers like '1-1-1.2.3.4'
  71
  72 =item est
  73
  74 other kinds of EST identifiers like 'cLEC-23-A23'
  75
  76 =item bac_end
  77
  78 BAC end identifiers like 'LE_HBa0123A12_SP6_2342'
  79
  80 =item bac
  81
  82 BAC identifiers like 'LE_HBa0123A12'
  83
  84 By default, if a BAC has been assigned to a sequencing
  85 project, its clean_identifier and link_identifier will
  86 replace the LE_ or SL_ species name at the beginning
  87 with a C##, where ## is the zero-padded chromosome number.
  88
  89 $CXGN::Tools::Identifier::insert_bac_chr_nums can be used
  90 to enable or disable this.  To disable, set it to a
  91 false value.  To enable, set true.  Defaults to true.
  92
  93 =item bac_sequence
  94
  95 BAC sequence identifiers like 'LE_HBa0123A12.1'
  96
  97 =item bac_fragment
  98
  99 BAC fragment identifiers (identifies contigs in a bac that is
 100 still partially assembled) like 'LE_HBa0123A12.1-4'
 101
 102 =item tomato_bac_contig
 103
 104 A tomato BAC contig, e.g. C12.4_contig1
 105
 106 =item generic_scaffold
 107
 108 A generic scaffold identifier, e.g. scaffold12345
 109
 110 =item tair_locus
 111
 112 TAIR locus identifiers like 'At1g67700.1'
 113
 114 =item species_binomial
 115
 116 e.g. 'Arabidopsis thaliana', 'Solanum lycopersicum'
 117
 118 =item genbank_gi
 119
 120 A genbank identifier containing a stable GI identification number.
 121 Examples include gi|108883260|gb|EAT47485.1| or just gi|108883260|
 122 or GI:108883260
 123
 124 =item genbank_accession
 125
 126 A genbank identifier denoting a sequence accession, containing a
 127 namespace and an identifier, such as gb|EAT47485.1|.
 128
 129 =item genbank_cdd
 130
 131 A genbank identifier denoting an entry in the NCBI Conserved Domain
 132 Database (CDD).
 133
 134 =item swissprot_accession
 135
 136 A uniprot identifier composed by 'sp'+pipe+uniprot_accession+pipe+entry_name
 137 for example sp|P22142|NDUS2_NEUCR means accession=P22142 and entry_name=NDUS2_NEUCR
 138 (sp means swissprot)
 139
 140 =item uniref_accession
 141
 142 A uniprot accession is contains a combination of number and letters,
 143 for example: UniRef90_P22142, the accsssion will be P22142
 144
 145 =back
 146
 147 =cut
 148
 149 ############ NAMESPACE DEFINITIONS ############
 150 # To add a namespace:
 151 # 1. make is_<namespace>, url_<namespace>, and clean_<namespace>
 152 #    functions for your namespace at the end of this file
 153 # 2. add its name to @namespace_list below
 154
 155 #NOTE: the ordering of this list is the order in which a given
 156 #identifier is checked for membership in each class
 157
 158 # removed sgn_marker from this list because it always returns
 159 # a valid identifier
 160 our @namespace_list = qw/
 161                          sgn_u
 162                          cgn_u
 163                          sgn_e
 164                          sgn_s
 165                          sgn_m
 166                          sgn_t
 167                          sgn_locus_sequence
 168                          microarray_spot
 169                          est
 170                          bac_end
 171                          bac_fragment
 172                          bac_sequence
 173                          bac
 174                          tomato_bac_contig
 175                          generic_scaffold
 176                          tair_gene_model
 177                          tair_locus
 178                          go_term
 179                          interpro_accession
 180                          swissprot_accession
 181                          uniref_accession
 182                          genbank_gi
 183                          species_binomial
 184                          genbank_cdd
 185                          genbank_accession
 186                         /;
 187
 188 #return 1 if the given namespace is in this list
 189 sub _is_valid_namespace {
 190   my ($ns) = @_;
 191   return 1 if grep {$ns eq $_} @namespace_list;
 192   return 0;
 193 }
 194
 195 =head1 FUNCTIONS
 196
 197 All functions are EXPORT_OK.
 198
 199 =cut
 200
 201 # ABOUT THE ARCHITECTURE OF THIS MODULE
 202 #
 203 # each namespace supported by this module
 204 # has:
 205 #   a.) an entry in @namespace_list
 206 #   b.) a is_<namespace> function
 207 #   c.) a url_<namespace> function
 208 #
 209 # All of these are at the bottom of this file,
 210 # where it says NAMESPACE DEFINITIONS.
 211
 212 use Carp;
 213 use Tie::UrlEncoder;
 214 our %urlencode;
 215
 216 BEGIN {
 217   our @EXPORT_OK = qw/  identifier_url
 218                         link_identifier
 219                         identifier_namespace
 220                         clean_identifier
 221                         list_namespaces
 222                         parse_identifier
 223                         unique_identifier
 224                      /;
 225 }
 226 use base qw/Exporter/;
 227
 228 use CXGN::Genomic::Clone;
 229 use CXGN::Genomic::CloneIdentifiers qw/ parse_clone_ident assemble_clone_ident /;
 230 use CXGN::Genomic::GSS;
 231 #use CXGN::Marker::Tools qw/clean_marker_name marker_name_to_ids/;
 232 use CXGN::Tools::Text qw/trim/;
 233
 234
 235 =head2 identifier_url
 236
 237   Usage: my $url = identifier_url('SGN-E12141');
 238   Desc : get an information URL for an identifier.
 239   Ret  : a string containing an absolute or relative URL,
 240          suitable for putting in a href= in HTML
 241   Args : identifier string,
 242          (optional) namespace name if you know it
 243   Side Effects: might look things up in the database
 244
 245 =cut
 246
 247 sub identifier_url {
 248   my ($ident,$ns) = @_;
 249   $ident = trim($ident);
 250   $ns ||= identifier_namespace($ident)
 251     or return;
 252   return unless _is_valid_namespace($ns);
 253   #clean up the identifier if we can
 254   $ident = clean_identifier($ident,$ns) || $ident;
 255   no strict 'refs';
 256   return "url_$ns"->($ident);
 257 }
 258
 259 =head2 link_identifier
 260
 261   Usage: my $link = link_identifier('SGN-E575150');
 262          #returns '<a href="/search/est.pl?request_type=7&request=575150">SGN-E575150</a>'
 263   Desc : calls identifier_url() to get a URL for your identifier,
 264          then returns a complete HTML link to you, like
 265   Ret  : an html link, or undef if the link could not be made
 266   Args : single string containing an identifier,
 267          (optional) namespace name if you know it
 268   Side Effects: might look things up in the database
 269
 270 =cut
 271
 272 sub link_identifier {
 273   my ($ident,$ns) = @_;
 274   $ident = trim($ident);
 275   $ns ||= identifier_namespace($ident)
 276     or return;
 277   $ident = clean_identifier($ident,$ns) || $ident;
 278   my $url = identifier_url($ident,$ns)
 279     or return;
 280   #clean up the identifier if we can
 281   my $fns = ucfirst join ' ',split /_/,$ns;
 282   return qq|<a title="search for this $fns" href="$url">$ident</a>|;
 283 }
 284
 285 =head2 identifier_namespace
 286
 287   Usage: my $ns = identifier_namespace('SGN-U1231');
 288          #returns 'sgn_u'
 289   Desc : get the namespace
 290   Ret  : a string containing the name of the namespace,
 291          or undef if it cannot identify the namespace
 292   Args : a string identifier
 293   Side Effects: might look things up in the database
 294
 295 =cut
 296
 297 #see bottom for namespace definitions
 298 sub identifier_namespace {
 299   my ($identifier) = @_;
 300   $identifier = trim($identifier)
 301     or return;
 302   #identifiers have to be more than 2 chars, and they can't be all numbers
 303   length($identifier) > 2 && $identifier =~ /\D/
 304     or return;
 305   foreach my $ns (our @namespace_list) {
 306     no strict 'refs';
 307     return $ns if "is_$ns"->($identifier);
 308     #warn "$identifier is not in $ns\n";
 309   }
 310   return;
 311 }
 312
 313 =head2 clean_identifier
 314
 315   Usage: my $newident = clean_identifier('SGNE1231');
 316   Desc : attempt to guess the namespace of the identifier,
 317          and clean up any irregularities in it to put
 318          it in its canonical form
 319   Ret  : a cleaned string, or undef if the identifier
 320          is not in any recognized namespace
 321   Args : identifier to be cleaned
 322   Side Effects: may look things up in the database
 323
 324 =cut
 325
 326 sub clean_identifier {
 327   my ($ident,$ns) = @_;
 328   $ident = trim($ident);
 329   $ns ||= identifier_namespace($ident)
 330     or return;
 331   return unless _is_valid_namespace($ns);
 332   no strict 'refs';
 333   return "clean_$ns"->($ident);
 334 }
 335
 336 =head2 list_namespaces
 337
 338   Usage: my @namespaces = list_namespaces;
 339   Desc : get the list of namespace names supported by this module
 340   Ret  : list of valid namespace names
 341   Args : none
 342   Side Effects: none
 343
 344 =cut
 345
 346 sub list_namespaces {
 347   return @namespace_list;
 348 }
 349
 350
 351 =head2 parse_identifier
 352
 353   Usage: my $data = parse_identifier($identifier, $namespace );
 354   Desc : many identifiers have data in them, for example, an SGN-E has the EST id
 355          in it, and a bac name (LE_HBa0001A02) has the organism, library, plate,
 356          row, and column in it.  This function parses that data out and gives it
 357          to you, as it appears in the string.  You might consider running
 358          clean_identifier() on what you give this function.
 359   Args : identifier to parse,
 360          optional list of namespace names it could be a member of,
 361                   guesses the namespace if not provided
 362   Ret  : nothing if the identifier could not be parsed,
 363          otherwise a hashref of data in the identifier, which varies in its
 364          contents, looking like
 365            {  namespace => 'namespace_name',
 366               <other data in the identifier>
 367            }
 368   Side Effects: none
 369   Example:
 370
 371      my $data = parse_identifier('C03HBa0001A02');
 372      #and now $data contains
 373      $data = { namespace => 'bac',
 374                lib       => 'LE_HBa',
 375                plate     => 1,
 376                row       => 'A',
 377                col       => 2,
 378                clonetype => 'bac',
 379                match     => 'C03HBa0001A02',
 380                chr       => 3,
 381              );
 382
 383 =cut
 384
 385 sub parse_identifier {
 386   my ($ident, $ns ) = @_;
 387
 388   $ident = trim( $ident);
 389   $ns ||= identifier_namespace($ident)
 390     or return;
 391
 392   return unless _is_valid_namespace($ns);
 393
 394   no strict 'refs';
 395   my $p = "parse_$ns"->($ident)
 396     or return;
 397   $p->{namespace} = $ns;
 398   return $p;
 399 }
 400
 401 =head1 NAMESPACE FUNCTIONS
 402
 403 These functions are not exported, and
 404 are only used internally by this module.
 405 To add a namespace, follow the instructions inside this
 406 file (they are in comments, not POD).
 407
 408 =cut
 409
 410 #for instructions, see ABOUT THE ARCHITECTURE OF THIS MODULE above
 411
 412 =head2 is_E<lt>namespaceE<gt>
 413
 414   Usage: is_sgn_e('SGN-E2342');
 415   Desc : check if an identifier is in a given namespace
 416   Ret  : 1 if the given identifier is in that namespace,
 417          0 otherwise
 418   Args : identifier string
 419   Side Effects: may look up things in the database
 420
 421 =head2 url_E<lt>namespaceE<gt>
 422
 423   Usage: url_sgn_e('SGN-E2342');
 424   Desc : get the info URL for a given identifier,
 425   Ret  : string with the URL, or undef if no
 426          url is available for this identifier
 427   Args : identifier string
 428   Side Effects: may look up things in the database
 429
 430   NOTE: These functions will ONLY be called if it has already
 431         been determined that the identifier is in that namespace.
 432
 433 =head2 clean_E<lt>namespaceE<gt>
 434
 435   Usage: my $clean = clean_sgn_e('sgne12311');
 436          #returns 'SGN-E12311'
 437   Desc : clean up any irregularities in the identifier string
 438   Ret  : cleaned up identifier string.  Should never fail,
 439          since this function will only be called on identifiers
 440          that are definitely in that namespace.
 441   Args : identifier string
 442   Side Effects: may look things up in the database
 443
 444   NOTE: These functions will ONLY be called if it has already
 445         been determined that the identifier is in that namespace.
 446         If your is_<namespace> function says it's that type of
 447         identifier, your clean_<namespace> function had better
 448         be able to clean it.
 449
 450 =cut
 451
 452 ######## sgn_u
 453 sub is_sgn_u {
 454   is_letter_identifier('sgn','u',shift);
 455 }
 456 sub url_sgn_u {
 457   "/search/unigene.pl?unigene_id=".$urlencode{uc($_[0])};
 458 }
 459 sub clean_sgn_u {
 460   clean_letter_identifier('sgn','u',shift);
 461 }
 462 sub parse_sgn_u {
 463   parse_letter_identifier('sgn','u',shift);
 464 }
 465 ######## cgn_u
 466 sub is_cgn_u {
 467   is_letter_identifier('cgn','u',shift);
 468 }
 469 sub url_cgn_u {
 470   my ($cgnid) = shift =~ /(\d+)/ or return undef;
 471   return "/search/unigene.pl?unigene_id=CGN-U$cgnid";
 472 }
 473 sub clean_cgn_u {
 474   clean_letter_identifier('cgn','u',shift);
 475 }
 476 sub parse_cgn_u {
 477   parse_letter_identifier('cgn','u',shift);
 478 }
 479 ######### sgn_e
 480 sub is_sgn_e {
 481   is_letter_identifier('sgn','e',shift);
 482 }
 483 sub url_sgn_e {
 484   "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=automatic&search=Search";
 485 }
 486 sub clean_sgn_e {
 487   clean_letter_identifier('sgn','e',shift);
 488 }
 489 sub parse_sgn_e {
 490   parse_letter_identifier('sgn','e',shift);
 491 }
 492 ######### sgn_s
 493 sub is_sgn_s {
 494   is_letter_identifier('sgn','s',shift);
 495 }
 496 sub url_sgn_s {
 497   "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
 498 }
 499 sub clean_sgn_s {
 500   clean_letter_identifier('sgn','s',shift);
 501 }
 502 sub parse_sgn_s {
 503   parse_letter_identifier('sgn','s',shift);
 504 }
 505 ######### sgn_m
 506 sub is_sgn_m {
 507   is_letter_identifier('sgn','m',shift);
 508 }
 509 sub url_sgn_m {
 510     my $id = shift;
 511     $id =~ s/sgn.*m(\d+)$/$1/i;
 512     return "/marker/SGN-M$id/details";
 513 }
 514 sub clean_sgn_m {
 515   clean_letter_identifier('sgn','m',shift);
 516 }
 517 sub parse_sgn_m {
 518   parse_letter_identifier('sgn','m',shift);
 519 }
 520 ######### sgn_t
 521 sub is_sgn_t {
 522   is_letter_identifier('sgn','t',shift);
 523 }
 524 sub url_sgn_t {
 525   "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=9&search=Search";
 526 }
 527 sub clean_sgn_t {
 528   clean_letter_identifier('sgn','t',shift);
 529 }
 530 sub parse_sgn_t {
 531   parse_letter_identifier('sgn','t',shift);
 532 }
 533 ######### sgn_locus_sequence
 534 sub is_sgn_locus_sequence {
 535   return 1 if shift =~ /[a-z]+_SGNlocusID_\d+_.+/i;
 536   return 0;
 537 }
 538 sub url_sgn_locus_sequence {
 539   my ($id) = shift =~ /SGNlocusID_(\d+)/i;
 540   return "/phenome/locus_display.pl?locus_id=$id";
 541 }
 542 sub clean_sgn_locus_sequence {
 543   my $p = parse_sgn_locus_sequence(shift);
 544   return join'_',$p->{species},'SGNlocusID',$p->{id},$p->{ext_id};
 545 }
 546 sub parse_sgn_locus_sequence {
 547   my ($species,undef,$id,$extid) = split /_/,shift,4;
 548   return { species => ucfirst($species),
 549            id  => $id+0,
 550            ext_id => $extid,
 551          };
 552 }
 553 ######### microarray_spot
 554 sub is_microarray_spot {
 555   return 1 if shift =~ /^\d-\d-\d+\.\d+\.\d+\.\d+$/;
 556   return 0;
 557 }
 558 sub url_microarray_spot {
 559   "/search/est.pl?request_id=$urlencode{$_[0]}&request_from=0&request_type=14&search=Search";
 560 }
 561 sub clean_microarray_spot {
 562   shift; #no cleaning is done here
 563 }
 564 sub parse_microarray_spot {
 565   warn 'WARNING: parsing not yet implemented for microarray_spot';
 566   return;
 567 }
 568 ######### est
 569 sub is_est {
 570   # XXX: stupid stupid stupid.  Coffee clones have names like
 571   # ccc<garbagegarbagegarbage>.
 572   return 0 if $_[0] =~ m|^ccc|i;
 573   return 1 if $_[0] =~ /^(c[A-Z]{2,3}|TUS)[^A-Z\d]*[0-9]+[^A-Z\d]*[A-P][^A-Z\d]*[0-9]{1,2}$/i;
 574   return 0;
 575 }
 576 sub url_est {
 577   "/search/est.pl?request_from=0&request_id=$urlencode{$_[0]}&request_type=automatic";
 578 }
 579 sub clean_est {
 580   my $ident = shift;
 581   $ident = uc($ident);
 582   $ident =~ s/^C/c/;
 583
 584   if ($ident =~ /^([A-Z]{3,4})[^A-Z\d]*([0-9]+)[^A-Z\d]*([A-P])[^A-Z\d]*([0-9]{1,2})$/i) {
 585     $ident = "$1-$2-$3$4";
 586   }
 587   return $ident;
 588 }
 589 sub parse_est {
 590   warn 'WARNING: parsing not yet implemented for est';
 591   return;
 592 }
 593 ######### bac_end
 594 sub is_bac_end {
 595   my $parsed = parse_clone_ident(shift,'bac_end')
 596     or return 0;
 597
 598   return 1;
 599 }
 600 sub url_bac_end {
 601   my $ident = shift;
 602   my $parsed = parse_clone_ident($ident,'bac_end')
 603     or confess 'not a valid bac end name';
 604   return "/maps/physical/clone_read_info.pl?chrid=$parsed->{chromat_id}";
 605 }
 606 sub clean_bac_end {
 607   my $ident = shift;
 608   my $parsed = parse_clone_ident($ident,'bac_end')
 609     or confess 'not a valid bac end name';
 610   my $gss = CXGN::Genomic::GSS->retrieve_from_parsed_name($parsed)
 611     or confess "could not fetch gss for ident '$ident'";
 612   return $gss->external_identifier;
 613 }
 614 sub parse_bac_end {
 615   parse_clone_ident(shift,'bac_end');
 616 }
 617 #bac
 618 sub is_bac {
 619   my ($ident) = @_;
 620   my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
 621       or return 0;
 622   #must match the whole identifier, cause we sometimes tack on
 623   #things to the ends of the names
 624   return 0 unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
 625   my $clone = _bac_cache($parsed)
 626     or return 0;
 627   return 1;
 628 }
 629 sub _bac_cache {
 630   #single-element cache of the last BAC ident we returned.  this speeds
 631   #up runs of multiple queries for the same bac
 632   my ($parsed) = @_;
 633   our $last_key;
 634   our $last_clone;
 635   my $key = join(',',@{$parsed}{qw/lib plate row col clonetype/});
 636   if($last_key && $last_key eq $key) {
 637 #    warn "cache hit $key\n";
 638     return $last_clone;
 639   } else {
 640 #    warn "cache miss $key\n";
 641     $last_key = $key;
 642     return $last_clone = CXGN::Genomic::Clone->retrieve_from_parsed_name($parsed);
 643   }
 644 }
 645 sub url_bac {
 646   my ($ident) = @_;
 647   my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
 648       or return undef;
 649   #must match the whole identifier, cause we sometimes tack on
 650   #things to the ends of the names
 651   return undef unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
 652   my $clone = _bac_cache($parsed)
 653     or return undef;
 654   return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
 655 }
 656 our $insert_bac_chr_nums = 1;
 657 sub clean_bac {
 658   my ($ident) = @_;
 659   my $parsed = parse_clone_ident($ident,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/)
 660     or return undef;
 661   #must match the whole identifier, cause we sometimes tack on
 662   #things to the ends of the names
 663   return undef unless $parsed->{match} eq $ident and !defined($parsed->{version}) and !defined($parsed->{fragment});
 664   my $clone = _bac_cache($parsed)
 665     or return undef;
 666   return (our $insert_bac_chr_nums) && $clone->chromosome_num
 667     ? $clone->clone_name_with_chromosome : $clone->clone_name;
 668
 669 }
 670 sub parse_bac {
 671   parse_clone_ident(shift,qw/agi_bac agi_bac_with_chrom old_cornell sanger_bac/);
 672 }
 673
 674 #tomato_bac_contig
 675 sub is_tomato_bac_contig {
 676   my ($ident) = @_;
 677   return 1 if $ident =~ /^C\d+\.\d+[^a-z\d]?contig\d+$/i;
 678   return 0;
 679 }
 680 sub url_tomato_bac_contig {
 681   my ($ident) = @_;
 682   return;
 683 }
 684 sub clean_tomato_bac_contig {
 685   my ($ident) = @_;
 686   $ident = uc $ident;
 687   $ident =~ s/CONTIG/contig/;
 688   $ident =~ s/[^a-z\d]?contig/_contig/;
 689   return $ident;
 690 }
 691 sub parse_tomato_bac_contig {
 692   my ($ident)  = @_;
 693   $ident =~ /^C(\d+)\.(\d+)[^a-z\d]?contig(\d+)$/i
 694     or return;
 695   return { chr        => $1+0,
 696            chr_ver    => $2+0,
 697            ver        => $2+0,
 698            ctg_num    => $3+0,
 699          };
 700 }
 701 #generic_scaffold
 702 sub is_generic_scaffold {
 703   return 1 if $_[0] =~ /^scaffold\d+$/i;
 704   return 0;
 705 }
 706 sub url_generic_scaffold {
 707   my ($ident) = @_;
 708   return;
 709 }
 710 sub clean_generic_scaffold {
 711   my ($ident) = @_;
 712   my ($d) = $ident =~ /(\d+)/;
 713   $d ||= 0;
 714   return 'scaffold'.($d+0);
 715 }
 716 sub parse_generic_scaffold {
 717   my ($ident)  = @_;
 718   $ident =~ /^scaffold(\d+)$/i
 719     or return;
 720   return { scaffold_num    => $1 };
 721 }
 722 #bac_sequence
 723 sub is_bac_sequence {
 724   my ($ident) = @_;
 725   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 726     or return 0;
 727   #must match the whole identifier, cause we sometimes tack on
 728   #things to the ends of the names
 729   return 0 unless $parsed->{match} eq $ident and defined($parsed->{version});
 730   my $clone = _bac_cache($parsed)
 731     or return 0;
 732   return 1;
 733 }
 734 sub url_bac_sequence {
 735   my ($ident) = @_;
 736   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 737         or return;
 738 #   use Data::Dumper;
 739 #   die "$ident -> ",Dumper($parsed);
 740   #must match the whole identifier, cause we sometimes tack on
 741   #things to the ends of the names
 742   return unless $parsed->{match} eq $ident and defined($parsed->{version});
 743   my $clone = _bac_cache($parsed)
 744     or return;
 745   return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
 746 }
 747 sub clean_bac_sequence {
 748   my ($ident) = @_;
 749   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 750         or return;
 751   #must match the whole identifier, cause we sometimes tack on
 752   #things to the ends of the names
 753   return undef unless $parsed->{match} eq $ident and defined($parsed->{version});
 754   my $clone = _bac_cache($parsed)
 755     or return;
 756   return assemble_clone_ident('versioned_bac_seq',$parsed);
 757 }
 758 sub parse_bac_sequence {
 759   parse_clone_ident(shift,qw/versioned_bac_seq/);
 760 }
 761 #bac_fragment
 762 sub is_bac_fragment {
 763   my ($ident) = @_;
 764   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 765     or return 0;
 766   #must match the whole identifier, cause we sometimes tack on
 767   #things to the ends of the names
 768   return 0 unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
 769   my $clone = _bac_cache($parsed)
 770     or return 0;
 771   return 1;
 772 }
 773 sub url_bac_fragment {
 774   my ($ident) = @_;
 775   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 776         or return;
 777 #   use Data::Dumper;
 778 #   die "$ident -> ",Dumper($parsed);
 779   #must match the whole identifier, cause we sometimes tack on
 780   #things to the ends of the names
 781   return unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
 782   my $clone = _bac_cache($parsed)
 783     or return;
 784   return "/maps/physical/clone_info.pl?id=".$clone->clone_id;
 785 }
 786 sub clean_bac_fragment {
 787   my ($ident) = @_;
 788   my $parsed = parse_clone_ident($ident,qw/versioned_bac_seq/)
 789         or return;
 790   #must match the whole identifier, cause we sometimes tack on
 791   #things to the ends of the names
 792   return undef unless $parsed->{match} eq $ident and defined($parsed->{version}) and defined($parsed->{fragment});
 793   my $clone = _bac_cache($parsed)
 794     or return;
 795   return assemble_clone_ident(versioned_bac_seq => $parsed);
 796 }
 797 sub parse_bac_fragment {
 798   parse_clone_ident(shift,qw/ versioned_bac_seq / );
 799 }
 800 #tair_locus
 801 sub is_tair_locus {
 802   return 1 if shift =~ /^AT[1-5MC]G\d{5}$/i;
 803   return 0;
 804 }
 805 sub url_tair_locus {
 806   my ($locusname) = @_;
 807 #  $locusname =~ s/\.\d+$//;
 808   "http://arabidopsis.org/servlets/TairObject?type=locus&name=$urlencode{$locusname}"
 809 }
 810 sub clean_tair_locus {
 811   my $ident = shift;
 812   $ident =~ s/^at/At/i; #properly capitalize the first at
 813   return $ident;
 814 }
 815 sub parse_tair_locus {
 816   my ($ident) = @_;
 817   warn 'WARNING: parsing not yet implemented for TAIR locus';
 818   return;
 819 }
 820 #tair_gene_model
 821 sub is_tair_gene_model {
 822   return 1 if shift =~ /^AT[1-5MC]G\d{5}\.\d+$/i;
 823   return 0;
 824 }
 825 sub url_tair_gene_model {
 826   my $name = shift;
 827   "http://arabidopsis.org/servlets/TairObject?type=gene&name=$urlencode{$name}"
 828 }
 829 sub clean_tair_gene_model {
 830   my $ident = shift;
 831   $ident =~ s/^at/At/i; #properly capitalize the first at
 832   return $ident;
 833 }
 834 sub parse_tair_gene_model {
 835   warn 'WARNING: parsing not yet implemented for tair gene model';
 836   return;
 837 }
 838 #species binomial
 839 sub is_species_binomial {
 840   return 1 if shift =~ /^[a-z]+ [a-z]+$|^[a-z]\.\s*[a-z]+$/i;
 841   return 0;
 842 }
 843 sub url_species_binomial {
 844   _wikipedia_link(@_);
 845 }
 846 sub clean_species_binomial {
 847   my $ident = shift;
 848   $ident =~ s/\.(?=\S)/\. /g;
 849   $ident =~ s/\s+/ /g;
 850   $ident = lc($ident);
 851   $ident = ucfirst($ident);
 852   return $ident;
 853 }
 854 sub parse_species_binomial {
 855   my ($ident) = @_;
 856   my @w = split qr/\W+/, $ident;
 857   return unless @w == 2;
 858   return { genus   => $w[0],
 859            species => $w[1],
 860          };
 861 }
 862 sub _wikipedia_link {
 863   my ($ident) = @_;
 864   $ident =~ s/\s+/_/g;
 865   return 'http://en.wikipedia.org/wiki/Special:Search/'.$ident;
 866 }
 867 #uniprotKB-swissprot_accession
 868 sub is_swissprot_accession {
 869     my ($ident) = @_;
 870     return 1 if $ident =~ /^sp\|\w+\|\w+_\w+$/i;
 871 }
 872 sub url_swissprot_accession {
 873     my ($ident) = @_;
 874     $ident = clean_swissprot_accession($ident) or return;
 875     return "http://www.uniprot.org/uniprot/" . $ident;
 876 }
 877 sub clean_swissprot_accession {
 878     my ($ident) = @_;
 879     my $clean;
 880     if ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
 881         $clean = $1;
 882     } else {
 883         $clean=$ident;
 884     }
 885     return $clean;
 886 }
 887 sub parse_swissprot_accession {
 888     my ($ident)=@_;
 889     unless ($ident =~ m/^sp\|(\w+)\|\w+_\w+$/i) {
 890         return { id => $1};
 891     } else {
 892         return;
 893     }
 894 }
 895
 896 #uniprot_uniref_accession
 897 sub is_uniref_accession {
 898     my ($ident) = @_;
 899     return 1 if $ident =~ m/^UniRef\d+_\w+$/i;
 900 }
 901 sub url_uniref_accession {
 902     my ($ident) = @_;
 903     $ident = clean_uniref_accession($ident) or return;
 904     return "http://www.uniprot.org/uniprot/".$ident;
 905 }
 906 sub clean_uniref_accession {
 907     my ($ident) = @_;
 908     my $clean;
 909     if ($ident =~ m/^UniRef\d+_(\w+)$/i) {
 910         $clean = $1;
 911     } else {
 912         $clean=$ident;
 913     }
 914     return $clean;
 915 }
 916 sub parse_uniref_accession {
 917     my ($ident)=@_;
 918     unless ($ident =~ m/^UniRef\d+_(\w+)$/i) {
 919         return { id => $1};
 920     } else {
 921         return;
 922     }
 923 }
 924
 925
 926 #genbank_gi
 927 sub is_genbank_gi {
 928   my ($ident) = @_;
 929   return 1 if $ident =~ /^gi[\|:]\d+[\|:]?$/i;
 930   return 0;
 931 }
 932 sub url_genbank_gi {
 933   my ($ident) = @_;
 934   $ident = clean_genbank_gi($ident) or return;
 935   "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}"
 936 }
 937 sub clean_genbank_gi {
 938   my ($ident) = @_;
 939   if($ident =~ /^gi[\|:](\d+)$/i) {
 940     return "gi|$1|";
 941   } else {
 942     return clean_genbank($ident)
 943   }
 944 }
 945 sub parse_genbank_gi {
 946   my ($ident) = @_;
 947   $ident =~ /^gi[\|:](\d+)[\|:]?$/i
 948     or return;
 949   return { gi => $1 + 0 };
 950 }
 951 #genbank_accession
 952 sub is_genbank_accession {
 953   my ($ident) = @_;
 954
 955    return 1 if
 956      $ident =~ /([a-z]{2,3})\|+\w+\d+(\.\d+)?\|?/i
 957       || $ident =~ /^[A-Z_]{1,5}\d{4,}(\.\d+)?$/;
 958   return 0;
 959 }
 960 sub url_genbank_accession {
 961   my ($ident) = @_;
 962   $ident = clean_genbank_accession($ident) or return;
 963
 964   return "http://www.ncbi.nlm.nih.gov/gquery/gquery.fcgi?term=$urlencode{$ident}";
 965 }
 966 sub clean_genbank_accession { clean_genbank(@_) };
 967 sub parse_genbank_accession {
 968   my ($ident) = @_;
 969
 970   my %parsed;
 971
 972   if( $ident =~ /[\|:]/ ) {
 973     my @fields = split /[\|:]+/, $ident;
 974     pop @fields if $fields[-1] =~ /^\[\d+\]$/;
 975
 976     while( my $field = shift @fields ) {
 977       if( lc $field eq 'gi' ) {
 978         my $gi = shift @fields;
 979         $parsed{gi} = $gi + 0;
 980       }
 981       elsif( lc $field eq 'gb' ) {
 982         my $acc = shift @fields;
 983         my $locus = shift @fields;
 984         $parsed{locus} = $locus if defined $locus;
 985         my $accver = _gb_acc_ver($acc);
 986         @parsed{keys %$accver} = values %$accver;
 987       }
 988       else {
 989         if( $fields[0] && $field !~ /\d/ ) {
 990           $parsed{$field} = shift @fields;
 991         } else {
 992           $parsed{unknown} ||= [];
 993           push @{$parsed{unknown}},$field;
 994         }
 995       }
 996     }
 997     return \%parsed;
 998   } else {
 999     return _gb_acc_ver($ident);
1000   }
1001 }
1002 sub _gb_acc_ver {
1003   my ($id) = @_;
1004   my %parsed;
1005   if( $id =~ /^([\w_]+\d+)\.(\d+)$/ ) {
1006     return { accession => $1,
1007              version   => $2+0,
1008            };
1009   }
1010   else {
1011     return { accession => $id };
1012   }
1013 }
1014
1015 sub clean_genbank {
1016   my ($ident) = @_;
1017   $ident =~ s/^([a-z]{2,3})\|/lc($1).'|'/ie; #lowercase initial gi and namespace idents
1018   $ident =~ s/\|([a-z]{2,3})\|/'|'.lc($1).'|'/ie; #lowercase internal gi and namespace idents
1019   $ident =~ s/\[\d+\]//; #remove any bracketed gi numbers
1020   return $ident;
1021 }
1022
1023 #genbank_cdd
1024 sub is_genbank_cdd {
1025   my ($ident) = @_;
1026   return 1 if $ident =~ /^(gnl\|)?cdd[\|:]\d+[\|:]?$/i;
1027   return 0;
1028 }
1029 sub url_genbank_cdd {
1030   my ($ident) = @_;
1031   $ident = clean_genbank_cdd($ident) or return;
1032   my $p = parse_genbank_cdd($ident) or return;
1033  return "http://www.ncbi.nlm.nih.gov/sites/entrez/query.fcgi?db=cdd&term=$p->{id}";
1034 }
1035 sub clean_genbank_cdd {
1036   my ($ident) = @_;
1037   if($ident =~ /(\d+)/i) {
1038     return "gnl|CDD|$1";
1039   } else {
1040     return clean_genbank($ident)
1041   }
1042 }
1043 sub parse_genbank_cdd {
1044   my ($ident) = @_;
1045   $ident =~ /cdd[\|:](\d+)[\|:]?$/i
1046     or return;
1047   return { id => $1 + 0 };
1048 }
1049
1050
1051 #interpro accession
1052 sub is_interpro_accession {
1053   my ($ident) = @_;
1054   return 1 if $ident =~ /^IPR\d+$/i;
1055   return 0;
1056 }
1057 sub url_interpro_accession {
1058   my ($ident) = @_;
1059   $ident = clean_interpro_accession($ident) or return;
1060   return "http://www.ebi.ac.uk/interpro/IEntry?ac=" . $ident;
1061 }
1062 sub clean_interpro_accession {
1063   my ($ident) = @_;
1064   return uc($ident);
1065 }
1066 sub parse_interpro_accession {
1067   my ($ident) = @_;
1068   return unless $ident =~ /^IPR(\d+)$/;
1069   return { id => $1+0 };
1070 }
1071
1072
1073 #go term
1074 sub is_go_term {
1075   my ($ident) = @_;
1076   return 1 if $ident =~ /^GO[^a-zA-Z\d]?\d{3,}$/i;
1077   return 0;
1078 }
1079 sub url_go_term {
1080   my ($ident) = @_;
1081   $ident = clean_go_term($ident) or return;
1082   return "http://www.geneontology.org/cgi-bin/chooser.cgi?search_query=$urlencode{$ident}&search_constraint=terms";
1083 }
1084 sub clean_go_term {
1085   my ($ident) = @_;
1086   my @d = $ident =~ /\d+/g;
1087   return sprintf('GO:%07d',join('',@d));
1088 }
1089 sub parse_go_term {
1090   my ($ident) = @_;
1091   return unless $ident =~ /^GO/;
1092   my @d = $ident =~ /(\d+)/g;
1093   return { id => join('',@d)+0 };
1094 }
1095
1096
1097 #### NAMESPACE HELPERS ###
1098
1099 #return 1 if the identifier is a SGN-X234232 identifier
1100 #where X is the letter of your choice
1101 sub is_letter_identifier {
1102   my ($dbname,$letter,$identifier) = @_;
1103   $dbname = uc($dbname);
1104   return 1 if $identifier =~ /^$dbname?\W?$letter\d{1,9}/i;
1105   return 0;
1106 }
1107
1108 sub quick_search_url {
1109   "/search/quick_search.pl?term=".$urlencode{+shift}
1110 }
1111
1112 sub clean_letter_identifier {
1113   my ($dbname,$letter,$identifier) = @_;
1114   $dbname = uc($dbname);
1115   $letter = uc($letter);
1116   my ($digits) = $identifier =~ /(\d+)/
1117     or return;
1118   $digits += 0;
1119   return "$dbname-$letter$digits";
1120 }
1121
1122 sub parse_letter_identifier {
1123   my ($dbname,$letter,$identifier) = @_;
1124   $dbname = uc($dbname);
1125   $letter = uc($letter);
1126   my ($digits) = $identifier =~ /(\d+)/
1127     or return;
1128   return { id => $digits + 0 };
1129 }
1130
1131
1132 ###
1133 1;#do not remove
1134 ###