lib/CXGN/Tools/Text.pm

   1
   2 =head1 NAME
   3
   4 CXGN::Tools::Text
   5
   6 =head1 DESCRIPTION
   7
   8 Various tools for interpreting and displaying text strings.
   9
  10 =head1 FUNCTIONS
  11
  12 =head2 list_to_string
  13
  14 Takes a list, puts it into a string with commas and the word "and" before the last item.
  15
  16 =head2 is_all_letters
  17
  18 Takes a string, returns 1 if the string is all letters, 0 if not.
  19
  20 =head2 is_number
  21
  22 Takes a string, tests to see if it meets this pattern: optional + or
  23 -, 0 or more digits, followed by either: "." and one or more digits,
  24 or, just one or more digits. This should catch most normal ways that a
  25 user would enter a number. This function might be improved by
  26 returning the number that was contained in the string instead of just
  27 "1" (in case perl can't cast it on its own... i've never checked to
  28 see if perl can parse an initial "+" for instance)
  29
  30 =head2 trim
  31
  32 Takes a string and returns the string without leading or trailing
  33 whitespaces.
  34
  35 =head2 remove_all_whitespaces
  36
  37 Takes a string and returns it without any whitespaces in it at all
  38 anymore. If you sent in a spaced sentence, your spaces would be
  39 removed.
  40
  41 =head2 strip_unprintables
  42
  43 [NOT YET IMPLEMENTED -apparently as of March 2009] This function is
  44 still under development. It is meant to clean input of escape
  45 characters in preparation for display, database insertion,
  46 etc. However, different machines apparently are working with different
  47 character sets, so the higher characters cannot be cleaned reliably
  48 (when i tried to clean higher characters, this function produced
  49 different output on my machine than it did on the devel machine). For
  50 now, it just cleans out the lower characters.
  51
  52
  53 =head2 abbr_latin
  54
  55   Desc: abbreviate some latin words in your string and return
  56         the new abbreviated version
  57   Args: string
  58   Ret : string with abbreviations
  59   Side Effects: none
  60   Example:
  61
  62    my $tomato = 'Lycopersicon esculentum';
  63    my $abbr = abbr_latin($tomato);
  64    print $abbr,"\n";
  65    #will print 'L. esculentum'
  66
  67   Currently abbreviates Solanum, Lycopersicon, Capsicum, Nicotiana,
  68   and Coffea.
  69
  70 =cut
  71
  72 package CXGN::Tools::Text;
  73 use strict;
  74 use Carp;
  75
  76 BEGIN {
  77     our @EXPORT_OK = qw/
  78       list_to_string
  79       is_all_letters
  80       is_number
  81       is_garbage
  82       trim
  83       commify_number
  84       remove_all_whitespaces
  85       strip_unprintables
  86       abbr_latin
  87       to_tsquery_string
  88       from_tsquery_string
  89       parse_pg_arraystr
  90       sanitize_string
  91       truncate_string
  92       /;
  93 }
  94 our @EXPORT_OK;
  95 use base qw/Exporter/;
  96
  97 #returns the contents of the array in a string of the form "$_[0], $_[1],...., and $_[end]"
  98 sub list_to_string {
  99         ( @_ == 0 ) ? ''
 100       : ( @_ == 1 ) ? $_[0]
 101       : ( @_ == 2 ) ? join( " and ", @_ )
 102       :               join( ", ", @_[ 0 .. ( $#_ - 1 ) ], "and $_[-1]" );
 103 }
 104
 105 #test a string to see if it is one continuous string of letters
 106 sub is_all_letters {
 107     my ($string) = @_;
 108     if ( defined($string)
 109         && $string =~ /^[A-Za-z]+$/i
 110       )    #if there are one or more letters with no spaces in the string
 111     {
 112         return 1;
 113     }
 114     else { return 0; }
 115 }
 116
 117 #test a string to see if it is a number
 118 sub is_number {
 119     my ($string) = @_;
 120     if ( defined($string)
 121         && $string =~ /^([+\-]?)\d*(\.\d+|\d+)$/
 122       ) #optional + or -, 0 or more digits, followed by (. and one or more digits) or (just one or more digits)
 123     {
 124         return 1;
 125     }
 126     else { return 0; }
 127 }
 128
 129 #trim whitespace from string
 130 sub trim {
 131     my ($string) = @_;
 132     $string =~ s/^\s+|\s+$//g if defined $string;
 133     return $string;
 134 }
 135
 136 #remove_all all whitespace in string
 137 sub remove_all_whitespaces {
 138     my ($string) = @_;
 139     if ( defined($string) ) {
 140         $string =~ s/\s+//g;
 141     }
 142     return $string;
 143 }
 144
 145 sub abbr_latin {
 146     my ($string) = @_;
 147     if ( defined($string) ) {
 148         $string =~ s/Solanum/S\./g;
 149         $string =~ s/Lycopersicon/L\./g;
 150         $string =~ s/Capsicum/C\./g;
 151         $string =~ s/Nicotiana/N\./g;
 152         $string =~ s/Coffea/C\./g;
 153     }
 154     return $string;
 155 }
 156
 157 =head2 sanitize_string
 158
 159  Usage:        my $sanitized = sanitize_string($dirty)
 160  Desc:         removes {, }, <, >, and ; characters from
 161                string $dirty and returns the sanitized
 162                string.
 163  Side Effects:
 164  Example:
 165
 166 =cut
 167
 168 sub sanitize_string {
 169     my $s = shift;
 170     $s = trim($s);
 171     $s =~ s/\}|\{|\>|\<|\;//g;
 172     return $s;
 173 }
 174
 175
 176 =head2 function format_field_text()
 177
 178   Synopsis:
 179   Arguments:
 180   Returns:
 181   Side effects:
 182   Description:  formats a post or topic text for display.
 183                 Note that it converts certain embedded tags to
 184                 html links. This function does not assure security
 185                 - use the get_encoded_arguments in the CXGN::Page
 186                 object for that purpose.
 187
 188                Tags supported:
 189                [url][/url]
 190                [link][ref][\ref][\link]     the difference between [link] and [ilink] is that [link] add
 191                [ilink][ref][\ref][\ilink]   http:// if do not find it. [ilink] not.
 192                [i][/i]
 193                \n
 194
 195 =cut
 196
 197
 198 sub format_field_text {
 199     my $post_text = shift;
 200
 201     # support vB script url tag
 202     while ($post_text =~ /\[url\](.*?)\[\/url\]/g ) {
 203         my $link = $1;
 204         my $replace_link = $link;
 205         if ($link !~ /^http/i) {
 206             $replace_link = "http:\/\/$link";
 207         }
 208         $link=~ s/\?/\\?/g;
 209         $post_text =~ s/\[url\]$link\[\/url\]/\<a href=\"$replace_link\"\>$replace_link\<\/a\>/g;
 210
 211     }
 212     while ($post_text =~ /\[link\](.*?)\[ref\](.*?)\[\/ref\]\[\/link\]/g ) {
 213         my $link = $1;
 214         my $ref=$2;
 215         my $replace_link = $link;
 216         if ($link !~ /^http/i) {
 217             $replace_link = "http:\/\/$link";
 218         }
 219         $link=~ s/\?/\\?/g;
 220         $post_text =~ s/\[link\]$link\[ref\]$ref\[\/ref\]\[\/link\]/\<a href=\"$replace_link\"\>$ref<\/a\>/g;
 221     }
 222     ## New tag, internal link. [ilink] that works in the same way that link but do not any http:// if do not find it
 223     while ($post_text =~ /\[ilink\](.*?)\[ref\](.*?)\[\/ref\]\[\/ilink\]/g ) {
 224         my $link = $1;
 225         my $ref=$2;
 226         my $replace_link = $link;
 227         $link=~ s/\?/\\?/g;
 228         $post_text =~ s/\[ilink\]$link\[ref\]$ref\[\/ref\]\[\/ilink\]/\<a href=\"$replace_link\"\>$ref<\/a\>/g;
 229     }
 230     # italics tag
 231     while ($post_text =~ /\[i\](.*?)\[\/i\]/g ) {
 232         my $itext = $1;
 233         my $replace_text = $itext;
 234
 235         $itext=~ s/\?/\\?/g;
 236         $post_text =~ s/\[i\]$itext\[\/i\]/\<i\>$replace_text\<\/i\>/g;
 237     }
 238     # convert newlines to <br /> tags
 239     #
 240     $post_text =~ s/\n/\<br \/\>/g;
 241     return $post_text;
 242 }
 243
 244
 245
 246 =head2 to_tsquery_string
 247
 248   Desc: format a plain-text string for feeding to Postgres to_tsquery
 249         function
 250   Args: list of strings to convert
 251   Ret : in scalar context: the first converted string,
 252         in list context:   list of converted strings
 253   Side Effects: none
 254   Example:
 255
 256     my $teststring = 'gi|b4ogus123|blah is bogus & I hate it!';
 257     to_tsquery_string($teststring);
 258     #returns 'gi\\|b4ogus123\\|blah|is|bogus|\\&|I|hate|it\\!'
 259
 260 =cut
 261
 262 sub to_tsquery_string {
 263     ($_) = @_;
 264
 265     $_ = trim($_);
 266
 267     # Escape pipes
 268     s/\|/\\\|/g;
 269
 270     # Escape ampersands and exclamation points
 271     s/([&!])/\\\\$1/g;
 272
 273     # Escape parentheses and colons.
 274     s/([():])/\\$1/g;
 275
 276     # And together all strings
 277     s/\s+/&/g;
 278     return $_;
 279 }
 280
 281 =head2 from_tsquery_string
 282
 283   Desc: attempt to recover the original string from the product
 284         of to_tsquery_string()
 285   Args: list of strings
 286   Ret : list of de-munged strings
 287   Side Effects: none
 288   Example:
 289
 290 =cut
 291
 292 sub from_tsquery_string {
 293     my @args = @_;
 294
 295     foreach (@args) {
 296         next unless defined $_;
 297         s/(?<!\\)&/ /g;        #& not preceded by backslashes is a space
 298         s/\\\\([^\\])/$1/g;    #anything double-backslashed
 299         s/\\(.)/$1/g;          #anything single-backslashed
 300     }
 301     return wantarray ? @args : $args[0];
 302 }
 303
 304 =head2 parse_pg_arraystr
 305
 306   Usage: my $arrayref = parse_pg_arraystr('{1234,543}');
 307   Desc : parse the string representation of a postgres array, returning
 308          an arrayref
 309   Args : string representation of postgres array
 310   Ret  : an arrayref
 311   Side Effects: none
 312
 313 =cut
 314
 315 sub parse_pg_arraystr {
 316     my ($str) = @_;
 317
 318     return [] unless $str;
 319
 320     my $origstr = $str;
 321
 322     #remove leading and trailing braces
 323     $str =~ s/^{//
 324       or croak "malformed array string '$origstr'";
 325     $str =~ s/}$//
 326       or croak "malformed array string '$origstr'";
 327
 328     return [
 329         do {
 330             if ( $str =~ /^"/ ) {
 331                 $str =~ s/^"|"$//g;
 332                 split /","/, $str;
 333             }
 334             else {
 335                 split /,/, $str;
 336             }
 337           }
 338     ];
 339 }
 340
 341
 342 =head2 commify_number
 343
 344   Args: a number
 345   Ret : a string containing the commified version of it
 346
 347   Example: commify_number(230400) returns '230,400'
 348
 349 =cut
 350
 351 sub commify_number {
 352   local $_  = shift;
 353   return undef unless defined $_;
 354   1 while s/^(-?\d+)(\d{3})/$1,$2/;
 355   $_;
 356 }
 357
 358
 359 =head2 truncate_string
 360
 361   Desc: truncate a string that might be long so that it fits in a manageable
 362         length, adding an arbitrary string (default '&hellip;') to the end if
 363         necessary.  If the string is shorter than the given truncation
 364         length, simply returns the string unaltered.  If the truncated
 365         string would have whitespace between the end of the given
 366         string and the addon string, drops that whitespace.
 367   Args: string to truncate, optional truncation length (default 50),
 368         optional truncation addon (default '...')
 369   Ret : in scalar context:   truncated string
 370         in list context:     (truncated string,
 371                               boolean telling whether string was truncated)
 372
 373   Example:
 374     truncate_string('Honk if you love ducks',6);
 375     #would return
 376     'Honk i&hellip;'
 377
 378     truncate_string('Honk if you love cats',5);
 379     #would return
 380     'Honk&hellip;'
 381     #because this function drops trailing whitespace
 382
 383 =cut
 384
 385 sub truncate_string {
 386   my ($string,$length,$addon) = @_;
 387   $length ||= 50;
 388   $addon ||= '...';
 389
 390   my $was_truncated = 0;
 391   if( length($string) > $length) {
 392     $string = substr($string,0,$length).$addon;
 393     $was_truncated = 1;
 394   }
 395
 396   return wantarray ? ($string,$was_truncated) : $string;
 397 }
 398
 399 =head1 AUTHOR
 400
 401 john binns - zombieite@gmail.com
 402 Robert Buels - rmb32@cornell.edu
 403
 404 =cut
 405
 406 ###
 407 1;    #do not remove
 408 ###