a class to extract sequences from the genome
[cxgn-corelibs.git] / lib / CXGN / Tools / Text.pm
bloba71d76e228da2cbfc43fd69313ed2eb185e3b12a
2 =head1 NAME
4 CXGN::Tools::Text
6 =head1 DESCRIPTION
8 Various tools for interpreting and displaying text strings.
10 =head1 FUNCTIONS
12 =head2 list_to_string
14 Takes a list, puts it into a string with commas and the word "and" before the last item.
16 =head2 is_all_letters
18 Takes a string, returns 1 if the string is all letters, 0 if not.
20 =head2 is_number
22 Takes a string, tests to see if it meets this pattern: optional + or
23 -, 0 or more digits, followed by either: "." and one or more digits,
24 or, just one or more digits. This should catch most normal ways that a
25 user would enter a number. This function might be improved by
26 returning the number that was contained in the string instead of just
27 "1" (in case perl can't cast it on its own... i've never checked to
28 see if perl can parse an initial "+" for instance)
30 =head2 trim
32 Takes a string and returns the string without leading or trailing
33 whitespaces.
35 =head2 remove_all_whitespaces
37 Takes a string and returns it without any whitespaces in it at all
38 anymore. If you sent in a spaced sentence, your spaces would be
39 removed.
41 =head2 strip_unprintables
43 [NOT YET IMPLEMENTED -apparently as of March 2009] This function is
44 still under development. It is meant to clean input of escape
45 characters in preparation for display, database insertion,
46 etc. However, different machines apparently are working with different
47 character sets, so the higher characters cannot be cleaned reliably
48 (when i tried to clean higher characters, this function produced
49 different output on my machine than it did on the devel machine). For
50 now, it just cleans out the lower characters.
53 =head2 abbr_latin
55 Desc: abbreviate some latin words in your string and return
56 the new abbreviated version
57 Args: string
58 Ret : string with abbreviations
59 Side Effects: none
60 Example:
62 my $tomato = 'Lycopersicon esculentum';
63 my $abbr = abbr_latin($tomato);
64 print $abbr,"\n";
65 #will print 'L. esculentum'
67 Currently abbreviates Solanum, Lycopersicon, Capsicum, Nicotiana,
68 and Coffea.
70 =cut
72 package CXGN::Tools::Text;
73 use strict;
74 use Carp;
76 BEGIN {
77 our @EXPORT_OK = qw/
78 list_to_string
79 is_all_letters
80 is_number
81 is_garbage
82 trim
83 commify_number
84 remove_all_whitespaces
85 strip_unprintables
86 abbr_latin
87 to_tsquery_string
88 from_tsquery_string
89 parse_pg_arraystr
90 sanitize_string
91 truncate_string
94 our @EXPORT_OK;
95 use base qw/Exporter/;
97 #returns the contents of the array in a string of the form "$_[0], $_[1],...., and $_[end]"
98 sub list_to_string {
99 ( @_ == 0 ) ? ''
100 : ( @_ == 1 ) ? $_[0]
101 : ( @_ == 2 ) ? join( " and ", @_ )
102 : join( ", ", @_[ 0 .. ( $#_ - 1 ) ], "and $_[-1]" );
105 #test a string to see if it is one continuous string of letters
106 sub is_all_letters {
107 my ($string) = @_;
108 if ( defined($string)
109 && $string =~ /^[A-Za-z]+$/i
110 ) #if there are one or more letters with no spaces in the string
112 return 1;
114 else { return 0; }
117 #test a string to see if it is a number
118 sub is_number {
119 my ($string) = @_;
120 if ( defined($string)
121 && $string =~ /^([+\-]?)\d*(\.\d+|\d+)$/
122 ) #optional + or -, 0 or more digits, followed by (. and one or more digits) or (just one or more digits)
124 return 1;
126 else { return 0; }
129 #trim whitespace from string
130 sub trim {
131 my ($string) = @_;
132 $string =~ s/^\s+|\s+$//g if defined $string;
133 return $string;
136 #remove_all all whitespace in string
137 sub remove_all_whitespaces {
138 my ($string) = @_;
139 if ( defined($string) ) {
140 $string =~ s/\s+//g;
142 return $string;
145 sub abbr_latin {
146 my ($string) = @_;
147 if ( defined($string) ) {
148 $string =~ s/Solanum/S\./g;
149 $string =~ s/Lycopersicon/L\./g;
150 $string =~ s/Capsicum/C\./g;
151 $string =~ s/Nicotiana/N\./g;
152 $string =~ s/Coffea/C\./g;
154 return $string;
157 =head2 sanitize_string
159 Usage: my $sanitized = sanitize_string($dirty)
160 Desc: removes {, }, <, >, and ; characters from
161 string $dirty and returns the sanitized
162 string.
163 Side Effects:
164 Example:
166 =cut
168 sub sanitize_string {
169 my $s = shift;
170 $s = trim($s);
171 $s =~ s/\}|\{|\>|\<|\;//g;
172 return $s;
176 =head2 function format_field_text()
178 Synopsis:
179 Arguments:
180 Returns:
181 Side effects:
182 Description: formats a post or topic text for display.
183 Note that it converts certain embedded tags to
184 html links. This function does not assure security
185 - use the get_encoded_arguments in the CXGN::Page
186 object for that purpose.
188 Tags supported:
189 [url][/url]
190 [link][ref][\ref][\link] the difference between [link] and [ilink] is that [link] add
191 [ilink][ref][\ref][\ilink] http:// if do not find it. [ilink] not.
192 [i][/i]
195 =cut
198 sub format_field_text {
199 my $post_text = shift;
201 # support vB script url tag
202 while ($post_text =~ /\[url\](.*?)\[\/url\]/g ) {
203 my $link = $1;
204 my $replace_link = $link;
205 if ($link !~ /^http/i) {
206 $replace_link = "http:\/\/$link";
208 $link=~ s/\?/\\?/g;
209 $post_text =~ s/\[url\]$link\[\/url\]/\<a href=\"$replace_link\"\>$replace_link\<\/a\>/g;
212 while ($post_text =~ /\[link\](.*?)\[ref\](.*?)\[\/ref\]\[\/link\]/g ) {
213 my $link = $1;
214 my $ref=$2;
215 my $replace_link = $link;
216 if ($link !~ /^http/i) {
217 $replace_link = "http:\/\/$link";
219 $link=~ s/\?/\\?/g;
220 $post_text =~ s/\[link\]$link\[ref\]$ref\[\/ref\]\[\/link\]/\<a href=\"$replace_link\"\>$ref<\/a\>/g;
222 ## New tag, internal link. [ilink] that works in the same way that link but do not any http:// if do not find it
223 while ($post_text =~ /\[ilink\](.*?)\[ref\](.*?)\[\/ref\]\[\/ilink\]/g ) {
224 my $link = $1;
225 my $ref=$2;
226 my $replace_link = $link;
227 $link=~ s/\?/\\?/g;
228 $post_text =~ s/\[ilink\]$link\[ref\]$ref\[\/ref\]\[\/ilink\]/\<a href=\"$replace_link\"\>$ref<\/a\>/g;
230 # italics tag
231 while ($post_text =~ /\[i\](.*?)\[\/i\]/g ) {
232 my $itext = $1;
233 my $replace_text = $itext;
235 $itext=~ s/\?/\\?/g;
236 $post_text =~ s/\[i\]$itext\[\/i\]/\<i\>$replace_text\<\/i\>/g;
238 # convert newlines to <br /> tags
240 $post_text =~ s/\n/\<br \/\>/g;
241 return $post_text;
246 =head2 to_tsquery_string
248 Desc: format a plain-text string for feeding to Postgres to_tsquery
249 function
250 Args: list of strings to convert
251 Ret : in scalar context: the first converted string,
252 in list context: list of converted strings
253 Side Effects: none
254 Example:
256 my $teststring = 'gi|b4ogus123|blah is bogus & I hate it!';
257 to_tsquery_string($teststring);
258 #returns 'gi\\|b4ogus123\\|blah|is|bogus|\\&|I|hate|it\\!'
260 =cut
262 sub to_tsquery_string {
263 ($_) = @_;
265 $_ = trim($_);
267 # Escape pipes
268 s/\|/\\\|/g;
270 # Escape ampersands and exclamation points
271 s/([&!])/\\\\$1/g;
273 # Escape parentheses and colons.
274 s/([():])/\\$1/g;
276 # And together all strings
277 s/\s+/&/g;
278 return $_;
281 =head2 from_tsquery_string
283 Desc: attempt to recover the original string from the product
284 of to_tsquery_string()
285 Args: list of strings
286 Ret : list of de-munged strings
287 Side Effects: none
288 Example:
290 =cut
292 sub from_tsquery_string {
293 my @args = @_;
295 foreach (@args) {
296 next unless defined $_;
297 s/(?<!\\)&/ /g; #& not preceded by backslashes is a space
298 s/\\\\([^\\])/$1/g; #anything double-backslashed
299 s/\\(.)/$1/g; #anything single-backslashed
301 return wantarray ? @args : $args[0];
304 =head2 parse_pg_arraystr
306 Usage: my $arrayref = parse_pg_arraystr('{1234,543}');
307 Desc : parse the string representation of a postgres array, returning
308 an arrayref
309 Args : string representation of postgres array
310 Ret : an arrayref
311 Side Effects: none
313 =cut
315 sub parse_pg_arraystr {
316 my ($str) = @_;
318 return [] unless $str;
320 my $origstr = $str;
322 #remove leading and trailing braces
323 $str =~ s/^{//
324 or croak "malformed array string '$origstr'";
325 $str =~ s/}$//
326 or croak "malformed array string '$origstr'";
328 return [
329 do {
330 if ( $str =~ /^"/ ) {
331 $str =~ s/^"|"$//g;
332 split /","/, $str;
334 else {
335 split /,/, $str;
342 =head2 commify_number
344 Args: a number
345 Ret : a string containing the commified version of it
347 Example: commify_number(230400) returns '230,400'
349 =cut
351 sub commify_number {
352 local $_ = shift;
353 return undef unless defined $_;
354 1 while s/^(-?\d+)(\d{3})/$1,$2/;
359 =head2 truncate_string
361 Desc: truncate a string that might be long so that it fits in a manageable
362 length, adding an arbitrary string (default '&hellip;') to the end if
363 necessary. If the string is shorter than the given truncation
364 length, simply returns the string unaltered. If the truncated
365 string would have whitespace between the end of the given
366 string and the addon string, drops that whitespace.
367 Args: string to truncate, optional truncation length (default 50),
368 optional truncation addon (default '...')
369 Ret : in scalar context: truncated string
370 in list context: (truncated string,
371 boolean telling whether string was truncated)
373 Example:
374 truncate_string('Honk if you love ducks',6);
375 #would return
376 'Honk i&hellip;'
378 truncate_string('Honk if you love cats',5);
379 #would return
380 'Honk&hellip;'
381 #because this function drops trailing whitespace
383 =cut
385 sub truncate_string {
386 my ($string,$length,$addon) = @_;
387 $length ||= 50;
388 $addon ||= '...';
390 my $was_truncated = 0;
391 if( length($string) > $length) {
392 $string = substr($string,0,$length).$addon;
393 $was_truncated = 1;
396 return wantarray ? ($string,$was_truncated) : $string;
399 =head1 AUTHOR
401 john binns - zombieite@gmail.com
402 Robert Buels - rmb32@cornell.edu
404 =cut
407 1; #do not remove