Merge pull request #5248 from solgenomics/topic/batch_update_trials
[sgn.git] / cgi-bin / tools / ssr / show_ssr.pl
blob1fecfaa39ba6a93edf702ae7c3df2a8dc7ef0f91
2 =head1 NAME
4 show_ssr.pl - find SSR sequences
6 =head1 DESCRIPTION
8 Generates an simple table with SSR sequences from data submitted through the input page (index.pl in this directory). It is a web script that supports the following cgi parameters:
10 =over 5
12 =item seq_data
14 seq_data is the actual data uploaded; it should be in FASTA format.
16 =item upload
18 upload is an optional file upload containing a FASTA formatted sequence as input
20 =item ssr_min
22 denotes the minimum length needed to be found.
24 =item ssr_max
26 denotes the maximum length needed to be found.
28 =item ssr_repeat
30 denotes the minimum number of times a substring must be repeated to be found.
32 =item type
34 =over 5
36 =item 'html' for HTML output
38 =item 'tab' for tab delimited output
40 =back 5
42 =back
44 =head1 AUTHOR(S)
46 Search code by Chenwei Lin (cl295@cornell.edu) with edits by Robert Albright (rfa5). Documentation and HTML interface by Robert Albright (rfa5@cornell.edu).
48 =cut
50 use strict;
51 use CXGN::Page;
52 use CXGN::Page::FormattingHelpers qw/ page_title_html
53 blue_section_html /;
54 use Bio::SeqIO;
56 our $page = CXGN::Page->new("SSR Search Results", "rfa5");
57 use CatalystX::GlobalContext qw( $c );
59 if ($c->req->method() ne "POST") {
60 post_only($page);
61 exit;
65 my ($output_type, $unit_low, $unit_high, $repeat_time, $seq_data) = $page->get_arguments("type", "ssr_min", "ssr_max", "ssr_repeat", "seq_data");
67 # Check for input errors
68 my $upload = $page->get_upload("upload");
69 my $upload_fh;
71 # print STDERR "Uploading file $args{upload_file}...\n";
72 if (defined $upload) {
73 $upload_fh = $upload->fh();
75 while (<$upload_fh>) {
76 $seq_data .=$_;
80 if ($seq_data eq "") {
81 user_error($page, "No sequence was entered!");
85 # Perform SSR search
86 my %out = search($unit_low, $unit_high, $repeat_time, $seq_data);
88 if ($output_type eq "tab") {
89 print " ";
90 print $out{tab};
91 } elsif ($output_type eq "html") {
92 $page->header();
93 print page_title_html("SSR Search Results");
94 print blue_section_html("$out{num} SSRs Found",'<table width="100%" cellpadding="5" cellspacing="0" border="0" summary=""><tr><td>' . $out{html} . '</td></tr></table>');
95 $page -> footer();
96 } else {
97 user_error($page, "Unrecognized output type!");
100 sub search {
101 (@_ == 4) or die "Please supply minimum unit length, maximum unit length, minimum repeat time, input sequence (in blast format), true if output should be HTML, false if tab delimited.";
103 my ($unit_low, $unit_high, $repeat_time, $inseq) = @_;
104 $repeat_time = $repeat_time - 1;
105 my %seq = ();
107 # open input string as a file
108 open IN, "+<", \$inseq or die "Couldn't open input file \n";
109 my $seqio = Bio::SeqIO->new( -format=>"fasta",
110 -fh => \*IN,
113 #Search and write result to output file
114 my $result_table = '<table width="100%"><tr><th>Sequence</th><th>Motif</th><th>Repeat</th><th>Start</th><th>Length</th></tr>'."\n";
115 my $result_tab = "Sequence\tMotif\tRepeat\tStart\tLength\n";
117 open OUT_TAB, "+>>", \$result_tab or die "Couldn't open output file\n";
118 open OUT_TABLE, "+>>", \$result_table or die "Couldn't open output file\n";
120 my $numResults=0;
121 while (my $seqobj = $seqio->next_seq()) {
122 my $id = $seqobj->display_id(); #substr ($_, 1);
123 my $length = $seqobj->length(); # length $seq{$_};
124 my $seq = $seqobj->seq();
126 # print OUT_TABLE '<tr><td colspan="5">'.$seq.'</td></tr>';
128 #seach for patterns repeats
129 while ($seq =~ /([ATGC]{$unit_low,$unit_high}?)(\1{$repeat_time,})/g){
130 my $actual_repeat_time = (length $&)/(length $1);
131 my $start = (length $`) + 1;
132 my $match = $1;
133 my $fullmatch =$&;
135 #screen out single nucleotide repeat
136 if (!($match =~ /^A+$|^T+$|^G+$|^C+$/)) {
137 #for patterns longer than 2 necleotides, print out all the hits
138 if (length $match >=3){
139 print OUT_TAB "$id\t$1\t$actual_repeat_time\t$start\t$length\n";
140 print OUT_TABLE "<tr><td>$id</td><td>$1</td><td>$actual_repeat_time</td><td>$start</td><td>$length</td></tr>\n";
141 $numResults++;
143 #If the repeat time specified in command line is less than 4, for patterns of 2 nucleotides, print out matches of at least 4 repeats.
144 elsif ($actual_repeat_time >3){
145 print OUT_TAB "$id\t$1\t$actual_repeat_time\t$start\t$length\n";
146 print OUT_TABLE "<tr><td>$id</td><td>$1</td><td>$actual_repeat_time</td><td>$start</td><td>$length</td></tr>\n";
147 $numResults++;
152 print OUT_TABLE "</table>\n";
153 close OUT_TABLE;
154 close OUT_TAB;
156 my %result = ();
157 $result{html} = $result_table;
158 $result{tab} = $result_tab;
159 $result{num} = $numResults;
160 return %result;
163 sub post_only {
164 my ($page) = @_;
166 $page->header();
168 print <<EOF;
170 <h4>SGN SSR Interface Error</h4>
172 <p>SSR subsystem can only accept HTTP POST requests</p>
176 $page->footer();
179 sub user_error {
180 my ($page, $reason) = @_;
182 $page->header();
184 print <<EOF;
186 <h4>SGN SSR Interface Error</h4>
188 <p>$reason</p>
191 $page->footer();
192 exit(0);