getting the person_id correctly , now the metadata row can be deleted
[sgn.git] / cgi-bin / tools / dna_digestion / digestion.pl
blobca2dfc2a3cb12c48f15614168b1d551861a59c32
1 #Program that inputs a DNA sequence and the name of a restriction enzyme
2 #Shows where the enzyme would cut the DNA
3 #Written by Emily Hart 7/27/06
4 #Refactored by Johnathon Schultz 4/08/07
6 use strict;
7 use warnings;
8 use CXGN::Page;
9 use CXGN::Page::FormattingHelpers;
10 use CXGN::DB::Connection;
11 our $dbh=CXGN::DB::Connection->new();
12 our $page=CXGN::Page->new("Sol Genomics Network","john");
13 our ($dna_seq, $enz_name, $action)=$page->get_encoded_arguments("dna_seq", "enz_name", "action");
15 if($action eq "digest"){
16 if (!is_valid_dna($dna_seq)){
17 # $page-> message_page("Invalid DNA Sequence", "Please make sure that your sequence contains only the letters A, G, T, and C.");
18 $page->header("Sol Genomics Network");
19 print "<div align='center'>Invalid DNA Sequence<br />Please make sure that your sequence contains only the letter A, G, T, and C.</div>";
20 $page->footer();
22 elsif(!is_valid_enzyme($enz_name, $dbh)){
23 # $page-> message_page("Invalid Enzyme", "Please select an enzyme from the list.");
24 $page->header("Sol Genomics Network");
25 print "<div align='center'>Invalid Enzyme<br /> Please select an enzyme from the list.</div>";
26 $page->footer();
28 elsif(!(find_cut_sites(is_valid_dna($dna_seq), $enz_name) || $enz_name eq "All")){
29 no_cuts_found($enz_name);
30 print "<h1>Error 1</h1>";
32 # $page-> message_page("No Cuts Found", "Your sequence is not cut by $enz_name.");
34 else{
35 my $dna = "";
36 if($enz_name eq "All"){
37 my $enzymes = get_all_enzymes($dbh);
38 foreach my $enz(@$enzymes){
39 my $result = find_cut_sites(is_valid_dna($dna_seq), $enz);
40 if($result){
41 $dna .= "Your sequence was cut by $enz at:<br /> $result";
43 # print "<h1>$enz</h1>";
46 else{
47 $dna = find_cut_sites(is_valid_dna($dna_seq), $enz_name);
49 if($dna){
50 $page->header("Sol Genomics Network");
51 print "<div align='center'>Cuts Found!<br /> <tt>$dna</tt></div>";
52 $page->footer();
54 else{
55 no_cuts_found($enz_name);
59 else{
60 $page->header("Sol Genomics Network","Virtual DNA Digestion");
61 display($dbh);
62 $page->footer();
65 sub no_cuts_found{
66 $page->header("Sol Genomics Network");
67 my $message = "<div align='center'>No cuts found.<br /> Your sequence is not cut by ";
68 if($enz_name eq "All"){
69 $message .= "any enzymes.</div>";
71 else{
72 $message .= "$enz_name.</div>";
74 print $message;
75 $page->footer();
78 #shows the main page
79 sub display{
80 my $dbh = shift();
81 print "<p>Enter the sequence of DNA that you would like to have virtually digested and select the enzyme that you would like to use. Please do not enter a DNA sequence containing \"N\"s.</p>";
82 print '<p>Enter DNA sequence:</p><form action="#" method="post"><textarea name="dna_seq" rows="10" cols="60" wrap="virtual"></textarea>';
83 my $menu = enz_menu($dbh);
84 print "<p>Select enzyme: &nbsp; $menu</p>";
85 print '<input type="hidden" name="action" value="digest" /><input type="submit" value="Submit" /><input type="reset" value="Reset" /></form>';
88 #gets the names of the enzymes form the database and puts them into a drop down list
89 sub enz_menu{
90 my $dbh = shift();
91 $dbh -> do("set search_path=sgn");
92 my $enzyme_ids = $dbh ->selectcol_arrayref("SELECT DISTINCT enzyme_id FROM enzyme_restriction_sites WHERE restriction_site IS NOT NULL");
93 my $formatted_enz_ids = join ", ", @$enzyme_ids;
94 my $enzyme_names = $dbh -> selectcol_arrayref("SELECT enzyme_name FROM enzymes WHERE enzyme_id IN($formatted_enz_ids) ORDER BY enzyme_name");
95 my $menu_string = '<select name="enz_name">';
96 $menu_string .= "<option value='All' default='True'>All Enzymes</option>";
97 foreach my $e(@$enzyme_names){
98 $menu_string .= "<option value = \"$e\">$e</option>";
100 $menu_string .= "</select>";
101 return $menu_string;
103 #if the DNA sequence contains only A,G,T,C, returns the sequence
104 #otherwise returns false
105 sub is_valid_dna{
106 my ($dna) = @_;
107 $dna = uc $dna;
108 $dna =~ s/\s//g;
109 if($dna =~ m/^[ACGT]+$/){
110 return $dna;
112 else{
113 return "";
117 #returns true if the enzyme is valid, false otherwise
118 sub is_valid_enzyme{
119 my ($enz, $dbh) = @_;
120 $dbh -> do("set search_path=sgn");
121 my $ids = $dbh ->selectcol_arrayref("SELECT DISTINCT enzyme_id FROM enzyme_restriction_sites WHERE restriction_site IS NOT NULL");
122 my $formatted_enz_ids = join ", ", @$ids;
123 my $enzyme_names = $dbh -> selectcol_arrayref("SELECT enzyme_name FROM enzymes WHERE enzyme_id IN($formatted_enz_ids) ORDER BY enzyme_name");
124 my $is_valid = 0;
125 foreach my $e(@$enzyme_names){
126 if($e eq $enz){
127 $is_valid = 1;
130 return $is_valid || $enz_name eq "All";
131 #Return the truth of this statement: The enzyme "is valid" or is equal to "All"
134 #returns an array of all possible sequences for the given enzyme
135 sub get_enz_seq_arr{
136 my ($enz_name) = @_;
137 $dbh -> do("set search_path=sgn");
138 my $ids_ref = $dbh -> selectcol_arrayref("SELECT enzyme_id FROM enzymes WHERE enzyme_name = '$enz_name'");
139 my $enz_id = $ids_ref->[0];
140 my $seqs_ref = $dbh -> selectcol_arrayref("SELECT restriction_site FROM enzyme_restriction_sites WHERE enzyme_id = ?", undef, $enz_id);
141 foreach my $seq(@$seqs_ref){
142 $seq =~ s/\s//g;
144 return $seqs_ref;
147 #finds where the enzyme matches the dna sequence and returns an array of the indexes where they first match
148 sub find_matches{
149 my ($dna_seq, $enz_name) = @_;
150 my $seqs_ref = get_enz_seq_arr($enz_name);
151 my @matches = ();
152 for(my $i=0; $i<length($dna_seq); $i++){
153 foreach my $enz_seq(@$seqs_ref){
154 $enz_seq =~ s/\^//g;
155 $enz_seq =~ s/(.*)\(\d*\/\d*\)/$1/;
156 my $enz_length = length($enz_seq);
157 my $sub_seq = substr($dna_seq, $i, $enz_length);
158 if($enz_seq eq $sub_seq){
159 push @matches, $i;
163 return \@matches;
166 #returns the reverse compliment of the given DNA sequence
167 sub reverse_compliment{
168 my ($dna_seq) = @_;
169 my %base_pair = ("A" => "T",
170 "T" => "A",
171 "G" => "C",
172 "C" => "G");
173 my $compliment = "";
174 my @seq = split "", $dna_seq;
175 foreach my $s(@seq){
176 $compliment.=$base_pair{$s};
178 my $rev_comp = reverse($compliment);
179 return $rev_comp;
182 #returns finished, formatted DNA
183 sub find_cut_sites{
184 my($top_seq, $enz_name) = @_;
185 my $rev_bottom_seq = reverse_compliment($top_seq);
186 my $enzyme_seq_ref = get_enz_seq_arr($enz_name);
187 my $enzyme_seq = $enzyme_seq_ref->[0];
188 my $length_before_cut = this_length_before_cut($enzyme_seq);
189 my $top_seq_cuts = find_matches($top_seq, $enz_name);
190 foreach my $c(@$top_seq_cuts){
191 my $cut = $c+$length_before_cut;
192 $top_seq = insert_cut($top_seq, $cut);
193 if (other_length_before_cut($enzyme_seq)){
194 my $bottom_seq = reverse($rev_bottom_seq);
195 $bottom_seq = insert_cut($bottom_seq, other_length_before_cut($enzyme_seq));
196 $rev_bottom_seq = reverse($bottom_seq);
199 my $rev_bottom_seq_cuts = find_matches($rev_bottom_seq, $enz_name);
200 foreach my $c(@$rev_bottom_seq_cuts){
201 my $cut = $c+$length_before_cut;
202 $rev_bottom_seq = insert_cut($rev_bottom_seq, $cut);
203 if (other_length_before_cut($enzyme_seq)){
204 my $rev_top_seq = insert_cut(reverse($top_seq), other_length_before_cut($enzyme_seq));
205 $top_seq = reverse($rev_top_seq);
208 my $bottom_seq = reverse($rev_bottom_seq);
209 if(has_cut_sites($top_seq) || has_cut_sites($bottom_seq)){
210 if(length($top_seq) < 100 && length($bottom_seq) < 100){
211 return "$top_seq<br />$bottom_seq<br />";
213 else{
214 return wrap_text($top_seq, $bottom_seq);
217 return "";
220 #returns true if a DNA sequence contains a ^
221 sub has_cut_sites{
222 my($dna_seq) = @_;
223 my @dna = split "", $dna_seq;
224 my $has_cut_sites = 0;
225 foreach my $d(@dna){
226 if($d eq "^"){
227 $has_cut_sites = 1;
230 return $has_cut_sites;
233 #returns how far after the match the strand will be cut
234 sub this_length_before_cut{
235 my ($enzyme_seq) = @_;
236 $enzyme_seq =~ s/\s//g;
237 my $length_before_cut = "";
238 if($enzyme_seq =~ m/^(.*)\^.*$/){
239 $length_before_cut = length($1);
241 elsif($enzyme_seq =~ m/(.*)\((\d*)\/\d*\)/){
242 $length_before_cut = length($1) + $2;
244 return $length_before_cut;
247 #for enzymes which cut the strand and its compliment, returns how far after the match the other strand will be cut
248 sub other_length_before_cut{
249 my ($enzyme_seq) = @_;
250 my $length_before_cut = "";
251 if($enzyme_seq =~ m/(.*)\(\d*\/(\d*)\)/){
252 $length_before_cut = length($1) + $2;
254 return $length_before_cut;
257 #insert a cut before the index given, ignoring ^s
258 sub insert_cut{
259 my ($dna_seq, $index) = @_;
260 my $length = length($dna_seq);
261 if($index > length($dna_seq)){
262 return $dna_seq;
264 my @dna = split "", $dna_seq;
265 my $letters_count = -1;
266 my $char_count = -1;
267 foreach my $d(@dna){
268 $char_count++;
269 if ($d =~ m/[ATGC]/){
270 $letters_count++;
272 if ($letters_count == $index){
273 last;
276 if($letters_count == $index-1){
277 return $dna_seq."^";
279 elsif($letters_count < $index-1){
280 return $dna_seq;
282 else{
283 $dna_seq =~ s/^(.{$char_count})(.*)$/$1^$2/;
284 return $dna_seq;
288 #Returns the names of all the enzymes in the database.
289 sub get_all_enzymes{
290 my ($dbh) = @_;
291 $dbh->do("set search_path=sgn");
292 my $enzymes = $dbh->selectcol_arrayref("SELECT DISTINCT enzyme_name from enzymes");
293 return $enzymes;
296 #Wraps the dna text around the screen instead of letting it run off into the abyss.
297 #I know this is like the worst possible way to write this code but, its the only way I can think of for doing it
298 #Since I don't understand a lot of the perl "subtleties"
299 sub wrap_text{
300 my ($top, $bottom) = @_;
301 print CXGN::Page::FormattingHelpers->html_break_string($top,100,":");
302 my $delimitedtop = CXGN::Page::FormattingHelpers->html_break_string($top,100,":");
303 my $delimitedbottom = CXGN::Page::FormattingHelpers->html_break_string($bottom,100,":");
304 my @toplines = split(/:/,$delimitedtop);
305 my @bottomlines = split(/:/,$delimitedbottom);
306 my $result = "";
307 for(my $i = 0; $i < length(@toplines) && $i < length(@bottomlines); $i++){
308 $result .= "$toplines[$i]<br />$bottomlines[$i]<br /><br />";
310 return $result;