cgi-bin/tools/dna_digestion/digestion.pl

   1 #Program that inputs a DNA sequence and the name of a restriction enzyme
   2 #Shows where the enzyme would cut the DNA
   3 #Written by Emily Hart 7/27/06
   4 #Refactored by Johnathon Schultz 4/08/07
   5
   6 use strict;
   7 use warnings;
   8 use CXGN::Page;
   9 use CXGN::Page::FormattingHelpers;
  10 use CXGN::DB::Connection;
  11 our $dbh=CXGN::DB::Connection->new();
  12 our $page=CXGN::Page->new("Sol Genomics Network","john");
  13 our ($dna_seq, $enz_name, $action)=$page->get_encoded_arguments("dna_seq", "enz_name", "action");
  14
  15 if($action eq "digest"){
  16     if (!is_valid_dna($dna_seq)){
  17 #       $page-> message_page("Invalid DNA Sequence", "Please make sure that your sequence contains only the letters A, G, T, and C.");
  18         $page->header("Sol Genomics Network");
  19         print "<div align='center'>Invalid DNA Sequence<br />Please make sure that your sequence contains only the letter A, G, T, and C.</div>";
  20         $page->footer();
  21     }
  22     elsif(!is_valid_enzyme($enz_name, $dbh)){
  23 #       $page-> message_page("Invalid Enzyme", "Please select an enzyme from the list.");
  24         $page->header("Sol Genomics Network");
  25         print "<div align='center'>Invalid Enzyme<br /> Please select an enzyme from the list.</div>";
  26         $page->footer();
  27     }
  28     elsif(!(find_cut_sites(is_valid_dna($dna_seq), $enz_name) || $enz_name eq "All")){
  29         no_cuts_found($enz_name);
  30         print "<h1>Error 1</h1>";
  31
  32 #       $page-> message_page("No Cuts Found", "Your sequence is not cut by $enz_name.");
  33     }
  34     else{
  35         my $dna = "";
  36         if($enz_name eq "All"){
  37             my $enzymes = get_all_enzymes($dbh);
  38             foreach my $enz(@$enzymes){
  39                 my $result = find_cut_sites(is_valid_dna($dna_seq), $enz);
  40                 if($result){
  41                     $dna .= "Your sequence was cut by $enz at:<br /> $result";
  42                 }
  43 #               print "<h1>$enz</h1>";
  44             }
  45         }
  46         else{
  47             $dna = find_cut_sites(is_valid_dna($dna_seq), $enz_name);
  48         }
  49         if($dna){
  50             $page->header("Sol Genomics Network");
  51             print "<div align='center'>Cuts Found!<br /> <tt>$dna</tt></div>";
  52             $page->footer();
  53         }
  54         else{
  55             no_cuts_found($enz_name);
  56         }
  57     }
  58 }
  59 else{
  60     $page->header("Sol Genomics Network","Virtual DNA Digestion");
  61     display($dbh);
  62     $page->footer();
  63 }
  64
  65 sub no_cuts_found{
  66         $page->header("Sol Genomics Network");
  67         my $message = "<div align='center'>No cuts found.<br /> Your sequence is not cut by ";
  68         if($enz_name eq "All"){
  69             $message .= "any enzymes.</div>";
  70         }
  71         else{
  72             $message .= "$enz_name.</div>";
  73         }
  74         print $message;
  75         $page->footer();
  76 }
  77
  78 #shows the main page
  79 sub display{
  80     my $dbh = shift();
  81     print "<p>Enter the sequence of DNA that you would like to have virtually digested and select the enzyme that you would like to use.  Please do not enter a DNA sequence containing \"N\"s.</p>";
  82     print '<p>Enter DNA sequence:</p><form action="#" method="post"><textarea name="dna_seq" rows="10" cols="60" wrap="virtual"></textarea>';
  83     my $menu = enz_menu($dbh);
  84     print "<p>Select enzyme: &nbsp; $menu</p>";
  85     print '<input type="hidden" name="action" value="digest" /><input type="submit" value="Submit" /><input type="reset" value="Reset" /></form>';
  86 }
  87
  88 #gets the names of the enzymes form the database and puts them into a drop down list
  89 sub enz_menu{
  90     my $dbh = shift();
  91     $dbh -> do("set search_path=sgn");
  92     my $enzyme_ids = $dbh ->selectcol_arrayref("SELECT DISTINCT enzyme_id FROM enzyme_restriction_sites WHERE restriction_site IS NOT NULL");
  93     my $formatted_enz_ids = join ", ", @$enzyme_ids;
  94     my $enzyme_names = $dbh -> selectcol_arrayref("SELECT enzyme_name FROM enzymes WHERE enzyme_id IN($formatted_enz_ids) ORDER BY enzyme_name");
  95     my $menu_string = '<select name="enz_name">';
  96     $menu_string .= "<option value='All' default='True'>All Enzymes</option>";
  97     foreach my $e(@$enzyme_names){
  98         $menu_string .= "<option value = \"$e\">$e</option>";
  99     }
 100     $menu_string .= "</select>";
 101     return $menu_string;
 102 }
 103 #if the DNA sequence contains only A,G,T,C, returns the sequence
 104 #otherwise returns false
 105 sub is_valid_dna{
 106     my ($dna) = @_;
 107     $dna = uc $dna;
 108     $dna =~ s/\s//g;
 109     if($dna =~ m/^[ACGT]+$/){
 110         return $dna;
 111     }
 112     else{
 113         return "";
 114     }
 115 }
 116
 117 #returns true if the enzyme is valid, false otherwise
 118 sub is_valid_enzyme{
 119     my ($enz, $dbh) = @_;
 120     $dbh -> do("set search_path=sgn");
 121     my $ids = $dbh ->selectcol_arrayref("SELECT DISTINCT enzyme_id FROM enzyme_restriction_sites WHERE restriction_site IS NOT NULL");
 122     my $formatted_enz_ids = join ", ", @$ids;
 123     my $enzyme_names = $dbh -> selectcol_arrayref("SELECT enzyme_name FROM enzymes WHERE enzyme_id IN($formatted_enz_ids) ORDER BY enzyme_name");
 124     my $is_valid = 0;
 125     foreach my $e(@$enzyme_names){
 126         if($e eq $enz){
 127             $is_valid = 1;
 128         }
 129     }
 130     return $is_valid || $enz_name eq "All";
 131     #Return the truth of this statement: The enzyme "is valid" or is equal to "All"
 132 }
 133
 134 #returns an array of all possible sequences for the given enzyme
 135 sub get_enz_seq_arr{
 136     my ($enz_name) = @_;
 137     $dbh -> do("set search_path=sgn");
 138     my $ids_ref = $dbh -> selectcol_arrayref("SELECT enzyme_id FROM enzymes WHERE enzyme_name = '$enz_name'");
 139     my $enz_id = $ids_ref->[0];
 140     my $seqs_ref = $dbh -> selectcol_arrayref("SELECT restriction_site FROM enzyme_restriction_sites WHERE enzyme_id = ?", undef, $enz_id);
 141     foreach my $seq(@$seqs_ref){
 142         $seq =~ s/\s//g;
 143     }
 144     return $seqs_ref;
 145 }
 146
 147 #finds where the enzyme matches the dna sequence and returns an array of the indexes where they first match
 148 sub find_matches{
 149     my ($dna_seq, $enz_name) = @_;
 150     my $seqs_ref = get_enz_seq_arr($enz_name);
 151     my @matches = ();
 152     for(my $i=0; $i<length($dna_seq); $i++){
 153         foreach my $enz_seq(@$seqs_ref){
 154             $enz_seq =~ s/\^//g;
 155             $enz_seq =~ s/(.*)\(\d*\/\d*\)/$1/;
 156             my $enz_length = length($enz_seq);
 157             my $sub_seq = substr($dna_seq, $i, $enz_length);
 158             if($enz_seq eq $sub_seq){
 159                 push @matches, $i;
 160             }
 161         }
 162     }
 163     return \@matches;
 164 }
 165
 166 #returns the reverse compliment of the given DNA sequence
 167 sub reverse_compliment{
 168     my ($dna_seq) = @_;
 169     my %base_pair = ("A" => "T",
 170                       "T" => "A",
 171                       "G" => "C",
 172                       "C" => "G");
 173     my $compliment = "";
 174     my @seq = split "", $dna_seq;
 175     foreach my $s(@seq){
 176         $compliment.=$base_pair{$s};
 177     }
 178     my $rev_comp = reverse($compliment);
 179     return $rev_comp;
 180 }
 181
 182 #returns finished, formatted DNA
 183 sub find_cut_sites{
 184     my($top_seq, $enz_name) = @_;
 185     my $rev_bottom_seq = reverse_compliment($top_seq);
 186     my $enzyme_seq_ref = get_enz_seq_arr($enz_name);
 187     my $enzyme_seq = $enzyme_seq_ref->[0];
 188     my $length_before_cut = this_length_before_cut($enzyme_seq);
 189     my $top_seq_cuts = find_matches($top_seq, $enz_name);
 190     foreach my $c(@$top_seq_cuts){
 191         my $cut = $c+$length_before_cut;
 192         $top_seq = insert_cut($top_seq, $cut);
 193         if (other_length_before_cut($enzyme_seq)){
 194             my $bottom_seq = reverse($rev_bottom_seq);
 195             $bottom_seq = insert_cut($bottom_seq, other_length_before_cut($enzyme_seq));
 196             $rev_bottom_seq = reverse($bottom_seq);
 197         }
 198     }
 199     my $rev_bottom_seq_cuts = find_matches($rev_bottom_seq, $enz_name);
 200     foreach my $c(@$rev_bottom_seq_cuts){
 201         my $cut = $c+$length_before_cut;
 202         $rev_bottom_seq = insert_cut($rev_bottom_seq, $cut);
 203         if (other_length_before_cut($enzyme_seq)){
 204             my $rev_top_seq = insert_cut(reverse($top_seq), other_length_before_cut($enzyme_seq));
 205             $top_seq = reverse($rev_top_seq);
 206         }
 207     }
 208     my $bottom_seq = reverse($rev_bottom_seq);
 209     if(has_cut_sites($top_seq) || has_cut_sites($bottom_seq)){
 210         if(length($top_seq) < 100 && length($bottom_seq) < 100){
 211             return "$top_seq<br />$bottom_seq<br />";
 212         }
 213         else{
 214             return wrap_text($top_seq, $bottom_seq);
 215         }
 216     }
 217     return "";
 218 }
 219
 220 #returns true if a DNA sequence contains a ^
 221 sub has_cut_sites{
 222     my($dna_seq) = @_;
 223     my @dna = split "", $dna_seq;
 224     my $has_cut_sites = 0;
 225     foreach my $d(@dna){
 226         if($d eq "^"){
 227             $has_cut_sites = 1;
 228         }
 229     }
 230     return $has_cut_sites;
 231 }
 232
 233 #returns how far after the match the strand will be cut
 234 sub this_length_before_cut{
 235     my ($enzyme_seq) = @_;
 236     $enzyme_seq =~ s/\s//g;
 237     my $length_before_cut = "";
 238     if($enzyme_seq =~ m/^(.*)\^.*$/){
 239         $length_before_cut = length($1);
 240     }
 241     elsif($enzyme_seq =~ m/(.*)\((\d*)\/\d*\)/){
 242         $length_before_cut = length($1) + $2;
 243     }
 244     return $length_before_cut;
 245 }
 246
 247 #for enzymes which cut the strand and its compliment, returns how far after the match the other strand will be cut
 248 sub other_length_before_cut{
 249     my ($enzyme_seq) = @_;
 250     my $length_before_cut = "";
 251     if($enzyme_seq =~ m/(.*)\(\d*\/(\d*)\)/){
 252         $length_before_cut = length($1) + $2;
 253     }
 254     return $length_before_cut;
 255 }
 256
 257 #insert a cut before the index given, ignoring ^s
 258 sub insert_cut{
 259     my ($dna_seq, $index) = @_;
 260     my $length = length($dna_seq);
 261     if($index > length($dna_seq)){
 262         return $dna_seq;
 263     }
 264     my @dna = split "", $dna_seq;
 265     my $letters_count = -1;
 266     my $char_count = -1;
 267     foreach my $d(@dna){
 268         $char_count++;
 269         if ($d =~ m/[ATGC]/){
 270             $letters_count++;
 271         }
 272         if ($letters_count == $index){
 273             last;
 274         }
 275     }
 276     if($letters_count == $index-1){
 277         return $dna_seq."^";
 278     }
 279     elsif($letters_count < $index-1){
 280         return $dna_seq;
 281     }
 282     else{
 283         $dna_seq =~ s/^(.{$char_count})(.*)$/$1^$2/;
 284         return $dna_seq;
 285     }
 286 }
 287
 288 #Returns the names of all the enzymes in the database.
 289 sub get_all_enzymes{
 290     my ($dbh) = @_;
 291     $dbh->do("set search_path=sgn");
 292     my $enzymes = $dbh->selectcol_arrayref("SELECT DISTINCT enzyme_name from enzymes");
 293     return $enzymes;
 294 }
 295
 296 #Wraps the dna text around the screen instead of letting it run off into the abyss.
 297 #I know this is like the worst possible way to write this code but, its the only way I can think of for doing it
 298 #Since I don't understand a lot of the perl "subtleties"
 299 sub wrap_text{
 300     my ($top, $bottom) = @_;
 301     print CXGN::Page::FormattingHelpers->html_break_string($top,100,":");
 302     my $delimitedtop = CXGN::Page::FormattingHelpers->html_break_string($top,100,":");
 303     my $delimitedbottom = CXGN::Page::FormattingHelpers->html_break_string($bottom,100,":");
 304     my @toplines = split(/:/,$delimitedtop);
 305     my @bottomlines = split(/:/,$delimitedbottom);
 306     my $result = "";
 307     for(my $i = 0; $i < length(@toplines) && $i < length(@bottomlines); $i++){
 308         $result .= "$toplines[$i]<br />$bottomlines[$i]<br /><br />";
 309     }
 310     return $result;
 311 }