Merge pull request #42 from solgenomics/topic/duplicate_image_warning
[cxgn-corelibs.git] / lib / CXGN / Phylo / File.pm
blobc316c8707e61ea53562e71288db0c92f1b7c087a
2 =head1 NAME
4 CXGN::Phylo::File - a class to read different tree files.
6 =head1 DESCRIPTION
8 my $file = CXGN::Phylo::File->new($filename);
9 my @node_names = $file->get_node_names();
10 my $tree_string = $file -> get_tree_string();
13 =head1 AUTHOR
15 Lukas Mueller (lam87@cornell.edu)
17 =cut
19 package CXGN::Phylo::File;
21 sub new {
22 my $class = shift;
23 my $file = shift;
25 my $args={};
26 my $self = bless $args, $class;
28 my $newick="";
29 my %ids;
30 my $in_translation=0;
31 my $in_tree = 0;
32 $self->set_file_type($self->determine_filetype($file));
34 if ($self->get_file_type() eq "nexus") {
35 print STDERR "READING a NEXUS FILE!\n\n";
36 open (F, "<$file") || die "Can't open file \"$file\".\n";
37 while (<F>) {
38 chomp;
39 if (/^>/) { next; } # skip lines that start with >
40 if (/^\#/) { next; } # and #
41 if (/^\[|^\]/) { next; } # and [ or ]
42 if (/Translate/i) { $in_translation = 1;} # lets get the node names
43 if (/^\s+\;/ && $in_translation) { $in_translation = 0; } # until that section is over
44 if (($in_translation) && /^\s*(\d+)\s+(.[A-Za-z._\-0-9]+),?/) { $ids{$1}=$2; } # the leaf node names are coded with a number
45 if (/^\s*tree/i) { $in_tree = 1; } # finally, the tree!
46 if (/^\s*End;/i) { $in_tree = 0; }
47 if ($in_tree) { $newick .= $_ } #allow newick to span multiple lines (forgiving)
49 close(F);
51 # throw away trash chars before the newick expression begins
53 #translate the newick:
54 if(keys %ids > 0){
55 foreach my $k (keys %ids){
56 my $v = $ids{$k};
57 $newick =~ s/\b$k\b/$v/;
60 $newick =~ s/\n|\r//g;
61 $newick =~ s/^(tree.*?)(\(.*)$/$2/;
63 $self->set_node_names(\%ids);
65 else {
66 # print STDERR "Reading plain newick file $file...\n";
67 open (F, "<".$file) || die "Can't open file $file\n";
68 while (<F>) {
69 chomp;
70 $newick .=$_;
72 close(F);
74 $self->set_tree_string($newick);
76 return $self;
79 =head2 function determine_filetype()
81 Synopsis:
82 Arguments: a filename, possibly including a path
83 Returns: "nexus" if the file is of type nexus
84 "newick" if the file contains a plain newick expression
85 Side effects: none
86 Description:
88 =cut
90 sub determine_filetype {
91 my $self = shift;
92 my $filename = shift;
93 open(TEST, "<$filename") || die "Can't open file $filename ...";
94 my $line = <TEST>;
95 close(TEST);
96 if ($line =~ /NEXUS/i) { return "nexus"; }
97 if ($line =~ /\(/) { return "newick"; }
98 return undef;
102 =head2 function get_tree_string()
104 Synopsis:
105 Arguments:
106 Returns:
107 Side effects:
108 Description:
110 =cut
112 sub get_tree_string {
113 my $self=shift;
114 return $self->{tree_string};
117 =head2 function set_tree_string()
119 Synopsis:
120 Arguments:
121 Returns:
122 Side effects:
123 Description:
125 =cut
127 sub set_tree_string {
128 my $self=shift;
129 $self->{tree_string}=shift;
132 =head2 function get_file_type()
134 Synopsis:
135 Arguments:
136 Returns:
137 Side effects:
138 Description:
140 =cut
142 sub get_file_type {
143 my $self=shift;
144 return $self->{file_type};
147 =head2 function set_file_type()
149 Synopsis:
150 Arguments:
151 Returns:
152 Side effects:
153 Description:
155 =cut
157 sub set_file_type {
158 my $self=shift;
159 $self->{file_type}=shift;
162 =head2 function get_node_names()
164 Synopsis:
165 Arguments:
166 Returns:
167 Side effects:
168 Description:
170 =cut
172 sub get_node_names {
173 my $self=shift;
174 return $self->{node_names};
177 =head2 function set_node_names()
179 Synopsis:
180 Arguments:
181 Returns:
182 Side effects:
183 Description:
185 =cut
187 sub set_node_names {
188 my $self=shift;
189 $self->{node_names}=shift;
192 =head2 function get_tree()
194 Synopsis:
195 Arguments:
196 Returns: a tree object
197 Side effects:
198 Description:
200 =cut
202 sub get_tree {
203 my $self=shift;
204 my $tree_parser=CXGN::Phylo::Parse_newick->new($self->get_tree_string());
205 $self->{tree}=$tree_parser->parse();
206 # if it was a nexus file, replace the node names with the actual
207 # names which are available through get_node_names().
209 my $trans_hash_ref = $self->get_node_names();
210 foreach my $k (keys %$trans_hash_ref) {
211 my $node = $self->{tree}->get_node_by_name($k);
212 if ($node) {
213 $node->set_name($$trans_hash_ref{$k});
216 return $self->{tree};