start fixing test for multi cat phenotype upload.
[sgn.git] / lib / CXGN / Bulk / ArraySpotEST.pm
blobe5642852d04f0d6dd6f6ab49c1f20bb4546765c5
1 # Array Spot EST download script for SGN database
2 # Lukas Mueller, August 12, 2003
4 # This bulk download option handles the query
5 # Of Array Spot of type EST.
6 # Many of its methods are in the Bulk object.
8 # Modified July 15, 2005
9 # Modified more August 11, 2005
10 # Summer Intern Caroline N. Nyenke
12 # Modified July 7, 2006
13 # Summer Intern Emily Hart
15 # Modified July 3rd, 2007
16 # Alexander Naydich and Matthew Crumb
18 =head1 NAME
20 /CXGN/Bulk/ArraySpotEST.pm
21 (A subclass of Bulk)
23 =head1 DESCRIPTION
25 This perl script is used on the bulk download page. The script collects
26 identifiers submitted by the user and returns information based on the
27 Array Spot EST Ids entered. It then determines the information the user is
28 searching for (SGN_M, Chip Name, TUS, Clone Name, SGN_C, SGN_T, SGN_U,
29 Builder Number, Manual Annotation, Automatic Annotation, and
30 Estimated Sequence) and preforms the appropriate querying of the
31 database. The results of the database query are formated and presented
32 to the user on a separate page. Options of viewing or downloading
33 in text or fasta are available.
35 =cut
38 use strict;
39 use CXGN::Bulk;
41 package CXGN::Bulk::ArraySpotEST;
42 use base "CXGN::Bulk";
44 sub new
46 my $class = shift;
47 my $self = $class->SUPER::new(@_);
48 return $self;
51 =head2 process_parameters
53 Desc:
54 Args: none
55 Ret : 1 if the parameters were OK, 0 if not
57 Modifies some of the parameters received set in get_parameters. Preparing
58 data for the database query.
60 =cut
62 sub process_parameters {
63 my $self = shift;
65 #do some simple parameter checking
66 print STDERR "IDS (ESTs) $self->{ids}\n\n";
67 return 0 if ($self->{idType} eq "");
68 #return 0 if ($self->{ids_string} !~ /\w/);
70 # @output_list defines the identity on order of all fields that can be output
72 my @output_list = qw/ SGN_S chipname TUS clone_name
73 SGN_C SGN_T SGN_E SGN_U build_nr
74 manual_annotation automatic_annotation
75 evalue /;
77 my %links = (
78 clone_name => "/search/est.pl?request_type=10&search=Search&request_id=",
79 SGN_U => "/search/unigene.pl?unigene_id=",
82 $self->{links} = \%links;
83 my @output_fields = ();
85 $self->debug("Type of identifier: ".($self->{idType})."");
87 # @output_fields is the sub-set of fields that will actually be output.
88 for my $o (@output_list)
90 if (my $value = $self->{$o})
92 if ($value eq "on")
94 push @output_fields, $o;
99 if ($self->{sequence} eq "on") { push @output_fields, $self->{seq_type}; }
101 $self->{output_list} = \@output_list;
102 $self->{output_fields} = \@output_fields;
104 #make sure the input string isn't too big
105 return 0 if length( $self->{ids_string} ) > 1_000_000;
107 # clean up data retrieved
108 my $ids = $self -> {ids};
109 $ids =~ s/\n+/ /g;
110 $ids =~ s/\s+/ /g; # compress multiple returns into one
111 $ids =~ s/\r+/ /g; # convert carriage returns to space
112 my @ids = split /\s+/, $ids;
113 return 0 if @ids > 10_000; #limit to 10_000 ids to process
114 $self->debug("IDs to be processed:");
115 for my $i (@ids)
117 $i =~ s/^\d\-\d\-(.*)$/$1/;
118 $self->debug($i);
120 my $has_valid_id = 0;
121 for my $i(@ids)
123 if ($i ne "")
125 $has_valid_id = 1;
128 return 0 unless $has_valid_id;
129 $self->{ids} = \@ids;
131 return 1; #params were OK if we got here
134 =head2 proces_sids
136 Desc: sub process_[idType]_ids
137 Args: default;
138 Ret : data from database printed to a file;
140 Queries database using Persistent (see perldoc Persistent) and
141 object oriented perl to obtain data on Bulk Objects using formatted
142 IDs.
144 =cut
146 sub process_ids {
147 my $self = shift;
148 my $db = $self->{db};
149 my @output_fields = @{$self -> {output_fields}};
150 my @return_data = ();
151 my @notfound = ();
152 my ($dump_fh, $notfound_fh) = $self -> create_dumpfile();
153 # start querying the database
154 my $current_time= time() - $self -> {query_start_time};
155 $self->debug("Time point 6: $current_time");
157 my $in_ids = 'IN ('.join(',',(map {$db->quote($_)} @{$self->{ids}})).')'; #makes fragment of SQL query
158 my $query = get_query($in_ids, $self->{build_id});
160 #warn "using query \n",$query;
162 my $sth = $db -> prepare($query);
164 $self -> {query_start_time} = time();
165 $sth -> execute();
166 $current_time = time() - $self->{query_start_time};
168 # execute the query and get the data.
169 while (my $row = $sth -> fetchrow_hashref()) {
170 # crop est_seq if qc_report data is available
172 if ( defined($row->{start}) && defined($row->{length}) ) {
173 my $start = $row->{start};
174 my $length = $row->{length};
175 $row->{"est_seq"}=substr($row->{est_seq}, $start, $length);
178 $row->{sgn_u}="SGN-U$row->{sgn_u}" if defined($row->{sgn_u});
179 $row->{sgn_c}="SGN-C$row->{sgn_c}" if defined($row->{sgn_c});
180 $row->{sgn_t}="SGN-T$row->{sgn_t} ($row->{direction})" if defined($row->{sgn_t});
182 @return_data = map ($row->{lc($_)}, @{$self -> {output_fields}});
183 # the pesky manual annotation field contains carriage returns!!!
184 foreach my $r (@return_data) {
185 $r =~ s/\n//g;
187 print $dump_fh (join "\t", @return_data)."\n";
189 close($notfound_fh);
190 close($dump_fh);
192 $self->{query_time} = time() - $self -> {query_start_time}
196 =head2 get_query
198 Desc:
199 Args: default;
200 Ret : data from database printed to a file;
202 Queries database using SQL to obtain data on Bulk Objects using formatted
203 IDs.
205 =cut
207 sub get_query
209 my ($in_ids, $build_id) = @_;
210 #person might have picked a specific build they're interested in
211 my $build_condition = ($build_id eq 'all') ? '' : <<EOSQL;
212 AND (unigene_build.unigene_build_id = $build_id
213 OR unigene_build.unigene_build_id IS NULL)
214 EOSQL
216 return <<EOSQL
217 SELECT clone.clone_name,
218 clone.clone_id as SGN_C,
219 seqread.read_id as SGN_T,
220 seqread.direction as direction,
221 est.est_id as SGN_E,
222 (unigene_build.build_nr) as build_nr,
223 unigene.unigene_id as SGN_U,
224 microarray.chip_name as chipname,
225 microarray.spot_id as SGN_S,
226 microarray.content_specific_tag as TUS,
227 est.seq as est_seq,
228 qc_report.hqi_start as start,
229 qc_report.hqi_length as length,
230 (SELECT array_to_string(array(SELECT '"' || m.annotation_text || '"'
231 || ' -- ' || a.first_name || ' ' || a.last_name
232 FROM manual_annotations as m
233 JOIN sgn_people.sp_person as a
234 ON(m.author_id=a.sp_person_id)
235 WHERE m.annotation_target_id = clone.clone_id
236 AND (m.annotation_target_type_id=1
237 OR m.annotation_target_type_id IS NULL)
238 LIMIT 5
240 ' AND ')
241 ) AS manual_annotation,
242 (SELECT array_to_string(array(SELECT 'MATCHED '
243 || dl.defline
244 || ' (evalue:'
245 || bh.evalue
246 || ')'
247 FROM blast_annotations as ba
248 JOIN blast_hits as bh USING(blast_annotation_id)
249 JOIN blast_defline as dl USING(defline_id)
250 WHERE ba.apply_id=unigene.unigene_id
251 AND ba.blast_target_id=1
252 AND ba.apply_type=15
253 LIMIT 5
255 ' AND ')
256 ) AS automatic_annotation,
257 est.status
258 FROM microarray
259 LEFT JOIN clone ON (clone.clone_id=microarray.clone_id)
260 LEFT JOIN seqread ON (clone.clone_id=seqread.clone_id)
261 LEFT JOIN est ON (seqread.read_id=est.read_id)
262 LEFT JOIN qc_report ON (est.est_id=qc_report.est_id)
263 LEFT JOIN unigene_member ON (est.est_id=unigene_member.est_id)
264 LEFT JOIN unigene ON (unigene_member.unigene_id=unigene.unigene_id)
265 LEFT JOIN unigene_build ON (unigene_build.unigene_build_id=unigene.unigene_build_id)
266 LEFT JOIN unigene_consensi ON (unigene.consensi_id=unigene_consensi.consensi_id)
267 WHERE microarray.spot_id $in_ids
268 AND (est.status=0 OR est.status IS NULL)
269 AND (est.flags=0 OR est.flags IS NULL)
270 AND (unigene_build.status='C' OR unigene_build.status IS NULL)
271 $build_condition
272 GROUP BY
273 clone.clone_name,
274 clone.clone_id,
275 seqread.read_id,
276 seqread.direction,
277 est.est_id,
278 (unigene_build.build_nr),
279 unigene.unigene_id,
280 microarray.chip_name,
281 microarray.spot_id,
282 microarray.content_specific_tag,
283 est.seq,
284 qc_report.hqi_start,
285 qc_report.hqi_length,
286 unigene_consensi.seq,
287 est.status
288 ORDER BY microarray.spot_id
289 EOSQL