remove erroneously-commited next i had as part of my FISH loader debugging
[sgn.git] / lib / CXGN / Bulk / ArraySpotEST.pm
bloba9d03a401d883ef879ff0baed0a58ea6972337cc
1 # Array Spot EST download script for SGN database
2 # Lukas Mueller, August 12, 2003
4 # This bulk download option handles the query
5 # Of Array Spot of type EST.
6 # Many of its methods are in the Bulk object.
8 # Modified July 15, 2005
9 # Modified more August 11, 2005
10 # Summer Intern Caroline N. Nyenke
12 # Modified July 7, 2006
13 # Summer Intern Emily Hart
15 # Modified July 3rd, 2007
16 # Alexander Naydich and Matthew Crumb
18 =head1 NAME
20 /CXGN/Bulk/ArraySpotEST.pm
21 (A subclass of Bulk)
23 =head1 DESCRIPTION
25 This perl script is used on the bulk download page. The script collects
26 identifiers submitted by the user and returns information based on the
27 Array Spot EST Ids entered. It then determines the information the user is
28 searching for (SGN_M, Chip Name, TUS, Clone Name, SGN_C, SGN_T, SGN_U,
29 Builder Number, Manual Annotation, Automatic Annotation, and
30 Estimated Sequence) and preforms the appropriate querying of the
31 database. The results of the database query are formated and presented
32 to the user on a separate page. Options of viewing or downloading
33 in text or fasta are available.
35 =cut
38 use strict;
39 use CXGN::Bulk;
41 package CXGN::Bulk::ArraySpotEST;
42 use base "CXGN::Bulk";
44 sub new
46 my $class = shift;
47 my $self = $class->SUPER::new(@_);
48 return $self;
51 =head2 process_parameters
53 Desc:
54 Args: none
55 Ret : 1 if the parameters were OK, 0 if not
57 Modifies some of the parameters received set in get_parameters. Preparing
58 data for the database query.
60 =cut
62 sub process_parameters
64 my $self = shift;
66 #do some simple parameter checking
68 return 0 if ($self->{idType} eq "");
69 return 0 if ($self->{ids_string} !~ /\w/);
71 # @output_list defines the identity on order of all fields that can be output
73 my @output_list = qw/ SGN_S chipname TUS clone_name
74 SGN_C SGN_T SGN_E SGN_U build_nr
75 manual_annotation automatic_annotation
76 evalue /;
78 my %links = (
79 clone_name => "/search/est.pl?request_type=10&search=Search&request_id=",
80 SGN_U => "/search/unigene.pl?unigene_id=",
83 $self->{links} = \%links;
84 my @output_fields = ();
86 $self->debug("Type of identifier: ".($self->{idType})."");
88 # @output_fields is the sub-set of fields that will actually be output.
89 for my $o (@output_list)
91 if (my $value = $self->{$o})
93 if ($value eq "on")
95 push @output_fields, $o;
100 if ($self->{sequence} eq "on") { push @output_fields, $self->{seq_type}; }
102 $self->{output_list} = \@output_list;
103 $self->{output_fields} = \@output_fields;
105 #make sure the input string isn't too big
106 return 0 if length( $self->{ids_string} ) > 1_000_000;
108 # clean up data retrieved
109 my $ids = $self -> {ids_string};
110 $ids =~ s/\n+/ /g;
111 $ids =~ s/\s+/ /g; # compress multiple returns into one
112 $ids =~ s/\r+/ /g; # convert carriage returns to space
113 my @ids = split /\s+/, $ids;
114 return 0 if @ids > 10_000; #limit to 10_000 ids to process
115 $self->debug("IDs to be processed:");
116 for my $i (@ids)
118 $i =~ s/^\d\-\d\-(.*)$/$1/;
119 $self->debug($i);
121 my $has_valid_id = 0;
122 for my $i(@ids)
124 if ($i ne "")
126 $has_valid_id = 1;
129 return 0 unless $has_valid_id;
130 $self->{ids} = \@ids;
132 return 1; #params were OK if we got here
135 =head2 proces_sids
137 Desc: sub process_[idType]_ids
138 Args: default;
139 Ret : data from database printed to a file;
141 Queries database using Persistent (see perldoc Persistent) and
142 object oriented perl to obtain data on Bulk Objects using formatted
143 IDs.
145 =cut
147 sub process_ids {
148 my $self = shift;
149 my $db = $self->{db};
150 my @output_fields = @{$self -> {output_fields}};
151 my @return_data = ();
152 my @notfound = ();
153 my ($dump_fh, $notfound_fh) = $self -> create_dumpfile();
154 # start querying the database
155 my $current_time= time() - $self -> {query_start_time};
156 $self->debug("Time point 6: $current_time");
158 my $in_ids = 'IN ('.join(',',(map {$db->quote($_)} @{$self->{ids}})).')'; #makes fragment of SQL query
159 my $query = get_query($in_ids, $self->{build_id});
161 #warn "using query \n",$query;
163 my $sth = $db -> prepare($query);
165 $self -> {query_start_time} = time();
166 $sth -> execute();
167 $current_time = time() - $self->{query_start_time};
169 # execute the query and get the data.
170 while (my $row = $sth -> fetchrow_hashref()) {
171 # crop est_seq if qc_report data is available
173 if ( defined($row->{start}) && defined($row->{length}) ) {
174 my $start = $row->{start};
175 my $length = $row->{length};
176 $row->{"est_seq"}=substr($row->{est_seq}, $start, $length);
179 $row->{sgn_u}="SGN-U$row->{sgn_u}" if defined($row->{sgn_u});
180 $row->{sgn_c}="SGN-C$row->{sgn_c}" if defined($row->{sgn_c});
181 $row->{sgn_t}="SGN-T$row->{sgn_t} ($row->{direction})" if defined($row->{sgn_t});
183 @return_data = map ($row->{lc($_)}, @{$self -> {output_fields}});
184 # the pesky manual annotation field contains carriage returns!!!
185 foreach my $r (@return_data) {
186 $r =~ s/\n//g;
188 print $dump_fh (join "\t", @return_data)."\n";
190 close($notfound_fh);
191 close($dump_fh);
193 $self->{query_time} = time() - $self -> {query_start_time}
197 =head2 get_query
199 Desc:
200 Args: default;
201 Ret : data from database printed to a file;
203 Queries database using SQL to obtain data on Bulk Objects using formatted
204 IDs.
206 =cut
208 sub get_query
210 my ($in_ids, $build_id) = @_;
211 #person might have picked a specific build they're interested in
212 my $build_condition = ($build_id eq 'all') ? '' : <<EOSQL;
213 AND (unigene_build.unigene_build_id = $build_id
214 OR unigene_build.unigene_build_id IS NULL)
215 EOSQL
217 return <<EOSQL
218 SELECT clone.clone_name,
219 clone.clone_id as SGN_C,
220 seqread.read_id as SGN_T,
221 seqread.direction as direction,
222 est.est_id as SGN_E,
223 (unigene_build.build_nr) as build_nr,
224 unigene.unigene_id as SGN_U,
225 microarray.chip_name as chipname,
226 microarray.spot_id as SGN_S,
227 microarray.content_specific_tag as TUS,
228 est.seq as est_seq,
229 qc_report.hqi_start as start,
230 qc_report.hqi_length as length,
231 (SELECT array_to_string(array(SELECT '"' || m.annotation_text || '"'
232 || ' -- ' || a.first_name || ' ' || a.last_name
233 FROM manual_annotations as m
234 JOIN sgn_people.sp_person as a
235 ON(m.author_id=a.sp_person_id)
236 WHERE m.annotation_target_id = clone.clone_id
237 AND (m.annotation_target_type_id=1
238 OR m.annotation_target_type_id IS NULL)
239 LIMIT 5
241 ' AND ')
242 ) AS manual_annotation,
243 (SELECT array_to_string(array(SELECT 'MATCHED '
244 || dl.defline
245 || ' (evalue:'
246 || bh.evalue
247 || ')'
248 FROM blast_annotations as ba
249 JOIN blast_hits as bh USING(blast_annotation_id)
250 JOIN blast_defline as dl USING(defline_id)
251 WHERE ba.apply_id=unigene.unigene_id
252 AND ba.blast_target_id=1
253 AND ba.apply_type=15
254 LIMIT 5
256 ' AND ')
257 ) AS automatic_annotation,
258 est.status
259 FROM microarray
260 LEFT JOIN clone ON (clone.clone_id=microarray.clone_id)
261 LEFT JOIN seqread ON (clone.clone_id=seqread.clone_id)
262 LEFT JOIN est ON (seqread.read_id=est.read_id)
263 LEFT JOIN qc_report ON (est.est_id=qc_report.est_id)
264 LEFT JOIN unigene_member ON (est.est_id=unigene_member.est_id)
265 LEFT JOIN unigene ON (unigene_member.unigene_id=unigene.unigene_id)
266 LEFT JOIN unigene_build ON (unigene_build.unigene_build_id=unigene.unigene_build_id)
267 LEFT JOIN unigene_consensi ON (unigene.consensi_id=unigene_consensi.consensi_id)
268 WHERE microarray.spot_id $in_ids
269 AND (est.status=0 OR est.status IS NULL)
270 AND (est.flags=0 OR est.flags IS NULL)
271 AND (unigene_build.status='C' OR unigene_build.status IS NULL)
272 $build_condition
273 GROUP BY
274 clone.clone_name,
275 clone.clone_id,
276 seqread.read_id,
277 seqread.direction,
278 est.est_id,
279 (unigene_build.build_nr),
280 unigene.unigene_id,
281 microarray.chip_name,
282 microarray.spot_id,
283 microarray.content_specific_tag,
284 est.seq,
285 qc_report.hqi_start,
286 qc_report.hqi_length,
287 unigene_consensi.seq,
288 est.status
289 ORDER BY microarray.spot_id
290 EOSQL