make test pass for multicat parsing with two xlsx files for testing.
[sgn.git] / bin / dump_markers_to_fasta.pl
blob0acd3e1a60e4ce67da9099860bcb3c1f2471b18f
1 #!/usr/bin/perl -w
3 # This script will output all marker and primer sequences to fasta.
4 # These fasta files can be used to generate blast databases.
6 use strict;
7 use CXGN::DB::Connection;
8 use Getopt::Std;
10 # optional map_version_id
11 our ($opt_v);
12 getopts('v:');
14 my $dbh = CXGN::DB::Connection->new();
16 my $DEBUG = 1;
18 my @marker_types = ("COS","P","TM","RFLP - fwd","RFLP - rev","PCR - fwd","PCR - rev","COSII","EST clones","EST markers","Unigenes - singletons","Unigenes - contigs", "SNP", "SNP");
20 my $map_version_id = $opt_v;
22 # test whether map_version_id is valid
23 if ($map_version_id) {
25 my $query = "SELECT map_version_id FROM map_version WHERE map_version_id = ?";
26 my $sth = $dbh->prepare($query);
27 $sth->execute($map_version_id);
29 my $exists = $sth->fetchrow();
31 unless ($exists) { die "invalid map_version_id specified\n" }
34 # modify queries to accomodate specific map_version_id
35 my ($join,$where,$where_modified) = ("","","");
37 if ($map_version_id) {
38 $join = "join marker_experiment using (marker_id) join marker_location using (location_id)";
39 $where = "and map_version_id = ?";
40 $where_modified = "where map_version_id = ?";
43 my @queries = (
45 # cos markers
46 "SELECT c.cos_id as name, c.marker_id, coalesce(substring(e.seq from qc_report.hqi_start::integer+1 for qc_report.hqi_length::integer), e.seq) as sequence FROM cos_markers AS c LEFT JOIN est AS e ON (c.est_read_id=e.read_id) INNER JOIN qc_report on (e.est_id=qc_report.est_id) $join where marker_id > 0 and cos_id is not null and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) is not null and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) != '' $where",
48 # p markers
49 "SELECT p.p_mrkr_name as name, p.marker_id, e.seq as sequence FROM p_markers AS p INNER JOIN seqread AS s ON p.est_clone_id=s.clone_id INNER JOIN est AS e ON s.read_id=e.read_id $join where marker_id > 0 and p_mrkr_name is not null and seq is not null and seq != '' $where",
51 # tm_markers
52 "SELECT tm.tm_name as name, tm.marker_id, r.fasta_sequence as sequence FROM tm_markers AS tm INNER JOIN rflp_sequences AS r ON tm.seq_id=r.seq_id $join where tm_name is not null and marker_id > 0 and fasta_sequence is not null and fasta_sequence != '' $where",
54 # rflp markers (forward)
55 "SELECT rflp_name||'-F' as name, marker_id, fasta_sequence as sequence FROM rflp_markers AS r LEFT JOIN rflp_sequences AS fs ON r.forward_seq_id=fs.seq_id $join where fasta_sequence is not null and fasta_sequence != '' $where",
57 # rflp markers (reverse)
58 "SELECT rflp_name||'-R' as name, marker_id, fasta_sequence as sequence FROM rflp_markers AS r LEFT JOIN rflp_sequences AS fs ON r.reverse_seq_id=fs.seq_id $join where fasta_sequence is not null and fasta_sequence != '' $where",
60 # pcr primers (forward)
61 "select alias||'-FPRIMER' as name, marker_id, sequence from marker_alias inner join pcr_experiment using(marker_id) inner join pcr_experiment_sequence using(pcr_experiment_id) join sequence using(sequence_id) join cvterm on (pcr_experiment_sequence.type_id=cvterm.cvterm_id) $join where sequence is not null $where and cvterm.name='forward_primer'",
63 # pcr primers (reverse)
64 "select alias||'-RPRIMER' as name, marker_id, sequence from marker_alias inner join pcr_experiment using(marker_id) inner join pcr_experiment_sequence using(pcr_experiment_id) join sequence using(sequence_id) $join join cvterm on (pcr_experiment_sequence.type_id=cvterm.cvterm_id) where sequence is not null $where and cvterm.name='reverse_primer'",
66 # cosii markers?
67 "select alias, marker_alias.marker_id, seq from marker_alias inner join cosii_ortholog using(marker_id) inner join unigene using(unigene_id) inner join unigene_consensi using(consensi_id) $join $where_modified",
69 # this one works good for EST clones (we're skipping genomic clones, of course)
70 "select alias, marker_id, coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) as sequence from marker_alias inner join marker_derived_from using(marker_id) inner join clone on(clone_id=id_in_source and derived_from_source_id=1) inner join seqread using(clone_id) inner join est using(read_id) inner join qc_report using(est_id) $join where coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) is not null and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) != '' $where",
72 # this one SHOULD work for est markers linked by read, but the sequences are missing
73 "select alias, marker_id, coalesce(substring(e.seq from qc_report.hqi_start::integer+1 for qc_report.hqi_length::integer), e.seq) as sequence from marker_alias inner join marker_derived_from as d using(marker_id) inner join seqread on(id_in_source=read_id) inner join est as e on(e.read_id=seqread.read_id) inner join qc_report using(est_id) $join where derived_from_source_id=2 and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) is not null and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) != '' $where",
75 # unigenes - singletons
76 "select alias, marker_id, COALESCE(substring(seq FROM (hqi_start)::int+1 FOR (hqi_length)::int ),seq) as sequence from marker_alias inner join marker_derived_from using(marker_id) inner join unigene on(unigene_id=id_in_source and derived_from_source_id=3) LEFT JOIN unigene_member USING (unigene_id) LEFT JOIN est USING (est_id) LEFT JOIN qc_report USING (est_id) $join where unigene.nr_members=1 and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) is not null and coalesce(substring(seq, hqi_start::integer+1, hqi_length::integer), seq) != '' $where",
78 # unigenes - real unigenes
79 "select alias, marker_id, seq as sequence from marker_alias inner join marker_derived_from using(marker_id) inner join unigene on(unigene_id=id_in_source and derived_from_source_id=3) LEFT JOIN unigene_consensi USING (consensi_id) $join where seq is not null and seq != '' $where",
81 #SNP markers - left
82 "select alias||'-5prime_flanking_region' as name, marker_id, sequence from marker_alias inner join pcr_experiment using(marker_id) inner join pcr_experiment_sequence using(pcr_experiment_id) join sequence using(sequence_id) $join join cvterm on (pcr_experiment_sequence.type_id=cvterm.cvterm_id) where sequence is not null $where and cvterm.name='five_prime_flanking_region'",
84 #SNP markers - right
85 "select alias||'-3prime_flanking_region' as name, marker_id, sequence from marker_alias inner join pcr_experiment using(marker_id) inner join pcr_experiment_sequence using(pcr_experiment_id) join sequence using(sequence_id) $join join cvterm on (pcr_experiment_sequence.type_id=cvterm.cvterm_id) where sequence is not null $where and cvterm.name='three_prime_flanking_region'"
89 # using a hash makes sure the identifiers are unique. In the event of
90 # a conflict, the later result takes precedence over the earlier.
92 my %marker_hash;
94 foreach my $index (0..$#queries) {
96 my $query = $queries[$index];
97 my $marker_type = $marker_types[$index];
99 my $sth = $dbh->prepare($query);
101 if ($map_version_id) { $sth->execute($map_version_id) }
102 else { $sth->execute() }
104 while (my ($name, $marker_id, $seq) = $sth->fetchrow_array()) {
105 $marker_hash{$marker_type}{$marker_id}{seq} = $seq; # for SGN-M$id format
106 $marker_hash{$marker_type}{$marker_id}{name} = $name; # marker alias
110 $dbh->disconnect(42);
112 # now print the fasta file! Woo!
114 my %marker_count = ();
116 foreach my $marker_type (sort keys %marker_hash){
118 foreach my $marker (sort keys %{$marker_hash{$marker_type}}) {
120 my $seq = $marker_hash{$marker_type}{$marker}{seq};
121 my $name = $marker_hash{$marker_type}{$marker}{name};
123 if ($seq =~ /^\s*$/) {
124 # uh-oh!
125 warn "$marker has an empty sequence!\n";
126 next;
129 if ($seq =~ /[^atgcnrymkswbdhv]/i){
130 # allowing IUPAC nucleotide codes - http://www.mun.ca/biochem/courses/3107/symbols.html
131 # AFLP primers are often written as "MSEI-CAG", where the first part is a restriction enzyme.
132 # For now we'll just skip these. There aren't many.
133 warn "$marker contains non-nucleotide characters ($marker_hash{$marker_type}{$marker}).\n";
134 next;
137 $marker_count{$marker}++;
138 # if marker has been seen before, append count onto ID
139 my $count = ($marker_count{$marker} > 1) ? "-$marker_count{$marker}" : "";
141 # regular plain ol fasta file
142 print ">SGN-M$marker$count $name ($marker_type)\n$seq\n";