3 # This script will convert from SGD format to GFF format
4 # See http://db.yeastgenome.org/schema/Schema.html
9 # hard-coded length data that I couldn't get directly
10 my %CHROMOSOMES = (I
=> 230_203
,
27 my @ROMAN = qw(I II III IV V VI VII VIII IX X
28 XI XII XIII XIV XV XVI Mit);
30 if ($ARGV[0] =~ /^--?h/) {
32 Usage: $0 <SGD features file>
34 This script massages the SGD sequence annotation flat files located at
35 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
36 into a version of the GFF format suitable for display by the generic
39 To use this script, get the SGD chromosomal_features.tab file from the
40 FTP site listed above, and run the following command:
42 % process_sgd.pl chromosomal_features.tab > yeast.gff
44 The yeast.gff file can then be loaded into a Bio::DB::GFF database
45 using the following command:
47 % bulk_load_gff.pl -d <databasename> yeast.gff
53 # first print out chromosomes
54 # We hard coded the lengths because they are not available in the features table.
55 for my $chrom (sort keys %CHROMOSOMES) {
56 print join("\t",$chrom,'chromosome','Component',1,$CHROMOSOMES{$chrom},'.','.','.',qq(Sequence
"$chrom")),"\n";
59 # this is hard because the SGD idea of a feature doesn't really map onto the GFF idea.
62 my($id,$gene,$aliases,$type,$chromosome,$start,$stop,$strand,$sgdid,$sgdid2,$description,$date) = split "\t";
63 my $ref = $ROMAN[$chromosome-1];
64 $description =~ s/"/\\"/g;
65 $description =~ s/;/\\;/g;
67 $strand = $strand eq 'W' ?
'+' : '-';
68 ($start,$stop) = ($stop,$start) if $strand eq '-';
69 die "Strand logic is messed up" if $stop < $start;
72 my @aliases = split(/\|/,$aliases);
73 my $aliases = join " ; ",map {qq(Alias
"$_")} @aliases;
74 my $group = qq(Gene
"$gene" ; Note
"$description");
75 $group .= " ; $aliases" if $aliases;
76 print join("\t",$ref,'sgd','gene',$start,$stop,'.',$strand,'.',$group),"\n";
77 $description .= "\\; AKA @aliases" if @aliases;
80 print join("\t",$ref,'sgd',$type,$start,$stop,'.',$strand,'.',qq($type "$id" ; Note
"$description")),"\n";
87 bp_process_sgd.pl - Massage SGD annotation flat files into a version suitable for the Generic Genome Browser
91 % bp_process_sgd.pl chromosomal_features.tab > yeast.gff
95 This script massages the SGD sequence annotation flat files located at
96 ftp://genome-ftp.stanford.edu/pub/yeast/data_dump/feature/chromosomal_features.tab
97 into a version of the GFF format suitable for display by the generic
100 To use this script, get the SGD chromosomal_features.tab file from the
101 FTP site listed above, and run the following command:
103 % bp_process_sgd.pl chromosomal_features.tab > yeast.gff
105 The yeast.gff file can then be loaded into a Bio::DB::GFF database
106 using the following command:
108 % bulk_load_gff.pl -d <databasename> yeast.gff
112 L<Bio::DB::GFF>, L<bulk_load_gff.pl>, L<load_gff.pl>
116 Lincoln Stein, lstein@cshl.org
118 Copyright (c) 2002 Cold Spring Harbor Laboratory
120 This library is free software; you can redistribute it and/or modify
121 it under the same terms as Perl itself. See DISCLAIMER.txt for
122 disclaimers of warranty.