3 # PROGRAM : simplealign.pl
4 # PURPOSE : Simple driver for Bio::SimpleAlign
5 # AUTHOR : Ewan Birney birney@sanger.ac.uk
6 # CREATED : Tue Oct 27 1998
9 # If you have installed bioperl using the standard
10 # makefile system everything should be fine and
13 # if not edit the use lib "...." line to point the directory
14 # containing your Bioperl modules.
17 #use lib "/nfs/disk21/birney/prog/bioperl/";
19 # Modified 3/5/01 to use AlignIO by Peter Schattner schattner@alum.mit.edu
22 # This uses the internal DATA stream (past the end of this
23 # file, on the __END__ tag) to load in the data. We then
24 # do some reformats, sort in a different way and a quick
25 # getting into the alignment. All pretty simple ;)
29 # The simplealign module does not do the following things
30 # a) give you sensible ways of asking if residues are a
31 # column of gaps or conservation
32 # b) provide ways of editing the alignment
33 # c) making alignments
37 # a) and b) are probably best done by UnivAlign from Georg Fuellen
38 # c) is done for pairwise alignments in Bio::Tools::pSW; and
39 # also you can read in stuff from programs like clustal and hmmer
48 my $str = Bio
::AlignIO
->newFh('-fh'=> \
*DATA
, '-format' => 'pfam' );
51 # write out a MSF file
52 my $out = Bio
::AlignIO
->newFh('-fh'=> \
*STDOUT
, '-format' => 'msf');
53 my $status = print $out $al;
55 # order by alphabetically then start end
56 $al->sort_alphabetically();
58 # write in Pfam format now...
59 my $out2=Bio
::AlignIO
->newFh( '-fh'=> \
*STDOUT
, '-format' => 'pfam');
60 $status = print $out2 $al;
62 # now set the display name to be
63 # name_# like roa1_human_1, roa1_human_2 etc
64 # This **doesn't** change the underlying names of the
65 # sequences you'll be glad to hear.
67 $al->set_displayname_count();
69 # dump again... bored of this yet?
71 $status = print $out2 $al;
73 # get into the alignment and get things out
74 # we just want to see how many unique names
75 # there are in this alignment
77 my ($seq, $id, %hash) ;
79 # loop over the alignment
80 foreach $seq ( $al->eachSeq() ) {
81 # increment a hash on the name by one each time
87 foreach $id ( keys %hash ) {
88 print "$id has $hash{$id} subsequences in this alignment\n";
92 GR10_BRANA
/8-79 CFVGGL
......AWATGDAELERTFS
.....Q
.FGEV
..IDSKIIND
.............RETGRSRGFGFVTFKDEKSMKDAIDEMNG
.K
...ELDGRTITV
93 HUD_HUMAN
/48-119 LIVNYL
......PQNMTQEEFRSLFG
.....S
.IGEI
..ESCKLVRD
.............KITGQSLGYGFVNYIDPKDAEKAINTLNG
.L
...RLQTKTIKV
94 IF32_SCHPO
/41-124 VVIEGAP
....VVEEAKQQDFFRFLSSKVLAK
.IGKVKENGFYMPFE
.........EKNGK
..KMSLGLVFADFENVDGADLCVQELDGKQ
...ILKNHTFVV
95 IF32_YEAST
/79-157 IVVNGAPVIPSAKVPVLKKALTSLFS
.....K
.AGKV
..VNMEFPID
.............EATGKTKGFLFVECGSMNDAKKIIKSFHGKR
...LDLKHRLFL
96 IF4B_HUMAN
/98-168 AFLGNL
......PYDVTEESIKEFFR
.....G
.LNIS
...AVRLPR
............EPSNPERLKGFGYAEFEDLDSLLSALS
.LNE
.E
...SLGNRRIRV
97 LA_DROME
/151-225 AYAKGF
......PLDSQISELLDFTA
.....N
.YDKV
..VNLTMRNS
.........YDKPTKSYKFKGSIFLTFETKDQAKAFLE
.QEK
.I
...VYKERELLR
98 LA_HUMAN
/113-182 VYIKGF
......PTDATLDDIKEWLE
.....D
.KGQV
..LNIQMRR
..............TLHKAFKGSIFVVFDSIESAKKFVE
.TPG
.Q
...KYKETDLLI
99 MEI2_SCHPO
/197-265 LFVTNL
......PRIVPYATLLELFS
.....K
.LGDV
..KGIDTSSL
.................STDGICIVAFFDIRQAIQAAKSLRSQR
...FFNDRLLYF
100 MODU_DROME
/177-246 VFVTNL
......PNEYLHKDLVALFA
.....K
.FGRL
..SALQRFTN
................LNGNKSVLIAFDTSTGAEAVLQAKPKAL
...TLGDNVLSV
101 MODU_DROME
/260-326 VVVGLI
......GPNITKDDLKTFFE
.....K
.VAPV
..EAVTISSN
.................RLMPRAFVRLASVDDIPKALK
.LHS
.T
...ELFSRFITV
102 MODU_DROME
/342-410 LVVENVG
....KHESYSSDALEKIFK
.....K
.FGDV
..EEIDVVC
..................SKAVLAFVTFKQSDAATKALAQLDG
.K
...TVNKFEWKL
103 MODU_DROME
/422-484 ILVTNL
......TSDATEADLRKVFN
.....D
.SGEI
..ESIIMLG
.....................QKAVVKFKDDEGFCKSFL
.ANE
.S
...IVNNAPIFI
104 MSSP_HUMAN
/31-102 LYIRGL
......PPHTTDQDLVKLCQ
.....P
.YGKI
..VSTKAILD
.............KTTNKCKGYGFVDFDSPAAAQKAVSALKA
.S
...GVQAQKAKQ
105 NAM8_YEAST
/165-237 IFVGDL
......APNVTESQLFELFI
.....NRYAST
..SHAKIVHD
.............QVTGMSKGYGFVKFTNSDEQQLALSEMQG
.V
...FLNGRAIKV
106 NONA_DROME
/304-369 LYVGNL
......TNDITDDELREMFK
.....P
.YGEI
..SEIFSNLD
...................KNFTFLKVDYHPNAEKAKRALDG
.S
...MRKGRQLRV
107 NONA_DROME
/378-448 LRVSNL
......TPFVSNELLYKSFE
.....I
.FGPI
..ERASITVD
..............DRGKHMGEGIVEFAKKSSASACLRMCNE
.K
...CFFLTASLR
108 NOP3_YEAST
/127-190 LFVRPF
......PLDVQESELNEIFG
.....P
.FGPM
..KEVKILN
.....................GFAFVEFEEAESAAKAIEEVHG
.K
...SFANQPLEV
109 NOP3_YEAST
/202-270 ITMKNL
......PEGCSWQDLKDLAR
.....E
.NSLE
..TTFSSVN
................TRDFDGTGALEFPSEEILVEALERLNN
.I
...EFRGSVITV
110 NOP4_YEAST
/28-98 LFVRSI
......PQDVTDEQLADFFS
.....N
.FAPI
..KHAVVVKD
..............TNKRSRGFGFVSFAVEDDTKEALAKARK
.T
...KFNGHILRV
111 NOP4_YEAST
/292-363 VFVRNV
......PYDATEESLAPHFS
.....K
.FGSV
..KYALPVID
.............KSTGLAKGTAFVAFKDQYTYNECIKNAPA
.A
...GSTSLLIGD
112 NSR1_YEAST
/170-241 IFVGRL
......SWSIDDEWLKKEFE
.....H
.IGGV
..IGARVIYE
.............RGTDRSRGYGYVDFENKSYAEKAIQEMQG
.K
...EIDGRPINC
113 NSR1_YEAST
/269-340 LFLGNL
......SFNADRDAIFELFA
.....K
.HGEV
..VSVRIPTH
.............PETEQPKGFGYVQFSNMEDAKKALDALQG
.E
...YIDNRPVRL
114 NUCL_CHICK
/283-352 LFVKNL
......TPTKDYEELRTAIK
.....EFFGKK
...NLQVSEV
..............RIGSSKRFGYVDFLSAEDMDKALQ
.LNG
.K
...KLMGLEIKL
115 PABP_DROME
/4-75 LYVGDL
......PQDVNESGLFDKFS
.....S
.AGPV
..LSIRVCRD
.............VITRRSLGYAYVNFQQPADAERALDTMNF
.D
...LVRNKPIRI
116 PABP_DROME
/92-162 VFIKNL
......DRAIDNKAIYDTFS
.....A
.FGNI
..LSCKVATD
..............EKGNSKGYGFVHFETEEAANTSIDKVNG
.M
...LLNGKKVYV
117 PABP_DROME
/183-254 VYVKNF
......TEDFDDEKLKEFFE
.....P
.YGKI
..TSYKVMS
..............KEDGKSKGFGFVAFETTEAAEAAVQALNGKD
...MGEGKSLYV
118 PABP_SCHPO
/249-319 VYIKNL
......DTEITEQEFSDLFG
.....Q
.FGEI
..TSLSLVKD
..............QNDKPRGFGFVNYANHECAQKAVDELND
.K
...EYKGKKLYV
119 PES4_YEAST
/93-164 LFIGDL
......HETVTEETLKGIFK
.....K
.YPSF
..VSAKVCLD
.............SVTKKSLGHGYLNFEDKEEAEKAMEELNY
.T
...KVNGKEIRI
120 PES4_YEAST
/305-374 IFIKNL
......PTITTRDDILNFFS
.....E
.VGPI
..KSIYLSN
...............ATKVKYLWAFVTYKNSSDSEKAIKRYNN
.F
...YFRGKKLLV
121 PR24_YEAST
/43-111 VLVKNL
......PKSYNQNKVYKYFK
.....H
.CGPI
..IHVDVAD
...............SLKKNFRFARIEFARYDGALAAIT
.KTH
.K
...VVGQNEIIV
122 PR24_YEAST
/119-190 LWMTNF
......PPSYTQRNIRDLLQ
.....D
.INVV
.ALSIRLPSL
..............RFNTSRRFAYIDVTSKEDARYCVEKLNG
.L
...KIEGYTLVT
123 PR24_YEAST
/212-284 IMIRNL
.....STELLDENLLRESFE
.....G
.FGSI
..EKINIPAG
............QKEHSFNNCCAFMVFENKDSAERALQ
.MNR
.S
...LLGNREISV
124 PSF_HUMAN
/373-443 LSVRNL
......SPYVSNELLEEAFS
.....Q
.FGPI
..ERAVVIVD
..............DRGRSTGKGIVEFASKPAARKAFERCSE
.G
...VFLLTTTPR
125 PTB_HUMAN
/61-128 IHIRKL
......PIDVTEGEVISLGL
.....P
.FGKV
..TNLLMLKG
...................KNQAFIEMNTEEAANTMVN
.YYT
.SVTPVLRGQPIYI
126 PTB_HUMAN
/186-253 IIVENL
......FYPVTLDVLHQIFS
.....K
.FGTV
....LKIIT
...............FTKNNQFQALLQYADPVSAQHAKLSLDG
.Q
...NIYNACCTL
127 PUB1_YEAST
/76-146 LYVGNL
......DKAITEDILKQYFQ
.....V
.GGPI
..ANIKIMID
..............KNNKNVNYAFVEYHQSHDANIALQTLNG
.K
...QIENNIVKI
128 PUB1_YEAST
/163-234 LFVGDL
......NVNVDDETLRNAFK
.....D
.FPSY
..LSGHVMWD
.............MQTGSSRGYGFVSFTSQDDAQNAMDSMQG
.Q
...DLNGRPLRI
129 PUB1_YEAST
/342-407 AYIGNI
......PHFATEADLIPLFQ
.....N
.FGFI
..LDFKHYPE
...................KGCCFIKYDTHEQAAVCIVALAN
.F
...PFQGRNLRT
130 RB97_DROME
/34-104 LFIGGL
......APYTTEENLKLFYG
.....Q
.WGKV
..VDVVVMRD
.............AATKRSRGFGFITYTKSLMVDRAQE
..NRPH
...IIDGKTVEA
131 RN12_YEAST
/200-267 IVIKFQ
......GPALTEEEIYSLFR
.....R
.YGTI
....IDIFP
...............PTAANNNVAKVRYRSFRGAISAKNCVSG
.I
...EIHNTVLHI
132 RN15_YEAST
/20-91 VYLGSI
......PYDQTEEQILDLCS
.....N
.VGPV
..INLKMMFD
.............PQTGRSKGYAFIEFRDLESSASAVRNLNG
.Y
...QLGSRFLKC
133 RNP1_YEAST
/37-109 LYVGNL
......PKNCRKQDLRDLFE
.....PNYGKI
..TINMLKKK
.............PLKKPLKRFAFIEFQEGVNLKKVKEKMNG
.K
...IFMNEKIVI
134 RO28_NICSY
/99-170 LFVGNL
......PYDIDSEGLAQLFQ
.....Q
.AGVV
..EIAEVIYN
.............RETDRSRGFGFVTMSTVEEADKAVELYSQ
.Y
...DLNGRLLTV
135 RO33_NICSY
/116-187 LYVGNL
......PFSMTSSQLSEIFA
.....E
.AGTV
..ANVEIVYD
.............RVTDRSRGFAFVTMGSVEEAKEAIRLFDG
.S
...QVGGRTVKV
136 RO33_NICSY
/219-290 LYVANL
......SWALTSQGLRDAFA
.....D
.QPGF
..MSAKVIYD
.............RSSGRSRGFGFITFSSAEAMNSALDTMNE
.V
...ELEGRPLRL
137 ROA1_BOVIN
/106-176 IFVGGI
......KEDTEEHHLRDYFE
.....Q
.YGKI
..EVIEIMTD
.............RGSGKKRGFAFVTFDDHDSVDKIVI
.QKY
.H
...TVNGHNCEV
138 ROC_HUMAN
/18-82 VFIGNL
.....NTLVVKKSDVEAIFS
.....K
.YGKI
..VGCSVHK
.....................GFAFVQYVNERNARAAVAGEDG
.R
...MIAGQVLDI
139 ROF_HUMAN
/113-183 VRLRGL
......PFGCTKEEIVQFFS
.....G
.LEIV
.PNGITLPVD
..............PEGKITGEAFVQFASQELAEKALG
.KHK
.E
...RIGHRYIEV
140 ROG_HUMAN
/10-81 LFIGGL
......NTETNEKALEAVFG
.....K
.YGRI
..VEVLLMKD
.............RETNKSRGFAFVTFESPADAKDAARDMNG
.K
...SLDGKAIKV
141 RT19_ARATH
/33-104 LYIGGL
......SPGTDEHSLKDAFS
.....S
.FNGV
..TEARVMTN
.............KVTGRSRGYGFVNFISEDSANSAISAMNG
.Q
...ELNGFNISV
142 RU17_DROME
/104-175 LFIARI
......NYDTSESKLRREFE
.....F
.YGPI
..KKIVLIHD
.............QESGKPKGYAFIEYEHERDMHAAYKHADG
.K
...KIDSKRVLV
143 RU1A_HUMAN
/12-84 IYINNLNE
..KIKKDELKKSLYAIFS
.....Q
.FGQI
..LDILVSR
................SLKMRGQAFVIFKEVSSATNALRSMQG
.F
...PFYDKPMRI
144 RU1A_HUMAN
/210-276 LFLTNL
......PEETNELMLSMLFN
.....Q
.FPGF
..KEVRLVPG
..................RHDIAFVEFDNEVQAGAARDALQG
.F
...KITQNNAMK
145 RU1A_YEAST
/229-293 LLIQNL
......PSGTTEQLLSQILG
.....N
.EALV
...EIRLVSV
...................RNLAFVEYETVADATKIKNQLGS
.T
...YKLQNNDVT
146 RU2B_HUMAN
/9-81 IYINNMND
..KIKKEELKRSLYALFS
.....Q
.FGHV
..VDIVALK
................TMKMRGQAFVIFKELGSSTNALRQLQG
.F
...PFYGKPMRI
147 RU2B_HUMAN
/153-220 LFLNNL
......PEETNEMMLSMLFN
.....Q
.FPGF
..KEVRLVPG
..................RHDIAFVEFENDGQAGAARDALQGFK
...ITPSHAMKI
148 SC35_CHICK
/16-87 LKVDNL
......TYRTSPDTLRRVFE
.....K
.YGRV
..GDVYIPRD
.............RYTKESRGFAFVRFHDKRDAEDAMDAMDG
.A
...VLDGRELRV
149 SP33_HUMAN
/17-85 IYVGNL
......PPDIRTKDIEDVFY
.....K
.YGAI
..RDIDLKNR
................RGGPPFAFVEFEDPRDAEDAVYGRDG
.Y
...DYDGYRLRV
150 SP33_HUMAN
/122-186 VVVSGL
......PPSGSWQDLKDHMR
.....E
.AGDV
..CYADVYRD
....................GTGVVEFVRKEDMTYAVRKLDN
.T
...KFRSHEGET
151 SQD_DROME
/58-128 LFVGGL
......SWETTEKELRDHFG
.....K
.YGEI
..ESINVKTD
.............PQTGRSRGFAFIVFTNTEAIDKVSA
.ADE
.H
...IINSKKVDP
152 SQD_DROME
/138-208 IFVGGL
......TTEISDEEIKTYFG
.....Q
.FGNI
..VEVEMPLD
.............KQKSQRKGFCFITFDSEQVVTDLLK
.TPK
.Q
...KIAGKEVDV
153 SR55_DROME
/5-68 VYVGGL
......PYGVRERDLERFFK
.....G
.YGRT
..RDILIKN
.....................GYGFVEFEDYRDADDAVYELNG
.K
...ELLGERVVV
154 SSB1_YEAST
/39-114 IFIGNV
......AHECTEDDLKQLFV
.....EEFGDE
..VSVEIPIK
..........EHTDGHIPASKHALVKFPTKIDFDNIKENYDT
.K
...VVKDREIHI
155 SXLF_DROME
/127-198 LIVNYL
......PQDMTDRELYALFR
.....A
.IGPI
..NTCRIMRD
.............YKTGYSFGYAFVDFTSEMDSQRAIKVLNG
.I
...TVRNKRLKV
156 SXLF_DROME
/213-285 LYVTNL
......PRTITDDQLDTIFG
.....K
.YGSI
..VQKNILRD
.............KLTGRPRGVAFVRYNKREEAQEAISALNNVI
...PEGGSQPLS
157 TIA1_HUMAN
/9-78 LYVGNL
......SRDVTEALILQLFS
.....Q
.IGPC
..KNCKMIMD
...............TAGNDPYCFVEFHEHRHAAAALAAMNG
.R
...KIMGKEVKV
158 TIA1_HUMAN
/97-168 VFVGDL
......SPQITTEDIKAAFA
.....P
.FGRI
..SDARVVKD
.............MATGKSKGYGFVSFFNKWDAENAIQQMGG
.Q
...WLGGRQIRT
159 TIA1_HUMAN
/205-270 VYCGGV
......TSGLTEQLMRQTFS
.....P
.FGQI
..MEIRVFPD
...................KGYSFVRFNSHESAAHAIVSVNG
.T
...TIEGHVVKC
160 TRA2_DROME
/99-170 IGVFGL
......NTNTSQHKVRELFN
.....K
.YGPI
..ERIQMVID
.............AQTQRSRGFCFIYFEKLSDARAAKDSCSG
.I
...EVDGRRIRV
161 U2AF_HUMAN
/261-332 LFIGGL
......PNYLNDDQVKELLT
.....S
.FGPL
..KAFNLVKD
.............SATGLSKGYAFCEYVDINVTDQAIAGLNG
.M
...QLGDKKLLV
162 U2AF_SCHPO
/312-383 IYISNL
......PLNLGEDQVVELLK
.....P
.FGDL
..LSFQLIKN
.............IADGSSKGFCFCEFKNPSDAEVAISGLDG
.K
...DTYGNKLHA
163 U2AG_HUMAN
/67-142 CAVSDVEM
..QEHYDEFFEEVFTEME
.....EKYGEV
..EEMNVCDN
..............LGDHLVGNVYVKFRREEDAEKAVIDLNN
.R
...WFNGQPIHA
164 WHI3_YEAST
/540-614 LYVGNL
......PSDATEQELRQLFS
.....G
.QEGF
..RRLSFRNK
..........NTTSNGHSHGPMCFVEFDDVSFATRALAELYG
.R
...QLPRSTVSS
165 X16_HUMAN
/12-78 VYVGNL
......GNNGNKTELERAFG
.....Y
.YGPL
..RSVWVARN
..................PPGFAFVEFEDPRDAADAVRELDG
.R
...TLCGCRVRV
166 YHC4_YEAST
/348-415 IFVGQL
......DKETTREELNRRFS
.....T
.HGKI
..QDINLIFK
.................PTNIFAFIKYETEEAAAAALESENH
.A
...IFLNKTMHV
167 YHH5_YEAST
/315-384 ILVKNL
......PSDTTQEEVLDYFS
.....T
.IGPI
..KSVFISEK
...............QANTPHKAFVTYKNEEESKKAQKCLNK
.T
...IFKNHTIWV
168 YIS1_YEAST
/66-136 IFVGNI
......TPDVTPEQIEDHFK
.....D
.CGQI
..KRITLLYD
.............RNTGTPKGYGYIEFESPAYREKALQ
.LNG
.G
...ELKGKKIAV
169 YIS5_YEAST
/33-104 IYIGNL
......NRELTEGDILTVFS
.....E
.YGVP
..VDVILSRD
.............ENTGESQGFAYLKYEDQRSTILAVDNLNG
.F
...KIGGRALKI
170 ARP2_PLAFA
/364-438 VEVTYLF
....STYLVNGQTL
..IYS
.....N
.ISVV
....LVILY
........HQKFKETVLGRNSGFGFVSYDNVISAQHAIQFMNG
.Y
...FVNNKYLKV
171 CABA_MOUSE
/77-147 MFVGGL
......SWDTSKKDLKDYFT
.....K
.FGEV
..VDCTIKMD
.............PNTGRSRGFGFILFKDSSSVEKVLD
.QKE
.H
...RLDGRVIDP
172 CABA_MOUSE
/161-231 IFVGGL
......NPEATEEKIREYFG
.....Q
.FGEI
..EAIELPID
.............PKLNKRRGFVFITFKEEDPVKKVLE
.KKF
.H
...TVSGSKCEI
173 CPO_DROME
/453-526 LFVSGL
......PMDAKPRELYLLFR
.....A
.YEGY
..EGSLLKV
............TSKNGKTASPVGFVTFHTRAGAEAAKQDLQGVR
...FDPDMPQTI
174 CST2_HUMAN
/18-89 VFVGNI
......PYEATEEQLKDIFS
.....E
.VGPV
..VSFRLVYD
.............RETGKPKGYGFCEYQDQETALSAMRNLNG
.R
...EFSGRALRV
175 D111_ARATH
/281-360 LLLRNMVG
.PGQVDDELEDEVGGECA
.....K
.YGTV
..TRVLIFE
..........ITEPNFPVHEAVRIFVQFSRPEETTKALVDLDG
.R
...YFGGRTVRA
176 ELAV_DROME
/250-322 LYVSGL
......PKTMTQQELEAIFA
.....P
.FGAI
..ITSRILQN
............AGNDTQTKGVGFIRFDKREEATRAIIALNG
.T
...TPSSCTDPI
177 ELAV_DROME
/404-475 IFIYNL
......APETEEAALWQLFG
.....P
.FGAV
..QSVKIVKD
.............PTTNQCKGYGFVSMTNYDEAAMAIRALNG
.Y
...TMGNRVLQV
178 EWS_HUMAN
/363-442 IYVQGL
......NDSVTLDDLADFFK
.....Q
.CGVV
..K
.MNKRTG
....QPMIHIYLDKETGKPKGDATVSYEDPPTAKAAVEWFDG
.K
...DFQGSKLKV
179 GBP2_YEAST
/124-193 IFVRNL
......TFDCTPEDLKELFG
.....T
.VGEV
..VEADIIT
...............SKGHHRGMGTVEFTKNESVQDAISKFDG
.A
...LFMDRKLMV
180 GBP2_YEAST
/221-291 VFIINL
......PYSMNWQSLKDMFK
.....E
.CGHV
..LRADVELD
..............FNGFSRGFGSVIYPTEDEMIRAIDTFNG
.M
...EVEGRVLEV
181 GBP2_YEAST
/351-421 IYCSNL
......PFSTARSDLFDLFG
.....P
.IGKI
..NNAELKP
..............QENGQPTGVAVVEYENLVDADFCIQKLNN
.Y
...NYGGCSLQI