Merge varuzza@lambda:lisp/prog/bioinfo
[biolisp.git] / ucsc.lisp
blobefa79e3712f18344527676bbbacc380e85675967
1 (in-package :bioinfo)
3 (defparameter *ucsc-dir*
4 (merge-pathnames #p"lbhc/ucsc/db/" (user-homedir-pathname)))
6 (defmacro def-ucsc-file-mapper ((name file) &body body)
7 `(defun ,name (fun &key (file (merge-pathnames *ucsc-dir* ,file)) limit)
8 ,(concatenate 'string "Read UCSC " file " file")
9 (let ((count 1))
10 (with-open-typed-file (stream file)
11 (each-line (line stream)
12 (let ((data (split-sequence-array #\Tab line)))
13 (when (and limit (= count limit))
14 (return-from ,name))
15 (incf count)
16 (funcall fun (progn
17 ,@body))))))))
20 (def-ucsc-file-mapper (map-chrom-info "chromInfo.txt.gz")
21 (list
22 (make-instance 'feature
23 :seqid (aref data 0)
24 :name (aref data 0)
25 :start 1
26 :strand "+"
27 :end (read-from-string (aref data 1))
28 :source "UCSC"
29 :type "contig")
30 (make-instance 'feature
31 :seqid (aref data 0)
32 :id (aref data 0)
33 :name (aref data 0)
34 :start 1
35 :strand "+"
36 :end (read-from-string (aref data 1))
37 :source "UCSC"
38 :type "chromosome")))
40 ;; TODO: Guess db from id
42 (defparameter *dbxref-db-patterns*
43 (mapcar #'(lambda (pair)
44 (list
45 (create-scanner (first pair))
46 (second pair)))
47 '(("^NP_" :NCBI_NP)
48 ("^[A-Z][A-Z0-9]{5}(-[0-9]+)?$" :protein_id)
49 ("^[0-9]+$" :NCBI_gi)
50 ("^NM_" :NCBI_NM))))
52 (defun dbxref-db (id)
53 (dolist (pair *dbxref-db-patterns*)
54 (when (scan (first pair) id)
55 (return-from dbxref-db (second pair))))
56 (return-from dbxref-db :unknow))
58 (defun format-dbxref (id &optional db)
59 (format nil "~a:~a" (if db db (dbxref-db id)) id))
61 (defun ucsc-know-gene-make-exons (data)
62 (let ((starts (remove "" (split-sequence #\, (aref data 8)) :test #'equal))
63 (ends (split-sequence #\, (aref data 9)))
64 (i 0))
66 (mapcar #'(lambda (start end)
67 (make-instance 'feature
68 :seqid (aref data 1)
69 :id (format nil "exon_~a_~d" (aref data 0) (incf i))
70 :start (read-from-string start)
71 :end (read-from-string end)
72 :strand (aref data 2)
73 :source "UCSC"
74 :type "exon"))
75 starts ends)))
78 (defun ucsc-know-gene-make-mRNA (data)
79 (list
80 (make-instance 'feature
81 :seqid (aref data 1) ; chrom
82 :id (format nil "mrna_~a" (aref data 0))
83 :name (aref data 0)
84 :strand (aref data 2) ; strand
85 :start (read-from-string (aref data 5)) ; cdsStart
86 :end (read-from-string (aref data 6)) ; cdsEnd
87 :source "UCSC"
88 :children (ucsc-know-gene-make-exons data)
89 :type "mRNA")))
92 (def-ucsc-file-mapper (map-know-gene "knownGene.txt.gz")
93 (let ((dbxrefs nil))
94 (when (> (length (aref data 10)) 0)
95 (push (format-dbxref (aref data 10)) dbxrefs))
96 (push (format-dbxref (aref data 11) :ucsc_align) dbxrefs)
98 (make-instance 'feature
99 :seqid (aref data 1) ; chrom
100 :id (aref data 0)
101 :name (aref data 0)
102 :strand (aref data 2) ; strand
103 :start (read-from-string (aref data 3)) ; txStart
104 :end (read-from-string (aref data 4)) ; txEnd
105 :source "UCSC"
106 :children (ucsc-know-gene-make-mRNA data)
107 :type "gene"
108 :dbxrefs dbxrefs)))
111 (def-output-fun convert-gff (stream)
112 (flet ((save-gff (feat)
113 (gff feat stream)))
114 (puts "##gff-version 3" stream)
115 (map-chrom-info #'save-gff)
116 (map-know-gene #'save-gff :limit 100)))