3 (defparameter *ucsc-dir
*
4 (merge-pathnames #p
"lbhc/ucsc/db/" (user-homedir-pathname)))
6 (defmacro def-ucsc-file-mapper
((name file
) &body body
)
7 `(defun ,name
(fun &key
(file (merge-pathnames *ucsc-dir
* ,file
)) limit
)
8 ,(concatenate 'string
"Read UCSC " file
" file")
10 (with-open-typed-file (stream file
)
11 (each-line (line stream
)
12 (let ((data (split-sequence-array #\Tab line
)))
13 (when (and limit
(= count limit
))
20 (def-ucsc-file-mapper (map-chrom-info "chromInfo.txt.gz")
22 (make-instance 'feature
27 :end
(read-from-string (aref data
1))
30 (make-instance 'feature
36 :end
(read-from-string (aref data
1))
40 ;; TODO: Guess db from id
42 (defparameter *dbxref-db-patterns
*
43 (mapcar #'(lambda (pair)
45 (create-scanner (first pair
))
48 ("^[A-Z][A-Z0-9]{5}(-[0-9]+)?$" :protein_id
)
53 (dolist (pair *dbxref-db-patterns
*)
54 (when (scan (first pair
) id
)
55 (return-from dbxref-db
(second pair
))))
56 (return-from dbxref-db
:unknow
))
58 (defun format-dbxref (id &optional db
)
59 (format nil
"~a:~a" (if db db
(dbxref-db id
)) id
))
61 (defun ucsc-know-gene-make-exons (data)
62 (let ((starts (remove "" (split-sequence #\
, (aref data
8)) :test
#'equal
))
63 (ends (split-sequence #\
, (aref data
9)))
66 (mapcar #'(lambda (start end
)
67 (make-instance 'feature
69 :id
(format nil
"exon_~a_~d" (aref data
0) (incf i
))
70 :start
(read-from-string start
)
71 :end
(read-from-string end
)
78 (defun ucsc-know-gene-make-mRNA (data)
80 (make-instance 'feature
81 :seqid
(aref data
1) ; chrom
82 :id
(format nil
"mrna_~a" (aref data
0))
84 :strand
(aref data
2) ; strand
85 :start
(read-from-string (aref data
5)) ; cdsStart
86 :end
(read-from-string (aref data
6)) ; cdsEnd
88 :children
(ucsc-know-gene-make-exons data
)
92 (def-ucsc-file-mapper (map-know-gene "knownGene.txt.gz")
94 (when (> (length (aref data
10)) 0)
95 (push (format-dbxref (aref data
10)) dbxrefs
))
96 (push (format-dbxref (aref data
11) :ucsc_align
) dbxrefs
)
98 (make-instance 'feature
99 :seqid
(aref data
1) ; chrom
102 :strand
(aref data
2) ; strand
103 :start
(read-from-string (aref data
3)) ; txStart
104 :end
(read-from-string (aref data
4)) ; txEnd
106 :children
(ucsc-know-gene-make-mRNA data
)
111 (def-output-fun convert-gff
(stream)
112 (flet ((save-gff (feat)
114 (puts "##gff-version 3" stream
)
115 (map-chrom-info #'save-gff
)
116 (map-know-gene #'save-gff
:limit
100)))