1 ;; -*- Mode: emacs-lisp; lexical-binding: t -*- ;;
2 ;; Copyright Leo Butler (leo.butler@member.fsf.org) 2015
3 ;; Released under terms of GPLv3+
5 ;; Generate the unicodedata-txt.lisp file from UnicodeData.txt
8 ;; emacs -nw -q --eval "(progn (byte-compile-file \"unicodedata.el\" t) (unicode-init) (unicode-print-lisp))"
11 (defun unicode->list
(file)
12 (interactive "sFile? ")
13 (cl-labels ((string->number
(x)
14 (string-to-number x
16))
20 (let ((n (string->number
(car x
))))
21 (cons (car x
) (cons n
(cons (char n
) (cdr x
)))))))
22 (let ((u (save-excursion
23 (or (get-buffer file
) (find-file-literally file
))
24 (buffer-substring-no-properties (point-min) (point-max)))))
25 (mapcar #'assemble-list
26 (mapcar (lambda(s) (split-string s
";" nil nil
))
27 (split-string u
"\n" t
))))))
29 (defun unicode-hashtable->list
(ht)
31 (maphash #'(lambda(k v
)
36 (eval-when (compile load
)
37 (defvar unicode-code-value-hex
)
38 (defvar unicode-code-value
)
39 (defvar unicode-code-character
)
40 (defvar unicode-character-name
)
41 (defvar unicode-general-category
))
43 (defun unicode-find (predicate)
44 "Creates a function that applies PREDICATE(K V) to the key K and value V of each element
45 in a hashtable HT. Analogous to CL-REMOVE-IF-NOT."
46 (let ((this `(lambda (ht)
47 (let ((oht (make-hash-table :test
#'eq
)))
48 (maphash #'(lambda (k v
)
49 (if (funcall #',predicate k v
)
50 (puthash k v oht
))) ht
)
52 (cl-coerce this
'function
)))
54 (defun unicode->hashtable
(file)
55 (interactive "sFile? ")
56 (let ((ht (make-hash-table :size
29215 :test
#'equal
))
57 (l (unicode->list file
)))
59 for h
= (funcall unicode-code-value-hex c
)
60 for v
= (funcall unicode-code-value c
)
61 for ch
= (funcall unicode-character-name c
)
62 for g
= (funcall unicode-general-category c
)
63 for gc
= (gethash g ht
)
68 (puthash g
(push c gc
) ht
))
71 (eval-when (compile load
)
72 (defvar unicode-data-hashtable
) ;; (unicode->hashtable unicode-data-txt-buffer))
73 (defvar unicode-data-list
) ;; (unicode->list unicode-data-txt-buffer))
74 (defvar unicode-data-txt-url
"http://www.unicode.org/Public/UNIDATA/UnicodeData.txt"
75 "URL of UnicodeData.txt file.")
76 (defvar unicode-data-txt-buffer
"UnicodeData.txt" "Name of buffer created by UNICODEDATA.TXT.GET."))
78 (defun unicodedata.txt.get
(&optional url proceed further-callbacks
)
79 (interactive "sURL? ")
80 (setq url
(or url unicode-data-txt-url
))
81 (let* ((transform (lambda (buffer)
82 (with-current-buffer buffer
83 (setq unicode-data-hashtable
(unicode->hashtable
(buffer-name)))
84 (message "Done processing %s." unicode-data-txt-buffer
))))
85 (callback (lambda (status)
87 (search-forward-regexp "^0000;" (point-max))
89 (kill-whole-line (- (1+ (line-number-at-pos))))
91 (rename-buffer unicode-data-txt-buffer
)
92 (funcall transform unicode-data-txt-buffer
)
93 (cl-loop for f in further-callbacks do
(funcall f
))))))
94 (cond ((find-file-literally unicode-data-txt-buffer
)
95 (funcall transform unicode-data-txt-buffer
))
96 ((or proceed
(y-or-n-p "Retrieve UNICODE DATA online? "))
97 (url-retrieve url callback
))
99 (message "Stopped.")))))
101 (defun unicode-search-character-name (regex &optional ht
)
103 (let ((ht (or ht unicode-data-hashtable
))
105 (cl-loop for k being the hash-keys of ht
106 if
(and (stringp k
) (string-match regex k
))
107 do
(push (gethash k ht
) l
))
109 (defun unicode-search-value (n &optional ht
)
111 (let ((ht (or ht unicode-data-hashtable
)))
113 (defun unicode-search-value-hex (n &optional ht
)
115 (unicode-search-value (string-to-number n
16) ht
))
116 (defun unicode-search-category (c &optional ht
)
118 (let ((ht (or ht unicode-data-hashtable
)))
121 ;; see http://www.unicode.org/Public//3.0-Update1/UnicodeData-3.0.1.html#General%20Category
122 (defun unicode-math-characters ()
124 (let ((ht (make-hash-table :size
8000 :test
#'eq
))
125 (l (append (unicode-search-category "Sm") ;; math symbol
126 (unicode-search-category "Mn") ;; non-spacing mark
127 ;;(unicode-search-category "Me") ;; enclosing mark
128 (unicode-search-category "No") ;; number other
129 (unicode-search-category "Nl") ;; number letter
130 (unicode-search-category "Nd") ;; number decimal digit
131 (unicode-search-category "Pc") ;; number decimal digit
132 (unicode-search-category "Pc") ;; Punctuation, Connector
133 (unicode-search-category "Pd") ;; Punctuation, Dash
134 (unicode-search-category "Ps") ;; Punctuation, Open
135 (unicode-search-category "Pe") ;; Punctuation, Close
136 (unicode-search-category "Pi") ;; Punctuation, Initial quote (may behave like Ps or Pe depending on usage)
137 (unicode-search-category "Pf") ;; Punctuation, Final quote (may behave like Ps or Pe depending on usage)
138 ;;(unicode-search-category "Po") ;; Punctuation, Other
139 (unicode-search-character-name (regexp-opt '("APL FUNCTIONAL SYMBOL" "MATHEMATICAL" "GREEK" "LATIN"))))))
141 do
(puthash (funcall unicode-code-value i
) i ht
))
144 (eval-when (compile load
)
145 (defvar unicode-math-characters nil
) ;; (unicode-math-characters))
146 (defvar unicode-data-output-file
"unicodedata-txt.lisp"))
148 (defun unicode-print-lisp (&optional data-file ht
)
150 (setq ht
(or ht
(unicode-math-characters))
151 data-file
(or data-file unicode-data-output-file
))
152 (if (get-buffer data-file
) (kill-buffer data-file
))
153 (if (file-exists-p data-file
) (delete-file data-file
))
156 (let* ((standard-output (current-buffer))
157 (coding-system-for-read 'utf-8-unix
)
158 (coding-system-for-write 'utf-8-unix
)
159 (print-as-symbol (lambda (x) (if (equal "" x
) (format " nil") (format " |%s|" x
))))
160 (print-as-hex (lambda (x) (if (equal "" x
) (format " nil") (progn (with-output-to-string (princ " #x")(princ x
))))))
161 (print-car (lambda (x)
162 (princ "'(#x")(princ (car x
))))
163 (print-rest (lambda (x)
165 (funcall print-as-symbol
(nth 2 x
))
166 (funcall print-as-symbol
(nth 3 x
))
167 (funcall print-as-symbol
(nth 4 x
))
168 (funcall print-as-symbol
(nth 5 x
))
169 (funcall print-as-symbol
(nth 6 x
))
170 (funcall print-as-symbol
(nth 7 x
))
171 (funcall print-as-symbol
(nth 8 x
))
172 (funcall print-as-symbol
(nth 9 x
))
173 (funcall print-as-symbol
(nth 10 x
))
174 (funcall print-as-symbol
(nth 11 x
))
175 (funcall print-as-symbol
(nth 12 x
))
176 (funcall print-as-symbol
(nth 13 x
))
177 (funcall print-as-hex
(nth 14 x
))
178 (funcall print-as-hex
(nth 15 x
))
181 (funcall print-car x
) (funcall print-rest x
))))
182 (insert ";; -*- mode:lisp; coding: utf-8 -*-\n")
183 (insert (format ";; Derived from %s\n#.(list\n" unicode-data-txt-url
))
184 (mapc printer
(cl-loop for k being the hash-keys of ht
185 for v
= (gethash k ht
)
186 if
(and (numberp k
) (> k
127))
189 (setq s
(buffer-substring-no-properties (point-min) (point-max)))))
190 (with-current-buffer (find-file-literally data-file
)
191 (insert (encode-coding-string s
'utf-8-unix
))
195 (defun unicode-init ()
197 (mapc #'(lambda(x) (set (car x
) `(lambda (l) (nth ,(cadr x
) l
))))
198 '((unicode-code-value-hex 0)
199 (unicode-code-value 1)
200 (unicode-code-character 2)
201 (unicode-character-name 3)
202 (unicode-general-category 4)))
203 (unicodedata.txt.get nil t
)
204 (unicode->hashtable unicode-data-txt-buffer
)
207 ;; end of unicodedata.el ;;