gnu/dist/gettext/gettext-tools/libuniname/gen-uninames

   1 #!/usr/local/bin/clisp -C
   2
   3 ;;; Creation of CLISP's uni_names.h from the UnicodeData.txt table.
   4 ;;; Bruno Haible 2000-12-28
   5
   6 (defparameter add-comments nil)
   7
   8 (defstruct unicode-char
   9   (code nil :type integer)
  10   (name nil :type string)
  11   word-indices
  12   word-indices-index
  13 )
  14
  15 (defstruct word-list
  16   (hashed nil :type hash-table)
  17   (sorted nil :type list)
  18   size                          ; number of characters total
  19   length                        ; number of words
  20 )
  21
  22 (defun main (inputfile outputfile)
  23   (declare (type string inputfile outputfile))
  24   #+UNICODE (setq *default-file-encoding* charset:utf-8)
  25   (let ((all-chars '()))
  26     ;; Read all characters and names from the input file.
  27     (with-open-file (istream inputfile :direction :input)
  28       (loop
  29         (let ((line (read-line istream nil nil)))
  30           (unless line (return))
  31           (let* ((i1 (position #\; line))
  32                  (i2 (position #\; line :start (1+ i1)))
  33                  (code-string (subseq line 0 i1))
  34                  (code (parse-integer code-string :radix 16))
  35                  (name-string (subseq line (1+ i1) i2)))
  36             ; Ignore characters whose name starts with "<".
  37             (unless (eql (char name-string 0) #\<)
  38               ; Also ignore Hangul syllables; they are treated specially.
  39               (unless (<= #xAC00 code #xD7A3)
  40                 ; Also ignore CJK compatibility ideographs; they are treated
  41                 ; specially as well.
  42                 (unless (or (<= #xF900 code #xFA2D) (<= #xFA30 code #xFA6A)
  43                             (<= #x2F800 code #x2FA1D))
  44                   ; Transform the code so that it fits in 16 bits. In
  45                   ; Unicode 3.1 the following ranges are used.
  46                   ;   0x00000..0x033FF  >>12=  0x00..0x03  -> 0x0..0x3
  47                   ;   0x0A000..0x0A4FF  >>12=  0x0A        -> 0x4
  48                   ;   0x0F900..0x0FFFF  >>12=  0x0F        -> 0x5
  49                   ;   0x10300..0x104FF  >>12=  0x10        -> 0x6
  50                   ;   0x1D000..0x1D7DD  >>12=  0x1D        -> 0x7
  51                   ;   0x2F800..0x2FAFF  >>12=  0x2F        -> 0x8
  52                   ;   0xE0000..0xE00FF  >>12=  0xE0        -> 0x9
  53                   (flet ((transform (x)
  54                            (dpb
  55                              (case (ash x -12)
  56                                ((#x00 #x01 #x02 #x03) (ash x -12))
  57                                (#x0A 4)
  58                                (#x0F 5)
  59                                (#x10 6)
  60                                (#x1D 7)
  61                                (#x2F 8)
  62                                (#xE0 9)
  63                                (t (error "Update the transform function for 0x~5,'0X" x))
  64                              )
  65                              (byte 8 12)
  66                              x
  67                         )) )
  68                     (push (make-unicode-char :code (transform code)
  69                                              :name name-string)
  70                           all-chars
  71             ) ) ) ) )
  72     ) ) ) )
  73     (setq all-chars (nreverse all-chars))
  74     ;; Split into words.
  75     (let ((words-by-length (make-array 0 :adjustable t)))
  76       (dolist (name (list* "HANGUL SYLLABLE" "CJK COMPATIBILITY" (mapcar #'unicode-char-name all-chars)))
  77         (let ((i1 0))
  78           (loop
  79             (when (>= i1 (length name)) (return))
  80             (let ((i2 (or (position #\Space name :start i1) (length name))))
  81               (let* ((word (subseq name i1 i2))
  82                      (len (length word)))
  83                 (when (>= len (length words-by-length))
  84                   (adjust-array words-by-length (1+ len))
  85                 )
  86                 (unless (aref words-by-length len)
  87                   (setf (aref words-by-length len)
  88                         (make-word-list
  89                           :hashed (make-hash-table :test #'equal)
  90                           :sorted '()
  91                 ) )     )
  92                 (let ((word-list (aref words-by-length len)))
  93                   (unless (gethash word (word-list-hashed word-list))
  94                     (setf (gethash word (word-list-hashed word-list)) t)
  95                     (push word (word-list-sorted word-list))
  96                 ) )
  97               )
  98               (setq i1 (1+ i2))
  99       ) ) ) )
 100       ;; Sort the word lists.
 101       (dotimes (len (length words-by-length))
 102         (unless (aref words-by-length len)
 103           (setf (aref words-by-length len)
 104                 (make-word-list
 105                   :hashed (make-hash-table :test #'equal)
 106                   :sorted '()
 107         ) )     )
 108         (let ((word-list (aref words-by-length len)))
 109           (setf (word-list-sorted word-list)
 110                 (sort (word-list-sorted word-list) #'string<)
 111           )
 112           (setf (word-list-size word-list)
 113                 (reduce #'+ (mapcar #'length (word-list-sorted word-list)))
 114           )
 115           (setf (word-list-length word-list)
 116                 (length (word-list-sorted word-list))
 117       ) ) )
 118       ;; Output the tables.
 119       (with-open-file (ostream outputfile :direction :output
 120                        #+UNICODE :external-format #+UNICODE charset:ascii)
 121         (format ostream "/*~%")
 122         (format ostream " * ~A~%" (file-namestring outputfile))
 123         (format ostream " *~%")
 124         (format ostream " * Unicode character name table.~%")
 125         (format ostream " * Generated automatically by the gen-uninames utility.~%")
 126         (format ostream " */~%")
 127         (format ostream "~%")
 128         (format ostream "static const char unicode_name_words[~D] = {~%"
 129                         (let ((sum 0))
 130                           (dotimes (len (length words-by-length))
 131                             (let ((word-list (aref words-by-length len)))
 132                               (incf sum (word-list-size word-list))
 133                           ) )
 134                           sum
 135         )               )
 136         (dotimes (len (length words-by-length))
 137           (let ((word-list (aref words-by-length len)))
 138             (dolist (word (word-list-sorted word-list))
 139               (format ostream " ~{ '~C',~}~%" (coerce word 'list))
 140         ) ) )
 141         (format ostream "};~%")
 142         (format ostream "#define UNICODE_CHARNAME_NUM_WORDS ~D~%"
 143                         (let ((sum 0))
 144                           (dotimes (len (length words-by-length))
 145                             (let ((word-list (aref words-by-length len)))
 146                               (incf sum (word-list-length word-list))
 147                           ) )
 148                           sum
 149         )               )
 150         #| ; Redundant data
 151         (format ostream "static const uint16_t unicode_name_word_offsets[~D] = {~%"
 152                         (let ((sum 0))
 153                           (dotimes (len (length words-by-length))
 154                             (let ((word-list (aref words-by-length len)))
 155                               (incf sum (word-list-length word-list))
 156                           ) )
 157                           sum
 158         )               )
 159         (dotimes (len (length words-by-length))
 160           (let ((word-list (aref words-by-length len)))
 161             (when (word-list-sorted word-list)
 162               (format ostream " ")
 163               (do ((l (word-list-sorted word-list) (cdr l))
 164                    (offset 0 (+ offset (length (car l)))))
 165                   ((endp l))
 166                 (format ostream "~<~% ~0,79:; ~D,~>" offset)
 167               )
 168               (format ostream "~%")
 169         ) ) )
 170         (format ostream "};~%")
 171         |#
 172         (format ostream "static const struct { uint16_t extra_offset; uint16_t ind_offset; } unicode_name_by_length[~D] = {~%"
 173                         (1+ (length words-by-length))
 174         )
 175         (let ((extra-offset 0)
 176               (ind-offset 0))
 177           (dotimes (len (length words-by-length))
 178             (let ((word-list (aref words-by-length len)))
 179               (format ostream "  { ~D, ~D },~%" extra-offset ind-offset)
 180               (incf extra-offset (word-list-size word-list))
 181               (incf ind-offset (word-list-length word-list))
 182           ) )
 183           (format ostream "  { ~D, ~D }~%" extra-offset ind-offset)
 184         )
 185         (format ostream "};~%")
 186         (let ((ind-offset 0))
 187           (dotimes (len (length words-by-length))
 188             (let ((word-list (aref words-by-length len)))
 189               (dolist (word (word-list-sorted word-list))
 190                 (setf (gethash word (word-list-hashed word-list)) ind-offset)
 191                 (incf ind-offset)
 192         ) ) ) )
 193         (dolist (word '("HANGUL" "SYLLABLE" "CJK" "COMPATIBILITY"))
 194           (format ostream "#define UNICODE_CHARNAME_WORD_~A ~D~%" word
 195                           (gethash word (word-list-hashed (aref words-by-length (length word))))
 196         ) )
 197         ;; Compute the word-indices for every unicode-char.
 198         (dolist (uc all-chars)
 199           (let ((name (unicode-char-name uc))
 200                 (indices '()))
 201             (let ((i1 0))
 202               (loop
 203                 (when (>= i1 (length name)) (return))
 204                 (let ((i2 (or (position #\Space name :start i1) (length name))))
 205                   (let* ((word (subseq name i1 i2))
 206                          (len (length word)))
 207                     (push (gethash word (word-list-hashed (aref words-by-length len)))
 208                           indices
 209                     )
 210                   )
 211                   (setq i1 (1+ i2))
 212             ) ) )
 213             (setf (unicode-char-word-indices uc)
 214                   (coerce (nreverse indices) 'vector)
 215             )
 216         ) )
 217         ;; Sort the list of unicode-chars by word-indices.
 218         (setq all-chars
 219               (sort all-chars
 220                     (lambda (vec1 vec2)
 221                       (let ((len1 (length vec1))
 222                             (len2 (length vec2)))
 223                         (do ((i 0 (1+ i)))
 224                             (nil)
 225                           (if (< i len2)
 226                             (if (< i len1)
 227                               (cond ((< (aref vec1 i) (aref vec2 i)) (return t))
 228                                     ((> (aref vec1 i) (aref vec2 i)) (return nil))
 229                               )
 230                               (return t)
 231                             )
 232                             (return nil)
 233                     ) ) ) )
 234                     :key #'unicode-char-word-indices
 235         )     )
 236         ;; Output the word-indices.
 237         (format ostream "static const uint16_t unicode_names[~D] = {~%"
 238                         (reduce #'+ (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
 239         )
 240         (let ((i 0))
 241           (dolist (uc all-chars)
 242             (format ostream " ~{ ~D,~}"
 243                             (maplist (lambda (r) (+ (* 2 (car r)) (if (cdr r) 1 0)))
 244                                      (coerce (unicode-char-word-indices uc) 'list)
 245                             )
 246             )
 247             (when add-comments
 248               (format ostream "~40T/* ~A */" (unicode-char-name uc))
 249             )
 250             (format ostream "~%")
 251             (setf (unicode-char-word-indices-index uc) i)
 252             (incf i (length (unicode-char-word-indices uc)))
 253         ) )
 254         (format ostream "};~%")
 255         (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_name_to_code[~D] = {~%"
 256                         (length all-chars)
 257         )
 258         (dolist (uc all-chars)
 259           (format ostream "  { 0x~4,'0X, ~D },"
 260                           (unicode-char-code uc)
 261                           (unicode-char-word-indices-index uc)
 262           )
 263           (when add-comments
 264             (format ostream "~21T/* ~A */" (unicode-char-name uc))
 265           )
 266           (format ostream "~%")
 267         )
 268         (format ostream "};~%")
 269         (format ostream "static const struct { uint16_t code; uint16_t name; } unicode_code_to_name[~D] = {~%"
 270                         (length all-chars)
 271         )
 272         (dolist (uc (sort (copy-list all-chars) #'< :key #'unicode-char-code))
 273           (format ostream "  { 0x~4,'0X, ~D },"
 274                           (unicode-char-code uc)
 275                           (unicode-char-word-indices-index uc)
 276           )
 277           (when add-comments
 278             (format ostream "~21T/* ~A */" (unicode-char-name uc))
 279           )
 280           (format ostream "~%")
 281         )
 282         (format ostream "};~%")
 283         (format ostream "#define UNICODE_CHARNAME_MAX_LENGTH ~D~%"
 284                         (reduce #'max (mapcar (lambda (uc) (length (unicode-char-name uc))) all-chars))
 285         )
 286         (format ostream "#define UNICODE_CHARNAME_MAX_WORDS ~D~%"
 287                         (reduce #'max (mapcar (lambda (uc) (length (unicode-char-word-indices uc))) all-chars))
 288         )
 289       )
 290 ) ) )
 291
 292 (main (first *args*) (second *args*))