Windows installer: Update README.txt.
[maxima.git] / share / stringproc / unicode-sniffer.lisp
blob35543d92696b373caf494f8c9735c6f4157f3b98
1 ;; unicode-sniffer.lisp -- attempt to determine Unicode encoding by inspecting bytes
2 ;; copyright 2018 by Robert Dodier
3 ;; I release this work under terms of the GNU General Public License, version 2
5 ;; Adapted from: https://en.wikipedia.org/wiki/Byte_order_mark
6 (defparameter unicode-signatures
7 ;; Sort signatures in order of decreasing length,
8 ;; so longer signatures are tested first.
9 ;; This really only makes a difference for UTF-16le vs UTF-32le,
10 ;; but it is harmless in other cases.
11 (sort
12 '(((#xEF #xBB #xBF) . #+clisp charset:utf-8 #-clisp :utf-8)
14 ((#xFE #xFF) . #+clisp charset:unicode-16-big-endian #+ecl :UCS-2BE #+cmucl :utf-16-be #-(or clisp ecl cmucl) :utf-16be)
15 ((#xFF #xFE) . #+clisp charset:unicode-16-little-endian #+ecl :UCS-2LE #+cmucl :utf-16-le #-(or clisp ecl cmucl) :utf-16le)
17 ((#x00 #x00 #xFE #xFF) . #+clisp charset:unicode-32-big-endian #+ecl :UCS-4BE #+cmucl :utf-32-be #-(or clisp ecl cmucl) :utf-32be)
18 ((#xFF #xFE #x00 #x00) . #+clisp charset:unicode-32-little-endian #+ecl :UCS-4LE #+cmucl :utf-32-le #-(or clisp ecl cmucl) :utf-32le)
20 ;; UTF-7 not known to SBCL, CCL, ECL, or CMUCL
21 ((#x2B #x2F #x76 #x38) . #+clisp charset:utf-7 #-clisp :utf-7)
22 ((#x2B #x2F #x76 #x39) . #+clisp charset:utf-7 #-clisp :utf-7)
23 ((#x2B #x2F #x76 #x2B) . #+clisp charset:utf-7 #-clisp :utf-7)
24 ((#x2B #x2F #x76 #x2F) . #+clisp charset:utf-7 #-clisp :utf-7)
25 ((#x2B #x2F #x76 #x38 #x2D) . #+clisp charset:utf-7 #-clisp :utf-7)
27 ;; UTF-1 not known to Clisp, SBCL, CCL, ECL, or CMUCL
28 ((#xF7 #x64 #x4C) . :utf-1)
30 ;; UTF-EBCDIC not known to Clisp, SBCL, CCL, ECL, or CMUCL
31 ;; SBCL knows "US-EBCDIC" but UTF-EBCDIC is different (right?) so not known to SBCL either
32 ((#xDD #x73 #x66 #x73) . :utf-ebcdic)
34 ;; SCSU not known to Clisp, SBCL, CCL, ECL, or CMUCL
35 ((#x0E #xFE #xFF) . :scsu)
37 ;; BOCU not known to Clisp, SBCL, CCL, ECL, or CMUCL
38 ((#xFB #xEE #x28) . :bocu-1)
40 ;; :CP936 is a subset of :GB-18030 according to Wikipedia, so this is a "best fit"
41 ;; GB-18030 and CP936 not known to CMUCL
42 ((#x84 #x31 #x95 #x33) . #+clisp charset:cp936 #+(or ccl sbcl) :cp936 #+ecl :|cp936| #+abcl :gb18030 #-(or clisp ccl sbcl ecl abcl) :gb-18030))
43 #'(lambda (a b) (> (length (car a)) (length (car b))))))
45 (defun sniffer-match (initial-bytes signature-bytes)
46 (let*
47 ((m (length signature-bytes))
48 (initial-bytes-subseq (subseq initial-bytes 0 m))
49 (byte-pairs (mapcar #'(lambda (a b) (list a b)) initial-bytes-subseq signature-bytes)))
50 (loop for p in byte-pairs
51 do (if (not (equal (first p) (second p)))
52 (return-from sniffer-match nil)))
53 t))
55 (defun sniffer-match-search (initial-bytes)
56 (loop for x in unicode-signatures
57 do (if (sniffer-match initial-bytes (car x))
58 (return-from sniffer-match-search (cdr x)))))
60 ;; Given a file name F, returns a Unicode encoding designator
61 ;; if the initial bytes of F match any signature in the UNICODE-SIGNATURES table,
62 ;; otherwise NIL.
64 (defun unicode-sniffer (f)
65 (with-open-file (s f :element-type '(unsigned-byte 8))
66 (let*
67 ((signature-length-max (apply #'max (mapcar #'(lambda (x) (length (car x))) unicode-signatures)))
68 (initial-bytes (loop repeat signature-length-max collect (read-byte s nil))))
69 (sniffer-match-search initial-bytes))))
71 ;; Expose UNICODE-SNIFFER to Maxima user.
72 ;; Returns the symbol name (i.e., a string) of the encoding,
73 ;; if any was found, otherwise false.
75 (defun $inferred_encoding (f)
76 (let ((e (unicode-sniffer f)))
77 (if e (symbol-name e) "DEFAULT")))
79 ;; Try to verify that the inferred encoding is among
80 ;; the encodings known to this Lisp implementation.
81 ;; If there is no known method to check the encoding
82 ;; for this Lisp implementation, return 'UNKNOWN.
83 ;; Otherwise this function returns a generalized Boolean.
85 (defun check-encoding (e)
86 ;; work around ECL bug #435: "UCS-4LE not on list of basic encodings"
87 #+ecl (or (eq e ':ucs-4le) (member e (ext:all-encodings)))
88 #+ccl (ccl:lookup-character-encoding e)
89 #+clisp (equal (symbol-package e) (find-package :charset))
90 ;; CMUCL: flatten table of encodings and look for E among preferred names and their synonyms
91 #+cmucl (member e (apply #'append (mapcar (lambda (l) (if (cdr l) (cons (car l) (cadr l)) l)) (ext:list-all-external-formats))))
92 #+sbcl (check-encoding-sbcl e)
93 #+gcl nil ;; GCL 2.6.12 does not recognize :external-format in OPEN
94 ;; work around ABCL bug: "SYSTEM:AVAILABLE-ENCODINGS symbols strangeness" (https://github.com/armedbear/abcl/issues/82)
95 #+abcl (member (symbol-name e) (mapcar #'symbol-name (system:available-encodings)) :test #'string=)
96 #-(or ecl ccl clisp cmucl sbcl gcl abcl) 'unknown)
98 #+sbcl (defun check-encoding-sbcl (e)
99 (let ((x sb-impl::*external-formats*))
100 (cond
101 ;; not sure when SBCL switched over from hash table to array ... try to handle both
102 ((hash-table-p x) (gethash e x))
103 ((arrayp x)
104 (some
105 #'identity
106 (mapcar (lambda (l) (member e l))
107 (loop for ef across x
108 when (sb-impl::external-format-p ef)
109 collect (sb-impl::ef-names ef)))))
110 (t (merror "CHECK-ENCODING: I don't know how to check encoding for this version of SBCL.")))))
112 ;; Expose CHECK-ENCODING to Maxima user.
113 ;; Argument is an encoding symbol name, such as that returned by $INFERRED_ENCODING.
114 ;; Returns true if the encoding is recognized by the Lisp implementation,
115 ;; false if the encoding is not recognized or the argument is null;
116 ;; if there is no known method to check the encoding, print an error message.
118 ;; CMUCL: symbols for encodings aren't known until this function is called.
119 #+cmucl (ext:list-all-external-formats)
121 (defun $recognized_encoding_p (e)
122 (let ((e-up (string-upcase e)) (e-down (string-downcase e)))
124 (not (null (string= e-up "DEFAULT")))
125 (let ((s (or
126 (find-symbol e #+clisp :charset #-clisp :keyword)
127 (find-symbol e-up #+clisp :charset #-clisp :keyword)
128 (find-symbol e-down #+clisp :charset #-clisp :keyword))))
129 (when s
130 (let ((x (check-encoding s)))
131 (cond
132 ((eq x 'unknown)
133 (merror (intl:gettext "recognized_encoding_p: I don't know how to verify encoding for this Lisp implementation.")))
135 (not (null x))))))))))