1 ;; unicode-sniffer.lisp -- attempt to determine Unicode encoding by inspecting bytes
2 ;; copyright 2018 by Robert Dodier
3 ;; I release this work under terms of the GNU General Public License, version 2
5 ;; Adapted from: https://en.wikipedia.org/wiki/Byte_order_mark
6 (defparameter unicode-signatures
7 ;; Sort signatures in order of decreasing length,
8 ;; so longer signatures are tested first.
9 ;; This really only makes a difference for UTF-16le vs UTF-32le,
10 ;; but it is harmless in other cases.
12 '(((#xEF
#xBB
#xBF
) .
#+clisp charset
:utf-8
#-clisp
:utf-8
)
14 ((#xFE
#xFF
) .
#+clisp charset
:unicode-16-big-endian
#+ecl
:UCS-2BE
#+cmucl
:utf-16-be
#-
(or clisp ecl cmucl
) :utf-16be
)
15 ((#xFF
#xFE
) .
#+clisp charset
:unicode-16-little-endian
#+ecl
:UCS-2LE
#+cmucl
:utf-16-le
#-
(or clisp ecl cmucl
) :utf-16le
)
17 ((#x00
#x00
#xFE
#xFF
) .
#+clisp charset
:unicode-32-big-endian
#+ecl
:UCS-4BE
#+cmucl
:utf-32-be
#-
(or clisp ecl cmucl
) :utf-32be
)
18 ((#xFF
#xFE
#x00
#x00
) .
#+clisp charset
:unicode-32-little-endian
#+ecl
:UCS-4LE
#+cmucl
:utf-32-le
#-
(or clisp ecl cmucl
) :utf-32le
)
20 ;; UTF-7 not known to SBCL, CCL, ECL, or CMUCL
21 ((#x2B
#x2F
#x76
#x38
) .
#+clisp charset
:utf-7
#-clisp
:utf-7
)
22 ((#x2B
#x2F
#x76
#x39
) .
#+clisp charset
:utf-7
#-clisp
:utf-7
)
23 ((#x2B
#x2F
#x76
#x2B
) .
#+clisp charset
:utf-7
#-clisp
:utf-7
)
24 ((#x2B
#x2F
#x76
#x2F
) .
#+clisp charset
:utf-7
#-clisp
:utf-7
)
25 ((#x2B
#x2F
#x76
#x38
#x2D
) .
#+clisp charset
:utf-7
#-clisp
:utf-7
)
27 ;; UTF-1 not known to Clisp, SBCL, CCL, ECL, or CMUCL
28 ((#xF7
#x64
#x4C
) .
:utf-1
)
30 ;; UTF-EBCDIC not known to Clisp, SBCL, CCL, ECL, or CMUCL
31 ;; SBCL knows "US-EBCDIC" but UTF-EBCDIC is different (right?) so not known to SBCL either
32 ((#xDD
#x73
#x66
#x73
) .
:utf-ebcdic
)
34 ;; SCSU not known to Clisp, SBCL, CCL, ECL, or CMUCL
35 ((#x0E
#xFE
#xFF
) .
:scsu
)
37 ;; BOCU not known to Clisp, SBCL, CCL, ECL, or CMUCL
38 ((#xFB
#xEE
#x28
) .
:bocu-1
)
40 ;; :CP936 is a subset of :GB-18030 according to Wikipedia, so this is a "best fit"
41 ;; GB-18030 and CP936 not known to CMUCL
42 ((#x84
#x31
#x95
#x33
) .
#+clisp charset
:cp936
#+(or ccl sbcl
) :cp936
#+ecl
:|cp936|
#+abcl
:gb18030
#-
(or clisp ccl sbcl ecl abcl
) :gb-18030
))
43 #'(lambda (a b
) (> (length (car a
)) (length (car b
))))))
45 (defun sniffer-match (initial-bytes signature-bytes
)
47 ((m (length signature-bytes
))
48 (initial-bytes-subseq (subseq initial-bytes
0 m
))
49 (byte-pairs (mapcar #'(lambda (a b
) (list a b
)) initial-bytes-subseq signature-bytes
)))
50 (loop for p in byte-pairs
51 do
(if (not (equal (first p
) (second p
)))
52 (return-from sniffer-match nil
)))
55 (defun sniffer-match-search (initial-bytes)
56 (loop for x in unicode-signatures
57 do
(if (sniffer-match initial-bytes
(car x
))
58 (return-from sniffer-match-search
(cdr x
)))))
60 ;; Given a file name F, returns a Unicode encoding designator
61 ;; if the initial bytes of F match any signature in the UNICODE-SIGNATURES table,
64 (defun unicode-sniffer (f)
65 (with-open-file (s f
:element-type
'(unsigned-byte 8))
67 ((signature-length-max (apply #'max
(mapcar #'(lambda (x) (length (car x
))) unicode-signatures
)))
68 (initial-bytes (loop repeat signature-length-max collect
(read-byte s nil
))))
69 (sniffer-match-search initial-bytes
))))
71 ;; Expose UNICODE-SNIFFER to Maxima user.
72 ;; Returns the symbol name (i.e., a string) of the encoding,
73 ;; if any was found, otherwise false.
75 (defun $inferred_encoding
(f)
76 (let ((e (unicode-sniffer f
)))
77 (if e
(symbol-name e
) "DEFAULT")))
79 ;; Try to verify that the inferred encoding is among
80 ;; the encodings known to this Lisp implementation.
81 ;; If there is no known method to check the encoding
82 ;; for this Lisp implementation, return 'UNKNOWN.
83 ;; Otherwise this function returns a generalized Boolean.
85 (defun check-encoding (e)
86 ;; work around ECL bug #435: "UCS-4LE not on list of basic encodings"
87 #+ecl
(or (eq e
':ucs-4le
) (member e
(ext:all-encodings
)))
88 #+ccl
(ccl:lookup-character-encoding e
)
89 #+clisp
(equal (symbol-package e
) (find-package :charset
))
90 ;; CMUCL: flatten table of encodings and look for E among preferred names and their synonyms
91 #+cmucl
(member e
(apply #'append
(mapcar (lambda (l) (if (cdr l
) (cons (car l
) (cadr l
)) l
)) (ext:list-all-external-formats
))))
92 #+sbcl
(check-encoding-sbcl e
)
93 #+gcl nil
;; GCL 2.6.12 does not recognize :external-format in OPEN
94 ;; work around ABCL bug: "SYSTEM:AVAILABLE-ENCODINGS symbols strangeness" (https://github.com/armedbear/abcl/issues/82)
95 #+abcl
(member (symbol-name e
) (mapcar #'symbol-name
(system:available-encodings
)) :test
#'string
=)
96 #-
(or ecl ccl clisp cmucl sbcl gcl abcl
) 'unknown
)
98 #+sbcl
(defun check-encoding-sbcl (e)
99 (let ((x sb-impl
::*external-formats
*))
101 ;; not sure when SBCL switched over from hash table to array ... try to handle both
102 ((hash-table-p x
) (gethash e x
))
106 (mapcar (lambda (l) (member e l
))
107 (loop for ef across x
108 when
(sb-impl::external-format-p ef
)
109 collect
(sb-impl::ef-names ef
)))))
110 (t (merror "CHECK-ENCODING: I don't know how to check encoding for this version of SBCL.")))))
112 ;; Expose CHECK-ENCODING to Maxima user.
113 ;; Argument is an encoding symbol name, such as that returned by $INFERRED_ENCODING.
114 ;; Returns true if the encoding is recognized by the Lisp implementation,
115 ;; false if the encoding is not recognized or the argument is null;
116 ;; if there is no known method to check the encoding, print an error message.
118 ;; CMUCL: symbols for encodings aren't known until this function is called.
119 #+cmucl
(ext:list-all-external-formats
)
121 (defun $recognized_encoding_p
(e)
122 (let ((e-up (string-upcase e
)) (e-down (string-downcase e
)))
124 (not (null (string= e-up
"DEFAULT")))
126 (find-symbol e
#+clisp
:charset
#-clisp
:keyword
)
127 (find-symbol e-up
#+clisp
:charset
#-clisp
:keyword
)
128 (find-symbol e-down
#+clisp
:charset
#-clisp
:keyword
))))
130 (let ((x (check-encoding s
)))
133 (merror (intl:gettext
"recognized_encoding_p: I don't know how to verify encoding for this Lisp implementation.")))
135 (not (null x
))))))))))