From 81cf8dd429c9f62b03addc006d3d7ba529862f55 Mon Sep 17 00:00:00 2001 From: David Lichteblau Date: Sat, 19 May 2007 14:24:28 +0200 Subject: [PATCH] patterns lueppen --- clex.lisp | 14 +++-- compact.lisp | 58 ++++++++++----------- cxml-rng.asd | 1 + nppcre.lisp | 163 +++++++++++++++------------------------------------------- types.lisp | 2 +- validate.lisp | 4 +- 6 files changed, 85 insertions(+), 157 deletions(-) diff --git a/clex.lisp b/clex.lisp index dff32bc..e30492b 100644 --- a/clex.lisp +++ b/clex.lisp @@ -36,12 +36,12 @@ ;; - Disable *full-table-p* by default. ;; - Added SBCL case to the CMUCL workarounds. -(defpackage :clex +(defpackage :cxml-clex (:use :cl :runes) (:export #:deflexer #:backup #:begin #:initial #:bag)) -(in-package :CLEX) +(in-package :cxml-clex) ;;; NOTE -- It turns out that this code is a magintude slower under CMUCL ;;; compared to CLISP or ACL. Probably they do not have a good implementation of @@ -73,9 +73,14 @@ (range* (max amin bmax) amax)) result)))) -(defun ranges- (aa b) +(defun ranges-range (aa b) (mapcan (lambda (a) (range- a b)) aa)) +(defun ranges- (aa b) + (dolist (l b) + (setf aa (ranges-range aa l))) + aa) + (defun partition-range (a pos) (multiple-value-bind (min max) (destructure-range a) (if (and (< min pos) (<= pos max)) @@ -102,8 +107,7 @@ (dolist (k (state-transitions this) (push (cons new that) (state-transitions this))) (when (eq (cdr k) that) - (dolist (l (car k)) ;avoid duplicates - (setf new (ranges- new l))) + (setf new (ranges- new (car k))) ;avoid duplicates (setf (car k) (append new (car k))) (return nil))) ;; split existing ranges to remove overlap diff --git a/compact.lisp b/compact.lisp index a4bded6..9dd4a40 100644 --- a/compact.lisp +++ b/compact.lisp @@ -114,7 +114,7 @@ (defun nc-name-p (str) (and (cxml-types::namep str) (cxml::nc-name-p str))) -(clex:deflexer rng +(cxml-clex:deflexer rng ( ;; NCName (letter+extras @@ -155,71 +155,71 @@ ((* space)) - ((and "##") (clex:begin 'documentation-line)) + ((and "##") (cxml-clex:begin 'documentation-line)) ((and "##" newline)) - ((clex::in documentation-line newline) (clex:begin 'clex:initial)) - ((clex::in documentation-line comment-char) - (return (values 'documentation-line clex:bag))) + ((cxml-clex::in documentation-line newline) (cxml-clex:begin 'cxml-clex:initial)) + ((cxml-clex::in documentation-line comment-char) + (return (values 'documentation-line cxml-clex:bag))) - ((and #\# init-comment-char) (clex:begin 'comment)) + ((and #\# init-comment-char) (cxml-clex:begin 'comment)) ((and #\# newline)) - ((clex::in comment newline) (clex:begin 'clex:initial)) - ((clex::in comment comment-char)) + ((cxml-clex::in comment newline) (cxml-clex:begin 'cxml-clex:initial)) + ((cxml-clex::in comment comment-char)) ((and "'''" (* (or string-char #\' #\")) "'''") (return - (values 'literal-segment (subseq clex:bag 3 (- (length clex:bag) 3))))) + (values 'literal-segment (subseq cxml-clex:bag 3 (- (length cxml-clex:bag) 3))))) ((and #\' (* (or string-char #\")) #\') - (when (or (find (code-char 13) clex:bag) - (find (code-char 10) clex:bag)) + (when (or (find (code-char 13) cxml-clex:bag) + (find (code-char 10) cxml-clex:bag)) (rng-error nil "disallowed newline in string literal")) (return - (values 'literal-segment (subseq clex:bag 1 (- (length clex:bag) 1))))) + (values 'literal-segment (subseq cxml-clex:bag 1 (- (length cxml-clex:bag) 1))))) ((and #\" #\" #\" (* (or string-char #\' #\")) #\" #\" #\") (return - (values 'literal-segment (subseq clex:bag 3 (- (length clex:bag) 3))))) + (values 'literal-segment (subseq cxml-clex:bag 3 (- (length cxml-clex:bag) 3))))) ((and #\" (* (or string-char #\')) #\") - (when (or (find (code-char 13) clex:bag) - (find (code-char 10) clex:bag)) + (when (or (find (code-char 13) cxml-clex:bag) + (find (code-char 10) cxml-clex:bag)) (rng-error nil "disallowed newline in string literal")) (return - (values 'literal-segment (subseq clex:bag 1 (- (length clex:bag) 1))))) + (values 'literal-segment (subseq cxml-clex:bag 1 (- (length cxml-clex:bag) 1))))) ((and name-start-char (* name-char)) (return (cond - ((find clex:bag *keywords* :test #'equal) - (let ((sym (intern (string-upcase clex:bag) :keyword))) + ((find cxml-clex:bag *keywords* :test #'equal) + (let ((sym (intern (string-upcase cxml-clex:bag) :keyword))) (values sym sym))) - ((find #\: clex:bag) - (let* ((pos (position #\: clex:bag)) - (prefix (subseq clex:bag 0 pos)) - (lname (subseq clex:bag (1+ pos )))) + ((find #\: cxml-clex:bag) + (let* ((pos (position #\: cxml-clex:bag)) + (prefix (subseq cxml-clex:bag 0 pos)) + (lname (subseq cxml-clex:bag (1+ pos )))) (when (find #\: lname) (rng-error "too many colons")) (unless (and (nc-name-p prefix)) (rng-error nil "not an ncname: ~A" prefix)) - (let ((ch (clex::getch))) + (let ((ch (cxml-clex::getch))) (cond ((and (equal lname "") (eql ch #\*)) (values 'nsname prefix)) (t - (clex::backup ch) + (cxml-clex::backup ch) (unless (and (nc-name-p lname)) (rng-error nil "not an ncname: ~A" lname)) (values 'cname (cons prefix lname))))))) (t - (unless (nc-name-p clex:bag) - (rng-error nil "not an ncname: ~A" clex:bag)) - (values 'identifier clex:bag))))) + (unless (nc-name-p cxml-clex:bag) + (rng-error nil "not an ncname: ~A" cxml-clex:bag)) + (values 'identifier cxml-clex:bag))))) ((and #\\ name-start-char (* name-char)) - (let ((str (subseq clex:bag 1))) + (let ((str (subseq cxml-clex:bag 1))) (unless (nc-name-p str) - (rng-error nil "not an ncname: ~A" clex:bag)) + (rng-error nil "not an ncname: ~A" cxml-clex:bag)) (return (values 'identifier str)))) (#\= (double '=)) diff --git a/cxml-rng.asd b/cxml-rng.asd index c1ec9e9..e64adff 100644 --- a/cxml-rng.asd +++ b/cxml-rng.asd @@ -17,6 +17,7 @@ :components ((:file "package") (:file "floats") + (:file "unicode") (:file "nppcre") (:file "types") (:file "parse") diff --git a/nppcre.lisp b/nppcre.lisp index d8454bd..b58f2c3 100644 --- a/nppcre.lisp +++ b/nppcre.lisp @@ -46,29 +46,22 @@ (eval-when (:compile-toplevel :load-toplevel :execute) (defparameter *standard-optimize-settings* '(optimize))) -(defvar *real-convert-aux* #'cl-ppcre::convert-aux) -(defvar *convert-char-class-to-hash* #'cl-ppcre::convert-char-class-to-hash) +(defvar *in-pattern-parser-p* nil) -;;; zzz Evil hacks! +(defvar *convert-char-class-to-hash* #'cl-ppcre::convert-char-class-to-hash) -(format t "Patching CL-PPCRE::CONVERT-AUX~%") -(setf (fdefinition 'cl-ppcre::convert-aux) - (lambda (tree) - (if (typep tree 'cl-ppcre::regex) - tree - (funcall *real-convert-aux* tree)))) +;;; zzz Evil hack! (format t "Patching CL-PPCRE::CONVERT-CHAR-CLASS-TO-HASH~%") (setf (fdefinition 'cl-ppcre::convert-char-class-to-hash) (lambda (list) - (let ((hash (funcall *convert-char-class-to-hash* - (remove-if (lambda (x) - (typep x 'cl-ppcre::char-class)) - list)))) - (dolist (x list) - (when (typep x 'cl-ppcre::char-class) - (add-char-class-to-hash x hash))) - hash))) + (when *in-pattern-parser-p* + (setf list (mapcan (lambda (x) + (if (symbolp x) + (symbol-value x) + x)) + list))) + (funcall *convert-char-class-to-hash* list))) (defun signal-ppcre-syntax-error (fmt &rest args) (error "invalid pattern: ~?" fmt args)) @@ -84,62 +77,15 @@ (t (coerce ,=string= 'simple-string)))))) -(defvar *initial-name-char* - (make-instance 'cl-ppcre::char-class - :hash (cl-ppcre::make-char-hash #'cxml::name-start-rune-p) - :case-insensitive-p nil - :invertedp nil - :word-char-class-p nil)) - -(defvar *non-initial-name-char* - (make-instance 'cl-ppcre::char-class - :hash (cl-ppcre::make-char-hash #'cxml::name-start-rune-p) - :case-insensitive-p nil - :invertedp t - :word-char-class-p nil)) - -(defvar *name-char* - (make-instance 'cl-ppcre::char-class - :hash (cl-ppcre::make-char-hash #'cxml::name-rune-p) - :case-insensitive-p nil - :invertedp nil - :word-char-class-p nil)) - -(defvar *non-name-char* - (make-instance 'cl-ppcre::char-class - :hash (cl-ppcre::make-char-hash #'cxml::name-rune-p) - :case-insensitive-p nil - :invertedp t - :word-char-class-p nil)) - (declaim (inline map-char-to-special-class)) (defun map-char-to-special-char-class (chr) (declare #.*standard-optimize-settings*) "Maps escaped characters like \"\\d\" to the tokens which represent their associated character classes." (case chr - ((#\.) - :non-newline) - ((#\i) - *initial-name-char*) - ((#\I) - *non-initial-name-char*) - ((#\c) - *name-char*) - ((#\C) - *non-name-char*) - ((#\d) - :digit-class) - ((#\D) - :non-digit-class) - ((#\w) - :word-char-class) - ((#\W) - :non-word-char-class) - ((#\s) - :whitespace-char-class) - ((#\S) - :non-whitespace-char-class))) + (#\. '\.) + (#\s '\\s) (#\i '\\i) (#\c '\\c) (#\d '\\d) (#\w '\\w) + (#\S '^s) (#\I '^i) (#\C '^c) (#\D '^d) (#\W '^w))) (locally (declare #.*standard-optimize-settings*) @@ -273,6 +219,14 @@ handled elsewhere." ;; all other characters aren't affected by a backslash chr)))) +(defun convert-substraction (r s) + (flet ((rangify (x) + (etypecase x + (character `((:range ,x ,x))) + (list (assert (eq (car x) :range)) (list x)) + (symbol (copy-list (symbol-value x)))))) + (ranges- (mapcan #'rangify r) (mapcan #'rangify s)))) + (defun collect-char-class (lexer) (declare #.*standard-optimize-settings*) "Reads and consumes characters from regex string until a right @@ -315,9 +269,15 @@ we're inside a range or not." (when (looking-at-p lexer #\[) (incf (lexer-pos lexer)) (return-from collect-char-class - (convert-substraction - (nreverse list) - (collect-char-class lexer)))) + (prog1 + (convert-substraction + (nreverse list) + (collect-char-class lexer)) + (unless + (eql (next-char-non-extended lexer) #\]) + (signal-ppcre-syntax-error* + start-pos + "Missing right bracket to close character class"))))) (push #\- list)) (setq hyphen-seen nil)) (otherwise @@ -337,6 +297,16 @@ we're inside a range or not." ;; reverse the list to preserve the order intended ;; by the author of the regex string (return-from collect-char-class (nreverse list))) + ((and hyphen-seen (char= c #\[)) + (return-from collect-char-class + (prog1 + (convert-substraction + (nreverse list) + (collect-char-class lexer)) + (unless (eql (next-char-non-extended lexer) #\]) + (signal-ppcre-syntax-error* + start-pos + "Missing right bracket to close character class"))))) ((and (char= c #\-) last-char (not hyphen-seen)) @@ -672,7 +642,8 @@ Will return or (:ALTERNATION )." (defun parse-pattern (string) (declare #.*standard-optimize-settings*) "Translate the regex string STRING into a parse tree." - (let* ((lexer (make-lexer string)) + (let* ((*in-pattern-parser-p* t) + (lexer (make-lexer string)) (parse-tree (reverse-strings (reg-expr lexer)))) ;; check whether we've consumed the whole regex string (if (end-of-string-p lexer) @@ -681,58 +652,8 @@ Will return or (:ALTERNATION )." (lexer-pos lexer) "Expected end of string")))) -(defun invert-char-class (char-class) - (with-slots (cl-ppcre::hash cl-ppcre::invertedp) char-class - (make-instance 'cl-ppcre::char-class - :hash (cl-ppcre::merge-inverted-hash (make-hash-table) - cl-ppcre::hash) - :invertedp (not cl-ppcre::invertedp) - :case-insensitive-p nil - :word-char-class-p nil))) - -(defun char-class (regex) - (etypecase regex - (cl-ppcre::char-class regex) - (cl-ppcre::str - (assert (eql 1 (cl-ppcre::len regex))) - (let ((hash (make-hash-table))) - (setf (gethash (elt (cl-ppcre::str regex) 0) hash) t) - (make-instance 'cl-ppcre::char-class - :hash hash - :invertedp nil - :case-insensitive-p nil - :word-char-class-p nil))))) - -(defun listify (x) - (if (listp x) x (list x))) - -(defun convert-substraction (r s) - (let ((result - (char-class (cl-ppcre::convert (cons :char-class (listify r))))) - (minus - (char-class (cl-ppcre::convert (cons :char-class (listify s)))))) - (when (cl-ppcre::invertedp result) - (setf result (invert-char-class result))) - (when (cl-ppcre::invertedp minus) - (setf minus (invert-char-class minus))) - (let ((hash (cl-ppcre::hash result))) - (loop - for char being each hash-key in (cl-ppcre::hash minus) - do (remhash char hash)) - (when (> (hash-table-count hash) (/ cl-ppcre::*regex-char-code-limit* 2)) - (setf result (invert-char-class result)))) - (list result))) - (defmethod pattern-scanner ((str string)) (cl-ppcre:create-scanner (parse-pattern str))) (defmethod pattern-scanner ((scanner function)) scanner) - -(defun add-char-class-to-hash (class hash) - (cl-ppcre::merge-hash hash - (if (cl-ppcre::invertedp class) - (cl-ppcre::merge-inverted-hash - (make-hash-table) - (cl-ppcre::hash class)) - (cl-ppcre::hash class)))) diff --git a/types.lisp b/types.lisp index c8e306f..d73c933 100644 --- a/types.lisp +++ b/types.lisp @@ -1688,7 +1688,7 @@ ;;; language (defmacro precompile (pattern) - `(load-time-value (pattern-scanner ,pattern))) + `(load-time-value (list (pattern-scanner ,pattern)))) (defxsd (language-type "language") (xsd-token-type) ((patterns :initform (precompile "[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*"))) diff --git a/validate.lisp b/validate.lisp index d0bf580..b90ec13 100644 --- a/validate.lisp +++ b/validate.lisp @@ -780,7 +780,9 @@ (defmethod expectation ((pattern attribute) s) (write-string "an attribute " s) - (describe-name (pattern-name pattern) s)) + (describe-name (pattern-name pattern) s) + (format s "~% with a value of ") + (expectation (pattern-child pattern) s)) (defmethod expectation ((pattern choice) s) (expectation (pattern-a pattern) s) -- 2.11.4.GIT