diff options
Diffstat (limited to 'vim/bundle/slimv/slime/nregex.lisp')
-rw-r--r-- | vim/bundle/slimv/slime/nregex.lisp | 523 |
1 files changed, 523 insertions, 0 deletions
diff --git a/vim/bundle/slimv/slime/nregex.lisp b/vim/bundle/slimv/slime/nregex.lisp new file mode 100644 index 0000000..43586ef --- /dev/null +++ b/vim/bundle/slimv/slime/nregex.lisp @@ -0,0 +1,523 @@ +;;; +;;; This code was written by: +;;; +;;; Lawrence E. Freil <lef@freil.com> +;;; National Science Center Foundation +;;; Augusta, Georgia 30909 +;;; +;;; This program was released into the public domain on 2005-08-31. +;;; (See the slime-devel mailing list archive for details.) +;;; +;;; nregex.lisp - My 4/8/92 attempt at a Lisp based regular expression +;;; parser. +;;; +;;; This regular expression parser operates by taking a +;;; regular expression and breaking it down into a list +;;; consisting of lisp expressions and flags. The list +;;; of lisp expressions is then taken in turned into a +;;; lambda expression that can be later applied to a +;;; string argument for parsing. +;;;; +;;;; Modifications made 6 March 2001 By Chris Double (chris@double.co.nz) +;;;; to get working with Corman Lisp 1.42, add package statement and export +;;;; relevant functions. +;;;; + +(in-package :cl-user) + +;; Renamed to slime-nregex avoid name clashes with other versions of +;; this file. -- he + +;;;; CND - 6/3/2001 +(defpackage slime-nregex + (:use #:common-lisp) + (:export + #:regex + #:regex-compile + )) + +;;;; CND - 6/3/2001 +(in-package :slime-nregex) + +;;; +;;; First we create a copy of macros to help debug the beast +(eval-when (:compile-toplevel :load-toplevel :execute) +(defvar *regex-debug* nil) ; Set to nil for no debugging code +) + +(defmacro info (message &rest args) + (if *regex-debug* + `(format *standard-output* ,message ,@args))) + +;;; +;;; Declare the global variables for storing the paren index list. +;;; +(defvar *regex-groups* (make-array 10)) +(defvar *regex-groupings* 0) + +;;; +;;; Declare a simple interface for testing. You probably wouldn't want +;;; to use this interface unless you were just calling this once. +;;; +(defun regex (expression string) + "Usage: (regex <expression> <string) + This function will call regex-compile on the expression and then apply + the string to the returned lambda list." + (let ((findit (cond ((stringp expression) + (regex-compile expression)) + ((listp expression) + expression))) + (result nil)) + (if (not (funcall (if (functionp findit) + findit + (eval `(function ,findit))) string)) + (return-from regex nil)) + (if (= *regex-groupings* 0) + (return-from regex t)) + (dotimes (i *regex-groupings*) + (push (funcall 'subseq + string + (car (aref *regex-groups* i)) + (cadr (aref *regex-groups* i))) + result)) + (reverse result))) + +;;; +;;; Declare some simple macros to make the code more readable. +;;; +(defvar *regex-special-chars* "?*+.()[]\\${}") + +(defmacro add-exp (list) + "Add an item to the end of expression" + `(setf expression (append expression ,list))) + +;;; +;;; Define a function that will take a quoted character and return +;;; what the real character should be plus how much of the source +;;; string was used. If the result is a set of characters, return an +;;; array of bits indicating which characters should be set. If the +;;; expression is one of the sub-group matches return a +;;; list-expression that will provide the match. +;;; +(defun regex-quoted (char-string &optional (invert nil)) + "Usage: (regex-quoted <char-string> &optional invert) + Returns either the quoted character or a simple bit vector of bits set for + the matching values" + (let ((first (char char-string 0)) + (result (char char-string 0)) + (used-length 1)) + (cond ((eql first #\n) + (setf result #\NewLine)) + ((eql first #\c) + (setf result #\Return)) + ((eql first #\t) + (setf result #\Tab)) + ((eql first #\d) + (setf result #*0000000000000000000000000000000000000000000000001111111111000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) + ((eql first #\D) + (setf result #*1111111111111111111111111111111111111111111111110000000000111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) + ((eql first #\w) + (setf result #*0000000000000000000000000000000000000000000000001111111111000000011111111111111111111111111000010111111111111111111111111110000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) + ((eql first #\W) + (setf result #*1111111111111111111111111111111111111111111111110000000000111111100000000000000000000000000111101000000000000000000000000001111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) + ((eql first #\b) + (setf result #*0000000001000000000000000000000011000000000010100000000000100000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) + ((eql first #\B) + (setf result #*1111111110111111111111111111111100111111111101011111111111011111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) + ((eql first #\s) + (setf result #*0000000001100000000000000000000010000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000)) + ((eql first #\S) + (setf result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111)) + ((and (>= (char-code first) (char-code #\0)) + (<= (char-code first) (char-code #\9))) + (if (and (> (length char-string) 2) + (and (>= (char-code (char char-string 1)) (char-code #\0)) + (<= (char-code (char char-string 1)) (char-code #\9)) + (>= (char-code (char char-string 2)) (char-code #\0)) + (<= (char-code (char char-string 2)) (char-code #\9)))) + ;; + ;; It is a single character specified in octal + ;; + (progn + (setf result (do ((x 0 (1+ x)) + (return 0)) + ((= x 2) return) + (setf return (+ (* return 8) + (- (char-code (char char-string x)) + (char-code #\0)))))) + (setf used-length 3)) + ;; + ;; We have a group number replacement. + ;; + (let ((group (- (char-code first) (char-code #\0)))) + (setf result `((let ((nstring (subseq string (car (aref *regex-groups* ,group)) + (cadr (aref *regex-groups* ,group))))) + (if (< length (+ index (length nstring))) + (return-from compare nil)) + (if (not (string= string nstring + :start1 index + :end1 (+ index (length nstring)))) + (return-from compare nil) + (incf index (length nstring))))))))) + (t + (setf result first))) + (if (and (vectorp result) invert) + (bit-xor result #*1111111110011111111111111111111101111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 t)) + (values result used-length))) + +;;; +;;; Now for the main regex compiler routine. +;;; +(defun regex-compile (source &key (anchored nil)) + "Usage: (regex-compile <expression> [ :anchored (t/nil) ]) + This function take a regular expression (supplied as source) and + compiles this into a lambda list that a string argument can then + be applied to. It is also possible to compile this lambda list + for better performance or to save it as a named function for later + use" + (info "Now entering regex-compile with \"~A\"~%" source) + ;; + ;; This routine works in two parts. + ;; The first pass take the regular expression and produces a list of + ;; operators and lisp expressions for the entire regular expression. + ;; The second pass takes this list and produces the lambda expression. + (let ((expression '()) ; holder for expressions + (group 1) ; Current group index + (group-stack nil) ; Stack of current group endings + (result nil) ; holder for built expression. + (fast-first nil)) ; holder for quick unanchored scan + ;; + ;; If the expression was an empty string then it alway + ;; matches (so lets leave early) + ;; + (if (= (length source) 0) + (return-from regex-compile + '(lambda (&rest args) + (declare (ignore args)) + t))) + ;; + ;; If the first character is a caret then set the anchored + ;; flags and remove if from the expression string. + ;; + (cond ((eql (char source 0) #\^) + (setf source (subseq source 1)) + (setf anchored t))) + ;; + ;; If the first sequence is .* then also set the anchored flags. + ;; (This is purely for optimization, it will work without this). + ;; + (if (>= (length source) 2) + (if (string= source ".*" :start1 0 :end1 2) + (setf anchored t))) + ;; + ;; Also, If this is not an anchored search and the first character is + ;; a literal, then do a quick scan to see if it is even in the string. + ;; If not then we can issue a quick nil, + ;; otherwise we can start the search at the matching character to skip + ;; the checks of the non-matching characters anyway. + ;; + ;; If I really wanted to speed up this section of code it would be + ;; easy to recognize the case of a fairly long multi-character literal + ;; and generate a Boyer-Moore search for the entire literal. + ;; + ;; I generate the code to do a loop because on CMU Lisp this is about + ;; twice as fast a calling position. + ;; + (if (and (not anchored) + (not (position (char source 0) *regex-special-chars*)) + (not (and (> (length source) 1) + (position (char source 1) *regex-special-chars*)))) + (setf fast-first `((if (not (dotimes (i length nil) + (if (eql (char string i) + ,(char source 0)) + (return (setf start i))))) + (return-from final-return nil))))) + ;; + ;; Generate the very first expression to save the starting index + ;; so that group 0 will be the entire string matched always + ;; + (add-exp '((setf (aref *regex-groups* 0) + (list index nil)))) + ;; + ;; Loop over each character in the regular expression building the + ;; expression list as we go. + ;; + (do ((eindex 0 (1+ eindex))) + ((= eindex (length source))) + (let ((current (char source eindex))) + (info "Now processing character ~A index = ~A~%" current eindex) + (case current + ((#\.) + ;; + ;; Generate code for a single wild character + ;; + (add-exp '((if (>= index length) + (return-from compare nil) + (incf index))))) + ((#\$) + ;; + ;; If this is the last character of the expression then + ;; anchor the end of the expression, otherwise let it slide + ;; as a standard character (even though it should be quoted). + ;; + (if (= eindex (1- (length source))) + (add-exp '((if (not (= index length)) + (return-from compare nil)))) + (add-exp '((if (not (and (< index length) + (eql (char string index) #\$))) + (return-from compare nil) + (incf index)))))) + ((#\*) + (add-exp '(ASTRISK))) + + ((#\+) + (add-exp '(PLUS))) + + ((#\?) + (add-exp '(QUESTION))) + + ((#\() + ;; + ;; Start a grouping. + ;; + (incf group) + (push group group-stack) + (add-exp `((setf (aref *regex-groups* ,(1- group)) + (list index nil)))) + (add-exp `(,group))) + ((#\)) + ;; + ;; End a grouping + ;; + (let ((group (pop group-stack))) + (add-exp `((setf (cadr (aref *regex-groups* ,(1- group))) + index))) + (add-exp `(,(- group))))) + ((#\[) + ;; + ;; Start of a range operation. + ;; Generate a bit-vector that has one bit per possible character + ;; and then on each character or range, set the possible bits. + ;; + ;; If the first character is carat then invert the set. + (let* ((invert (eql (char source (1+ eindex)) #\^)) + (bitstring (make-array 256 :element-type 'bit + :initial-element + (if invert 1 0))) + (set-char (if invert 0 1))) + (if invert (incf eindex)) + (do ((x (1+ eindex) (1+ x))) + ((eql (char source x) #\]) (setf eindex x)) + (info "Building range with character ~A~%" (char source x)) + (cond ((and (eql (char source (1+ x)) #\-) + (not (eql (char source (+ x 2)) #\]))) + (if (>= (char-code (char source x)) + (char-code (char source (+ 2 x)))) + (error "Invalid range \"~A-~A\". Ranges must be in acending order" + (char source x) (char source (+ 2 x)))) + (do ((j (char-code (char source x)) (1+ j))) + ((> j (char-code (char source (+ 2 x)))) + (incf x 2)) + (info "Setting bit for char ~A code ~A~%" (code-char j) j) + (setf (sbit bitstring j) set-char))) + (t + (cond ((not (eql (char source x) #\])) + (let ((char (char source x))) + ;; + ;; If the character is quoted then find out what + ;; it should have been + ;; + (if (eql (char source x) #\\ ) + (let ((length)) + (multiple-value-setq (char length) + (regex-quoted (subseq source x) invert)) + (incf x length))) + (info "Setting bit for char ~A code ~A~%" char (char-code char)) + (if (not (vectorp char)) + (setf (sbit bitstring (char-code (char source x))) set-char) + (bit-ior bitstring char t)))))))) + (add-exp `((let ((range ,bitstring)) + (if (>= index length) + (return-from compare nil)) + (if (= 1 (sbit range (char-code (char string index)))) + (incf index) + (return-from compare nil))))))) + ((#\\ ) + ;; + ;; Intreprete the next character as a special, range, octal, group or + ;; just the character itself. + ;; + (let ((length) + (value)) + (multiple-value-setq (value length) + (regex-quoted (subseq source (1+ eindex)) nil)) + (cond ((listp value) + (add-exp value)) + ((characterp value) + (add-exp `((if (not (and (< index length) + (eql (char string index) + ,value))) + (return-from compare nil) + (incf index))))) + ((vectorp value) + (add-exp `((let ((range ,value)) + (if (>= index length) + (return-from compare nil)) + (if (= 1 (sbit range (char-code (char string index)))) + (incf index) + (return-from compare nil))))))) + (incf eindex length))) + (t + ;; + ;; We have a literal character. + ;; Scan to see how many we have and if it is more than one + ;; generate a string= verses as single eql. + ;; + (let* ((lit "") + (term (dotimes (litindex (- (length source) eindex) nil) + (let ((litchar (char source (+ eindex litindex)))) + (if (position litchar *regex-special-chars*) + (return litchar) + (progn + (info "Now adding ~A index ~A to lit~%" litchar + litindex) + (setf lit (concatenate 'string lit + (string litchar))))))))) + (if (= (length lit) 1) + (add-exp `((if (not (and (< index length) + (eql (char string index) ,current))) + (return-from compare nil) + (incf index)))) + ;; + ;; If we have a multi-character literal then we must + ;; check to see if the next character (if there is one) + ;; is an astrisk or a plus or a question mark. If so then we must not use this + ;; character in the big literal. + (progn + (if (or (eql term #\*) + (eql term #\+) + (eql term #\?)) + (setf lit (subseq lit 0 (1- (length lit))))) + (add-exp `((if (< length (+ index ,(length lit))) + (return-from compare nil)) + (if (not (string= string ,lit :start1 index + :end1 (+ index ,(length lit)))) + (return-from compare nil) + (incf index ,(length lit))))))) + (incf eindex (1- (length lit)))))))) + ;; + ;; Plug end of list to return t. If we made it this far then + ;; We have matched! + (add-exp '((setf (cadr (aref *regex-groups* 0)) + index))) + (add-exp '((return-from final-return t))) + ;; +;;; (print expression) + ;; + ;; Now take the expression list and turn it into a lambda expression + ;; replacing the special flags with lisp code. + ;; For example: A BEGIN needs to be replace by an expression that + ;; saves the current index, then evaluates everything till it gets to + ;; the END then save the new index if it didn't fail. + ;; On an ASTRISK I need to take the previous expression and wrap + ;; it in a do that will evaluate the expression till an error + ;; occurs and then another do that encompases the remainder of the + ;; regular expression and iterates decrementing the index by one + ;; of the matched expression sizes and then returns nil. After + ;; the last expression insert a form that does a return t so that + ;; if the entire nested sub-expression succeeds then the loop + ;; is broken manually. + ;; + (setf result (copy-tree nil)) + ;; + ;; Reversing the current expression makes building up the + ;; lambda list easier due to the nexting of expressions when + ;; and astrisk has been encountered. + (setf expression (reverse expression)) + (do ((elt 0 (1+ elt))) + ((>= elt (length expression))) + (let ((piece (nth elt expression))) + ;; + ;; Now check for PLUS, if so then ditto the expression and then let the + ;; ASTRISK below handle the rest. + ;; + (cond ((eql piece 'PLUS) + (cond ((listp (nth (1+ elt) expression)) + (setf result (append (list (nth (1+ elt) expression)) + result))) + ;; + ;; duplicate the entire group + ;; NOTE: This hasn't been implemented yet!! + (t + (error "GROUP repeat hasn't been implemented yet~%"))))) + (cond ((listp piece) ;Just append the list + (setf result (append (list piece) result))) + ((eql piece 'QUESTION) ; Wrap it in a block that won't fail + (cond ((listp (nth (1+ elt) expression)) + (setf result + (append `((progn (block compare + ,(nth (1+ elt) + expression)) + t)) + result)) + (incf elt)) + ;; + ;; This is a QUESTION on an entire group which + ;; hasn't been implemented yet!!! + ;; + (t + (error "Optional groups not implemented yet~%")))) + ((or (eql piece 'ASTRISK) ; Do the wild thing! + (eql piece 'PLUS)) + (cond ((listp (nth (1+ elt) expression)) + ;; + ;; This is a single character wild card so + ;; do the simple form. + ;; + (setf result + `((let ((oindex index)) + (block compare + (do () + (nil) + ,(nth (1+ elt) expression))) + (do ((start index (1- start))) + ((< start oindex) nil) + (let ((index start)) + (block compare + ,@result)))))) + (incf elt)) + (t + ;; + ;; This is a subgroup repeated so I must build + ;; the loop using several values. + ;; + )) + ) + (t t)))) ; Just ignore everything else. + ;; + ;; Now wrap the result in a lambda list that can then be + ;; invoked or compiled, however the user wishes. + ;; + (if anchored + (setf result + `(lambda (string &key (start 0) (end (length string))) + (setf *regex-groupings* ,group) + (block final-return + (block compare + (let ((index start) + (length end)) + ,@result))))) + (setf result + `(lambda (string &key (start 0) (end (length string))) + (setf *regex-groupings* ,group) + (block final-return + (let ((length end)) + ,@fast-first + (do ((marker start (1+ marker))) + ((> marker end) nil) + (let ((index marker)) + (if (block compare + ,@result) + (return t))))))))))) + +;; (provide 'nregex) |