;;; -*-lisp-*- ;;; ;;; Efficient buffering character scanner ;;; ;;; (c) 2009 Straylight/Edgeware ;;; ;;;----- Licensing notice --------------------------------------------------- ;;; ;;; This file is part of the Sensble Object Design, an object system for C. ;;; ;;; SOD is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation; either version 2 of the License, or ;;; (at your option) any later version. ;;; ;;; SOD is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with SOD; if not, write to the Free Software Foundation, ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. (cl:in-package #:sod-parser) ;;;-------------------------------------------------------------------------- ;;; Infrastructure types. (eval-when (:compile-toplevel :load-toplevel :execute) (defconstant charbuf-size 4096 "Number of characters in a character buffer.")) (deftype charbuf () "Type of character buffers." `(simple-string ,charbuf-size)) (deftype charbuf-index () "Type of indices into character buffers." `(integer 0 ,charbuf-size)) (declaim (inline make-charbuf)) (defun make-charbuf () "Return a fresh uninitialized character buffer." (make-array charbuf-size :element-type 'character)) (defstruct charbuf-chain-link "A link in the charbuf scanner's buffer chain. Usually the scanner doesn't bother maintaining a buffer chain; but if we've rewound to a captured place then we need to be able to retrace our steps on to later buffers. It turns out to be easier to have an explicit link to the next structure in the chain than to maintain a spine of cons cells, so we do that; the only other things we need are the buffer itself and its length, which might be shorter than `charbuf-size', e.g., if we hit end-of-file." (next nil :type (or charbuf-chain-link null)) (buf nil :type (or charbuf (member nil :eof)) :read-only t) (size 0 :type charbuf-index :read-only t)) (export 'charbuf-scanner-place-p) (defstruct charbuf-scanner-place "A captured place we can return to later. We remember the buffer-chain link, so that we can retrace our steps up to the present. We also need the index at which we continue reading characters; and the line and column numbers to resume from." (scanner nil :type charbuf-scanner :read-only t) (link nil :type charbuf-chain-link :read-only t) (index 0 :type charbuf-index :read-only t) (line 0 :type fixnum :read-only t) (column 0 :type fixnum :read-only t)) (defmethod file-location ((place charbuf-scanner-place)) (make-file-location (scanner-filename (charbuf-scanner-place-scanner place)) (charbuf-scanner-place-line place) (charbuf-scanner-place-column place))) ;;;-------------------------------------------------------------------------- ;;; Main class. (export 'charbuf-scanner) (defclass charbuf-scanner (character-scanner) ((stream :initarg :stream :type stream) (buf :initform nil :type (or charbuf (member nil :eof))) (size :initform 0 :type (integer 0 #.charbuf-size)) (index :initform 0 :type (integer 0 #.charbuf-size)) (captures :initform 0 :type (and fixnum unsigned-byte)) (tail :initform nil :type (or charbuf-chain-link null)) (unread :initform nil :type (or charbuf-chain-link nil)) (filename :initarg :filename :type (or string null) :reader scanner-filename) (line :initarg :line :initform 1 :type fixnum :reader scanner-line) (column :initarg :column :initform 0 :type fixnum :reader scanner-column)) (:documentation "An efficient rewindable scanner for character streams. The scanner should be used via the parser protocol. The following notes describe the class's slots and the invariants maintained by the class. The scanner reads characters from STREAM. It reads in chunks, `charbuf-size' characters at a time, into freshly allocated arrays. At the beginning of time, BUF is nil; and SIZE is 0, indicating that a new buffer needs to be read in; this anomalous situation is remedied during instance initialization. At all times thereafter: * If SIZE > 0 then BUF is a `charbuf' containing characters. * (<= 0 INDEX SIZE charbuf-size). When the current buffer is finished with, another one is fetched. If we've rewound the scanner to a captured place, then there'll be a chain of buffers starting at TAIL (which corresponds to the current buffer); and we should use its NEXT buffer when we've finished this one. If there is no next buffer then we should acquire a new one and fill it from the input stream. If there is an outstanding captured place then we must also create a buffer chain entry for this new buffer and link it onto the chain. If there aren't outstanding captures then we don't need to bother with any of that -- earlier places certainly can't be captured and a capture of the current position can allocate its own buffer chain entry. Which leaves us with the need to determine whether there are outstanding captures. We simply maintain a counter, and rely on the client releasing captured places properly when he's finished. In practice, this is usually done using the `peek' parser macro so there isn't a problem.")) ;;;-------------------------------------------------------------------------- ;;; Utilities. (defgeneric charbuf-scanner-fetch (scanner) (:documentation "Refill the scanner buffer. This is an internal method, which is really only a method so that the compiler will optimize slot references. Replace the current buffer with the next one, either from the buffer chain (if we're currently rewound) or with a new buffer from the stream.")) (defmethod charbuf-scanner-fetch ((scanner charbuf-scanner)) (with-slots (stream buf size index tail captures) scanner (loop (acond ;; If we've hit the end of the line, stop. ((eq buf :eof) (return nil)) ;; If there's another buffer, we should check it out. ((and tail (charbuf-chain-link-next tail)) (setf tail it buf (charbuf-chain-link-buf it) size (charbuf-chain-link-size it) index 0)) ;; No joy: try reading more stuff from the input stream. (t (let* ((new (make-charbuf)) (n (read-sequence new stream :start 0 :end charbuf-size))) ;; If there's nothing coming in then store a magical marker. (when (zerop n) (setf new :eof)) ;; If there's someone watching, link a new entry onto the chain. ;; There must, under these circumstances, be a `tail'. (if (plusp captures) (let ((next (make-charbuf-chain-link :buf new :size n))) (setf (charbuf-chain-link-next tail) next tail next)) (setf tail nil)) ;; Store the new state. (setf buf new size n index 0)))) ;; If there's stuff in the current buffer, we're done. (when (< index size) (return t))))) (export 'charbuf-scanner-map) (defgeneric charbuf-scanner-map (scanner func &optional fail) (:documentation "Read characters from the SCANNER's raw buffers. This is intended to be an efficient and versatile interface for reading characters from a scanner in bulk. The FUNC is invoked repeatedly with three arguments: a simple string BUF and two nonnegative fixnums START and END, indicating that the subsequence of BUF between START (inclusive) and END (exclusive) should be processed. The FUNC returns two values: a generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is false then USED is ignored: the function has consumed the entire buffer and wishes to read more. If DONEP is true then the condition (<= START USED END) must hold; the FUNC has consumed the buffer as far as USED (exclusive) and has completed successfully; the values DONEP and `t' are returned as the result of `charbuf-scanner-map'. If end-of-file is encountered before FUNC completes successfully then FAIL is called with no arguments, and `charbuf-scanner-map' returns whatever FAIL returns. Observe that, if FAIL returns a second value of nil, then `charbuf-scanner-map' is usable as a parser expression.")) (defmethod charbuf-scanner-map ((scanner charbuf-scanner) func &optional fail) (with-slots (buf index size) scanner (flet ((offer (buf start end) ;; Pass the buffer to the function, and see what it thought. (multiple-value-bind (donep used) (funcall func buf start end) ;; Update the position as far as the function read. (with-slots (line column) scanner (let ((l line) (c column) (limit (if donep used end))) (do ((i start (1+ i))) ((>= i limit)) (setf (values l c) (update-position (char buf i) l c))) (setf line l column c))) ;; If the function is finished then update our state and ;; return. (when donep (setf index used) (when (>= index size) (charbuf-scanner-fetch scanner)) (return-from charbuf-scanner-map (values donep t)))))) ;; If there's anything in the current buffer, offer it to the function. (when (< index size) (offer buf index size)) ;; Repeatedly fetch new buffers and offer them to the function. ;; Because the buffers are fresh, we know that we must process them ;; from the beginning. Note that `offer' will exit if FUNC has ;; finished, so we don't need to worry about that. (loop (unless (charbuf-scanner-fetch scanner) (return (if fail (funcall fail) (values nil nil)))) (offer buf 0 size))))) ;;;-------------------------------------------------------------------------- ;;; Initialization. (defmethod shared-initialize :after ((scanner charbuf-scanner) slot-names &key) ;; Grab the filename from the underlying stream if we don't have a better ;; guess. (default-slot (scanner 'filename slot-names) (with-slots (stream) scanner (aif (stream-pathname stream) (namestring it) nil))) ;; Get ready with the first character. (charbuf-scanner-fetch scanner)) ;;;-------------------------------------------------------------------------- ;;; Scanner protocol implementation. (defmethod scanner-at-eof-p ((scanner charbuf-scanner)) (with-slots (buf) scanner (eq buf :eof))) (defmethod scanner-current-char ((scanner charbuf-scanner)) (with-slots (buf index) scanner (schar buf index))) (defmethod scanner-step ((scanner charbuf-scanner)) (with-slots (buf size index line column) scanner ;; If there's a current character then update the position from it. When ;; is there a current character? When the index is valid. (when (< index size) (setf (values line column) (update-position (schar buf index) line column))) ;; Now move the position on. If there's still a character left then we ;; win; otherwise fetch another buffer. (or (< (incf index) size) (charbuf-scanner-fetch scanner)))) (defmethod scanner-unread ((scanner charbuf-scanner) char) (with-slots (buf index size unread tail line column) scanner (cond ;; First, let's rewind the buffer index. This isn't going to work if ;; the index is already zero. (Note that this implies that INDEX is ;; zero in the remaining cases.) ((plusp index) (decf index)) ;; Plan B. Maybe we've been here before, in which case we'll have left ;; the appropriate state kicking about already. Note that, according ;; to the `unread' rules, the character must be the same as last time, ;; so we can just reuse the whole thing unchanged. Also, note that ;; the NEXT field in UNREAD is not nil due to the way that we construct ;; the link below. ((and unread (eql (charbuf-chain-link-next unread) tail)) (setf tail unread size 1 buf (charbuf-chain-link-buf unread))) ;; Nope, we've not been here, at least not recently. We'll concoct a ;; new buffer and put the necessary stuff in it. Store it away for ;; later so that repeated read/unread oscillations at this position ;; don't end up consing enormous arrays too much. (t (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size))) (fake (make-charbuf)) (this (make-charbuf-chain-link :buf fake :size 1 :next next))) (setf (schar fake 0) char buf fake size 1 tail this unread this)))) ;; That's that sorted; now we have to fiddle the position. (setf (values line column) (backtrack-position char line column)))) (defmethod scanner-capture-place ((scanner charbuf-scanner)) (with-slots (buf size index captures tail line column) scanner (incf captures) (unless tail (setf tail (make-charbuf-chain-link :buf buf :size size))) (make-charbuf-scanner-place :scanner scanner :link tail :index index :line line :column column))) (defmethod scanner-restore-place ((scanner charbuf-scanner) place) (with-slots (buf size index tail line column) scanner (let ((link (charbuf-scanner-place-link place))) (setf buf (charbuf-chain-link-buf link) size (charbuf-chain-link-size link) index (charbuf-scanner-place-index place) line (charbuf-scanner-place-line place) column (charbuf-scanner-place-column place) tail link)))) (defmethod scanner-release-place ((scanner charbuf-scanner) place) (declare (ignore place)) (with-slots (captures) scanner (decf captures))) (defstruct (charbuf-slice (:constructor make-charbuf-slice (buf &optional (start 0) %end &aux (end (or %end (length buf)))))) (buf nil :type (or charbuf (eql :eof)) :read-only t) (start 0 :type (and fixnum unsigned-byte) :read-only t) (end 0 :type (and fixnum unsigned-byte) :read-only t)) (declaim (inline charbuf-slice-length)) (defun charbuf-slice-length (slice) (- (charbuf-slice-end slice) (charbuf-slice-start slice))) (defun concatenate-charbuf-slices (slices) (let* ((len (reduce #'+ slices :key #'charbuf-slice-length :initial-value 0)) (string (make-array len :element-type 'character)) (i 0)) (dolist (slice slices) (let ((buf (charbuf-slice-buf slice)) (end (charbuf-slice-end slice))) (do ((j (charbuf-slice-start slice) (1+ j))) ((>= j end)) (setf (schar string i) (schar buf j)) (incf i)))) string)) (defmethod scanner-interval ((scanner charbuf-scanner) place-a &optional place-b) (let* ((slices nil) (place-b (or place-b (with-slots (index tail) scanner (make-charbuf-scanner-place :scanner scanner :link tail :index index)))) (last-link (charbuf-scanner-place-link place-b))) (flet ((bad () (error "Incorrect places ~S and ~S to `scanner-interval'." place-a place-b))) (do ((link (charbuf-scanner-place-link place-a) (charbuf-chain-link-next link)) (start (charbuf-scanner-place-index place-a) 0)) ((eq link last-link) (let ((end (charbuf-scanner-place-index place-b))) (when (< end start) (bad)) (push (make-charbuf-slice (charbuf-chain-link-buf link) start end) slices) (concatenate-charbuf-slices (nreverse slices)))) (when (null link) (bad)) (push (make-charbuf-slice (charbuf-chain-link-buf link) start (charbuf-chain-link-size link)) slices))))) ;;;-------------------------------------------------------------------------- ;;; Specialized streams. (export 'charbuf-scanner-stream) (defclass charbuf-scanner-stream (character-scanner-stream) ((scanner :initarg :scanner :type charbuf-scanner))) (defmethod make-scanner-stream ((scanner charbuf-scanner)) (make-instance 'charbuf-scanner-stream :scanner scanner)) (defmethod stream-read-sequence ((stream charbuf-scanner-stream) (seq string) #+clisp &key #-clisp &optional (start 0) end) (with-slots (scanner) stream (unless end (setf end (length seq))) (let ((i start) (n (- end start))) (labels ((copy (i buf start end) (do ((j i (1+ j)) (k start (1+ k))) ((>= k end)) (setf (char seq j) (schar buf k)))) (snarf (buf start end) (let ((m (- end start))) (cond ((< m n) (copy i buf start end) (decf n m) (incf i m) (values nil 0)) (t (copy i buf start (+ start n)) (incf i n) (values t n)))))) (charbuf-scanner-map scanner #'snarf) i)))) (defmethod stream-read-line ((stream charbuf-scanner-stream)) (with-slots (scanner) stream (let ((slices nil)) (flet ((snarf (buf start end) (let ((pos (position #\newline buf :start start :end end))) (push (make-charbuf-slice buf start (or pos end)) slices) (if pos (values (concatenate-charbuf-slices (nreverse slices)) (1+ pos)) (values nil 0)))) (fail () (values (concatenate-charbuf-slices (nreverse slices)) t))) (charbuf-scanner-map scanner #'snarf #'fail))))) ;;;----- That's all, folks --------------------------------------------------