--- /dev/null
+;;; -*-lisp-*-
+;;;
+;;; Efficient buffering character scanner
+;;;
+;;; (c) 2009 Straylight/Edgeware
+;;;
+
+;;;----- Licensing notice ---------------------------------------------------
+;;;
+;;; This file is part of the Sensble Object Design, an object system for C.
+;;;
+;;; SOD is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 2 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; SOD is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with SOD; if not, write to the Free Software Foundation,
+;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+(cl:in-package #:sod-parser)
+
+;;;--------------------------------------------------------------------------
+;;; Infrastructure types.
+
+(defconstant charbuf-size 4096
+ "Number of characters in a character buffer.")
+
+(deftype charbuf ()
+ "Type of character buffers."
+ `(simple-string ,charbuf-size))
+
+(deftype charbuf-index ()
+ "Type of indices into character buffers."
+ `(integer 0 ,charbuf-size))
+
+(declaim (inline make-charbuf))
+(defun make-charbuf ()
+ "Return a fresh uninitialized character buffer."
+ (make-array charbuf-size :element-type 'character))
+
+(defstruct charbuf-chain-link
+ "A link in the charbuf scanner's buffer chain.
+
+ Usually the scanner doesn't bother maintaining a buffer chain; but if
+ we've rewound to a captured place then we need to be able to retrace our
+ steps on to later buffers.
+
+ It turns out to be easier to have an explicit link to the next structure
+ in the chain than to maintain a spine of cons cells, so we do that; the
+ only other things we need are the buffer itself and its length, which
+ might be shorter than `charbuf-size', e.g., if we hit end-of-file."
+ (next nil :type (or charbuf-chain-link null))
+ (buf nil :type (or charbuf (member nil :eof)) :read-only t)
+ (size 0 :type charbuf-index :read-only t))
+
+(export 'charbuf-scanner-place-p)
+(defstruct charbuf-scanner-place
+ "A captured place we can return to later.
+
+ We remember the buffer-chain link, so that we can retrace our steps up to
+ the present. We also need the index at which we continue reading
+ characters; and the line and column numbers to resume from."
+ (link nil :type charbuf-chain-link :read-only t)
+ (index 0 :type charbuf-index :read-only t)
+ (line 0 :type fixnum :read-only t)
+ (column 0 :type fixnum :read-only t))
+
+;;;--------------------------------------------------------------------------
+;;; Main class.
+
+(export 'charbuf-scanner)
+(defclass charbuf-scanner (character-scanner)
+ ((stream :initarg :stream :type stream)
+ (buf :initform nil :type (or charbuf (member nil :eof)))
+ (size :initform 0 :type (integer 0 #.charbuf-size))
+ (index :initform 0 :type (integer 0 #.charbuf-size))
+ (captures :initform 0 :type (and fixnum unsigned-byte))
+ (tail :initform nil :type (or charbuf-chain-link null))
+ (unread :initform nil :type (or charbuf-chain-link nil))
+ (filename :initarg :filename :type (or string null)
+ :reader scanner-filename)
+ (line :initarg :line :initform 1 :type fixnum :reader scanner-line)
+ (column :initarg :column :initform 0 :type fixnum :reader scanner-column))
+ (:documentation
+ "An efficient rewindable scanner for character streams.
+
+ The scanner should be used via the parser protocol. The following notes
+ describe the class's slots and the invariants maintained by the class.
+
+ The scanner reads characters from STREAM. It reads in chunks,
+ `charbuf-size' characters at a time, into freshly allocated arrays. At
+ the beginning of time, BUF is nil; and SIZE is 0, indicating that a new
+ buffer needs to be read in; this anomalous situation is remedied during
+ instance initialization. At all times thereafter:
+
+ * If SIZE > 0 then BUF is a `charbuf' containing characters.
+
+ * (<= 0 INDEX SIZE charbuf-size).
+
+ When the current buffer is finished with, another one is fetched. If
+ we've rewound the scanner to a captured place, then there'll be a chain of
+ buffers starting at TAIL (which corresponds to the current buffer); and we
+ should use its NEXT buffer when we've finished this one.
+
+ If there is no next buffer then we should acquire a new one and fill it
+ from the input stream. If there is an outstanding captured place then we
+ must also create a buffer chain entry for this new buffer and link it onto
+ the chain. If there aren't outstanding captures then we don't need to
+ bother with any of that -- earlier places certainly can't be captured and
+ a capture of the current position can allocate its own buffer chain
+ entry.
+
+ Which leaves us with the need to determine whether there are outstanding
+ captures. We simply maintain a counter, and rely on the client releasing
+ captured places properly when he's finished. In practice, this is usually
+ done using the `peek' parser macro so there isn't a problem."))
+
+;;;--------------------------------------------------------------------------
+;;; Utilities.
+
+(defgeneric charbuf-scanner-fetch (scanner)
+ (:documentation
+ "Refill the scanner buffer.
+
+ This is an internal method, which is really only a method so that the
+ compiler will optimize slot references.
+
+ Replace the current buffer with the next one, either from the buffer chain
+ (if we're currently rewound) or with a new buffer from the stream."))
+
+(defmethod charbuf-scanner-fetch ((scanner charbuf-scanner))
+ (with-slots (stream buf size index tail captures) scanner
+ (loop
+ (acond
+
+ ;; If we've hit the end of the line, stop.
+ ((eq buf :eof)
+ (return nil))
+
+ ;; If there's another buffer, we should check it out.
+ ((and tail (charbuf-chain-link-next tail))
+ (setf tail it
+ buf (charbuf-chain-link-buf it)
+ size (charbuf-chain-link-size it)
+ index 0))
+
+ ;; No joy: try reading more stuff from the input stream.
+ (t
+ (let* ((new (make-charbuf))
+ (n (read-sequence new stream :start 0 :end charbuf-size)))
+
+ ;; If there's nothing coming in then store a magical marker.
+ (when (zerop n) (setf new :eof))
+
+ ;; If there's someone watching, link a new entry onto the chain.
+ ;; There must, under these circumstances, be a `tail'.
+ (if (plusp captures)
+ (let ((next (make-charbuf-chain-link :buf new :size n)))
+ (setf (charbuf-chain-link-next tail) next
+ tail next))
+ (setf tail nil))
+
+ ;; Store the new state.
+ (setf buf new
+ size n
+ index 0))))
+
+ ;; If there's stuff in the current buffer, we're done.
+ (when (< index size)
+ (return t)))))
+
+(export 'charbuf-scanner-map)
+(defgeneric charbuf-scanner-map (scanner func &optional fail)
+ (:documentation
+ "Read characters from the SCANNER's raw buffers.
+
+ This is intended to be an efficient and versatile interface for reading
+ characters from a scanner in bulk. The FUNC is invoked repeatedly with
+ three arguments: a simple string BUF and two nonnegative fixnums START and
+ END, indicating that the subsequence of BUF between START (inclusive) and
+ END (exclusive) should be processed. The FUNC returns two values: a
+ generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is
+ false then USED is ignored: the function has consumed the entire buffer
+ and wishes to read more. If DONEP is true then the condition (<= START
+ USED END) must hold; the FUNC has consumed the buffer as far as USED
+ (exclusive) and has completed successfully; the values DONEP and `t' are
+ returned as the result of CHARBUF-SCANNER-MAP.
+
+ If end-of-file is encountered before FUNC completes successfully then FAIL
+ is called with no arguments, and CHARBUF-SCANNER-MAP returns whatever
+ FAIL returns.
+
+ Observe that, if FAIL returns a second value of nil, then
+ `charbuf-scanner-map' is usable as a parser expression."))
+
+(defmethod charbuf-scanner-map
+ ((scanner charbuf-scanner) func &optional fail)
+ (with-slots (buf index size) scanner
+ (flet ((offer (buf start end)
+
+ ;; Pass the buffer to the function, and see what it thought.
+ (multiple-value-bind (donep used) (funcall func buf start end)
+
+ ;; Update the position as far as the function read.
+ (with-slots (line column) scanner
+ (let ((l line) (c column) (limit (if donep used end)))
+ (do ((i start (1+ i)))
+ ((>= i limit))
+ (setf (values l c)
+ (update-position (char buf i) l c)))
+ (setf line l column c)))
+
+ ;; If the function is finished then update our state and
+ ;; return.
+ (when donep
+ (setf index used)
+ (when (>= index size)
+ (charbuf-scanner-fetch scanner))
+ (return-from charbuf-scanner-map (values donep t))))))
+
+ ;; If there's anything in the current buffer, offer it to the function.
+ (when (< index size)
+ (offer buf index size))
+
+ ;; Repeatedly fetch new buffers and offer them to the function.
+ ;; Because the buffers are fresh, we know that we must process them
+ ;; from the beginning. Note that `offer' will exit if FUNC has
+ ;; finished, so we don't need to worry about that.
+ (loop
+ (unless (charbuf-scanner-fetch scanner)
+ (return (if fail (funcall fail) (values nil nil))))
+ (offer buf 0 size)))))
+
+;;;--------------------------------------------------------------------------
+;;; Initialization.
+
+(defmethod shared-initialize :after
+ ((scanner charbuf-scanner) slot-names &key)
+
+ ;; Grab the filename from the underlying stream if we don't have a better
+ ;; guess.
+ (default-slot (scanner 'filename slot-names)
+ (with-slots (stream) scanner
+ (aif (stream-pathname stream) (namestring it) nil)))
+
+ ;; Get ready with the first character.
+ (charbuf-scanner-fetch scanner))
+
+;;;--------------------------------------------------------------------------
+;;; Scanner protocol implementation.
+
+(defmethod scanner-at-eof-p ((scanner charbuf-scanner))
+ (with-slots (buf) scanner
+ (eq buf :eof)))
+
+(defmethod scanner-current-char ((scanner charbuf-scanner))
+ (with-slots (buf index) scanner
+ (schar buf index)))
+
+(defmethod scanner-step ((scanner charbuf-scanner))
+ (with-slots (buf size index line column) scanner
+
+ ;; If there's a current character then update the position from it. When
+ ;; is there a current character? When the index is valid.
+ (when (< index size)
+ (setf (values line column)
+ (update-position (schar buf index) line column)))
+
+ ;; Now move the position on. If there's still a character left then we
+ ;; win; otherwise fetch another buffer.
+ (or (< (incf index) size)
+ (charbuf-scanner-fetch scanner))))
+
+(defmethod scanner-unread ((scanner charbuf-scanner) char)
+ (with-slots (buf index size unread tail line column) scanner
+ (cond
+
+ ;; First, let's rewind the buffer index. This isn't going to work if
+ ;; the index is already zero. (Note that this implies that INDEX is
+ ;; zero in the remaining cases.)
+ ((plusp index)
+ (decf index))
+
+ ;; Plan B. Maybe we've been here before, in which case we'll have left
+ ;; the appropriate state kicking about already. Note that, according
+ ;; to the `unread' rules, the character must be the same as last time,
+ ;; so we can just reuse the whole thing unchanged. Also, note that
+ ;; the NEXT field in UNREAD is not nil due to the way that we construct
+ ;; the link below.
+ ((and unread (eql (charbuf-chain-link-next unread) tail))
+ (setf tail unread size 1
+ buf (charbuf-chain-link-buf unread)))
+
+ ;; Nope, we've not been here, at least not recently. We'll concoct a
+ ;; new buffer and put the necessary stuff in it. Store it away for
+ ;; later so that repeated read/unread oscillations at this position
+ ;; don't end up consing enormous arrays too much.
+ (t
+ (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size)))
+ (fake (make-charbuf))
+ (this (make-charbuf-chain-link :buf fake :size 1 :next next)))
+ (setf (schar fake 0) char buf fake size 1
+ tail this unread this))))
+
+ ;; That's that sorted; now we have to fiddle the position.
+ (setf (values line column) (backtrack-position char line column))))
+
+(defmethod scanner-capture-place ((scanner charbuf-scanner))
+ (with-slots (buf size index captures tail line column) scanner
+ (incf captures)
+ (unless tail
+ (setf tail (make-charbuf-chain-link :buf buf :size size)))
+ (make-charbuf-scanner-place :link tail :index index
+ :line line :column column)))
+
+(defmethod scanner-restore-place ((scanner charbuf-scanner) place)
+ (with-slots (buf size index tail line column) scanner
+ (let ((link (charbuf-scanner-place-link place)))
+ (setf buf (charbuf-chain-link-buf link)
+ size (charbuf-chain-link-size link)
+ index (charbuf-scanner-place-index place)
+ line (charbuf-scanner-place-line place)
+ column (charbuf-scanner-place-column place)
+ tail link))))
+
+(defmethod scanner-release-place ((scanner charbuf-scanner) place)
+ (with-slots (captures) scanner
+ (decf captures)))
+
+(defstruct (charbuf-slice
+ (:constructor make-charbuf-slice
+ (buf &optional (start 0) %end
+ &aux (end (or %end (length buf))))))
+ (buf nil :type (or charbuf (eql :eof)) :read-only t)
+ (start 0 :type (and fixnum unsigned-byte) :read-only t)
+ (end 0 :type (and fixnum unsigned-byte) :read-only t))
+
+(declaim (inline charbuf-slice-length))
+(defun charbuf-slice-length (slice)
+ (- (charbuf-slice-end slice) (charbuf-slice-start slice)))
+
+(defun concatenate-charbuf-slices (slices)
+ (let* ((len (reduce #'+ slices
+ :key #'charbuf-slice-length
+ :initial-value 0))
+ (string (make-array len :element-type 'character))
+ (i 0))
+ (dolist (slice slices)
+ (let ((buf (charbuf-slice-buf slice))
+ (end (charbuf-slice-end slice)))
+ (do ((j (charbuf-slice-start slice) (1+ j)))
+ ((>= j end))
+ (setf (schar string i) (schar buf j))
+ (incf i))))
+ string))
+
+(defmethod scanner-interval
+ ((scanner charbuf-scanner) place-a &optional place-b)
+ (let* ((slices nil)
+ (place-b (or place-b
+ (with-slots (index tail) scanner
+ (make-charbuf-scanner-place :link tail
+ :index index))))
+ (last-link (charbuf-scanner-place-link place-b)))
+ (flet ((bad ()
+ (error "Incorrect places ~S and ~S to SCANNER-INTERVAL."
+ place-a place-b)))
+ (do ((link (charbuf-scanner-place-link place-a)
+ (charbuf-chain-link-next link))
+ (start (charbuf-scanner-place-index place-a) 0))
+ ((eq link last-link)
+ (let ((end (charbuf-scanner-place-index place-b)))
+ (when (< end start)
+ (bad))
+ (push (make-charbuf-slice (charbuf-chain-link-buf link)
+ start end)
+ slices)
+ (concatenate-charbuf-slices (nreverse slices))))
+ (when (null link) (bad))
+ (push (make-charbuf-slice (charbuf-chain-link-buf link)
+ start
+ (charbuf-chain-link-size link))
+ slices)))))
+
+;;;--------------------------------------------------------------------------
+;;; Specialized streams.
+
+(export 'charbuf-scanner-stream)
+(defclass charbuf-scanner-stream (character-scanner-stream)
+ ((scanner :initarg :scanner :type charbuf-scanner)))
+
+(defmethod make-scanner-stream ((scanner charbuf-scanner))
+ (make-instance 'charbuf-scanner-stream :scanner scanner))
+
+(defmethod stream-read-sequence
+ ((stream charbuf-scanner-stream) (seq string) &optional (start 0) end)
+ (with-slots (scanner) stream
+ (unless end (setf end (length seq)))
+ (let ((i start) (n (- end start)))
+ (labels ((copy (i buf start end)
+ (do ((j i (1+ j))
+ (k start (1+ k)))
+ ((>= k end))
+ (setf (char seq j) (schar buf k))))
+ (snarf (buf start end)
+ (let ((m (- end start)))
+ (cond ((< m n)
+ (copy i buf start end) (decf n m) (incf i m)
+ (values nil 0))
+ (t
+ (copy i buf start (+ start n)) (incf i n)
+ (values t n))))))
+ (charbuf-scanner-map scanner #'snarf)
+ i))))
+
+(defmethod stream-read-line ((stream charbuf-scanner-stream))
+ (with-slots (scanner) stream
+ (let ((slices nil))
+ (flet ((snarf (buf start end)
+ (let ((pos (position #\newline buf :start start :end end)))
+ (push (make-charbuf-slice buf start (or pos end)) slices)
+ (if pos
+ (values (concatenate-charbuf-slices (nreverse slices))
+ (1+ pos))
+ (values nil 0))))
+ (fail ()
+ (values (concatenate-charbuf-slices (nreverse slices)) t)))
+ (charbuf-scanner-map scanner #'snarf #'fail)))))
+
+;;;----- That's all, folks --------------------------------------------------