| 1 | ;;; -*-lisp-*- |
| 2 | ;;; |
| 3 | ;;; Efficient buffering character scanner |
| 4 | ;;; |
| 5 | ;;; (c) 2009 Straylight/Edgeware |
| 6 | ;;; |
| 7 | |
| 8 | ;;;----- Licensing notice --------------------------------------------------- |
| 9 | ;;; |
| 10 | ;;; This file is part of the Sensble Object Design, an object system for C. |
| 11 | ;;; |
| 12 | ;;; SOD is free software; you can redistribute it and/or modify |
| 13 | ;;; it under the terms of the GNU General Public License as published by |
| 14 | ;;; the Free Software Foundation; either version 2 of the License, or |
| 15 | ;;; (at your option) any later version. |
| 16 | ;;; |
| 17 | ;;; SOD is distributed in the hope that it will be useful, |
| 18 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | ;;; GNU General Public License for more details. |
| 21 | ;;; |
| 22 | ;;; You should have received a copy of the GNU General Public License |
| 23 | ;;; along with SOD; if not, write to the Free Software Foundation, |
| 24 | ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 25 | |
| 26 | (cl:in-package #:sod-parser) |
| 27 | |
| 28 | ;;;-------------------------------------------------------------------------- |
| 29 | ;;; Infrastructure types. |
| 30 | |
| 31 | (eval-when (:compile-toplevel :load-toplevel :execute) |
| 32 | (defconstant charbuf-size 4096 |
| 33 | "Number of characters in a character buffer.")) |
| 34 | |
| 35 | (deftype charbuf () |
| 36 | "Type of character buffers." |
| 37 | `(simple-string ,charbuf-size)) |
| 38 | |
| 39 | (deftype charbuf-index () |
| 40 | "Type of indices into character buffers." |
| 41 | `(integer 0 ,charbuf-size)) |
| 42 | |
| 43 | (declaim (inline make-charbuf)) |
| 44 | (defun make-charbuf () |
| 45 | "Return a fresh uninitialized character buffer." |
| 46 | (make-array charbuf-size :element-type 'character)) |
| 47 | |
| 48 | (defstruct charbuf-chain-link |
| 49 | "A link in the charbuf scanner's buffer chain. |
| 50 | |
| 51 | Usually the scanner doesn't bother maintaining a buffer chain; but if |
| 52 | we've rewound to a captured place then we need to be able to retrace our |
| 53 | steps on to later buffers. |
| 54 | |
| 55 | It turns out to be easier to have an explicit link to the next structure |
| 56 | in the chain than to maintain a spine of cons cells, so we do that; the |
| 57 | only other things we need are the buffer itself and its length, which |
| 58 | might be shorter than `charbuf-size', e.g., if we hit end-of-file." |
| 59 | (next nil :type (or charbuf-chain-link null)) |
| 60 | (buf nil :type (or charbuf (member nil :eof)) :read-only t) |
| 61 | (size 0 :type charbuf-index :read-only t)) |
| 62 | |
| 63 | (export 'charbuf-scanner-place-p) |
| 64 | (defstruct charbuf-scanner-place |
| 65 | "A captured place we can return to later. |
| 66 | |
| 67 | We remember the buffer-chain link, so that we can retrace our steps up to |
| 68 | the present. We also need the index at which we continue reading |
| 69 | characters; and the line and column numbers to resume from." |
| 70 | (scanner nil :type charbuf-scanner :read-only t) |
| 71 | (link nil :type charbuf-chain-link :read-only t) |
| 72 | (index 0 :type charbuf-index :read-only t) |
| 73 | (line 0 :type fixnum :read-only t) |
| 74 | (column 0 :type fixnum :read-only t)) |
| 75 | |
| 76 | (defmethod file-location ((place charbuf-scanner-place)) |
| 77 | (make-file-location (scanner-filename |
| 78 | (charbuf-scanner-place-scanner place)) |
| 79 | (charbuf-scanner-place-line place) |
| 80 | (charbuf-scanner-place-column place))) |
| 81 | |
| 82 | ;;;-------------------------------------------------------------------------- |
| 83 | ;;; Main class. |
| 84 | |
| 85 | (export 'charbuf-scanner) |
| 86 | (defclass charbuf-scanner (character-scanner) |
| 87 | ((stream :initarg :stream :type stream) |
| 88 | (buf :initform nil :type (or charbuf (member nil :eof))) |
| 89 | (size :initform 0 :type (integer 0 #.charbuf-size)) |
| 90 | (index :initform 0 :type (integer 0 #.charbuf-size)) |
| 91 | (captures :initform 0 :type (and fixnum unsigned-byte)) |
| 92 | (tail :initform nil :type (or charbuf-chain-link null)) |
| 93 | (unread :initform nil :type (or charbuf-chain-link nil)) |
| 94 | (filename :initarg :filename :type (or string null) |
| 95 | :reader scanner-filename) |
| 96 | (line :initarg :line :initform 1 :type fixnum :reader scanner-line) |
| 97 | (column :initarg :column :initform 0 :type fixnum :reader scanner-column)) |
| 98 | (:documentation |
| 99 | "An efficient rewindable scanner for character streams. |
| 100 | |
| 101 | The scanner should be used via the parser protocol. The following notes |
| 102 | describe the class's slots and the invariants maintained by the class. |
| 103 | |
| 104 | The scanner reads characters from STREAM. It reads in chunks, |
| 105 | `charbuf-size' characters at a time, into freshly allocated arrays. At |
| 106 | the beginning of time, BUF is nil; and SIZE is 0, indicating that a new |
| 107 | buffer needs to be read in; this anomalous situation is remedied during |
| 108 | instance initialization. At all times thereafter: |
| 109 | |
| 110 | * If SIZE > 0 then BUF is a `charbuf' containing characters. |
| 111 | |
| 112 | * (<= 0 INDEX SIZE charbuf-size). |
| 113 | |
| 114 | When the current buffer is finished with, another one is fetched. If |
| 115 | we've rewound the scanner to a captured place, then there'll be a chain of |
| 116 | buffers starting at TAIL (which corresponds to the current buffer); and we |
| 117 | should use its NEXT buffer when we've finished this one. |
| 118 | |
| 119 | If there is no next buffer then we should acquire a new one and fill it |
| 120 | from the input stream. If there is an outstanding captured place then we |
| 121 | must also create a buffer chain entry for this new buffer and link it onto |
| 122 | the chain. If there aren't outstanding captures then we don't need to |
| 123 | bother with any of that -- earlier places certainly can't be captured and |
| 124 | a capture of the current position can allocate its own buffer chain |
| 125 | entry. |
| 126 | |
| 127 | Which leaves us with the need to determine whether there are outstanding |
| 128 | captures. We simply maintain a counter, and rely on the client releasing |
| 129 | captured places properly when he's finished. In practice, this is usually |
| 130 | done using the `peek' parser macro so there isn't a problem.")) |
| 131 | |
| 132 | ;;;-------------------------------------------------------------------------- |
| 133 | ;;; Utilities. |
| 134 | |
| 135 | (defgeneric charbuf-scanner-fetch (scanner) |
| 136 | (:documentation |
| 137 | "Refill the scanner buffer. |
| 138 | |
| 139 | This is an internal method, which is really only a method so that the |
| 140 | compiler will optimize slot references. |
| 141 | |
| 142 | Replace the current buffer with the next one, either from the buffer chain |
| 143 | (if we're currently rewound) or with a new buffer from the stream.")) |
| 144 | |
| 145 | (defmethod charbuf-scanner-fetch ((scanner charbuf-scanner)) |
| 146 | (with-slots (stream buf size index tail captures) scanner |
| 147 | (loop |
| 148 | (acond |
| 149 | |
| 150 | ;; If we've hit the end of the line, stop. |
| 151 | ((eq buf :eof) |
| 152 | (return nil)) |
| 153 | |
| 154 | ;; If there's another buffer, we should check it out. |
| 155 | ((and tail (charbuf-chain-link-next tail)) |
| 156 | (setf tail it |
| 157 | buf (charbuf-chain-link-buf it) |
| 158 | size (charbuf-chain-link-size it) |
| 159 | index 0)) |
| 160 | |
| 161 | ;; No joy: try reading more stuff from the input stream. |
| 162 | (t |
| 163 | (let* ((new (make-charbuf)) |
| 164 | (n (read-sequence new stream :start 0 :end charbuf-size))) |
| 165 | |
| 166 | ;; If there's nothing coming in then store a magical marker. |
| 167 | (when (zerop n) (setf new :eof)) |
| 168 | |
| 169 | ;; If there's someone watching, link a new entry onto the chain. |
| 170 | ;; There must, under these circumstances, be a `tail'. |
| 171 | (if (plusp captures) |
| 172 | (let ((next (make-charbuf-chain-link :buf new :size n))) |
| 173 | (setf (charbuf-chain-link-next tail) next |
| 174 | tail next)) |
| 175 | (setf tail nil)) |
| 176 | |
| 177 | ;; Store the new state. |
| 178 | (setf buf new |
| 179 | size n |
| 180 | index 0)))) |
| 181 | |
| 182 | ;; If there's stuff in the current buffer, we're done. |
| 183 | (when (< index size) |
| 184 | (return t))))) |
| 185 | |
| 186 | (export 'charbuf-scanner-map) |
| 187 | (defgeneric charbuf-scanner-map (scanner func &optional fail) |
| 188 | (:documentation |
| 189 | "Read characters from the SCANNER's raw buffers. |
| 190 | |
| 191 | This is intended to be an efficient and versatile interface for reading |
| 192 | characters from a scanner in bulk. The FUNC is invoked repeatedly with |
| 193 | three arguments: a simple string BUF and two nonnegative fixnums START and |
| 194 | END, indicating that the subsequence of BUF between START (inclusive) and |
| 195 | END (exclusive) should be processed. The FUNC returns two values: a |
| 196 | generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is |
| 197 | false then USED is ignored: the function has consumed the entire buffer |
| 198 | and wishes to read more. If DONEP is true then the condition (<= START |
| 199 | USED END) must hold; the FUNC has consumed the buffer as far as USED |
| 200 | (exclusive) and has completed successfully; the values DONEP and `t' are |
| 201 | returned as the result of `charbuf-scanner-map'. |
| 202 | |
| 203 | If end-of-file is encountered before FUNC completes successfully then FAIL |
| 204 | is called with no arguments, and `charbuf-scanner-map' returns whatever |
| 205 | FAIL returns. |
| 206 | |
| 207 | Observe that, if FAIL returns a second value of nil, then |
| 208 | `charbuf-scanner-map' is usable as a parser expression.")) |
| 209 | |
| 210 | (defmethod charbuf-scanner-map |
| 211 | ((scanner charbuf-scanner) func &optional fail) |
| 212 | (with-slots (buf index size) scanner |
| 213 | (flet ((offer (buf start end) |
| 214 | |
| 215 | ;; Pass the buffer to the function, and see what it thought. |
| 216 | (multiple-value-bind (donep used) (funcall func buf start end) |
| 217 | |
| 218 | ;; Update the position as far as the function read. |
| 219 | (with-slots (line column) scanner |
| 220 | (let ((l line) (c column) (limit (if donep used end))) |
| 221 | (do ((i start (1+ i))) |
| 222 | ((>= i limit)) |
| 223 | (setf (values l c) |
| 224 | (update-position (char buf i) l c))) |
| 225 | (setf line l column c))) |
| 226 | |
| 227 | ;; If the function is finished then update our state and |
| 228 | ;; return. |
| 229 | (when donep |
| 230 | (setf index used) |
| 231 | (when (>= index size) |
| 232 | (charbuf-scanner-fetch scanner)) |
| 233 | (return-from charbuf-scanner-map (values donep t)))))) |
| 234 | |
| 235 | ;; If there's anything in the current buffer, offer it to the function. |
| 236 | (when (< index size) |
| 237 | (offer buf index size)) |
| 238 | |
| 239 | ;; Repeatedly fetch new buffers and offer them to the function. |
| 240 | ;; Because the buffers are fresh, we know that we must process them |
| 241 | ;; from the beginning. Note that `offer' will exit if FUNC has |
| 242 | ;; finished, so we don't need to worry about that. |
| 243 | (loop |
| 244 | (unless (charbuf-scanner-fetch scanner) |
| 245 | (return (if fail (funcall fail) (values nil nil)))) |
| 246 | (offer buf 0 size))))) |
| 247 | |
| 248 | ;;;-------------------------------------------------------------------------- |
| 249 | ;;; Initialization. |
| 250 | |
| 251 | (defmethod shared-initialize :after |
| 252 | ((scanner charbuf-scanner) slot-names &key) |
| 253 | |
| 254 | ;; Grab the filename from the underlying stream if we don't have a better |
| 255 | ;; guess. |
| 256 | (default-slot (scanner 'filename slot-names) |
| 257 | (with-slots (stream) scanner |
| 258 | (aif (stream-pathname stream) (namestring it) nil))) |
| 259 | |
| 260 | ;; Get ready with the first character. |
| 261 | (charbuf-scanner-fetch scanner)) |
| 262 | |
| 263 | ;;;-------------------------------------------------------------------------- |
| 264 | ;;; Scanner protocol implementation. |
| 265 | |
| 266 | (defmethod scanner-at-eof-p ((scanner charbuf-scanner)) |
| 267 | (with-slots (buf) scanner |
| 268 | (eq buf :eof))) |
| 269 | |
| 270 | (defmethod scanner-current-char ((scanner charbuf-scanner)) |
| 271 | (with-slots (buf index) scanner |
| 272 | (schar buf index))) |
| 273 | |
| 274 | (defmethod scanner-step ((scanner charbuf-scanner)) |
| 275 | (with-slots (buf size index line column) scanner |
| 276 | |
| 277 | ;; If there's a current character then update the position from it. When |
| 278 | ;; is there a current character? When the index is valid. |
| 279 | (when (< index size) |
| 280 | (setf (values line column) |
| 281 | (update-position (schar buf index) line column))) |
| 282 | |
| 283 | ;; Now move the position on. If there's still a character left then we |
| 284 | ;; win; otherwise fetch another buffer. |
| 285 | (or (< (incf index) size) |
| 286 | (charbuf-scanner-fetch scanner)))) |
| 287 | |
| 288 | (defmethod scanner-unread ((scanner charbuf-scanner) char) |
| 289 | (with-slots (buf index size unread tail line column) scanner |
| 290 | (cond |
| 291 | |
| 292 | ;; First, let's rewind the buffer index. This isn't going to work if |
| 293 | ;; the index is already zero. (Note that this implies that INDEX is |
| 294 | ;; zero in the remaining cases.) |
| 295 | ((plusp index) |
| 296 | (decf index)) |
| 297 | |
| 298 | ;; Plan B. Maybe we've been here before, in which case we'll have left |
| 299 | ;; the appropriate state kicking about already. Note that, according |
| 300 | ;; to the `unread' rules, the character must be the same as last time, |
| 301 | ;; so we can just reuse the whole thing unchanged. Also, note that |
| 302 | ;; the NEXT field in UNREAD is not nil due to the way that we construct |
| 303 | ;; the link below. |
| 304 | ((and unread (eql (charbuf-chain-link-next unread) tail)) |
| 305 | (setf tail unread size 1 |
| 306 | buf (charbuf-chain-link-buf unread))) |
| 307 | |
| 308 | ;; Nope, we've not been here, at least not recently. We'll concoct a |
| 309 | ;; new buffer and put the necessary stuff in it. Store it away for |
| 310 | ;; later so that repeated read/unread oscillations at this position |
| 311 | ;; don't end up consing enormous arrays too much. |
| 312 | (t |
| 313 | (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size))) |
| 314 | (fake (make-charbuf)) |
| 315 | (this (make-charbuf-chain-link :buf fake :size 1 :next next))) |
| 316 | (setf (schar fake 0) char buf fake size 1 |
| 317 | tail this unread this)))) |
| 318 | |
| 319 | ;; That's that sorted; now we have to fiddle the position. |
| 320 | (setf (values line column) (backtrack-position char line column)))) |
| 321 | |
| 322 | (defmethod scanner-capture-place ((scanner charbuf-scanner)) |
| 323 | (with-slots (buf size index captures tail line column) scanner |
| 324 | (incf captures) |
| 325 | (unless tail |
| 326 | (setf tail (make-charbuf-chain-link :buf buf :size size))) |
| 327 | (make-charbuf-scanner-place :scanner scanner :link tail :index index |
| 328 | :line line :column column))) |
| 329 | |
| 330 | (defmethod scanner-restore-place ((scanner charbuf-scanner) place) |
| 331 | (with-slots (buf size index tail line column) scanner |
| 332 | (let ((link (charbuf-scanner-place-link place))) |
| 333 | (setf buf (charbuf-chain-link-buf link) |
| 334 | size (charbuf-chain-link-size link) |
| 335 | index (charbuf-scanner-place-index place) |
| 336 | line (charbuf-scanner-place-line place) |
| 337 | column (charbuf-scanner-place-column place) |
| 338 | tail link)))) |
| 339 | |
| 340 | (defmethod scanner-release-place ((scanner charbuf-scanner) place) |
| 341 | (declare (ignore place)) |
| 342 | (with-slots (captures) scanner |
| 343 | (decf captures))) |
| 344 | |
| 345 | (defstruct (charbuf-slice |
| 346 | (:constructor make-charbuf-slice |
| 347 | (buf &optional (start 0) %end |
| 348 | &aux (end (or %end (length buf)))))) |
| 349 | (buf nil :type (or charbuf (eql :eof)) :read-only t) |
| 350 | (start 0 :type (and fixnum unsigned-byte) :read-only t) |
| 351 | (end 0 :type (and fixnum unsigned-byte) :read-only t)) |
| 352 | |
| 353 | (declaim (inline charbuf-slice-length)) |
| 354 | (defun charbuf-slice-length (slice) |
| 355 | (- (charbuf-slice-end slice) (charbuf-slice-start slice))) |
| 356 | |
| 357 | (defun concatenate-charbuf-slices (slices) |
| 358 | (let* ((len (reduce #'+ slices |
| 359 | :key #'charbuf-slice-length |
| 360 | :initial-value 0)) |
| 361 | (string (make-array len :element-type 'character)) |
| 362 | (i 0)) |
| 363 | (dolist (slice slices) |
| 364 | (let ((buf (charbuf-slice-buf slice)) |
| 365 | (end (charbuf-slice-end slice))) |
| 366 | (do ((j (charbuf-slice-start slice) (1+ j))) |
| 367 | ((>= j end)) |
| 368 | (setf (schar string i) (schar buf j)) |
| 369 | (incf i)))) |
| 370 | string)) |
| 371 | |
| 372 | (defmethod scanner-interval |
| 373 | ((scanner charbuf-scanner) place-a &optional place-b) |
| 374 | (let* ((slices nil) |
| 375 | (place-b (or place-b |
| 376 | (with-slots (index tail) scanner |
| 377 | (make-charbuf-scanner-place :scanner scanner |
| 378 | :link tail |
| 379 | :index index)))) |
| 380 | (last-link (charbuf-scanner-place-link place-b))) |
| 381 | (flet ((bad () |
| 382 | (error "Incorrect places ~S and ~S to `scanner-interval'." |
| 383 | place-a place-b))) |
| 384 | (do ((link (charbuf-scanner-place-link place-a) |
| 385 | (charbuf-chain-link-next link)) |
| 386 | (start (charbuf-scanner-place-index place-a) 0)) |
| 387 | ((eq link last-link) |
| 388 | (let ((end (charbuf-scanner-place-index place-b))) |
| 389 | (when (< end start) |
| 390 | (bad)) |
| 391 | (push (make-charbuf-slice (charbuf-chain-link-buf link) |
| 392 | start end) |
| 393 | slices) |
| 394 | (concatenate-charbuf-slices (nreverse slices)))) |
| 395 | (when (null link) (bad)) |
| 396 | (push (make-charbuf-slice (charbuf-chain-link-buf link) |
| 397 | start |
| 398 | (charbuf-chain-link-size link)) |
| 399 | slices))))) |
| 400 | |
| 401 | ;;;-------------------------------------------------------------------------- |
| 402 | ;;; Specialized streams. |
| 403 | |
| 404 | (export 'charbuf-scanner-stream) |
| 405 | (defclass charbuf-scanner-stream (character-scanner-stream) |
| 406 | ((scanner :initarg :scanner :type charbuf-scanner))) |
| 407 | |
| 408 | (defmethod make-scanner-stream ((scanner charbuf-scanner)) |
| 409 | (make-instance 'charbuf-scanner-stream :scanner scanner)) |
| 410 | |
| 411 | (defmethod stream-read-sequence |
| 412 | ((stream charbuf-scanner-stream) (seq string) |
| 413 | #+clisp &key #-clisp &optional (start 0) end) |
| 414 | (with-slots (scanner) stream |
| 415 | (unless end (setf end (length seq))) |
| 416 | (let ((i start) (n (- end start))) |
| 417 | (labels ((copy (i buf start end) |
| 418 | (do ((j i (1+ j)) |
| 419 | (k start (1+ k))) |
| 420 | ((>= k end)) |
| 421 | (setf (char seq j) (schar buf k)))) |
| 422 | (snarf (buf start end) |
| 423 | (let ((m (- end start))) |
| 424 | (cond ((< m n) |
| 425 | (copy i buf start end) (decf n m) (incf i m) |
| 426 | (values nil 0)) |
| 427 | (t |
| 428 | (copy i buf start (+ start n)) (incf i n) |
| 429 | (values t n)))))) |
| 430 | (charbuf-scanner-map scanner #'snarf) |
| 431 | i)))) |
| 432 | |
| 433 | (defmethod stream-read-line ((stream charbuf-scanner-stream)) |
| 434 | (with-slots (scanner) stream |
| 435 | (let ((slices nil)) |
| 436 | (flet ((snarf (buf start end) |
| 437 | (let ((pos (position #\newline buf :start start :end end))) |
| 438 | (push (make-charbuf-slice buf start (or pos end)) slices) |
| 439 | (if pos |
| 440 | (values (concatenate-charbuf-slices (nreverse slices)) |
| 441 | (1+ pos)) |
| 442 | (values nil 0)))) |
| 443 | (fail () |
| 444 | (values (concatenate-charbuf-slices (nreverse slices)) t))) |
| 445 | (charbuf-scanner-map scanner #'snarf #'fail))))) |
| 446 | |
| 447 | ;;;----- That's all, folks -------------------------------------------------- |