Commit | Line | Data |
---|---|---|
dea4d055 MW |
1 | ;;; -*-lisp-*- |
2 | ;;; | |
3 | ;;; Efficient buffering character scanner | |
4 | ;;; | |
5 | ;;; (c) 2009 Straylight/Edgeware | |
6 | ;;; | |
7 | ||
8 | ;;;----- Licensing notice --------------------------------------------------- | |
9 | ;;; | |
10 | ;;; This file is part of the Sensble Object Design, an object system for C. | |
11 | ;;; | |
12 | ;;; SOD is free software; you can redistribute it and/or modify | |
13 | ;;; it under the terms of the GNU General Public License as published by | |
14 | ;;; the Free Software Foundation; either version 2 of the License, or | |
15 | ;;; (at your option) any later version. | |
16 | ;;; | |
17 | ;;; SOD is distributed in the hope that it will be useful, | |
18 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;;; GNU General Public License for more details. | |
21 | ;;; | |
22 | ;;; You should have received a copy of the GNU General Public License | |
23 | ;;; along with SOD; if not, write to the Free Software Foundation, | |
24 | ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
25 | ||
26 | (cl:in-package #:sod-parser) | |
27 | ||
28 | ;;;-------------------------------------------------------------------------- | |
29 | ;;; Infrastructure types. | |
30 | ||
1d8cc67a MW |
31 | (eval-when (:compile-toplevel :load-toplevel :execute) |
32 | (defconstant charbuf-size 4096 | |
33 | "Number of characters in a character buffer.")) | |
dea4d055 MW |
34 | |
35 | (deftype charbuf () | |
36 | "Type of character buffers." | |
37 | `(simple-string ,charbuf-size)) | |
38 | ||
39 | (deftype charbuf-index () | |
40 | "Type of indices into character buffers." | |
41 | `(integer 0 ,charbuf-size)) | |
42 | ||
43 | (declaim (inline make-charbuf)) | |
44 | (defun make-charbuf () | |
45 | "Return a fresh uninitialized character buffer." | |
46 | (make-array charbuf-size :element-type 'character)) | |
47 | ||
48 | (defstruct charbuf-chain-link | |
49 | "A link in the charbuf scanner's buffer chain. | |
50 | ||
51 | Usually the scanner doesn't bother maintaining a buffer chain; but if | |
52 | we've rewound to a captured place then we need to be able to retrace our | |
53 | steps on to later buffers. | |
54 | ||
55 | It turns out to be easier to have an explicit link to the next structure | |
56 | in the chain than to maintain a spine of cons cells, so we do that; the | |
57 | only other things we need are the buffer itself and its length, which | |
58 | might be shorter than `charbuf-size', e.g., if we hit end-of-file." | |
59 | (next nil :type (or charbuf-chain-link null)) | |
60 | (buf nil :type (or charbuf (member nil :eof)) :read-only t) | |
61 | (size 0 :type charbuf-index :read-only t)) | |
62 | ||
63 | (export 'charbuf-scanner-place-p) | |
64 | (defstruct charbuf-scanner-place | |
65 | "A captured place we can return to later. | |
66 | ||
67 | We remember the buffer-chain link, so that we can retrace our steps up to | |
68 | the present. We also need the index at which we continue reading | |
69 | characters; and the line and column numbers to resume from." | |
70 | (link nil :type charbuf-chain-link :read-only t) | |
71 | (index 0 :type charbuf-index :read-only t) | |
72 | (line 0 :type fixnum :read-only t) | |
73 | (column 0 :type fixnum :read-only t)) | |
74 | ||
75 | ;;;-------------------------------------------------------------------------- | |
76 | ;;; Main class. | |
77 | ||
78 | (export 'charbuf-scanner) | |
79 | (defclass charbuf-scanner (character-scanner) | |
80 | ((stream :initarg :stream :type stream) | |
81 | (buf :initform nil :type (or charbuf (member nil :eof))) | |
82 | (size :initform 0 :type (integer 0 #.charbuf-size)) | |
83 | (index :initform 0 :type (integer 0 #.charbuf-size)) | |
84 | (captures :initform 0 :type (and fixnum unsigned-byte)) | |
85 | (tail :initform nil :type (or charbuf-chain-link null)) | |
86 | (unread :initform nil :type (or charbuf-chain-link nil)) | |
87 | (filename :initarg :filename :type (or string null) | |
88 | :reader scanner-filename) | |
bf090e02 | 89 | (line :initarg :line :initform 1 :type fixnum :reader scanner-line) |
dea4d055 MW |
90 | (column :initarg :column :initform 0 :type fixnum :reader scanner-column)) |
91 | (:documentation | |
92 | "An efficient rewindable scanner for character streams. | |
93 | ||
94 | The scanner should be used via the parser protocol. The following notes | |
95 | describe the class's slots and the invariants maintained by the class. | |
96 | ||
97 | The scanner reads characters from STREAM. It reads in chunks, | |
98 | `charbuf-size' characters at a time, into freshly allocated arrays. At | |
99 | the beginning of time, BUF is nil; and SIZE is 0, indicating that a new | |
100 | buffer needs to be read in; this anomalous situation is remedied during | |
101 | instance initialization. At all times thereafter: | |
102 | ||
103 | * If SIZE > 0 then BUF is a `charbuf' containing characters. | |
104 | ||
105 | * (<= 0 INDEX SIZE charbuf-size). | |
106 | ||
107 | When the current buffer is finished with, another one is fetched. If | |
108 | we've rewound the scanner to a captured place, then there'll be a chain of | |
109 | buffers starting at TAIL (which corresponds to the current buffer); and we | |
110 | should use its NEXT buffer when we've finished this one. | |
111 | ||
112 | If there is no next buffer then we should acquire a new one and fill it | |
113 | from the input stream. If there is an outstanding captured place then we | |
114 | must also create a buffer chain entry for this new buffer and link it onto | |
115 | the chain. If there aren't outstanding captures then we don't need to | |
116 | bother with any of that -- earlier places certainly can't be captured and | |
117 | a capture of the current position can allocate its own buffer chain | |
118 | entry. | |
119 | ||
120 | Which leaves us with the need to determine whether there are outstanding | |
121 | captures. We simply maintain a counter, and rely on the client releasing | |
122 | captured places properly when he's finished. In practice, this is usually | |
123 | done using the `peek' parser macro so there isn't a problem.")) | |
124 | ||
125 | ;;;-------------------------------------------------------------------------- | |
126 | ;;; Utilities. | |
127 | ||
128 | (defgeneric charbuf-scanner-fetch (scanner) | |
129 | (:documentation | |
130 | "Refill the scanner buffer. | |
131 | ||
132 | This is an internal method, which is really only a method so that the | |
133 | compiler will optimize slot references. | |
134 | ||
135 | Replace the current buffer with the next one, either from the buffer chain | |
136 | (if we're currently rewound) or with a new buffer from the stream.")) | |
137 | ||
138 | (defmethod charbuf-scanner-fetch ((scanner charbuf-scanner)) | |
139 | (with-slots (stream buf size index tail captures) scanner | |
140 | (loop | |
141 | (acond | |
142 | ||
143 | ;; If we've hit the end of the line, stop. | |
144 | ((eq buf :eof) | |
145 | (return nil)) | |
146 | ||
147 | ;; If there's another buffer, we should check it out. | |
148 | ((and tail (charbuf-chain-link-next tail)) | |
149 | (setf tail it | |
150 | buf (charbuf-chain-link-buf it) | |
151 | size (charbuf-chain-link-size it) | |
152 | index 0)) | |
153 | ||
154 | ;; No joy: try reading more stuff from the input stream. | |
155 | (t | |
156 | (let* ((new (make-charbuf)) | |
157 | (n (read-sequence new stream :start 0 :end charbuf-size))) | |
158 | ||
159 | ;; If there's nothing coming in then store a magical marker. | |
160 | (when (zerop n) (setf new :eof)) | |
161 | ||
162 | ;; If there's someone watching, link a new entry onto the chain. | |
163 | ;; There must, under these circumstances, be a `tail'. | |
164 | (if (plusp captures) | |
165 | (let ((next (make-charbuf-chain-link :buf new :size n))) | |
166 | (setf (charbuf-chain-link-next tail) next | |
167 | tail next)) | |
168 | (setf tail nil)) | |
169 | ||
170 | ;; Store the new state. | |
171 | (setf buf new | |
172 | size n | |
173 | index 0)))) | |
174 | ||
175 | ;; If there's stuff in the current buffer, we're done. | |
176 | (when (< index size) | |
177 | (return t))))) | |
178 | ||
179 | (export 'charbuf-scanner-map) | |
180 | (defgeneric charbuf-scanner-map (scanner func &optional fail) | |
181 | (:documentation | |
182 | "Read characters from the SCANNER's raw buffers. | |
183 | ||
184 | This is intended to be an efficient and versatile interface for reading | |
185 | characters from a scanner in bulk. The FUNC is invoked repeatedly with | |
186 | three arguments: a simple string BUF and two nonnegative fixnums START and | |
187 | END, indicating that the subsequence of BUF between START (inclusive) and | |
188 | END (exclusive) should be processed. The FUNC returns two values: a | |
189 | generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is | |
190 | false then USED is ignored: the function has consumed the entire buffer | |
191 | and wishes to read more. If DONEP is true then the condition (<= START | |
192 | USED END) must hold; the FUNC has consumed the buffer as far as USED | |
193 | (exclusive) and has completed successfully; the values DONEP and `t' are | |
3109662a | 194 | returned as the result of `charbuf-scanner-map'. |
dea4d055 MW |
195 | |
196 | If end-of-file is encountered before FUNC completes successfully then FAIL | |
3109662a | 197 | is called with no arguments, and `charbuf-scanner-map' returns whatever |
dea4d055 MW |
198 | FAIL returns. |
199 | ||
200 | Observe that, if FAIL returns a second value of nil, then | |
201 | `charbuf-scanner-map' is usable as a parser expression.")) | |
202 | ||
203 | (defmethod charbuf-scanner-map | |
204 | ((scanner charbuf-scanner) func &optional fail) | |
205 | (with-slots (buf index size) scanner | |
206 | (flet ((offer (buf start end) | |
207 | ||
208 | ;; Pass the buffer to the function, and see what it thought. | |
209 | (multiple-value-bind (donep used) (funcall func buf start end) | |
210 | ||
211 | ;; Update the position as far as the function read. | |
212 | (with-slots (line column) scanner | |
213 | (let ((l line) (c column) (limit (if donep used end))) | |
214 | (do ((i start (1+ i))) | |
215 | ((>= i limit)) | |
216 | (setf (values l c) | |
217 | (update-position (char buf i) l c))) | |
218 | (setf line l column c))) | |
219 | ||
220 | ;; If the function is finished then update our state and | |
221 | ;; return. | |
222 | (when donep | |
223 | (setf index used) | |
224 | (when (>= index size) | |
225 | (charbuf-scanner-fetch scanner)) | |
226 | (return-from charbuf-scanner-map (values donep t)))))) | |
227 | ||
228 | ;; If there's anything in the current buffer, offer it to the function. | |
229 | (when (< index size) | |
230 | (offer buf index size)) | |
231 | ||
232 | ;; Repeatedly fetch new buffers and offer them to the function. | |
233 | ;; Because the buffers are fresh, we know that we must process them | |
234 | ;; from the beginning. Note that `offer' will exit if FUNC has | |
235 | ;; finished, so we don't need to worry about that. | |
236 | (loop | |
237 | (unless (charbuf-scanner-fetch scanner) | |
238 | (return (if fail (funcall fail) (values nil nil)))) | |
239 | (offer buf 0 size))))) | |
240 | ||
241 | ;;;-------------------------------------------------------------------------- | |
242 | ;;; Initialization. | |
243 | ||
244 | (defmethod shared-initialize :after | |
245 | ((scanner charbuf-scanner) slot-names &key) | |
246 | ||
247 | ;; Grab the filename from the underlying stream if we don't have a better | |
248 | ;; guess. | |
249 | (default-slot (scanner 'filename slot-names) | |
250 | (with-slots (stream) scanner | |
251 | (aif (stream-pathname stream) (namestring it) nil))) | |
252 | ||
253 | ;; Get ready with the first character. | |
254 | (charbuf-scanner-fetch scanner)) | |
255 | ||
256 | ;;;-------------------------------------------------------------------------- | |
257 | ;;; Scanner protocol implementation. | |
258 | ||
259 | (defmethod scanner-at-eof-p ((scanner charbuf-scanner)) | |
260 | (with-slots (buf) scanner | |
261 | (eq buf :eof))) | |
262 | ||
263 | (defmethod scanner-current-char ((scanner charbuf-scanner)) | |
264 | (with-slots (buf index) scanner | |
265 | (schar buf index))) | |
266 | ||
267 | (defmethod scanner-step ((scanner charbuf-scanner)) | |
268 | (with-slots (buf size index line column) scanner | |
269 | ||
270 | ;; If there's a current character then update the position from it. When | |
271 | ;; is there a current character? When the index is valid. | |
272 | (when (< index size) | |
273 | (setf (values line column) | |
274 | (update-position (schar buf index) line column))) | |
275 | ||
276 | ;; Now move the position on. If there's still a character left then we | |
277 | ;; win; otherwise fetch another buffer. | |
278 | (or (< (incf index) size) | |
279 | (charbuf-scanner-fetch scanner)))) | |
280 | ||
281 | (defmethod scanner-unread ((scanner charbuf-scanner) char) | |
282 | (with-slots (buf index size unread tail line column) scanner | |
283 | (cond | |
284 | ||
285 | ;; First, let's rewind the buffer index. This isn't going to work if | |
286 | ;; the index is already zero. (Note that this implies that INDEX is | |
287 | ;; zero in the remaining cases.) | |
288 | ((plusp index) | |
289 | (decf index)) | |
290 | ||
291 | ;; Plan B. Maybe we've been here before, in which case we'll have left | |
292 | ;; the appropriate state kicking about already. Note that, according | |
293 | ;; to the `unread' rules, the character must be the same as last time, | |
294 | ;; so we can just reuse the whole thing unchanged. Also, note that | |
295 | ;; the NEXT field in UNREAD is not nil due to the way that we construct | |
296 | ;; the link below. | |
297 | ((and unread (eql (charbuf-chain-link-next unread) tail)) | |
298 | (setf tail unread size 1 | |
299 | buf (charbuf-chain-link-buf unread))) | |
300 | ||
301 | ;; Nope, we've not been here, at least not recently. We'll concoct a | |
302 | ;; new buffer and put the necessary stuff in it. Store it away for | |
303 | ;; later so that repeated read/unread oscillations at this position | |
304 | ;; don't end up consing enormous arrays too much. | |
305 | (t | |
306 | (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size))) | |
307 | (fake (make-charbuf)) | |
308 | (this (make-charbuf-chain-link :buf fake :size 1 :next next))) | |
309 | (setf (schar fake 0) char buf fake size 1 | |
310 | tail this unread this)))) | |
311 | ||
312 | ;; That's that sorted; now we have to fiddle the position. | |
313 | (setf (values line column) (backtrack-position char line column)))) | |
314 | ||
315 | (defmethod scanner-capture-place ((scanner charbuf-scanner)) | |
316 | (with-slots (buf size index captures tail line column) scanner | |
317 | (incf captures) | |
318 | (unless tail | |
319 | (setf tail (make-charbuf-chain-link :buf buf :size size))) | |
320 | (make-charbuf-scanner-place :link tail :index index | |
321 | :line line :column column))) | |
322 | ||
323 | (defmethod scanner-restore-place ((scanner charbuf-scanner) place) | |
324 | (with-slots (buf size index tail line column) scanner | |
325 | (let ((link (charbuf-scanner-place-link place))) | |
326 | (setf buf (charbuf-chain-link-buf link) | |
327 | size (charbuf-chain-link-size link) | |
328 | index (charbuf-scanner-place-index place) | |
329 | line (charbuf-scanner-place-line place) | |
330 | column (charbuf-scanner-place-column place) | |
331 | tail link)))) | |
332 | ||
333 | (defmethod scanner-release-place ((scanner charbuf-scanner) place) | |
1d8cc67a | 334 | (declare (ignore place)) |
dea4d055 MW |
335 | (with-slots (captures) scanner |
336 | (decf captures))) | |
337 | ||
338 | (defstruct (charbuf-slice | |
339 | (:constructor make-charbuf-slice | |
340 | (buf &optional (start 0) %end | |
341 | &aux (end (or %end (length buf)))))) | |
342 | (buf nil :type (or charbuf (eql :eof)) :read-only t) | |
343 | (start 0 :type (and fixnum unsigned-byte) :read-only t) | |
344 | (end 0 :type (and fixnum unsigned-byte) :read-only t)) | |
345 | ||
346 | (declaim (inline charbuf-slice-length)) | |
347 | (defun charbuf-slice-length (slice) | |
348 | (- (charbuf-slice-end slice) (charbuf-slice-start slice))) | |
349 | ||
350 | (defun concatenate-charbuf-slices (slices) | |
351 | (let* ((len (reduce #'+ slices | |
352 | :key #'charbuf-slice-length | |
353 | :initial-value 0)) | |
354 | (string (make-array len :element-type 'character)) | |
355 | (i 0)) | |
356 | (dolist (slice slices) | |
357 | (let ((buf (charbuf-slice-buf slice)) | |
358 | (end (charbuf-slice-end slice))) | |
359 | (do ((j (charbuf-slice-start slice) (1+ j))) | |
360 | ((>= j end)) | |
361 | (setf (schar string i) (schar buf j)) | |
362 | (incf i)))) | |
363 | string)) | |
364 | ||
365 | (defmethod scanner-interval | |
366 | ((scanner charbuf-scanner) place-a &optional place-b) | |
367 | (let* ((slices nil) | |
368 | (place-b (or place-b | |
369 | (with-slots (index tail) scanner | |
370 | (make-charbuf-scanner-place :link tail | |
371 | :index index)))) | |
372 | (last-link (charbuf-scanner-place-link place-b))) | |
373 | (flet ((bad () | |
3109662a | 374 | (error "Incorrect places ~S and ~S to `scanner-interval'." |
dea4d055 MW |
375 | place-a place-b))) |
376 | (do ((link (charbuf-scanner-place-link place-a) | |
377 | (charbuf-chain-link-next link)) | |
378 | (start (charbuf-scanner-place-index place-a) 0)) | |
379 | ((eq link last-link) | |
380 | (let ((end (charbuf-scanner-place-index place-b))) | |
381 | (when (< end start) | |
382 | (bad)) | |
383 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
384 | start end) | |
385 | slices) | |
386 | (concatenate-charbuf-slices (nreverse slices)))) | |
387 | (when (null link) (bad)) | |
388 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
389 | start | |
390 | (charbuf-chain-link-size link)) | |
391 | slices))))) | |
392 | ||
393 | ;;;-------------------------------------------------------------------------- | |
394 | ;;; Specialized streams. | |
395 | ||
396 | (export 'charbuf-scanner-stream) | |
397 | (defclass charbuf-scanner-stream (character-scanner-stream) | |
398 | ((scanner :initarg :scanner :type charbuf-scanner))) | |
399 | ||
bf090e02 MW |
400 | (defmethod make-scanner-stream ((scanner charbuf-scanner)) |
401 | (make-instance 'charbuf-scanner-stream :scanner scanner)) | |
402 | ||
dea4d055 | 403 | (defmethod stream-read-sequence |
1d8cc67a MW |
404 | ((stream charbuf-scanner-stream) (seq string) |
405 | #+clisp &key #-clisp &optional (start 0) end) | |
dea4d055 MW |
406 | (with-slots (scanner) stream |
407 | (unless end (setf end (length seq))) | |
408 | (let ((i start) (n (- end start))) | |
409 | (labels ((copy (i buf start end) | |
410 | (do ((j i (1+ j)) | |
411 | (k start (1+ k))) | |
412 | ((>= k end)) | |
413 | (setf (char seq j) (schar buf k)))) | |
414 | (snarf (buf start end) | |
415 | (let ((m (- end start))) | |
416 | (cond ((< m n) | |
417 | (copy i buf start end) (decf n m) (incf i m) | |
418 | (values nil 0)) | |
419 | (t | |
420 | (copy i buf start (+ start n)) (incf i n) | |
421 | (values t n)))))) | |
422 | (charbuf-scanner-map scanner #'snarf) | |
423 | i)))) | |
424 | ||
425 | (defmethod stream-read-line ((stream charbuf-scanner-stream)) | |
426 | (with-slots (scanner) stream | |
427 | (let ((slices nil)) | |
428 | (flet ((snarf (buf start end) | |
429 | (let ((pos (position #\newline buf :start start :end end))) | |
430 | (push (make-charbuf-slice buf start (or pos end)) slices) | |
431 | (if pos | |
432 | (values (concatenate-charbuf-slices (nreverse slices)) | |
433 | (1+ pos)) | |
434 | (values nil 0)))) | |
435 | (fail () | |
436 | (values (concatenate-charbuf-slices (nreverse slices)) t))) | |
437 | (charbuf-scanner-map scanner #'snarf #'fail))))) | |
438 | ||
439 | ;;;----- That's all, folks -------------------------------------------------- |