Commit | Line | Data |
---|---|---|
dea4d055 MW |
1 | ;;; -*-lisp-*- |
2 | ;;; | |
3 | ;;; Efficient buffering character scanner | |
4 | ;;; | |
5 | ;;; (c) 2009 Straylight/Edgeware | |
6 | ;;; | |
7 | ||
8 | ;;;----- Licensing notice --------------------------------------------------- | |
9 | ;;; | |
10 | ;;; This file is part of the Sensble Object Design, an object system for C. | |
11 | ;;; | |
12 | ;;; SOD is free software; you can redistribute it and/or modify | |
13 | ;;; it under the terms of the GNU General Public License as published by | |
14 | ;;; the Free Software Foundation; either version 2 of the License, or | |
15 | ;;; (at your option) any later version. | |
16 | ;;; | |
17 | ;;; SOD is distributed in the hope that it will be useful, | |
18 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of | |
19 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
20 | ;;; GNU General Public License for more details. | |
21 | ;;; | |
22 | ;;; You should have received a copy of the GNU General Public License | |
23 | ;;; along with SOD; if not, write to the Free Software Foundation, | |
24 | ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | |
25 | ||
26 | (cl:in-package #:sod-parser) | |
27 | ||
28 | ;;;-------------------------------------------------------------------------- | |
29 | ;;; Infrastructure types. | |
30 | ||
31 | (defconstant charbuf-size 4096 | |
32 | "Number of characters in a character buffer.") | |
33 | ||
34 | (deftype charbuf () | |
35 | "Type of character buffers." | |
36 | `(simple-string ,charbuf-size)) | |
37 | ||
38 | (deftype charbuf-index () | |
39 | "Type of indices into character buffers." | |
40 | `(integer 0 ,charbuf-size)) | |
41 | ||
42 | (declaim (inline make-charbuf)) | |
43 | (defun make-charbuf () | |
44 | "Return a fresh uninitialized character buffer." | |
45 | (make-array charbuf-size :element-type 'character)) | |
46 | ||
47 | (defstruct charbuf-chain-link | |
48 | "A link in the charbuf scanner's buffer chain. | |
49 | ||
50 | Usually the scanner doesn't bother maintaining a buffer chain; but if | |
51 | we've rewound to a captured place then we need to be able to retrace our | |
52 | steps on to later buffers. | |
53 | ||
54 | It turns out to be easier to have an explicit link to the next structure | |
55 | in the chain than to maintain a spine of cons cells, so we do that; the | |
56 | only other things we need are the buffer itself and its length, which | |
57 | might be shorter than `charbuf-size', e.g., if we hit end-of-file." | |
58 | (next nil :type (or charbuf-chain-link null)) | |
59 | (buf nil :type (or charbuf (member nil :eof)) :read-only t) | |
60 | (size 0 :type charbuf-index :read-only t)) | |
61 | ||
62 | (export 'charbuf-scanner-place-p) | |
63 | (defstruct charbuf-scanner-place | |
64 | "A captured place we can return to later. | |
65 | ||
66 | We remember the buffer-chain link, so that we can retrace our steps up to | |
67 | the present. We also need the index at which we continue reading | |
68 | characters; and the line and column numbers to resume from." | |
69 | (link nil :type charbuf-chain-link :read-only t) | |
70 | (index 0 :type charbuf-index :read-only t) | |
71 | (line 0 :type fixnum :read-only t) | |
72 | (column 0 :type fixnum :read-only t)) | |
73 | ||
74 | ;;;-------------------------------------------------------------------------- | |
75 | ;;; Main class. | |
76 | ||
77 | (export 'charbuf-scanner) | |
78 | (defclass charbuf-scanner (character-scanner) | |
79 | ((stream :initarg :stream :type stream) | |
80 | (buf :initform nil :type (or charbuf (member nil :eof))) | |
81 | (size :initform 0 :type (integer 0 #.charbuf-size)) | |
82 | (index :initform 0 :type (integer 0 #.charbuf-size)) | |
83 | (captures :initform 0 :type (and fixnum unsigned-byte)) | |
84 | (tail :initform nil :type (or charbuf-chain-link null)) | |
85 | (unread :initform nil :type (or charbuf-chain-link nil)) | |
86 | (filename :initarg :filename :type (or string null) | |
87 | :reader scanner-filename) | |
bf090e02 | 88 | (line :initarg :line :initform 1 :type fixnum :reader scanner-line) |
dea4d055 MW |
89 | (column :initarg :column :initform 0 :type fixnum :reader scanner-column)) |
90 | (:documentation | |
91 | "An efficient rewindable scanner for character streams. | |
92 | ||
93 | The scanner should be used via the parser protocol. The following notes | |
94 | describe the class's slots and the invariants maintained by the class. | |
95 | ||
96 | The scanner reads characters from STREAM. It reads in chunks, | |
97 | `charbuf-size' characters at a time, into freshly allocated arrays. At | |
98 | the beginning of time, BUF is nil; and SIZE is 0, indicating that a new | |
99 | buffer needs to be read in; this anomalous situation is remedied during | |
100 | instance initialization. At all times thereafter: | |
101 | ||
102 | * If SIZE > 0 then BUF is a `charbuf' containing characters. | |
103 | ||
104 | * (<= 0 INDEX SIZE charbuf-size). | |
105 | ||
106 | When the current buffer is finished with, another one is fetched. If | |
107 | we've rewound the scanner to a captured place, then there'll be a chain of | |
108 | buffers starting at TAIL (which corresponds to the current buffer); and we | |
109 | should use its NEXT buffer when we've finished this one. | |
110 | ||
111 | If there is no next buffer then we should acquire a new one and fill it | |
112 | from the input stream. If there is an outstanding captured place then we | |
113 | must also create a buffer chain entry for this new buffer and link it onto | |
114 | the chain. If there aren't outstanding captures then we don't need to | |
115 | bother with any of that -- earlier places certainly can't be captured and | |
116 | a capture of the current position can allocate its own buffer chain | |
117 | entry. | |
118 | ||
119 | Which leaves us with the need to determine whether there are outstanding | |
120 | captures. We simply maintain a counter, and rely on the client releasing | |
121 | captured places properly when he's finished. In practice, this is usually | |
122 | done using the `peek' parser macro so there isn't a problem.")) | |
123 | ||
124 | ;;;-------------------------------------------------------------------------- | |
125 | ;;; Utilities. | |
126 | ||
127 | (defgeneric charbuf-scanner-fetch (scanner) | |
128 | (:documentation | |
129 | "Refill the scanner buffer. | |
130 | ||
131 | This is an internal method, which is really only a method so that the | |
132 | compiler will optimize slot references. | |
133 | ||
134 | Replace the current buffer with the next one, either from the buffer chain | |
135 | (if we're currently rewound) or with a new buffer from the stream.")) | |
136 | ||
137 | (defmethod charbuf-scanner-fetch ((scanner charbuf-scanner)) | |
138 | (with-slots (stream buf size index tail captures) scanner | |
139 | (loop | |
140 | (acond | |
141 | ||
142 | ;; If we've hit the end of the line, stop. | |
143 | ((eq buf :eof) | |
144 | (return nil)) | |
145 | ||
146 | ;; If there's another buffer, we should check it out. | |
147 | ((and tail (charbuf-chain-link-next tail)) | |
148 | (setf tail it | |
149 | buf (charbuf-chain-link-buf it) | |
150 | size (charbuf-chain-link-size it) | |
151 | index 0)) | |
152 | ||
153 | ;; No joy: try reading more stuff from the input stream. | |
154 | (t | |
155 | (let* ((new (make-charbuf)) | |
156 | (n (read-sequence new stream :start 0 :end charbuf-size))) | |
157 | ||
158 | ;; If there's nothing coming in then store a magical marker. | |
159 | (when (zerop n) (setf new :eof)) | |
160 | ||
161 | ;; If there's someone watching, link a new entry onto the chain. | |
162 | ;; There must, under these circumstances, be a `tail'. | |
163 | (if (plusp captures) | |
164 | (let ((next (make-charbuf-chain-link :buf new :size n))) | |
165 | (setf (charbuf-chain-link-next tail) next | |
166 | tail next)) | |
167 | (setf tail nil)) | |
168 | ||
169 | ;; Store the new state. | |
170 | (setf buf new | |
171 | size n | |
172 | index 0)))) | |
173 | ||
174 | ;; If there's stuff in the current buffer, we're done. | |
175 | (when (< index size) | |
176 | (return t))))) | |
177 | ||
178 | (export 'charbuf-scanner-map) | |
179 | (defgeneric charbuf-scanner-map (scanner func &optional fail) | |
180 | (:documentation | |
181 | "Read characters from the SCANNER's raw buffers. | |
182 | ||
183 | This is intended to be an efficient and versatile interface for reading | |
184 | characters from a scanner in bulk. The FUNC is invoked repeatedly with | |
185 | three arguments: a simple string BUF and two nonnegative fixnums START and | |
186 | END, indicating that the subsequence of BUF between START (inclusive) and | |
187 | END (exclusive) should be processed. The FUNC returns two values: a | |
188 | generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is | |
189 | false then USED is ignored: the function has consumed the entire buffer | |
190 | and wishes to read more. If DONEP is true then the condition (<= START | |
191 | USED END) must hold; the FUNC has consumed the buffer as far as USED | |
192 | (exclusive) and has completed successfully; the values DONEP and `t' are | |
3109662a | 193 | returned as the result of `charbuf-scanner-map'. |
dea4d055 MW |
194 | |
195 | If end-of-file is encountered before FUNC completes successfully then FAIL | |
3109662a | 196 | is called with no arguments, and `charbuf-scanner-map' returns whatever |
dea4d055 MW |
197 | FAIL returns. |
198 | ||
199 | Observe that, if FAIL returns a second value of nil, then | |
200 | `charbuf-scanner-map' is usable as a parser expression.")) | |
201 | ||
202 | (defmethod charbuf-scanner-map | |
203 | ((scanner charbuf-scanner) func &optional fail) | |
204 | (with-slots (buf index size) scanner | |
205 | (flet ((offer (buf start end) | |
206 | ||
207 | ;; Pass the buffer to the function, and see what it thought. | |
208 | (multiple-value-bind (donep used) (funcall func buf start end) | |
209 | ||
210 | ;; Update the position as far as the function read. | |
211 | (with-slots (line column) scanner | |
212 | (let ((l line) (c column) (limit (if donep used end))) | |
213 | (do ((i start (1+ i))) | |
214 | ((>= i limit)) | |
215 | (setf (values l c) | |
216 | (update-position (char buf i) l c))) | |
217 | (setf line l column c))) | |
218 | ||
219 | ;; If the function is finished then update our state and | |
220 | ;; return. | |
221 | (when donep | |
222 | (setf index used) | |
223 | (when (>= index size) | |
224 | (charbuf-scanner-fetch scanner)) | |
225 | (return-from charbuf-scanner-map (values donep t)))))) | |
226 | ||
227 | ;; If there's anything in the current buffer, offer it to the function. | |
228 | (when (< index size) | |
229 | (offer buf index size)) | |
230 | ||
231 | ;; Repeatedly fetch new buffers and offer them to the function. | |
232 | ;; Because the buffers are fresh, we know that we must process them | |
233 | ;; from the beginning. Note that `offer' will exit if FUNC has | |
234 | ;; finished, so we don't need to worry about that. | |
235 | (loop | |
236 | (unless (charbuf-scanner-fetch scanner) | |
237 | (return (if fail (funcall fail) (values nil nil)))) | |
238 | (offer buf 0 size))))) | |
239 | ||
240 | ;;;-------------------------------------------------------------------------- | |
241 | ;;; Initialization. | |
242 | ||
243 | (defmethod shared-initialize :after | |
244 | ((scanner charbuf-scanner) slot-names &key) | |
245 | ||
246 | ;; Grab the filename from the underlying stream if we don't have a better | |
247 | ;; guess. | |
248 | (default-slot (scanner 'filename slot-names) | |
249 | (with-slots (stream) scanner | |
250 | (aif (stream-pathname stream) (namestring it) nil))) | |
251 | ||
252 | ;; Get ready with the first character. | |
253 | (charbuf-scanner-fetch scanner)) | |
254 | ||
255 | ;;;-------------------------------------------------------------------------- | |
256 | ;;; Scanner protocol implementation. | |
257 | ||
258 | (defmethod scanner-at-eof-p ((scanner charbuf-scanner)) | |
259 | (with-slots (buf) scanner | |
260 | (eq buf :eof))) | |
261 | ||
262 | (defmethod scanner-current-char ((scanner charbuf-scanner)) | |
263 | (with-slots (buf index) scanner | |
264 | (schar buf index))) | |
265 | ||
266 | (defmethod scanner-step ((scanner charbuf-scanner)) | |
267 | (with-slots (buf size index line column) scanner | |
268 | ||
269 | ;; If there's a current character then update the position from it. When | |
270 | ;; is there a current character? When the index is valid. | |
271 | (when (< index size) | |
272 | (setf (values line column) | |
273 | (update-position (schar buf index) line column))) | |
274 | ||
275 | ;; Now move the position on. If there's still a character left then we | |
276 | ;; win; otherwise fetch another buffer. | |
277 | (or (< (incf index) size) | |
278 | (charbuf-scanner-fetch scanner)))) | |
279 | ||
280 | (defmethod scanner-unread ((scanner charbuf-scanner) char) | |
281 | (with-slots (buf index size unread tail line column) scanner | |
282 | (cond | |
283 | ||
284 | ;; First, let's rewind the buffer index. This isn't going to work if | |
285 | ;; the index is already zero. (Note that this implies that INDEX is | |
286 | ;; zero in the remaining cases.) | |
287 | ((plusp index) | |
288 | (decf index)) | |
289 | ||
290 | ;; Plan B. Maybe we've been here before, in which case we'll have left | |
291 | ;; the appropriate state kicking about already. Note that, according | |
292 | ;; to the `unread' rules, the character must be the same as last time, | |
293 | ;; so we can just reuse the whole thing unchanged. Also, note that | |
294 | ;; the NEXT field in UNREAD is not nil due to the way that we construct | |
295 | ;; the link below. | |
296 | ((and unread (eql (charbuf-chain-link-next unread) tail)) | |
297 | (setf tail unread size 1 | |
298 | buf (charbuf-chain-link-buf unread))) | |
299 | ||
300 | ;; Nope, we've not been here, at least not recently. We'll concoct a | |
301 | ;; new buffer and put the necessary stuff in it. Store it away for | |
302 | ;; later so that repeated read/unread oscillations at this position | |
303 | ;; don't end up consing enormous arrays too much. | |
304 | (t | |
305 | (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size))) | |
306 | (fake (make-charbuf)) | |
307 | (this (make-charbuf-chain-link :buf fake :size 1 :next next))) | |
308 | (setf (schar fake 0) char buf fake size 1 | |
309 | tail this unread this)))) | |
310 | ||
311 | ;; That's that sorted; now we have to fiddle the position. | |
312 | (setf (values line column) (backtrack-position char line column)))) | |
313 | ||
314 | (defmethod scanner-capture-place ((scanner charbuf-scanner)) | |
315 | (with-slots (buf size index captures tail line column) scanner | |
316 | (incf captures) | |
317 | (unless tail | |
318 | (setf tail (make-charbuf-chain-link :buf buf :size size))) | |
319 | (make-charbuf-scanner-place :link tail :index index | |
320 | :line line :column column))) | |
321 | ||
322 | (defmethod scanner-restore-place ((scanner charbuf-scanner) place) | |
323 | (with-slots (buf size index tail line column) scanner | |
324 | (let ((link (charbuf-scanner-place-link place))) | |
325 | (setf buf (charbuf-chain-link-buf link) | |
326 | size (charbuf-chain-link-size link) | |
327 | index (charbuf-scanner-place-index place) | |
328 | line (charbuf-scanner-place-line place) | |
329 | column (charbuf-scanner-place-column place) | |
330 | tail link)))) | |
331 | ||
332 | (defmethod scanner-release-place ((scanner charbuf-scanner) place) | |
333 | (with-slots (captures) scanner | |
334 | (decf captures))) | |
335 | ||
336 | (defstruct (charbuf-slice | |
337 | (:constructor make-charbuf-slice | |
338 | (buf &optional (start 0) %end | |
339 | &aux (end (or %end (length buf)))))) | |
340 | (buf nil :type (or charbuf (eql :eof)) :read-only t) | |
341 | (start 0 :type (and fixnum unsigned-byte) :read-only t) | |
342 | (end 0 :type (and fixnum unsigned-byte) :read-only t)) | |
343 | ||
344 | (declaim (inline charbuf-slice-length)) | |
345 | (defun charbuf-slice-length (slice) | |
346 | (- (charbuf-slice-end slice) (charbuf-slice-start slice))) | |
347 | ||
348 | (defun concatenate-charbuf-slices (slices) | |
349 | (let* ((len (reduce #'+ slices | |
350 | :key #'charbuf-slice-length | |
351 | :initial-value 0)) | |
352 | (string (make-array len :element-type 'character)) | |
353 | (i 0)) | |
354 | (dolist (slice slices) | |
355 | (let ((buf (charbuf-slice-buf slice)) | |
356 | (end (charbuf-slice-end slice))) | |
357 | (do ((j (charbuf-slice-start slice) (1+ j))) | |
358 | ((>= j end)) | |
359 | (setf (schar string i) (schar buf j)) | |
360 | (incf i)))) | |
361 | string)) | |
362 | ||
363 | (defmethod scanner-interval | |
364 | ((scanner charbuf-scanner) place-a &optional place-b) | |
365 | (let* ((slices nil) | |
366 | (place-b (or place-b | |
367 | (with-slots (index tail) scanner | |
368 | (make-charbuf-scanner-place :link tail | |
369 | :index index)))) | |
370 | (last-link (charbuf-scanner-place-link place-b))) | |
371 | (flet ((bad () | |
3109662a | 372 | (error "Incorrect places ~S and ~S to `scanner-interval'." |
dea4d055 MW |
373 | place-a place-b))) |
374 | (do ((link (charbuf-scanner-place-link place-a) | |
375 | (charbuf-chain-link-next link)) | |
376 | (start (charbuf-scanner-place-index place-a) 0)) | |
377 | ((eq link last-link) | |
378 | (let ((end (charbuf-scanner-place-index place-b))) | |
379 | (when (< end start) | |
380 | (bad)) | |
381 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
382 | start end) | |
383 | slices) | |
384 | (concatenate-charbuf-slices (nreverse slices)))) | |
385 | (when (null link) (bad)) | |
386 | (push (make-charbuf-slice (charbuf-chain-link-buf link) | |
387 | start | |
388 | (charbuf-chain-link-size link)) | |
389 | slices))))) | |
390 | ||
391 | ;;;-------------------------------------------------------------------------- | |
392 | ;;; Specialized streams. | |
393 | ||
394 | (export 'charbuf-scanner-stream) | |
395 | (defclass charbuf-scanner-stream (character-scanner-stream) | |
396 | ((scanner :initarg :scanner :type charbuf-scanner))) | |
397 | ||
bf090e02 MW |
398 | (defmethod make-scanner-stream ((scanner charbuf-scanner)) |
399 | (make-instance 'charbuf-scanner-stream :scanner scanner)) | |
400 | ||
dea4d055 MW |
401 | (defmethod stream-read-sequence |
402 | ((stream charbuf-scanner-stream) (seq string) &optional (start 0) end) | |
403 | (with-slots (scanner) stream | |
404 | (unless end (setf end (length seq))) | |
405 | (let ((i start) (n (- end start))) | |
406 | (labels ((copy (i buf start end) | |
407 | (do ((j i (1+ j)) | |
408 | (k start (1+ k))) | |
409 | ((>= k end)) | |
410 | (setf (char seq j) (schar buf k)))) | |
411 | (snarf (buf start end) | |
412 | (let ((m (- end start))) | |
413 | (cond ((< m n) | |
414 | (copy i buf start end) (decf n m) (incf i m) | |
415 | (values nil 0)) | |
416 | (t | |
417 | (copy i buf start (+ start n)) (incf i n) | |
418 | (values t n)))))) | |
419 | (charbuf-scanner-map scanner #'snarf) | |
420 | i)))) | |
421 | ||
422 | (defmethod stream-read-line ((stream charbuf-scanner-stream)) | |
423 | (with-slots (scanner) stream | |
424 | (let ((slices nil)) | |
425 | (flet ((snarf (buf start end) | |
426 | (let ((pos (position #\newline buf :start start :end end))) | |
427 | (push (make-charbuf-slice buf start (or pos end)) slices) | |
428 | (if pos | |
429 | (values (concatenate-charbuf-slices (nreverse slices)) | |
430 | (1+ pos)) | |
431 | (values nil 0)))) | |
432 | (fail () | |
433 | (values (concatenate-charbuf-slices (nreverse slices)) t))) | |
434 | (charbuf-scanner-map scanner #'snarf #'fail))))) | |
435 | ||
436 | ;;;----- That's all, folks -------------------------------------------------- |