More WIP.
[sod] / src / parser / scanner-charbuf-impl.lisp
CommitLineData
dea4d055
MW
1;;; -*-lisp-*-
2;;;
3;;; Efficient buffering character scanner
4;;;
5;;; (c) 2009 Straylight/Edgeware
6;;;
7
8;;;----- Licensing notice ---------------------------------------------------
9;;;
10;;; This file is part of the Sensble Object Design, an object system for C.
11;;;
12;;; SOD is free software; you can redistribute it and/or modify
13;;; it under the terms of the GNU General Public License as published by
14;;; the Free Software Foundation; either version 2 of the License, or
15;;; (at your option) any later version.
16;;;
17;;; SOD is distributed in the hope that it will be useful,
18;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;;; GNU General Public License for more details.
21;;;
22;;; You should have received a copy of the GNU General Public License
23;;; along with SOD; if not, write to the Free Software Foundation,
24;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26(cl:in-package #:sod-parser)
27
28;;;--------------------------------------------------------------------------
29;;; Infrastructure types.
30
1d8cc67a
MW
31(eval-when (:compile-toplevel :load-toplevel :execute)
32 (defconstant charbuf-size 4096
33 "Number of characters in a character buffer."))
dea4d055
MW
34
35(deftype charbuf ()
36 "Type of character buffers."
37 `(simple-string ,charbuf-size))
38
39(deftype charbuf-index ()
40 "Type of indices into character buffers."
41 `(integer 0 ,charbuf-size))
42
43(declaim (inline make-charbuf))
44(defun make-charbuf ()
45 "Return a fresh uninitialized character buffer."
46 (make-array charbuf-size :element-type 'character))
47
48(defstruct charbuf-chain-link
49 "A link in the charbuf scanner's buffer chain.
50
51 Usually the scanner doesn't bother maintaining a buffer chain; but if
52 we've rewound to a captured place then we need to be able to retrace our
53 steps on to later buffers.
54
55 It turns out to be easier to have an explicit link to the next structure
56 in the chain than to maintain a spine of cons cells, so we do that; the
57 only other things we need are the buffer itself and its length, which
58 might be shorter than `charbuf-size', e.g., if we hit end-of-file."
59 (next nil :type (or charbuf-chain-link null))
60 (buf nil :type (or charbuf (member nil :eof)) :read-only t)
61 (size 0 :type charbuf-index :read-only t))
62
63(export 'charbuf-scanner-place-p)
64(defstruct charbuf-scanner-place
65 "A captured place we can return to later.
66
67 We remember the buffer-chain link, so that we can retrace our steps up to
68 the present. We also need the index at which we continue reading
69 characters; and the line and column numbers to resume from."
70 (link nil :type charbuf-chain-link :read-only t)
71 (index 0 :type charbuf-index :read-only t)
72 (line 0 :type fixnum :read-only t)
73 (column 0 :type fixnum :read-only t))
74
75;;;--------------------------------------------------------------------------
76;;; Main class.
77
78(export 'charbuf-scanner)
79(defclass charbuf-scanner (character-scanner)
80 ((stream :initarg :stream :type stream)
81 (buf :initform nil :type (or charbuf (member nil :eof)))
82 (size :initform 0 :type (integer 0 #.charbuf-size))
83 (index :initform 0 :type (integer 0 #.charbuf-size))
84 (captures :initform 0 :type (and fixnum unsigned-byte))
85 (tail :initform nil :type (or charbuf-chain-link null))
86 (unread :initform nil :type (or charbuf-chain-link nil))
87 (filename :initarg :filename :type (or string null)
88 :reader scanner-filename)
bf090e02 89 (line :initarg :line :initform 1 :type fixnum :reader scanner-line)
dea4d055
MW
90 (column :initarg :column :initform 0 :type fixnum :reader scanner-column))
91 (:documentation
92 "An efficient rewindable scanner for character streams.
93
94 The scanner should be used via the parser protocol. The following notes
95 describe the class's slots and the invariants maintained by the class.
96
97 The scanner reads characters from STREAM. It reads in chunks,
98 `charbuf-size' characters at a time, into freshly allocated arrays. At
99 the beginning of time, BUF is nil; and SIZE is 0, indicating that a new
100 buffer needs to be read in; this anomalous situation is remedied during
101 instance initialization. At all times thereafter:
102
103 * If SIZE > 0 then BUF is a `charbuf' containing characters.
104
105 * (<= 0 INDEX SIZE charbuf-size).
106
107 When the current buffer is finished with, another one is fetched. If
108 we've rewound the scanner to a captured place, then there'll be a chain of
109 buffers starting at TAIL (which corresponds to the current buffer); and we
110 should use its NEXT buffer when we've finished this one.
111
112 If there is no next buffer then we should acquire a new one and fill it
113 from the input stream. If there is an outstanding captured place then we
114 must also create a buffer chain entry for this new buffer and link it onto
115 the chain. If there aren't outstanding captures then we don't need to
116 bother with any of that -- earlier places certainly can't be captured and
117 a capture of the current position can allocate its own buffer chain
118 entry.
119
120 Which leaves us with the need to determine whether there are outstanding
121 captures. We simply maintain a counter, and rely on the client releasing
122 captured places properly when he's finished. In practice, this is usually
123 done using the `peek' parser macro so there isn't a problem."))
124
125;;;--------------------------------------------------------------------------
126;;; Utilities.
127
128(defgeneric charbuf-scanner-fetch (scanner)
129 (:documentation
130 "Refill the scanner buffer.
131
132 This is an internal method, which is really only a method so that the
133 compiler will optimize slot references.
134
135 Replace the current buffer with the next one, either from the buffer chain
136 (if we're currently rewound) or with a new buffer from the stream."))
137
138(defmethod charbuf-scanner-fetch ((scanner charbuf-scanner))
139 (with-slots (stream buf size index tail captures) scanner
140 (loop
141 (acond
142
143 ;; If we've hit the end of the line, stop.
144 ((eq buf :eof)
145 (return nil))
146
147 ;; If there's another buffer, we should check it out.
148 ((and tail (charbuf-chain-link-next tail))
149 (setf tail it
150 buf (charbuf-chain-link-buf it)
151 size (charbuf-chain-link-size it)
152 index 0))
153
154 ;; No joy: try reading more stuff from the input stream.
155 (t
156 (let* ((new (make-charbuf))
157 (n (read-sequence new stream :start 0 :end charbuf-size)))
158
159 ;; If there's nothing coming in then store a magical marker.
160 (when (zerop n) (setf new :eof))
161
162 ;; If there's someone watching, link a new entry onto the chain.
163 ;; There must, under these circumstances, be a `tail'.
164 (if (plusp captures)
165 (let ((next (make-charbuf-chain-link :buf new :size n)))
166 (setf (charbuf-chain-link-next tail) next
167 tail next))
168 (setf tail nil))
169
170 ;; Store the new state.
171 (setf buf new
172 size n
173 index 0))))
174
175 ;; If there's stuff in the current buffer, we're done.
176 (when (< index size)
177 (return t)))))
178
179(export 'charbuf-scanner-map)
180(defgeneric charbuf-scanner-map (scanner func &optional fail)
181 (:documentation
182 "Read characters from the SCANNER's raw buffers.
183
184 This is intended to be an efficient and versatile interface for reading
185 characters from a scanner in bulk. The FUNC is invoked repeatedly with
186 three arguments: a simple string BUF and two nonnegative fixnums START and
187 END, indicating that the subsequence of BUF between START (inclusive) and
188 END (exclusive) should be processed. The FUNC returns two values: a
189 generalized boolean DONEP and a nonnegative fixnum USED. If DONEP is
190 false then USED is ignored: the function has consumed the entire buffer
191 and wishes to read more. If DONEP is true then the condition (<= START
192 USED END) must hold; the FUNC has consumed the buffer as far as USED
193 (exclusive) and has completed successfully; the values DONEP and `t' are
3109662a 194 returned as the result of `charbuf-scanner-map'.
dea4d055
MW
195
196 If end-of-file is encountered before FUNC completes successfully then FAIL
3109662a 197 is called with no arguments, and `charbuf-scanner-map' returns whatever
dea4d055
MW
198 FAIL returns.
199
200 Observe that, if FAIL returns a second value of nil, then
201 `charbuf-scanner-map' is usable as a parser expression."))
202
203(defmethod charbuf-scanner-map
204 ((scanner charbuf-scanner) func &optional fail)
205 (with-slots (buf index size) scanner
206 (flet ((offer (buf start end)
207
208 ;; Pass the buffer to the function, and see what it thought.
209 (multiple-value-bind (donep used) (funcall func buf start end)
210
211 ;; Update the position as far as the function read.
212 (with-slots (line column) scanner
213 (let ((l line) (c column) (limit (if donep used end)))
214 (do ((i start (1+ i)))
215 ((>= i limit))
216 (setf (values l c)
217 (update-position (char buf i) l c)))
218 (setf line l column c)))
219
220 ;; If the function is finished then update our state and
221 ;; return.
222 (when donep
223 (setf index used)
224 (when (>= index size)
225 (charbuf-scanner-fetch scanner))
226 (return-from charbuf-scanner-map (values donep t))))))
227
228 ;; If there's anything in the current buffer, offer it to the function.
229 (when (< index size)
230 (offer buf index size))
231
232 ;; Repeatedly fetch new buffers and offer them to the function.
233 ;; Because the buffers are fresh, we know that we must process them
234 ;; from the beginning. Note that `offer' will exit if FUNC has
235 ;; finished, so we don't need to worry about that.
236 (loop
237 (unless (charbuf-scanner-fetch scanner)
238 (return (if fail (funcall fail) (values nil nil))))
239 (offer buf 0 size)))))
240
241;;;--------------------------------------------------------------------------
242;;; Initialization.
243
244(defmethod shared-initialize :after
245 ((scanner charbuf-scanner) slot-names &key)
246
247 ;; Grab the filename from the underlying stream if we don't have a better
248 ;; guess.
249 (default-slot (scanner 'filename slot-names)
250 (with-slots (stream) scanner
251 (aif (stream-pathname stream) (namestring it) nil)))
252
253 ;; Get ready with the first character.
254 (charbuf-scanner-fetch scanner))
255
256;;;--------------------------------------------------------------------------
257;;; Scanner protocol implementation.
258
259(defmethod scanner-at-eof-p ((scanner charbuf-scanner))
260 (with-slots (buf) scanner
261 (eq buf :eof)))
262
263(defmethod scanner-current-char ((scanner charbuf-scanner))
264 (with-slots (buf index) scanner
265 (schar buf index)))
266
267(defmethod scanner-step ((scanner charbuf-scanner))
268 (with-slots (buf size index line column) scanner
269
270 ;; If there's a current character then update the position from it. When
271 ;; is there a current character? When the index is valid.
272 (when (< index size)
273 (setf (values line column)
274 (update-position (schar buf index) line column)))
275
276 ;; Now move the position on. If there's still a character left then we
277 ;; win; otherwise fetch another buffer.
278 (or (< (incf index) size)
279 (charbuf-scanner-fetch scanner))))
280
281(defmethod scanner-unread ((scanner charbuf-scanner) char)
282 (with-slots (buf index size unread tail line column) scanner
283 (cond
284
285 ;; First, let's rewind the buffer index. This isn't going to work if
286 ;; the index is already zero. (Note that this implies that INDEX is
287 ;; zero in the remaining cases.)
288 ((plusp index)
289 (decf index))
290
291 ;; Plan B. Maybe we've been here before, in which case we'll have left
292 ;; the appropriate state kicking about already. Note that, according
293 ;; to the `unread' rules, the character must be the same as last time,
294 ;; so we can just reuse the whole thing unchanged. Also, note that
295 ;; the NEXT field in UNREAD is not nil due to the way that we construct
296 ;; the link below.
297 ((and unread (eql (charbuf-chain-link-next unread) tail))
298 (setf tail unread size 1
299 buf (charbuf-chain-link-buf unread)))
300
301 ;; Nope, we've not been here, at least not recently. We'll concoct a
302 ;; new buffer and put the necessary stuff in it. Store it away for
303 ;; later so that repeated read/unread oscillations at this position
304 ;; don't end up consing enormous arrays too much.
305 (t
306 (let* ((next (or tail (make-charbuf-chain-link :buf buf :size size)))
307 (fake (make-charbuf))
308 (this (make-charbuf-chain-link :buf fake :size 1 :next next)))
309 (setf (schar fake 0) char buf fake size 1
310 tail this unread this))))
311
312 ;; That's that sorted; now we have to fiddle the position.
313 (setf (values line column) (backtrack-position char line column))))
314
315(defmethod scanner-capture-place ((scanner charbuf-scanner))
316 (with-slots (buf size index captures tail line column) scanner
317 (incf captures)
318 (unless tail
319 (setf tail (make-charbuf-chain-link :buf buf :size size)))
320 (make-charbuf-scanner-place :link tail :index index
321 :line line :column column)))
322
323(defmethod scanner-restore-place ((scanner charbuf-scanner) place)
324 (with-slots (buf size index tail line column) scanner
325 (let ((link (charbuf-scanner-place-link place)))
326 (setf buf (charbuf-chain-link-buf link)
327 size (charbuf-chain-link-size link)
328 index (charbuf-scanner-place-index place)
329 line (charbuf-scanner-place-line place)
330 column (charbuf-scanner-place-column place)
331 tail link))))
332
333(defmethod scanner-release-place ((scanner charbuf-scanner) place)
1d8cc67a 334 (declare (ignore place))
dea4d055
MW
335 (with-slots (captures) scanner
336 (decf captures)))
337
338(defstruct (charbuf-slice
339 (:constructor make-charbuf-slice
340 (buf &optional (start 0) %end
341 &aux (end (or %end (length buf))))))
342 (buf nil :type (or charbuf (eql :eof)) :read-only t)
343 (start 0 :type (and fixnum unsigned-byte) :read-only t)
344 (end 0 :type (and fixnum unsigned-byte) :read-only t))
345
346(declaim (inline charbuf-slice-length))
347(defun charbuf-slice-length (slice)
348 (- (charbuf-slice-end slice) (charbuf-slice-start slice)))
349
350(defun concatenate-charbuf-slices (slices)
351 (let* ((len (reduce #'+ slices
352 :key #'charbuf-slice-length
353 :initial-value 0))
354 (string (make-array len :element-type 'character))
355 (i 0))
356 (dolist (slice slices)
357 (let ((buf (charbuf-slice-buf slice))
358 (end (charbuf-slice-end slice)))
359 (do ((j (charbuf-slice-start slice) (1+ j)))
360 ((>= j end))
361 (setf (schar string i) (schar buf j))
362 (incf i))))
363 string))
364
365(defmethod scanner-interval
366 ((scanner charbuf-scanner) place-a &optional place-b)
367 (let* ((slices nil)
368 (place-b (or place-b
369 (with-slots (index tail) scanner
370 (make-charbuf-scanner-place :link tail
371 :index index))))
372 (last-link (charbuf-scanner-place-link place-b)))
373 (flet ((bad ()
3109662a 374 (error "Incorrect places ~S and ~S to `scanner-interval'."
dea4d055
MW
375 place-a place-b)))
376 (do ((link (charbuf-scanner-place-link place-a)
377 (charbuf-chain-link-next link))
378 (start (charbuf-scanner-place-index place-a) 0))
379 ((eq link last-link)
380 (let ((end (charbuf-scanner-place-index place-b)))
381 (when (< end start)
382 (bad))
383 (push (make-charbuf-slice (charbuf-chain-link-buf link)
384 start end)
385 slices)
386 (concatenate-charbuf-slices (nreverse slices))))
387 (when (null link) (bad))
388 (push (make-charbuf-slice (charbuf-chain-link-buf link)
389 start
390 (charbuf-chain-link-size link))
391 slices)))))
392
393;;;--------------------------------------------------------------------------
394;;; Specialized streams.
395
396(export 'charbuf-scanner-stream)
397(defclass charbuf-scanner-stream (character-scanner-stream)
398 ((scanner :initarg :scanner :type charbuf-scanner)))
399
bf090e02
MW
400(defmethod make-scanner-stream ((scanner charbuf-scanner))
401 (make-instance 'charbuf-scanner-stream :scanner scanner))
402
dea4d055 403(defmethod stream-read-sequence
1d8cc67a
MW
404 ((stream charbuf-scanner-stream) (seq string)
405 #+clisp &key #-clisp &optional (start 0) end)
dea4d055
MW
406 (with-slots (scanner) stream
407 (unless end (setf end (length seq)))
408 (let ((i start) (n (- end start)))
409 (labels ((copy (i buf start end)
410 (do ((j i (1+ j))
411 (k start (1+ k)))
412 ((>= k end))
413 (setf (char seq j) (schar buf k))))
414 (snarf (buf start end)
415 (let ((m (- end start)))
416 (cond ((< m n)
417 (copy i buf start end) (decf n m) (incf i m)
418 (values nil 0))
419 (t
420 (copy i buf start (+ start n)) (incf i n)
421 (values t n))))))
422 (charbuf-scanner-map scanner #'snarf)
423 i))))
424
425(defmethod stream-read-line ((stream charbuf-scanner-stream))
426 (with-slots (scanner) stream
427 (let ((slices nil))
428 (flet ((snarf (buf start end)
429 (let ((pos (position #\newline buf :start start :end end)))
430 (push (make-charbuf-slice buf start (or pos end)) slices)
431 (if pos
432 (values (concatenate-charbuf-slices (nreverse slices))
433 (1+ pos))
434 (values nil 0))))
435 (fail ()
436 (values (concatenate-charbuf-slices (nreverse slices)) t)))
437 (charbuf-scanner-map scanner #'snarf #'fail)))))
438
439;;;----- That's all, folks --------------------------------------------------