doc/: Sort out the manual structure. Write stuff.
[sod] / doc / parsing.tex
1 %%% -*-latex-*-
2 %%%
3 %%% Description of the parsing machinery
4 %%%
5 %%% (c) 2015 Straylight/Edgeware
6 %%%
7
8 %%%----- Licensing notice ---------------------------------------------------
9 %%%
10 %%% This file is part of the Sensble Object Design, an object system for C.
11 %%%
12 %%% SOD is free software; you can redistribute it and/or modify
13 %%% it under the terms of the GNU General Public License as published by
14 %%% the Free Software Foundation; either version 2 of the License, or
15 %%% (at your option) any later version.
16 %%%
17 %%% SOD is distributed in the hope that it will be useful,
18 %%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19 %%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 %%% GNU General Public License for more details.
21 %%%
22 %%% You should have received a copy of the GNU General Public License
23 %%% along with SOD; if not, write to the Free Software Foundation,
24 %%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26 \chapter{Parsing} \label{ch:parsing}
27
28 %%%--------------------------------------------------------------------------
29 \section{The parser protocol} \label{sec:parsing.proto}
30
31 For the purpose of Sod's parsing library, \emph{parsing} is the process of
32 reading a sequence of input items, in order, and computing an output value.
33
34 A \emph{parser} is an expression which consumes zero or more input items and
35 returns three values: a \emph{result}, a \emph{success flag}, and a
36 \emph{consumed flag}. The two flags are (generalized) booleans. If the
37 success flag is non-nil, then the parser is said to have \emph{succeeded},
38 and the result is the parser's output. If the success flag is nil then the
39 parser is said to have \emph{failed}, and the result is a list of
40 \emph{indicators}. Finally, the consumed flag is non-nil if the parser
41 consumed any input items.
42
43 %%%--------------------------------------------------------------------------
44 \section{File locations}
45
46 %%%--------------------------------------------------------------------------
47 \section{Scanners} \label{sec:parsing.scanner}
48
49 A \emph{scanner} is an object which keeps track of a parser's progress as it
50 works through its input. There's no common base class for scanners: a
51 scanner is simply any object which implements the scanner protocol described
52 here.
53
54 A scanner maintains a sequence of items to read. It can step forwards
55 through the items, one at a time, until it reaches the end (if, indeed, the
56 sequence is finite, which it needn't be). Until that point, there is a
57 current item, though there's no protocol for accessing it at this level
58 because the nature of the items is left unspecified.
59
60 Some scanners support an additional \emph{place-capture} protocol which
61 allows rewinding the scanner to an earlier point in the input so that it can
62 be scanned again.
63
64 \subsection{Basic scanner protocol} \label{sec:parsing.scanner.basic}
65
66 The basic protocol supports stepping the scanner forward through its input
67 sequence, and detecting the end of the sequence.
68
69 \begin{describe}{gf}{scanner-step @<scanner>}
70 Advance the @<scanner> to the next item, which becomes current.
71
72 It is an error to step the scanner if the scanner is at end-of-file.
73 \end{describe}
74
75 \begin{describe}{gf}{scanner-at-eof-p @<scanner> @> @<generalized-boolean>}
76 Return non-nil if the scanner is at end-of-file, i.e., there are no more
77 items to read.
78
79 If nil is returned, there is a current item, and it is safe to step the
80 scanner again; otherwise, it is an error to query the current item or to
81 step the scanner.
82 \end{describe}
83
84 \subsection{Place-capture scanner protocol} \label{sec:parsing.scanner.place}
85
86 The place-capture protocol allows rewinding to an earlier point in the
87 sequence. Not all scanners support the place-capture protocol.
88
89 To rewind a scanner to a particular point, that point must be \emph{captured}
90 as a \emph{place} when it's current -- so you must know in advance that this
91 is an interesting place that's worth capturing. The type of place returned
92 depends on the type of scanner. Given a captured place, the scanner can be
93 rewound to the position held in it.
94
95 Depending on how the scanner works, holding onto a captured place might
96 consume a lot of memory or case poor performance. For example, if the
97 scanner is reading from an input stream, having a captured place means that
98 data from that point on must be buffered in case the program needs to rewind
99 the scanner and read that data again. Therefore it's possible to
100 \emph{release} a place when it turns out not to be needed any more.
101
102 \begin{describe}{gf}{scanner-capture-place @<scanner> @> @<place>}
103 Capture the @<scanner>'s current position as a place, and return the place.
104 \end{describe}
105
106 \begin{describe}{gf}{scanner-restore-place @<scanner> @<place>}
107 Rewind the @<scanner> to the state it was in when @<place> was captured.
108 In particular, the item that was current when the @<place> was captured
109 becomes current again.
110
111 It is an error to restore a @<place> that has been released, or if the
112 @<place> wasn't captured from the @<scanner>.
113 \end{describe}
114
115 \begin{describe}{gf}{scanner-release-place @<scanner> @<place>}
116 Release the @<place>, to avoid having to maintaining the ability to restore
117 it after it's not needed any more..
118
119 It is an error if the @<place> wasn't captured from the @<scanner>.
120 \end{describe}
121
122 \begin{describe}{mac}
123 {with-scanner-place (@<place> @<scanner>) @<body-form>^* @> @<value>^*}
124 Capture the @<scanner>'s current position as a place, evaluate the
125 @<body-form>s as an implicit progn with the variable @<place> bound to the captured
126 place. When control leaves the @<body-form>s, the place is released. The return
127 values are the values of the final @<body-form>.
128 \end{describe}
129
130 \subsection{Scanner file-location protocol} \label{sec:parsing.scanner.floc}
131
132 Some scanners participate in the file-location protocol (\xref{sec:floc}).
133 They implement a method on @|file-location| which collects the necessary
134 information using scanner-specific functions described here.
135
136 \begin{describe}{fun}{scanner-file-location @<scanner> @> @<file-location>}
137 Return a @|file-location| object describing the current position of the
138 @<scanner>.
139
140 This calls the @|scanner-filename|, @|scanner-line| and @|scanner-column|
141 generic functions on the scanner, and uses these to fill in an appropriate
142 @|file-location|.
143
144 Since there are default methods on these generic functions, it is not an
145 error to call @|scanner-file-location| on any kind of value, but it might
146 not be very useful. This function exists to do the work of appropriately
147 specialized methods on @|file-location|.
148 \end{describe}
149
150 \begin{describe}{gf}{scanner-filename @<scanner> @> @<string>}
151 Return the name of the file the scanner is currently processing, as a
152 string, or nil if the filename is not known.
153 \end{describe}
154
155 \begin{describe}{meth}{scanner-filename (@<scanner> t) @> @<string>}
156 Returns nil.
157 \end{describe}
158
159 \begin{describe}{gf}{scanner-line @<scanner> @> @<integer>}
160 Return the line number of the @<scanner>'s current position, as an integer,
161 or nil if the line number is not known.
162 \end{describe}
163
164 \begin{describe}{meth}{scanner-line (@<scanner> t) @> @<integer>}
165 Returns nil.
166 \end{describe}
167
168 \begin{describe}{gf}{scanner-column @<scanner> @> @<integer>}
169 Return the column number of the @<scanner>'s current position, as an
170 integer, or nil if the column number is not known.
171 \end{describe}
172
173 \begin{describe}{meth}{scanner-column (@<scanner> t) @> @<integer>}
174 Returns nil.
175 \end{describe}
176
177 \subsection{Character scanners} \label{sec:parsing.scanner.char}
178
179 Character scanners are scanners which read sequences of characters.
180
181 \begin{describe}{cls}{character-scanner () \&key}
182 Base class for character scanners. This provides some very basic
183 functionality.
184
185 Not all character scanners are subclasses of @|character-scanner|.
186 \end{describe}
187
188 \begin{describe}{gf}{scanner-current-char @<scanner> @> @<character>}
189 Returns the current character.
190 \end{describe}
191
192 \begin{describe}{gf}{scanner-unread @<scanner> @<character>}
193 Rewind the @<scanner> by one step. The @<chararacter> must be the previous
194 current character, and becomes the current character again. It is an error
195 if: the @<scanner> has reached end-of-file; the @<scanner> is never been
196 stepped; or @<character> was not the previous current character.
197 \end{describe}
198
199 \begin{describe}{gf}
200 {scanner-interval @<scanner> @<place-a> \&optional @<place-b>
201 @> @<string>}
202 Return the characters in the @<scanner>'s input from @<place-a> up to (but
203 not including) @<place-b>.
204
205 The characters are returned as a string. If @<place-b> is omitted, return
206 the characters up to (but not including) the current position. It is an
207 error if @<place-b> precedes @<place-a> or they are from different
208 scanners.
209
210 This function is a character-scanner-specific extension to the
211 place-capture protocol; not all character scanners implement the
212 place-capture protocol, and some that do may not implement this function.
213 \end{describe}
214
215 \subsubsection{Stream access to character scanners}
216 Sometimes it can be useful to apply the standard Lisp character input
217 operations to the sequence of characters held by a character scanner.
218
219 \begin{describe}{gf}{make-scanner-stream @<scanner> @> @<stream>}
220 Returns a fresh input @|stream| object which fetches input characters from
221 the character scanner object @<scanner>. Reading characters from the
222 stream steps the scanner. The stream will reach end-of-file when the
223 scanner reports end-of-file. If the scanner implements the file-location
224 protocol then reading from the stream will change the file location in an
225 appropriate manner.
226
227 This is mostly useful for applying standard Lisp stream functions, most
228 particularly the @|read| function, in the middle of a parsing operation.
229 \end{describe}
230
231 \begin{describe}{cls}{character-scanner-stream (stream) \&key :scanner}
232 A Common Lisp input @|stream| object which works using the character
233 scanner protocol. Any @<scanner> which implements the base scanner and
234 character scanner protocols is suitable. See @|make-scanner-stream|.
235 \end{describe}
236
237 \subsection{String scanners} \label{sec:parsing.scanner.string}
238
239 A \emph{string scanner} is a simple kind of character scanner which reads
240 input from a string object. String scanners implement the character scanner
241 and place-capture protocols.
242
243 \begin{describe}{cls}{string-scanner}
244 The class of string scanners. The @|string-scanner| class is not a
245 subclass of @|character-scanner|.
246 \end{describe}
247
248 \begin{describe}{fun}{string-scanner-p @<value> @> @<generalized-boolean>}
249 Return non-nil if @<value> is a @|string-scanner| object; otherwise return
250 nil.
251 \end{describe}
252
253 \begin{describe}{fun}
254 {make-string-scanner @<string> \&key :start :end @> @<string-scanner>}
255 Construct and return a fresh @|string-scanner| object. The new scanner
256 will read characters from @<string>, starting at index @<start> (which
257 defaults to zero), and continuing until it reaches index @<end> (defaults
258 to the end of the @<string>).
259 \end{describe}
260
261 \subsection{Character buffer scanners} \label{sec:parsing.scanner.charbuf}
262
263 A \emph{character buffer scanner}, or \emph{charbuf scanner} for short, is an
264 efficient scanner for reading characters from an input stream. Charbuf
265 scanners implements the basic scanner, character buffer, place-capture, and
266 file-location protocols.
267
268 \begin{describe}{cls}
269 {charbuf-scanner (character-scanner)
270 \&key :stream :filename :line :column}
271 The class of charbuf scanners. The scanner will read characters from
272 @<stream>. Charbuf scanners implement the file-location protocol: the
273 initial location is set from the given @<filename>, @<line> and @<column>;
274 the scanner will update the location as it reads its input.
275 \end{describe}
276
277 \begin{describe}{cls}{charbuf-scanner-place}
278 The class of place objects captured by a charbuf scanner.
279 \end{describe}
280
281 \begin{describe}{fun}
282 {charbuf-scanner-place-p @<value> @> @<generalized-boolean>}
283 Type predicate for charbuf scanner places: returns non-nil if @<value> is a
284 place captured by a charbuf scanner, and nil otherwise.
285 \end{describe}
286
287 \begin{describe}{gf}
288 {charbuf-scanner-map @<scanner> @<func> \&optional @<fail>
289 \nlret @<result> @<successp> @<consumedp>}
290 Read characters from the @<scanner>'s buffers.
291
292 This is intended to be an efficient and versatile interface for reading
293 characters from a scanner in bulk. The function @<func> is invoked
294 repeatedly, as if by
295 \begin{prog}
296 (multiple-value-bind (@<donep> @<used>) \\ \ind\ind
297 (funcall @<func> @<buf> @<start> @<end>) \- \\
298 \textrm\ldots)
299 \end{prog}
300 The argument @<buf> is a simple string; @<start> and @<end> are two
301 nonnegative fixnums, indicating that the subsequence of @<buf> between
302 @<start> (inclusive) and @<end> (exclusive) should be processed. If
303 @<func>'s return value @<donep> is nil then @<used> is ignored: the
304 function has consumed the entire buffer and wishes to read more. If
305 @<donep> is non-nil, then it must be a fixnum such that $@<start> \le
306 @<used> \le @<end>$: the function has consumed the buffer as far as @<used>
307 (exclusive) and has completed successfully.
308
309 If end-of-file is encountered before @<func> completes successfully then it
310 fails: the @<fail> function is called with no arguments, and is expected to
311 return two values. If omitted, @<fail> defaults to
312 \begin{prog}
313 (lambda () \\ \ind
314 (values nil nil))%
315 \end{prog}
316
317 The @|charbuf-scanner-map| function returns three values. The first value
318 is the non-nil @<donep> value returned by @<func> if @|charbuf-scanner-map|
319 succeeded, or the first value returned by @<fail>; the second value is @|t|
320 on success, or the second value returned by @<fail>; the third value is
321 non-nil if @<func> consumed any input, i.e., it returned with @<donep> nil
322 at least once, or with $@<used> > @<start>$.
323 \end{describe}
324
325 \subsection{Token scanners} \label{sec:parsing.scanner.token}
326
327 \begin{describe}{cls}
328 {token-scanner () \&key :filename (:line 1) (:column 0)}
329 \end{describe}
330
331 \begin{describe}{gf}{token-type @<scanner> @> @<type>}
332 \end{describe}
333
334 \begin{describe}{gf}{token-value @<scanner> @> @<value>}
335 \end{describe}
336
337 \begin{describe}{gf}{scanner-token @<scanner> @> @<type> @<value>}
338 \end{describe}
339
340 \begin{describe}{ty}{token-scanner-place}
341 \end{describe}
342
343 \begin{describe}{fun}
344 {token-scanner-place-p @<value> @> @<generalized-boolean>}
345 \end{describe}
346
347 \subsection{List scanners}
348
349 \begin{describe}{ty}{list-scanner}
350 \end{describe}
351
352 \begin{describe}{fun}{list-scanner-p @<value> @> @<generalized-boolean>}
353 \end{describe}
354
355 \begin{describe}{fun}{make-list-scanner @<list> @> @<list-scanner>}
356 \end{describe}
357
358 %%%--------------------------------------------------------------------------
359 \section{Parsing macros}
360
361 %%%--------------------------------------------------------------------------
362 \section{Lexical analyser}
363
364 %%%----- That's all, folks --------------------------------------------------
365
366 %%% Local variables:
367 %%% mode: LaTeX
368 %%% TeX-master: "sod.tex"
369 %%% TeX-PDF-mode: t
370 %%% End: