lib/sod.[ch]: The runtime library is LGPL.
[sod] / pre-reorg / lex.lisp
CommitLineData
abdf50aa
MW
1;;; -*-lisp-*-
2;;;
3;;; Lexical analysis of a vaguely C-like language
4;;;
5;;; (c) 2009 Straylight/Edgeware
6;;;
7
8;;;----- Licensing notice ---------------------------------------------------
9;;;
10;;; This file is part of the Simple Object Definition system.
11;;;
12;;; SOD is free software; you can redistribute it and/or modify
13;;; it under the terms of the GNU General Public License as published by
14;;; the Free Software Foundation; either version 2 of the License, or
15;;; (at your option) any later version.
16;;;
17;;; SOD is distributed in the hope that it will be useful,
18;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;;; GNU General Public License for more details.
21;;;
22;;; You should have received a copy of the GNU General Public License
23;;; along with SOD; if not, write to the Free Software Foundation,
24;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26(cl:in-package #:sod)
27
28;;;--------------------------------------------------------------------------
29;;; Basic lexical analyser infrastructure.
30
31;; Class definition.
32
33(defclass lexer ()
77027cca
MW
34 ((stream :initarg :stream :type stream :reader lexer-stream)
35 (char :initform nil :type (or character null) :reader lexer-char)
36 (pushback-chars :initform nil :type list)
37 (token-type :initform nil :accessor token-type)
38 (token-value :initform nil :accessor token-value)
39 (pushback-tokens :initform nil :type list))
abdf50aa
MW
40 (:documentation
41 "Base class for lexical analysers.
42
43 The lexer reads characters from STREAM, which, for best results, wants to
44 be a POSITION-AWARE-INPUT-STREAM.
45
46 The lexer provides one-character lookahead by default: the current
47 lookahead character is available to subclasses in the slot CHAR. Before
48 beginning lexical analysis, the lookahead character needs to be
49 established with NEXT-CHAR. If one-character lookahead is insufficient,
50 the analyser can push back an arbitrary number of characters using
51 PUSHBACK-CHAR.
52
53 The NEXT-TOKEN function scans and returns the next token from the STREAM,
54 and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token
55 lookahead. A parser using the lexical analyser can push back tokens using
56 PUSHBACK-TOKENS.
57
58 For convenience, the lexer implements a FILE-LOCATION method (delegated to
59 the underlying stream)."))
60
61;; Lexer protocol.
62
63(defgeneric scan-token (lexer)
64 (:documentation
65 "Internal function for scanning tokens from an input stream.
66
67 Implementing a method on this function is the main responsibility of LEXER
68 subclasses; it is called by the user-facing NEXT-TOKEN function.
69
70 The method should consume characters (using NEXT-CHAR) as necessary, and
71 return two values: a token type and token value. These will be stored in
72 the corresponding slots in the lexer object in order to provide the user
73 with one-token lookahead."))
74
75(defgeneric next-token (lexer)
76 (:documentation
77 "Scan a token from an input stream.
78
79 This function scans a token from an input stream. Two values are
80 returned: a `token type' and a `token value'. These are opaque to the
81 LEXER base class, but the intent is that the token type be significant to
82 determining the syntax of the input, while the token value carries any
83 additional information about the token's semantic content. The token type
84 and token value are also made available for lookahead via accessors
85 TOKEN-TYPE and TOKEN-NAME on the LEXER object.
86
87 If tokens have been pushed back (see PUSHBACK-TOKEN) then they are
88 returned one by one instead of scanning the stream.")
89
90 (:method ((lexer lexer))
91 (with-slots (pushback-tokens token-type token-value) lexer
92 (setf (values token-type token-value)
93 (if pushback-tokens
94 (let ((pushback (pop pushback-tokens)))
95 (values (car pushback) (cdr pushback)))
96 (scan-token lexer))))))
97
98(defgeneric pushback-token (lexer token-type &optional token-value)
99 (:documentation
100 "Push a token back into the lexer.
101
102 Make the given TOKEN-TYPE and TOKEN-VALUE be the current lookahead token.
103 The previous lookahead token is pushed down, and will be made available
104 agan once this new token is consumed by NEXT-TOKEN. The FILE-LOCATION is
105 not affected by pushing tokens back. The TOKEN-TYPE and TOKEN-VALUE be
106 anything at all: for instance, they need not be values which can actually
107 be returned by NEXT-TOKEN.")
108
109 (:method ((lexer lexer) new-token-type &optional new-token-value)
110 (with-slots (pushback-tokens token-type token-value) lexer
111 (push (cons token-type token-value) pushback-tokens)
112 (setf token-type new-token-type
113 token-value new-token-value))))
114
115(defgeneric next-char (lexer)
116 (:documentation
117 "Fetch the next character from the LEXER's input stream.
118
119 Read a character from the input stream, and store it in the LEXER's CHAR
120 slot. The character stored is returned. If characters have been pushed
121 back then pushed-back characters are used instead of the input stream.
122
123 (This function is primarily intended for the use of lexer subclasses.)")
124
125 (:method ((lexer lexer))
126 (with-slots (stream char pushback-chars) lexer
127 (setf char (if pushback-chars
128 (pop pushback-chars)
129 (read-char stream nil))))))
130
131(defgeneric pushback-char (lexer char)
132 (:documentation
133 "Push the CHAR back into the lexer.
134
135 Make CHAR be the current lookahead character (stored in the LEXER's CHAR
136 slot). The previous lookahead character is pushed down, and will be made
137 available again once this character is consumed by NEXT-CHAR.
138
139 (This function is primarily intended for the use of lexer subclasses.)")
140
141 (:method ((lexer lexer) new-char)
142 (with-slots (char pushback-chars) lexer
143 (push char pushback-chars)
144 (setf char new-char))))
145
146(defgeneric fixup-stream* (lexer thunk)
147 (:documentation
148 "Helper function for WITH-LEXER-STREAM.
149
150 This function does the main work for WITH-LEXER-STREAM. The THUNK is
151 invoked on a single argument, the LEXER's underlying STREAM.")
152
153 (:method ((lexer lexer) thunk)
154 (with-slots (stream char pushback-chars) lexer
155 (when pushback-chars
156 (error "Lexer has pushed-back characters."))
157 (unread-char char stream)
158 (unwind-protect
159 (funcall thunk stream)
160 (setf char (read-char stream nil))))))
161
162(defmacro with-lexer-stream ((streamvar lexer) &body body)
163 "Evaluate BODY with STREAMVAR bound to the LEXER's input stream.
164
165 The STREAM is fixed up so that the next character read (e.g., using
166 READ-CHAR) will be the lexer's current lookahead character. Once the BODY
167 completes, the next character in the stream is read and set as the
168 lookahead character. It is an error if the lexer has pushed-back
169 characters (since these can't be pushed back into the input stream
170 properly)."
171
172 `(fixup-stream* ,lexer
173 (lambda (,streamvar)
174 ,@body)))
175
176(defmethod file-location ((lexer lexer))
177 (with-slots (stream) lexer
178 (file-location stream)))
179
180(defgeneric skip-spaces (lexer)
181 (:documentation
dea4d055 182 "Skip over whitespace characters in the LEXER."))
abdf50aa
MW
183
184;;;--------------------------------------------------------------------------
185;;; Lexer utilities.
186
abdf50aa
MW
187;;;--------------------------------------------------------------------------
188;;; Our main lexer.
189
190(defun make-keyword-table (&rest keywords)
191 "Construct a keyword table for the lexical analyser.
192
193 The KEYWORDS arguments are individual keywords, either as strings or as
194 (WORD . VALUE) pairs. A string argument is equivalent to a pair listing
195 the string itself as WORD and the corresponding keyword symbol (forced to
196 uppercase) as the VALUE."
197
198 (let ((table (make-hash-table :test #'equal)))
199 (dolist (item keywords)
200 (multiple-value-bind (word keyword)
201 (if (consp item)
202 (values (car item) (cdr item))
203 (values item (intern (string-upcase item) :keyword)))
204 (setf (gethash word table) keyword)))
205 table))
206
207(defparameter *sod-keywords*
208 (make-keyword-table
209
abdf50aa
MW
210 ;; Words with a meaning to C's type system.
211 "char" "int" "float" "void"
212 "long" "short" "signed" "unsigned" "double"
213 "const" "volatile" "restrict"
214 "struct" "union" "enum"))
215
216(defclass sod-lexer (lexer)
ddee4bb1 217 ()
abdf50aa
MW
218 (:documentation
219 "Lexical analyser for the SOD lanuage.
220
221 See the LEXER class for the gory details about the lexer protocol."))
222
223(defun format-token (token-type &optional token-value)
224 (when (typep token-type 'lexer)
225 (let ((lexer token-type))
226 (setf token-type (token-type lexer)
227 token-value (token-value lexer))))
228 (etypecase token-type
229 ((eql :eof) "<end-of-file>")
230 ((eql :string) "<string-literal>")
231 ((eql :char) "<character-literal>")
232 ((eql :id) (format nil "<identifier~@[ `~A'~]>" token-value))
233 (keyword (format nil "`~(~A~)'" token-type))
234 (character (format nil "~:[<~:C>~;`~C'~]"
235 (and (graphic-char-p token-type)
236 (char/= token-type #\space))
237 token-type))))
238
239(defmethod scan-token ((lexer sod-lexer))
240 (with-slots (stream char keywords) lexer
241 (prog ((ch char))
242
243 consider
244 (cond
245
246 ;; End-of-file brings its own peculiar joy.
247 ((null ch) (return (values :eof t)))
248
249 ;; Ignore whitespace and continue around for more.
250 ((whitespace-char-p ch) (go scan))
251
252 ;; Strings.
253 ((or (char= ch #\") (char= ch #\'))
1f1d88f5 254 (with-default-error-location ((file-location lexer))
abdf50aa
MW
255 (let* ((quote ch)
256 (string
257 (with-output-to-string (out)
258 (loop
259 (flet ((getch ()
260 (setf ch (next-char lexer))
261 (when (null ch)
1f1d88f5 262 (cerror*
abdf50aa
MW
263 "Unexpected end of file in string/character constant")
264 (return))))
265 (getch)
266 (cond ((char= ch quote) (return))
267 ((char= ch #\\) (getch)))
268 (write-char ch out))))))
269 (setf ch (next-char lexer))
270 (ecase quote
271 (#\" (return (values :string string)))
272 (#\' (case (length string)
273 (0 (cerror* "Empty character constant")
274 (return (values :char #\?)))
275 (1 (return (values :char (char string 0))))
276 (t (cerror*
277 "Multiple characters in character constant")
278 (return (values :char (char string 0))))))))))
279
280 ;; Pick out identifiers and keywords.
281 ((or (alpha-char-p ch) (char= ch #\_))
282
283 ;; Scan a sequence of alphanumerics and underscores. We could
284 ;; allow more interesting identifiers, but it would damage our C
285 ;; lexical compatibility.
286 (let ((id (with-output-to-string (out)
287 (loop
288 (write-char ch out)
289 (setf ch (next-char lexer))
290 (when (or (null ch)
291 (not (or (alphanumericp ch)
292 (char= ch #\_))))
293 (return))))))
294
ddee4bb1
MW
295 ;; Done.
296 (return (values :id id))))
abdf50aa
MW
297
298 ;; Pick out numbers. Currently only integers, but we support
299 ;; multiple bases.
300 ((digit-char-p ch)
301
302 ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
303 ;; (maybe uppercase) then we've got a funny radix to deal with.
304 ;; Otherwise, a leading zero signifies octal (daft, I know), else
305 ;; we're left with decimal.
306 (multiple-value-bind (radix skip-char)
307 (if (char/= ch #\0)
308 (values 10 nil)
309 (case (and (setf ch (next-char lexer))
310 (char-downcase ch))
311 (#\b (values 2 t))
312 (#\o (values 8 t))
313 (#\x (values 16 t))
314 (t (values 8 nil))))
315
316 ;; If we last munched an interesting letter, we need to skip over
317 ;; it. That's what the SKIP-CHAR flag is for.
1f1d88f5
MW
318 ;;
319 ;; Danger, Will Robinson! If we're' just about to eat a radix
320 ;; letter, then the next thing must be a digit. For example,
321 ;; `0xfatenning' parses as a hex number followed by an identifier
322 ;; `0xfa ttening', but `0xturning' is an octal number followed
323 ;; by an identifier `0 xturning'.
abdf50aa 324 (when skip-char
1f1d88f5
MW
325 (let ((peek (next-char lexer)))
326 (unless (digit-char-p peek radix)
327 (pushback-char lexer ch)
328 (return-from scan-token (values :integer 0)))
329 (setf ch peek)))
abdf50aa
MW
330
331 ;; Scan an integer. While there are digits, feed them into the
332 ;; accumulator.
333 (do ((accum 0 (+ (* accum radix) digit))
334 (digit (and ch (digit-char-p ch radix))
335 (and ch (digit-char-p ch radix))))
336 ((null digit) (return-from scan-token
337 (values :integer accum)))
338 (setf ch (next-char lexer)))))
339
340 ;; A slash might be the start of a comment.
341 ((char= ch #\/)
342 (setf ch (next-char lexer))
343 (case ch
344
345 ;; Comment up to the end of the line.
346 (#\/
347 (loop
348 (setf ch (next-char lexer))
349 (when (or (null ch) (char= ch #\newline))
350 (go scan))))
351
352 ;; Comment up to the next `*/'.
353 (#\*
354 (tagbody
355 top
356 (case (setf ch (next-char lexer))
357 (#\* (go star))
358 ((nil) (go done))
359 (t (go top)))
360 star
361 (case (setf ch (next-char lexer))
362 (#\* (go star))
363 (#\/ (setf ch (next-char lexer))
364 (go done))
365 ((nil) (go done))
366 (t (go top)))
367 done)
368 (go consider))
369
370 ;; False alarm. (The next character is already set up.)
371 (t
372 (return (values #\/ t)))))
373
374 ;; A dot: might be `...'. Tread carefully! We need more lookahead
375 ;; than is good for us.
376 ((char= ch #\.)
377 (setf ch (next-char lexer))
378 (cond ((eql ch #\.)
379 (setf ch (next-char lexer))
380 (cond ((eql ch #\.) (return (values :ellpisis nil)))
381 (t (pushback-char lexer #\.)
382 (return (values #\. t)))))
383 (t
384 (return (values #\. t)))))
385
386 ;; Anything else is a lone delimiter.
387 (t
388 (return (multiple-value-prog1
389 (values ch t)
390 (next-char lexer)))))
391
392 scan
393 ;; Scan a new character and try again.
394 (setf ch (next-char lexer))
395 (go consider))))
396
397;;;--------------------------------------------------------------------------
398;;; C fragments.
399
abdf50aa
MW
400(defun scan-c-fragment (lexer end-chars)
401 "Snarfs a sequence of C tokens with balanced brackets.
402
403 Reads and consumes characters from the LEXER's stream, and returns them as
404 a string. The string will contain whole C tokens, up as far as an
405 occurrence of one of the END-CHARS (a list) which (a) is not within a
406 string or character literal or comment, and (b) appears at the outer level
407 of nesting of brackets (whether round, curly or square -- again counting
408 only brackets which aren't themselves within string/character literals or
409 comments. The final END-CHAR is not consumed.
410
411 An error is signalled if either the stream ends before an occurrence of
412 one of the END-CHARS, or if mismatching brackets are encountered. No
413 other attempt is made to ensure that the characters read are in fact a
414 valid C fragment.
415
416 Both original /*...*/ and new //... comments are recognized. Trigraphs
417 and digraphs are currently not recognized."
418
419 (let ((output (make-string-output-stream))
420 (ch (lexer-char lexer))
421 (start-floc (file-location lexer))
422 (delim nil)
423 (stack nil))
424
425 ;; Main loop. At the top of this loop, we've already read a
426 ;; character into CH. This is usually read at the end of processing
427 ;; the individual character, though sometimes (following `/', for
428 ;; example) it's read speculatively because we need one-character
429 ;; lookahead.
430 (block loop
431 (labels ((getch ()
432 "Read the next character into CH; complain if we hit EOF."
433 (unless (setf ch (next-char lexer))
434 (cerror*-with-location start-floc
435 "Unexpected end-of-file in C fragment")
436 (return-from loop))
437 ch)
438 (putch ()
439 "Write the character to the output buffer."
440 (write-char ch output))
441 (push-delim (d)
442 "Push a closing delimiter onto the stack."
443 (push delim stack)
444 (setf delim d)
445 (getch)))
446
447 ;; Hack: if the first character is a newline, discard it. Otherwise
448 ;; (a) the output fragment will look funny, and (b) the location
449 ;; information will be wrong.
450 (when (eql ch #\newline)
451 (getch))
452
453 ;; And fetch characters.
454 (loop
455
456 ;; Here we're outside any string or character literal, though we
457 ;; may be nested within brackets. So, if there's no delimiter, and
458 ;; we've found the end character, we're done.
459 (when (and (null delim) (member ch end-chars))
460 (return))
461
462 ;; Otherwise take a copy of the character, and work out what to do
463 ;; next.
464 (putch)
465 (case ch
466
467 ;; Starting a literal. Continue until we find a matching
468 ;; character not preceded by a `\'.
469 ((#\" #\')
470 (let ((quote ch))
471 (loop
472 (getch)
473 (putch)
474 (when (eql ch quote)
475 (return))
476 (when (eql ch #\\)
477 (getch)
478 (putch)))
479 (getch)))
480
481 ;; Various kinds of opening bracket. Stash the current
482 ;; delimiter, and note that we're looking for a new one.
483 (#\( (push-delim #\)))
484 (#\[ (push-delim #\]))
485 (#\{ (push-delim #\}))
486
487 ;; Various kinds of closing bracket. If it matches the current
488 ;; delimeter then unstack the next one along. Otherwise
489 ;; something's gone wrong: C syntax doesn't allow unmatched
490 ;; brackets.
491 ((#\) #\] #\})
492 (if (eql ch delim)
493 (setf delim (pop stack))
494 (cerror* "Unmatched `~C'." ch))
495 (getch))
496
497 ;; A slash. Maybe a comment next. But maybe not...
498 (#\/
499
500 ;; Examine the next character to find out how to proceed.
501 (getch)
502 (case ch
503
504 ;; A second slash -- eat until the end of the line.
505 (#\/
506 (putch)
507 (loop
508 (getch)
509 (putch)
510 (when (eql ch #\newline)
511 (return)))
512 (getch))
513
514 ;; A star -- eat until we find a star-slash. Since the star
515 ;; might be preceded by another star, we use a little state
516 ;; machine.
517 (#\*
518 (putch)
519 (tagbody
520
521 main
522 ;; Main state. If we read a star, switch to star state;
523 ;; otherwise eat the character and try again.
524 (getch)
525 (putch)
526 (case ch
527 (#\* (go star))
528 (t (go main)))
529
530 star
531 ;; Star state. If we read a slash, we're done; if we read
532 ;; another star, stay in star state; otherwise go back to
533 ;; main.
534 (getch)
535 (putch)
536 (case ch
537 (#\* (go star))
538 (#\/ (go done))
539 (t (go main)))
540
541 done
542 (getch)))))
543
544 ;; Something else. Eat it and continue.
545 (t (getch)))))
546
3be8c2bf
MW
547 (let* ((string (get-output-stream-string output))
548 (end (position-if (lambda (char)
549 (or (char= char #\newline)
550 (not (whitespace-char-p char))))
551 string
552 :from-end t))
553 (trimmed (if end
554 (subseq string 0 (1+ end))
555 "")))
556
557 ;; Return the fragment we've collected.
558 (make-instance 'c-fragment
559 :location start-floc
560 :text trimmed)))))
abdf50aa
MW
561
562(defun c-fragment-reader (stream char arg)
563 "Reader for C-fragment syntax #{ ... stuff ... }."
564 (declare (ignore char arg))
565 (let ((lexer (make-instance 'sod-lexer
566 :stream stream)))
567 (next-char lexer)
568 (scan-c-fragment lexer '(#\}))))
569
1f1d88f5
MW
570#+interactive
571(set-dispatch-macro-character #\# #\{ 'c-fragment-reader)
572
abdf50aa
MW
573;;;--------------------------------------------------------------------------
574;;; Testing cruft.
575
576#+test
577(with-input-from-string (in "
578{ foo } 'x' /?/***/!
579123 0432 0b010123 0xc0ffee __burp_32 class
1f1d88f5
MW
580
5810xturning 0xfattening
abdf50aa
MW
582...
583
584class integer : integral_domain {
585 something here;
586}
587
588")
589 (let* ((stream (make-instance 'position-aware-input-stream
590 :stream in
591 :file #p"magic"))
592 (lexer (make-instance 'sod-lexer
593 :stream stream
594 :keywords *sod-keywords*))
595 (list nil))
596 (next-char lexer)
597 (loop
598 (multiple-value-bind (tokty tokval) (next-token lexer)
599 (push (list tokty tokval) list)
600 (when (eql tokty :eof)
601 (return))))
602 (nreverse list)))
603
604;;;----- That's all, folks --------------------------------------------------