Early work-in-progress.
[sod] / lex.lisp
1 ;;; -*-lisp-*-
2 ;;;
3 ;;; Lexical analysis of a vaguely C-like language
4 ;;;
5 ;;; (c) 2009 Straylight/Edgeware
6 ;;;
7
8 ;;;----- Licensing notice ---------------------------------------------------
9 ;;;
10 ;;; This file is part of the Simple Object Definition system.
11 ;;;
12 ;;; SOD is free software; you can redistribute it and/or modify
13 ;;; it under the terms of the GNU General Public License as published by
14 ;;; the Free Software Foundation; either version 2 of the License, or
15 ;;; (at your option) any later version.
16 ;;;
17 ;;; SOD is distributed in the hope that it will be useful,
18 ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19 ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 ;;; GNU General Public License for more details.
21 ;;;
22 ;;; You should have received a copy of the GNU General Public License
23 ;;; along with SOD; if not, write to the Free Software Foundation,
24 ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26 (cl:in-package #:sod)
27
28 ;;;--------------------------------------------------------------------------
29 ;;; Basic lexical analyser infrastructure.
30
31 ;; Class definition.
32
33 (defclass lexer ()
34 ((stream :initarg :stream
35 :type stream
36 :reader lexer-stream)
37 (char :initform nil
38 :type (or character null)
39 :reader lexer-char)
40 (pushback-chars :initform nil
41 :type list)
42 (token-type :initform nil
43 :accessor token-type)
44 (token-value :initform nil
45 :accessor token-value)
46 (pushback-tokens :initform nil
47 :type list))
48 (:documentation
49 "Base class for lexical analysers.
50
51 The lexer reads characters from STREAM, which, for best results, wants to
52 be a POSITION-AWARE-INPUT-STREAM.
53
54 The lexer provides one-character lookahead by default: the current
55 lookahead character is available to subclasses in the slot CHAR. Before
56 beginning lexical analysis, the lookahead character needs to be
57 established with NEXT-CHAR. If one-character lookahead is insufficient,
58 the analyser can push back an arbitrary number of characters using
59 PUSHBACK-CHAR.
60
61 The NEXT-TOKEN function scans and returns the next token from the STREAM,
62 and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token
63 lookahead. A parser using the lexical analyser can push back tokens using
64 PUSHBACK-TOKENS.
65
66 For convenience, the lexer implements a FILE-LOCATION method (delegated to
67 the underlying stream)."))
68
69 ;; Lexer protocol.
70
71 (defgeneric scan-token (lexer)
72 (:documentation
73 "Internal function for scanning tokens from an input stream.
74
75 Implementing a method on this function is the main responsibility of LEXER
76 subclasses; it is called by the user-facing NEXT-TOKEN function.
77
78 The method should consume characters (using NEXT-CHAR) as necessary, and
79 return two values: a token type and token value. These will be stored in
80 the corresponding slots in the lexer object in order to provide the user
81 with one-token lookahead."))
82
83 (defgeneric next-token (lexer)
84 (:documentation
85 "Scan a token from an input stream.
86
87 This function scans a token from an input stream. Two values are
88 returned: a `token type' and a `token value'. These are opaque to the
89 LEXER base class, but the intent is that the token type be significant to
90 determining the syntax of the input, while the token value carries any
91 additional information about the token's semantic content. The token type
92 and token value are also made available for lookahead via accessors
93 TOKEN-TYPE and TOKEN-NAME on the LEXER object.
94
95 If tokens have been pushed back (see PUSHBACK-TOKEN) then they are
96 returned one by one instead of scanning the stream.")
97
98 (:method ((lexer lexer))
99 (with-slots (pushback-tokens token-type token-value) lexer
100 (setf (values token-type token-value)
101 (if pushback-tokens
102 (let ((pushback (pop pushback-tokens)))
103 (values (car pushback) (cdr pushback)))
104 (scan-token lexer))))))
105
106 (defgeneric pushback-token (lexer token-type &optional token-value)
107 (:documentation
108 "Push a token back into the lexer.
109
110 Make the given TOKEN-TYPE and TOKEN-VALUE be the current lookahead token.
111 The previous lookahead token is pushed down, and will be made available
112 agan once this new token is consumed by NEXT-TOKEN. The FILE-LOCATION is
113 not affected by pushing tokens back. The TOKEN-TYPE and TOKEN-VALUE be
114 anything at all: for instance, they need not be values which can actually
115 be returned by NEXT-TOKEN.")
116
117 (:method ((lexer lexer) new-token-type &optional new-token-value)
118 (with-slots (pushback-tokens token-type token-value) lexer
119 (push (cons token-type token-value) pushback-tokens)
120 (setf token-type new-token-type
121 token-value new-token-value))))
122
123 (defgeneric next-char (lexer)
124 (:documentation
125 "Fetch the next character from the LEXER's input stream.
126
127 Read a character from the input stream, and store it in the LEXER's CHAR
128 slot. The character stored is returned. If characters have been pushed
129 back then pushed-back characters are used instead of the input stream.
130
131 (This function is primarily intended for the use of lexer subclasses.)")
132
133 (:method ((lexer lexer))
134 (with-slots (stream char pushback-chars) lexer
135 (setf char (if pushback-chars
136 (pop pushback-chars)
137 (read-char stream nil))))))
138
139 (defgeneric pushback-char (lexer char)
140 (:documentation
141 "Push the CHAR back into the lexer.
142
143 Make CHAR be the current lookahead character (stored in the LEXER's CHAR
144 slot). The previous lookahead character is pushed down, and will be made
145 available again once this character is consumed by NEXT-CHAR.
146
147 (This function is primarily intended for the use of lexer subclasses.)")
148
149 (:method ((lexer lexer) new-char)
150 (with-slots (char pushback-chars) lexer
151 (push char pushback-chars)
152 (setf char new-char))))
153
154 (defgeneric fixup-stream* (lexer thunk)
155 (:documentation
156 "Helper function for WITH-LEXER-STREAM.
157
158 This function does the main work for WITH-LEXER-STREAM. The THUNK is
159 invoked on a single argument, the LEXER's underlying STREAM.")
160
161 (:method ((lexer lexer) thunk)
162 (with-slots (stream char pushback-chars) lexer
163 (when pushback-chars
164 (error "Lexer has pushed-back characters."))
165 (unread-char char stream)
166 (unwind-protect
167 (funcall thunk stream)
168 (setf char (read-char stream nil))))))
169
170 (defmacro with-lexer-stream ((streamvar lexer) &body body)
171 "Evaluate BODY with STREAMVAR bound to the LEXER's input stream.
172
173 The STREAM is fixed up so that the next character read (e.g., using
174 READ-CHAR) will be the lexer's current lookahead character. Once the BODY
175 completes, the next character in the stream is read and set as the
176 lookahead character. It is an error if the lexer has pushed-back
177 characters (since these can't be pushed back into the input stream
178 properly)."
179
180 `(fixup-stream* ,lexer
181 (lambda (,streamvar)
182 ,@body)))
183
184 (defmethod file-location ((lexer lexer))
185 (with-slots (stream) lexer
186 (file-location stream)))
187
188 (defgeneric skip-spaces (lexer)
189 (:documentation
190 "Skip over whitespace characters in the LEXER.")
191 (:method ((lexer lexer))
192 (do ((ch (lexer-char lexer) (next-char lexer)))
193 ((not (whitespace-char-p ch))))))
194
195 ;;;--------------------------------------------------------------------------
196 ;;; Lexer utilities.
197
198 (defun require-token
199 (lexer wanted-token-type &key (errorp t) (consumep t) default)
200 (with-slots (token-type token-value) lexer
201 (cond ((eql token-type wanted-token-type)
202 (prog1 token-value
203 (when consumep (next-token lexer))))
204 (errorp
205 (cerror* "Expected ~A but found ~A"
206 (format-token wanted-token-type)
207 (format-token token-type token-value))
208 default)
209 (t
210 default))))
211
212 ;;;--------------------------------------------------------------------------
213 ;;; Our main lexer.
214
215 (defun make-keyword-table (&rest keywords)
216 "Construct a keyword table for the lexical analyser.
217
218 The KEYWORDS arguments are individual keywords, either as strings or as
219 (WORD . VALUE) pairs. A string argument is equivalent to a pair listing
220 the string itself as WORD and the corresponding keyword symbol (forced to
221 uppercase) as the VALUE."
222
223 (let ((table (make-hash-table :test #'equal)))
224 (dolist (item keywords)
225 (multiple-value-bind (word keyword)
226 (if (consp item)
227 (values (car item) (cdr item))
228 (values item (intern (string-upcase item) :keyword)))
229 (setf (gethash word table) keyword)))
230 table))
231
232 (defparameter *sod-keywords*
233 (make-keyword-table
234
235 ;; Words with important meanings to us.
236 "class"
237 "import" "load" "lisp" "typename"
238 "source" "header"
239
240 ;; Words with a meaning to C's type system.
241 "char" "int" "float" "void"
242 "long" "short" "signed" "unsigned" "double"
243 "const" "volatile" "restrict"
244 "struct" "union" "enum"))
245
246 (defclass sod-lexer (lexer)
247 ((keywords :initarg :keywords
248 :initform *sod-keywords*
249 :type hash-table
250 :reader lexer-keywords))
251 (:documentation
252 "Lexical analyser for the SOD lanuage.
253
254 See the LEXER class for the gory details about the lexer protocol."))
255
256 (defun format-token (token-type &optional token-value)
257 (when (typep token-type 'lexer)
258 (let ((lexer token-type))
259 (setf token-type (token-type lexer)
260 token-value (token-value lexer))))
261 (etypecase token-type
262 ((eql :eof) "<end-of-file>")
263 ((eql :string) "<string-literal>")
264 ((eql :char) "<character-literal>")
265 ((eql :id) (format nil "<identifier~@[ `~A'~]>" token-value))
266 (keyword (format nil "`~(~A~)'" token-type))
267 (character (format nil "~:[<~:C>~;`~C'~]"
268 (and (graphic-char-p token-type)
269 (char/= token-type #\space))
270 token-type))))
271
272 (defmethod scan-token ((lexer sod-lexer))
273 (with-slots (stream char keywords) lexer
274 (prog ((ch char))
275
276 consider
277 (cond
278
279 ;; End-of-file brings its own peculiar joy.
280 ((null ch) (return (values :eof t)))
281
282 ;; Ignore whitespace and continue around for more.
283 ((whitespace-char-p ch) (go scan))
284
285 ;; Strings.
286 ((or (char= ch #\") (char= ch #\'))
287 (with-default-error-location (file-location lexer)
288 (let* ((quote ch)
289 (string
290 (with-output-to-string (out)
291 (loop
292 (flet ((getch ()
293 (setf ch (next-char lexer))
294 (when (null ch)
295 (cerror* floc
296 "Unexpected end of file in string/character constant")
297 (return))))
298 (getch)
299 (cond ((char= ch quote) (return))
300 ((char= ch #\\) (getch)))
301 (write-char ch out))))))
302 (setf ch (next-char lexer))
303 (ecase quote
304 (#\" (return (values :string string)))
305 (#\' (case (length string)
306 (0 (cerror* "Empty character constant")
307 (return (values :char #\?)))
308 (1 (return (values :char (char string 0))))
309 (t (cerror*
310 "Multiple characters in character constant")
311 (return (values :char (char string 0))))))))))
312
313 ;; Pick out identifiers and keywords.
314 ((or (alpha-char-p ch) (char= ch #\_))
315
316 ;; Scan a sequence of alphanumerics and underscores. We could
317 ;; allow more interesting identifiers, but it would damage our C
318 ;; lexical compatibility.
319 (let ((id (with-output-to-string (out)
320 (loop
321 (write-char ch out)
322 (setf ch (next-char lexer))
323 (when (or (null ch)
324 (not (or (alphanumericp ch)
325 (char= ch #\_))))
326 (return))))))
327
328 ;; Check to see whether we match any keywords.
329 (multiple-value-bind (keyword foundp) (gethash id keywords)
330 (return (values (if foundp keyword :id) id)))))
331
332 ;; Pick out numbers. Currently only integers, but we support
333 ;; multiple bases.
334 ((digit-char-p ch)
335
336 ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
337 ;; (maybe uppercase) then we've got a funny radix to deal with.
338 ;; Otherwise, a leading zero signifies octal (daft, I know), else
339 ;; we're left with decimal.
340 (multiple-value-bind (radix skip-char)
341 (if (char/= ch #\0)
342 (values 10 nil)
343 (case (and (setf ch (next-char lexer))
344 (char-downcase ch))
345 (#\b (values 2 t))
346 (#\o (values 8 t))
347 (#\x (values 16 t))
348 (t (values 8 nil))))
349
350 ;; If we last munched an interesting letter, we need to skip over
351 ;; it. That's what the SKIP-CHAR flag is for.
352 (when skip-char
353 (setf ch (next-char lexer)))
354
355 ;; Scan an integer. While there are digits, feed them into the
356 ;; accumulator.
357 (do ((accum 0 (+ (* accum radix) digit))
358 (digit (and ch (digit-char-p ch radix))
359 (and ch (digit-char-p ch radix))))
360 ((null digit) (return-from scan-token
361 (values :integer accum)))
362 (setf ch (next-char lexer)))))
363
364 ;; A slash might be the start of a comment.
365 ((char= ch #\/)
366 (setf ch (next-char lexer))
367 (case ch
368
369 ;; Comment up to the end of the line.
370 (#\/
371 (loop
372 (setf ch (next-char lexer))
373 (when (or (null ch) (char= ch #\newline))
374 (go scan))))
375
376 ;; Comment up to the next `*/'.
377 (#\*
378 (tagbody
379 top
380 (case (setf ch (next-char lexer))
381 (#\* (go star))
382 ((nil) (go done))
383 (t (go top)))
384 star
385 (case (setf ch (next-char lexer))
386 (#\* (go star))
387 (#\/ (setf ch (next-char lexer))
388 (go done))
389 ((nil) (go done))
390 (t (go top)))
391 done)
392 (go consider))
393
394 ;; False alarm. (The next character is already set up.)
395 (t
396 (return (values #\/ t)))))
397
398 ;; A dot: might be `...'. Tread carefully! We need more lookahead
399 ;; than is good for us.
400 ((char= ch #\.)
401 (setf ch (next-char lexer))
402 (cond ((eql ch #\.)
403 (setf ch (next-char lexer))
404 (cond ((eql ch #\.) (return (values :ellpisis nil)))
405 (t (pushback-char lexer #\.)
406 (return (values #\. t)))))
407 (t
408 (return (values #\. t)))))
409
410 ;; Anything else is a lone delimiter.
411 (t
412 (return (multiple-value-prog1
413 (values ch t)
414 (next-char lexer)))))
415
416 scan
417 ;; Scan a new character and try again.
418 (setf ch (next-char lexer))
419 (go consider))))
420
421 ;;;--------------------------------------------------------------------------
422 ;;; C fragments.
423
424 (defclass c-fragment ()
425 ((location :initarg :location
426 :type file-location
427 :accessor c-fragment-location)
428 (text :initarg :text
429 :type string
430 :accessor c-fragment-text))
431 (:documentation
432 "Represents a fragment of C code to be written to an output file.
433
434 A C fragment is aware of its original location, and will bear proper #line
435 markers when written out."))
436
437 (defgeneric write-fragment (fragment stream)
438 (:documentation
439 "Writes a fragment to the output stream, marking its source properly.")
440
441 (:method ((fragment c-fragment) stream)
442 (with-slots (location text) fragment
443 (format stream "~&#line ~D ~S~%~A~&"
444 (file-location-line location)
445 (namestring (file-location-pathname location))
446 text)
447 (format stream "#line ~D ~S~%"
448 (1+ (position-aware-stream-line stream))
449 (namestring (stream-pathname stream))))))
450
451 (defun scan-c-fragment (lexer end-chars)
452 "Snarfs a sequence of C tokens with balanced brackets.
453
454 Reads and consumes characters from the LEXER's stream, and returns them as
455 a string. The string will contain whole C tokens, up as far as an
456 occurrence of one of the END-CHARS (a list) which (a) is not within a
457 string or character literal or comment, and (b) appears at the outer level
458 of nesting of brackets (whether round, curly or square -- again counting
459 only brackets which aren't themselves within string/character literals or
460 comments. The final END-CHAR is not consumed.
461
462 An error is signalled if either the stream ends before an occurrence of
463 one of the END-CHARS, or if mismatching brackets are encountered. No
464 other attempt is made to ensure that the characters read are in fact a
465 valid C fragment.
466
467 Both original /*...*/ and new //... comments are recognized. Trigraphs
468 and digraphs are currently not recognized."
469
470 (let ((output (make-string-output-stream))
471 (ch (lexer-char lexer))
472 (start-floc (file-location lexer))
473 (delim nil)
474 (stack nil))
475
476 ;; Main loop. At the top of this loop, we've already read a
477 ;; character into CH. This is usually read at the end of processing
478 ;; the individual character, though sometimes (following `/', for
479 ;; example) it's read speculatively because we need one-character
480 ;; lookahead.
481 (block loop
482 (labels ((getch ()
483 "Read the next character into CH; complain if we hit EOF."
484 (unless (setf ch (next-char lexer))
485 (cerror*-with-location start-floc
486 "Unexpected end-of-file in C fragment")
487 (return-from loop))
488 ch)
489 (putch ()
490 "Write the character to the output buffer."
491 (write-char ch output))
492 (push-delim (d)
493 "Push a closing delimiter onto the stack."
494 (push delim stack)
495 (setf delim d)
496 (getch)))
497
498 ;; Hack: if the first character is a newline, discard it. Otherwise
499 ;; (a) the output fragment will look funny, and (b) the location
500 ;; information will be wrong.
501 (when (eql ch #\newline)
502 (getch))
503
504 ;; And fetch characters.
505 (loop
506
507 ;; Here we're outside any string or character literal, though we
508 ;; may be nested within brackets. So, if there's no delimiter, and
509 ;; we've found the end character, we're done.
510 (when (and (null delim) (member ch end-chars))
511 (return))
512
513 ;; Otherwise take a copy of the character, and work out what to do
514 ;; next.
515 (putch)
516 (case ch
517
518 ;; Starting a literal. Continue until we find a matching
519 ;; character not preceded by a `\'.
520 ((#\" #\')
521 (let ((quote ch))
522 (loop
523 (getch)
524 (putch)
525 (when (eql ch quote)
526 (return))
527 (when (eql ch #\\)
528 (getch)
529 (putch)))
530 (getch)))
531
532 ;; Various kinds of opening bracket. Stash the current
533 ;; delimiter, and note that we're looking for a new one.
534 (#\( (push-delim #\)))
535 (#\[ (push-delim #\]))
536 (#\{ (push-delim #\}))
537
538 ;; Various kinds of closing bracket. If it matches the current
539 ;; delimeter then unstack the next one along. Otherwise
540 ;; something's gone wrong: C syntax doesn't allow unmatched
541 ;; brackets.
542 ((#\) #\] #\})
543 (if (eql ch delim)
544 (setf delim (pop stack))
545 (cerror* "Unmatched `~C'." ch))
546 (getch))
547
548 ;; A slash. Maybe a comment next. But maybe not...
549 (#\/
550
551 ;; Examine the next character to find out how to proceed.
552 (getch)
553 (case ch
554
555 ;; A second slash -- eat until the end of the line.
556 (#\/
557 (putch)
558 (loop
559 (getch)
560 (putch)
561 (when (eql ch #\newline)
562 (return)))
563 (getch))
564
565 ;; A star -- eat until we find a star-slash. Since the star
566 ;; might be preceded by another star, we use a little state
567 ;; machine.
568 (#\*
569 (putch)
570 (tagbody
571
572 main
573 ;; Main state. If we read a star, switch to star state;
574 ;; otherwise eat the character and try again.
575 (getch)
576 (putch)
577 (case ch
578 (#\* (go star))
579 (t (go main)))
580
581 star
582 ;; Star state. If we read a slash, we're done; if we read
583 ;; another star, stay in star state; otherwise go back to
584 ;; main.
585 (getch)
586 (putch)
587 (case ch
588 (#\* (go star))
589 (#\/ (go done))
590 (t (go main)))
591
592 done
593 (getch)))))
594
595 ;; Something else. Eat it and continue.
596 (t (getch)))))
597
598 ;; Return the fragment we've collected.
599 (make-instance 'c-fragment
600 :location floc
601 :text (get-output-stream-string output)))))
602
603 (defun c-fragment-reader (stream char arg)
604 "Reader for C-fragment syntax #{ ... stuff ... }."
605 (declare (ignore char arg))
606 (let ((lexer (make-instance 'sod-lexer
607 :stream stream)))
608 (next-char lexer)
609 (scan-c-fragment lexer '(#\}))))
610
611 ;;;--------------------------------------------------------------------------
612 ;;; Testing cruft.
613
614 #+test
615 (with-input-from-string (in "
616 { foo } 'x' /?/***/!
617 123 0432 0b010123 0xc0ffee __burp_32 class
618 ...
619
620 class integer : integral_domain {
621 something here;
622 }
623
624 ")
625 (let* ((stream (make-instance 'position-aware-input-stream
626 :stream in
627 :file #p"magic"))
628 (lexer (make-instance 'sod-lexer
629 :stream stream
630 :keywords *sod-keywords*))
631 (list nil))
632 (next-char lexer)
633 (loop
634 (multiple-value-bind (tokty tokval) (next-token lexer)
635 (push (list tokty tokval) list)
636 (when (eql tokty :eof)
637 (return))))
638 (nreverse list)))
639
640 ;;;----- That's all, folks --------------------------------------------------