Fix formatting badness.
[sod] / src / lexer-impl.lisp
CommitLineData
dea4d055
MW
1;;; -*-lisp-*-
2;;;
3;;; Implementation of lexical analysis protocol.
4;;;
5;;; (c) 2009 Straylight/Edgeware
6;;;
7
8;;;----- Licensing notice ---------------------------------------------------
9;;;
10;;; This file is part of the Sensble Object Design, an object system for C.
11;;;
12;;; SOD is free software; you can redistribute it and/or modify
13;;; it under the terms of the GNU General Public License as published by
14;;; the Free Software Foundation; either version 2 of the License, or
15;;; (at your option) any later version.
16;;;
17;;; SOD is distributed in the hope that it will be useful,
18;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
19;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20;;; GNU General Public License for more details.
21;;;
22;;; You should have received a copy of the GNU General Public License
23;;; along with SOD; if not, write to the Free Software Foundation,
24;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26(cl:in-package #:sod)
27
28;;;--------------------------------------------------------------------------
29;;; Basic lexical analyser.
30
31(defstruct (pushed-token
32 (:constructor make-pushed-token (type value location)))
33 "A token that has been pushed back into a lexer for later processing."
34 type value location)
35
36;;; Class definition.
37
38(export 'basic-lexer)
39(defclass basic-lexer ()
40 ((stream :initarg :stream :type stream :reader lexer-stream)
41 (char :initform nil :type (or character null) :reader lexer-char)
42 (pushback-chars :initform nil :type list)
43 (token-type :initform nil :accessor token-type)
44 (token-value :initform nil :accessor token-value)
45 (location :initform nil :reader file-location)
46 (pushback-tokens :initform nil :type list))
47 (:documentation
48 "Base class for lexical analysers.
49
50 The lexer reads characters from STREAM, which, for best results, wants to
3109662a 51 be a `position-aware-input-stream'.
dea4d055
MW
52
53 The lexer provides one-character lookahead by default: the current
54 lookahead character is available to subclasses in the slot CHAR. Before
55 beginning lexical analysis, the lookahead character needs to be
3109662a 56 established with `next-char'. If one-character lookahead is insufficient,
dea4d055 57 the analyser can push back an arbitrary number of characters using
3109662a 58 `pushback-char'.
dea4d055 59
3109662a
MW
60 The `next-token' function scans and returns the next token from the
61 STREAM, and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing
62 one-token lookahead. A parser using the lexical analyser can push back
63 tokens using `pushback-tokens'.
dea4d055 64
3109662a
MW
65 For convenience, the lexer implements a `file-location' method (delegated
66 to the underlying stream)."))
dea4d055
MW
67
68;;; Reading and pushing back characters.
69
70(defmethod next-char ((lexer basic-lexer))
71 (with-slots (stream char pushback-chars) lexer
72 (setf char (if pushback-chars
73 (pop pushback-chars)
74 (read-char stream nil)))))
75
76(defmethod pushback-char ((lexer basic-lexer) new-char)
77 (with-slots (char pushback-chars) lexer
78 (push char pushback-chars)
79 (setf char new-char)))
80
81(defmethod fixup-stream* ((lexer basic-lexer) thunk)
82 (with-slots (stream char pushback-chars) lexer
83 (when pushback-chars
84 (error "Lexer has pushed-back characters."))
85 (when (slot-boundp lexer 'char)
86 (unread-char char stream))
87 (unwind-protect
88 (funcall thunk stream)
89 (setf char (read-char stream nil)))))
90
91;;; Reading and pushing back tokens.
92
93(defmethod next-token :around ((lexer basic-lexer))
94 (unless (slot-boundp lexer 'char)
95 (next-char lexer)))
96
97(defmethod next-token ((lexer basic-lexer))
98 (with-slots (pushback-tokens token-type token-value location) lexer
99 (setf (values token-type token-value)
100 (if pushback-tokens
101 (let ((pushback (pop pushback-tokens)))
102 (setf location (pushed-token-location pushback))
103 (values (pushed-token-type pushback)
104 (pushed-token-value pushback)))
105 (scan-token lexer)))))
106
107(defmethod scan-token :around ((lexer basic-lexer))
108 (with-default-error-location (lexer)
109 (call-next-method)))
110
111(defmethod pushback-token ((lexer basic-lexer) new-token-type
112 &optional new-token-value new-location)
113 (with-slots (pushback-tokens token-type token-value location) lexer
114 (push (make-pushed-token token-type token-value location)
115 pushback-tokens)
116 (when new-location (setf location new-location))
117 (setf token-type new-token-type
118 token-value new-token-value)))
119
120;;; Utilities.
121
122(defmethod skip-spaces ((lexer basic-lexer))
123 (do ((ch (lexer-char lexer) (next-char lexer)))
124 ((not (whitespace-char-p ch)) ch)))
125
126;;;--------------------------------------------------------------------------
127;;; Our main lexer.
128
129(export 'sod-lexer)
130(defclass sod-lexer (basic-lexer)
131 ()
132 (:documentation
133 "Lexical analyser for the SOD lanuage.
134
3109662a 135 See the `lexer' class for the gory details about the lexer protocol."))
dea4d055
MW
136
137(defmethod scan-token ((lexer sod-lexer))
138 (with-slots (stream char keywords location) lexer
139 (prog (ch)
140
141 consider
142
143 ;; Stash the position of this token so that we can report it later.
144 (setf ch (skip-spaces lexer)
145 location (file-location stream))
146
147 ;; Now work out what it is that we're dealing with.
148 (cond
149
150 ;; End-of-file brings its own peculiar joy.
151 ((null ch) (return (values :eof t)))
152
153 ;; Strings.
154 ((or (char= ch #\") (char= ch #\'))
155 (let* ((quote ch)
156 (string
157 (with-output-to-string (out)
158 (loop
159 (flet ((getch ()
160 (setf ch (next-char lexer))
161 (when (null ch)
162 (cerror* "Unexpected end of file in ~
163 ~:[string~;character~] constant"
164 (char= quote #\'))
165 (return))))
166 (getch)
167 (cond ((char= ch quote) (return))
168 ((char= ch #\\) (getch)))
169 (write-char ch out))))))
170 (setf ch (next-char lexer))
171 (ecase quote
172 (#\" (return (values :string string)))
173 (#\' (case (length string)
174 (0 (cerror* "Empty character constant")
175 (return (values :char #\?)))
176 (1 (return (values :char (char string 0))))
177 (t (cerror* "Multiple characters in character constant")
178 (return (values :char (char string 0)))))))))
179
180 ;; Pick out identifiers and keywords.
181 ((or (alpha-char-p ch) (char= ch #\_))
182
183 ;; Scan a sequence of alphanumerics and underscores. We could
184 ;; allow more interesting identifiers, but it would damage our C
185 ;; lexical compatibility.
186 (let ((id (with-output-to-string (out)
187 (loop
188 (write-char ch out)
189 (setf ch (next-char lexer))
190 (when (or (null ch)
191 (not (or (alphanumericp ch)
192 (char= ch #\_))))
193 (return))))))
194
195 ;; Done.
196 (return (values :id id))))
197
198 ;; Pick out numbers. Currently only integers, but we support
199 ;; multiple bases.
200 ((digit-char-p ch)
201
202 ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
203 ;; (maybe uppercase) then we've got a funny radix to deal with.
204 ;; Otherwise, a leading zero signifies octal (daft, I know), else
205 ;; we're left with decimal.
206 (multiple-value-bind (radix skip-char)
207 (if (char/= ch #\0)
208 (values 10 nil)
209 (case (and (setf ch (next-char lexer))
210 (char-downcase ch))
211 (#\b (values 2 t))
212 (#\o (values 8 t))
213 (#\x (values 16 t))
214 (t (values 8 nil))))
215
216 ;; If we last munched an interesting letter, we need to skip over
217 ;; it. That's what the SKIP-CHAR flag is for.
218 ;;
219 ;; Danger, Will Robinson! If we're just about to eat a radix
220 ;; letter, then the next thing must be a digit. For example,
221 ;; `0xfatenning' parses as a hex number followed by an identifier
222 ;; `0xfa ttening', but `0xturning' is an octal number followed by
223 ;; an identifier `0 xturning'.
224 (when skip-char
225 (let ((peek (next-char lexer)))
226 (unless (digit-char-p peek radix)
227 (pushback-char lexer ch)
228 (return-from scan-token (values :integer 0)))
229 (setf ch peek)))
230
231 ;; Scan an integer. While there are digits, feed them into the
232 ;; accumulator.
233 (do ((accum 0 (+ (* accum radix) digit))
234 (digit (and ch (digit-char-p ch radix))
235 (and ch (digit-char-p ch radix))))
236 ((null digit) (return-from scan-token
237 (values :integer accum)))
238 (setf ch (next-char lexer)))))
239
240 ;; A slash might be the start of a comment.
241 ((char= ch #\/)
242 (setf ch (next-char lexer))
243 (case ch
244
245 ;; Comment up to the end of the line.
246 (#\/
247 (loop
248 (setf ch (next-char lexer))
249 (when (or (null ch) (char= ch #\newline))
250 (go scan))))
251
252 ;; Comment up to the next `*/'.
253 (#\*
254 (tagbody
255 top
256 (case (setf ch (next-char lexer))
257 (#\* (go star))
258 ((nil) (go done))
259 (t (go top)))
260 star
261 (case (setf ch (next-char lexer))
262 (#\* (go star))
263 (#\/ (setf ch (next-char lexer))
264 (go done))
265 ((nil) (go done))
266 (t (go top)))
267 done)
268 (go consider))
269
270 ;; False alarm. (The next character is already set up.)
271 (t
272 (return (values #\/ t)))))
273
274 ;; A dot: might be `...'. Tread carefully! We need more lookahead
275 ;; than is good for us.
276 ((char= ch #\.)
277 (setf ch (next-char lexer))
278 (cond ((eql ch #\.)
279 (setf ch (next-char lexer))
280 (cond ((eql ch #\.) (return (values :ellipsis nil)))
281 (t (pushback-char lexer #\.)
282 (return (values #\. t)))))
283 (t
284 (return (values #\. t)))))
285
286 ;; Anything else is a lone delimiter.
287 (t
288 (return (multiple-value-prog1
289 (values ch t)
290 (next-char lexer)))))
291
292 scan
293 ;; Scan a new character and try again.
294 (setf ch (next-char lexer))
295 (go consider))))
296
297;;;----- That's all, folks --------------------------------------------------