| 1 | ;;; -*-lisp-*- |
| 2 | ;;; |
| 3 | ;;; Implementation of lexical analysis protocol. |
| 4 | ;;; |
| 5 | ;;; (c) 2009 Straylight/Edgeware |
| 6 | ;;; |
| 7 | |
| 8 | ;;;----- Licensing notice --------------------------------------------------- |
| 9 | ;;; |
| 10 | ;;; This file is part of the Sensble Object Design, an object system for C. |
| 11 | ;;; |
| 12 | ;;; SOD is free software; you can redistribute it and/or modify |
| 13 | ;;; it under the terms of the GNU General Public License as published by |
| 14 | ;;; the Free Software Foundation; either version 2 of the License, or |
| 15 | ;;; (at your option) any later version. |
| 16 | ;;; |
| 17 | ;;; SOD is distributed in the hope that it will be useful, |
| 18 | ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 19 | ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 20 | ;;; GNU General Public License for more details. |
| 21 | ;;; |
| 22 | ;;; You should have received a copy of the GNU General Public License |
| 23 | ;;; along with SOD; if not, write to the Free Software Foundation, |
| 24 | ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
| 25 | |
| 26 | (cl:in-package #:sod) |
| 27 | |
| 28 | ;;;-------------------------------------------------------------------------- |
| 29 | ;;; Basic lexical analyser. |
| 30 | |
| 31 | (defstruct (pushed-token |
| 32 | (:constructor make-pushed-token (type value location))) |
| 33 | "A token that has been pushed back into a lexer for later processing." |
| 34 | type value location) |
| 35 | |
| 36 | ;;; Class definition. |
| 37 | |
| 38 | (export 'basic-lexer) |
| 39 | (defclass basic-lexer () |
| 40 | ((stream :initarg :stream :type stream :reader lexer-stream) |
| 41 | (char :initform nil :type (or character null) :reader lexer-char) |
| 42 | (pushback-chars :initform nil :type list) |
| 43 | (token-type :initform nil :accessor token-type) |
| 44 | (token-value :initform nil :accessor token-value) |
| 45 | (location :initform nil :reader file-location) |
| 46 | (pushback-tokens :initform nil :type list)) |
| 47 | (:documentation |
| 48 | "Base class for lexical analysers. |
| 49 | |
| 50 | The lexer reads characters from STREAM, which, for best results, wants to |
| 51 | be a POSITION-AWARE-INPUT-STREAM. |
| 52 | |
| 53 | The lexer provides one-character lookahead by default: the current |
| 54 | lookahead character is available to subclasses in the slot CHAR. Before |
| 55 | beginning lexical analysis, the lookahead character needs to be |
| 56 | established with NEXT-CHAR. If one-character lookahead is insufficient, |
| 57 | the analyser can push back an arbitrary number of characters using |
| 58 | PUSHBACK-CHAR. |
| 59 | |
| 60 | The NEXT-TOKEN function scans and returns the next token from the STREAM, |
| 61 | and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token |
| 62 | lookahead. A parser using the lexical analyser can push back tokens using |
| 63 | PUSHBACK-TOKENS. |
| 64 | |
| 65 | For convenience, the lexer implements a FILE-LOCATION method (delegated to |
| 66 | the underlying stream).")) |
| 67 | |
| 68 | ;;; Reading and pushing back characters. |
| 69 | |
| 70 | (defmethod next-char ((lexer basic-lexer)) |
| 71 | (with-slots (stream char pushback-chars) lexer |
| 72 | (setf char (if pushback-chars |
| 73 | (pop pushback-chars) |
| 74 | (read-char stream nil))))) |
| 75 | |
| 76 | (defmethod pushback-char ((lexer basic-lexer) new-char) |
| 77 | (with-slots (char pushback-chars) lexer |
| 78 | (push char pushback-chars) |
| 79 | (setf char new-char))) |
| 80 | |
| 81 | (defmethod fixup-stream* ((lexer basic-lexer) thunk) |
| 82 | (with-slots (stream char pushback-chars) lexer |
| 83 | (when pushback-chars |
| 84 | (error "Lexer has pushed-back characters.")) |
| 85 | (when (slot-boundp lexer 'char) |
| 86 | (unread-char char stream)) |
| 87 | (unwind-protect |
| 88 | (funcall thunk stream) |
| 89 | (setf char (read-char stream nil))))) |
| 90 | |
| 91 | ;;; Reading and pushing back tokens. |
| 92 | |
| 93 | (defmethod next-token :around ((lexer basic-lexer)) |
| 94 | (unless (slot-boundp lexer 'char) |
| 95 | (next-char lexer))) |
| 96 | |
| 97 | (defmethod next-token ((lexer basic-lexer)) |
| 98 | (with-slots (pushback-tokens token-type token-value location) lexer |
| 99 | (setf (values token-type token-value) |
| 100 | (if pushback-tokens |
| 101 | (let ((pushback (pop pushback-tokens))) |
| 102 | (setf location (pushed-token-location pushback)) |
| 103 | (values (pushed-token-type pushback) |
| 104 | (pushed-token-value pushback))) |
| 105 | (scan-token lexer))))) |
| 106 | |
| 107 | (defmethod scan-token :around ((lexer basic-lexer)) |
| 108 | (with-default-error-location (lexer) |
| 109 | (call-next-method))) |
| 110 | |
| 111 | (defmethod pushback-token ((lexer basic-lexer) new-token-type |
| 112 | &optional new-token-value new-location) |
| 113 | (with-slots (pushback-tokens token-type token-value location) lexer |
| 114 | (push (make-pushed-token token-type token-value location) |
| 115 | pushback-tokens) |
| 116 | (when new-location (setf location new-location)) |
| 117 | (setf token-type new-token-type |
| 118 | token-value new-token-value))) |
| 119 | |
| 120 | ;;; Utilities. |
| 121 | |
| 122 | (defmethod skip-spaces ((lexer basic-lexer)) |
| 123 | (do ((ch (lexer-char lexer) (next-char lexer))) |
| 124 | ((not (whitespace-char-p ch)) ch))) |
| 125 | |
| 126 | ;;;-------------------------------------------------------------------------- |
| 127 | ;;; Our main lexer. |
| 128 | |
| 129 | (export 'sod-lexer) |
| 130 | (defclass sod-lexer (basic-lexer) |
| 131 | () |
| 132 | (:documentation |
| 133 | "Lexical analyser for the SOD lanuage. |
| 134 | |
| 135 | See the LEXER class for the gory details about the lexer protocol.")) |
| 136 | |
| 137 | (defmethod scan-token ((lexer sod-lexer)) |
| 138 | (with-slots (stream char keywords location) lexer |
| 139 | (prog (ch) |
| 140 | |
| 141 | consider |
| 142 | |
| 143 | ;; Stash the position of this token so that we can report it later. |
| 144 | (setf ch (skip-spaces lexer) |
| 145 | location (file-location stream)) |
| 146 | |
| 147 | ;; Now work out what it is that we're dealing with. |
| 148 | (cond |
| 149 | |
| 150 | ;; End-of-file brings its own peculiar joy. |
| 151 | ((null ch) (return (values :eof t))) |
| 152 | |
| 153 | ;; Strings. |
| 154 | ((or (char= ch #\") (char= ch #\')) |
| 155 | (let* ((quote ch) |
| 156 | (string |
| 157 | (with-output-to-string (out) |
| 158 | (loop |
| 159 | (flet ((getch () |
| 160 | (setf ch (next-char lexer)) |
| 161 | (when (null ch) |
| 162 | (cerror* "Unexpected end of file in ~ |
| 163 | ~:[string~;character~] constant" |
| 164 | (char= quote #\')) |
| 165 | (return)))) |
| 166 | (getch) |
| 167 | (cond ((char= ch quote) (return)) |
| 168 | ((char= ch #\\) (getch))) |
| 169 | (write-char ch out)))))) |
| 170 | (setf ch (next-char lexer)) |
| 171 | (ecase quote |
| 172 | (#\" (return (values :string string))) |
| 173 | (#\' (case (length string) |
| 174 | (0 (cerror* "Empty character constant") |
| 175 | (return (values :char #\?))) |
| 176 | (1 (return (values :char (char string 0)))) |
| 177 | (t (cerror* "Multiple characters in character constant") |
| 178 | (return (values :char (char string 0))))))))) |
| 179 | |
| 180 | ;; Pick out identifiers and keywords. |
| 181 | ((or (alpha-char-p ch) (char= ch #\_)) |
| 182 | |
| 183 | ;; Scan a sequence of alphanumerics and underscores. We could |
| 184 | ;; allow more interesting identifiers, but it would damage our C |
| 185 | ;; lexical compatibility. |
| 186 | (let ((id (with-output-to-string (out) |
| 187 | (loop |
| 188 | (write-char ch out) |
| 189 | (setf ch (next-char lexer)) |
| 190 | (when (or (null ch) |
| 191 | (not (or (alphanumericp ch) |
| 192 | (char= ch #\_)))) |
| 193 | (return)))))) |
| 194 | |
| 195 | ;; Done. |
| 196 | (return (values :id id)))) |
| 197 | |
| 198 | ;; Pick out numbers. Currently only integers, but we support |
| 199 | ;; multiple bases. |
| 200 | ((digit-char-p ch) |
| 201 | |
| 202 | ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x' |
| 203 | ;; (maybe uppercase) then we've got a funny radix to deal with. |
| 204 | ;; Otherwise, a leading zero signifies octal (daft, I know), else |
| 205 | ;; we're left with decimal. |
| 206 | (multiple-value-bind (radix skip-char) |
| 207 | (if (char/= ch #\0) |
| 208 | (values 10 nil) |
| 209 | (case (and (setf ch (next-char lexer)) |
| 210 | (char-downcase ch)) |
| 211 | (#\b (values 2 t)) |
| 212 | (#\o (values 8 t)) |
| 213 | (#\x (values 16 t)) |
| 214 | (t (values 8 nil)))) |
| 215 | |
| 216 | ;; If we last munched an interesting letter, we need to skip over |
| 217 | ;; it. That's what the SKIP-CHAR flag is for. |
| 218 | ;; |
| 219 | ;; Danger, Will Robinson! If we're just about to eat a radix |
| 220 | ;; letter, then the next thing must be a digit. For example, |
| 221 | ;; `0xfatenning' parses as a hex number followed by an identifier |
| 222 | ;; `0xfa ttening', but `0xturning' is an octal number followed by |
| 223 | ;; an identifier `0 xturning'. |
| 224 | (when skip-char |
| 225 | (let ((peek (next-char lexer))) |
| 226 | (unless (digit-char-p peek radix) |
| 227 | (pushback-char lexer ch) |
| 228 | (return-from scan-token (values :integer 0))) |
| 229 | (setf ch peek))) |
| 230 | |
| 231 | ;; Scan an integer. While there are digits, feed them into the |
| 232 | ;; accumulator. |
| 233 | (do ((accum 0 (+ (* accum radix) digit)) |
| 234 | (digit (and ch (digit-char-p ch radix)) |
| 235 | (and ch (digit-char-p ch radix)))) |
| 236 | ((null digit) (return-from scan-token |
| 237 | (values :integer accum))) |
| 238 | (setf ch (next-char lexer))))) |
| 239 | |
| 240 | ;; A slash might be the start of a comment. |
| 241 | ((char= ch #\/) |
| 242 | (setf ch (next-char lexer)) |
| 243 | (case ch |
| 244 | |
| 245 | ;; Comment up to the end of the line. |
| 246 | (#\/ |
| 247 | (loop |
| 248 | (setf ch (next-char lexer)) |
| 249 | (when (or (null ch) (char= ch #\newline)) |
| 250 | (go scan)))) |
| 251 | |
| 252 | ;; Comment up to the next `*/'. |
| 253 | (#\* |
| 254 | (tagbody |
| 255 | top |
| 256 | (case (setf ch (next-char lexer)) |
| 257 | (#\* (go star)) |
| 258 | ((nil) (go done)) |
| 259 | (t (go top))) |
| 260 | star |
| 261 | (case (setf ch (next-char lexer)) |
| 262 | (#\* (go star)) |
| 263 | (#\/ (setf ch (next-char lexer)) |
| 264 | (go done)) |
| 265 | ((nil) (go done)) |
| 266 | (t (go top))) |
| 267 | done) |
| 268 | (go consider)) |
| 269 | |
| 270 | ;; False alarm. (The next character is already set up.) |
| 271 | (t |
| 272 | (return (values #\/ t))))) |
| 273 | |
| 274 | ;; A dot: might be `...'. Tread carefully! We need more lookahead |
| 275 | ;; than is good for us. |
| 276 | ((char= ch #\.) |
| 277 | (setf ch (next-char lexer)) |
| 278 | (cond ((eql ch #\.) |
| 279 | (setf ch (next-char lexer)) |
| 280 | (cond ((eql ch #\.) (return (values :ellipsis nil))) |
| 281 | (t (pushback-char lexer #\.) |
| 282 | (return (values #\. t))))) |
| 283 | (t |
| 284 | (return (values #\. t))))) |
| 285 | |
| 286 | ;; Anything else is a lone delimiter. |
| 287 | (t |
| 288 | (return (multiple-value-prog1 |
| 289 | (values ch t) |
| 290 | (next-char lexer))))) |
| 291 | |
| 292 | scan |
| 293 | ;; Scan a new character and try again. |
| 294 | (setf ch (next-char lexer)) |
| 295 | (go consider)))) |
| 296 | |
| 297 | ;;;----- That's all, folks -------------------------------------------------- |