--- /dev/null
+;;; -*-lisp-*-
+;;;
+;;; Implementation of lexical analysis protocol.
+;;;
+;;; (c) 2009 Straylight/Edgeware
+;;;
+
+;;;----- Licensing notice ---------------------------------------------------
+;;;
+;;; This file is part of the Sensble Object Design, an object system for C.
+;;;
+;;; SOD is free software; you can redistribute it and/or modify
+;;; it under the terms of the GNU General Public License as published by
+;;; the Free Software Foundation; either version 2 of the License, or
+;;; (at your option) any later version.
+;;;
+;;; SOD is distributed in the hope that it will be useful,
+;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;;; GNU General Public License for more details.
+;;;
+;;; You should have received a copy of the GNU General Public License
+;;; along with SOD; if not, write to the Free Software Foundation,
+;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+
+(cl:in-package #:sod)
+
+;;;--------------------------------------------------------------------------
+;;; Basic lexical analyser.
+
+(defstruct (pushed-token
+ (:constructor make-pushed-token (type value location)))
+ "A token that has been pushed back into a lexer for later processing."
+ type value location)
+
+;;; Class definition.
+
+(export 'basic-lexer)
+(defclass basic-lexer ()
+ ((stream :initarg :stream :type stream :reader lexer-stream)
+ (char :initform nil :type (or character null) :reader lexer-char)
+ (pushback-chars :initform nil :type list)
+ (token-type :initform nil :accessor token-type)
+ (token-value :initform nil :accessor token-value)
+ (location :initform nil :reader file-location)
+ (pushback-tokens :initform nil :type list))
+ (:documentation
+ "Base class for lexical analysers.
+
+ The lexer reads characters from STREAM, which, for best results, wants to
+ be a POSITION-AWARE-INPUT-STREAM.
+
+ The lexer provides one-character lookahead by default: the current
+ lookahead character is available to subclasses in the slot CHAR. Before
+ beginning lexical analysis, the lookahead character needs to be
+ established with NEXT-CHAR. If one-character lookahead is insufficient,
+ the analyser can push back an arbitrary number of characters using
+ PUSHBACK-CHAR.
+
+ The NEXT-TOKEN function scans and returns the next token from the STREAM,
+ and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token
+ lookahead. A parser using the lexical analyser can push back tokens using
+ PUSHBACK-TOKENS.
+
+ For convenience, the lexer implements a FILE-LOCATION method (delegated to
+ the underlying stream)."))
+
+;;; Reading and pushing back characters.
+
+(defmethod next-char ((lexer basic-lexer))
+ (with-slots (stream char pushback-chars) lexer
+ (setf char (if pushback-chars
+ (pop pushback-chars)
+ (read-char stream nil)))))
+
+(defmethod pushback-char ((lexer basic-lexer) new-char)
+ (with-slots (char pushback-chars) lexer
+ (push char pushback-chars)
+ (setf char new-char)))
+
+(defmethod fixup-stream* ((lexer basic-lexer) thunk)
+ (with-slots (stream char pushback-chars) lexer
+ (when pushback-chars
+ (error "Lexer has pushed-back characters."))
+ (when (slot-boundp lexer 'char)
+ (unread-char char stream))
+ (unwind-protect
+ (funcall thunk stream)
+ (setf char (read-char stream nil)))))
+
+;;; Reading and pushing back tokens.
+
+(defmethod next-token :around ((lexer basic-lexer))
+ (unless (slot-boundp lexer 'char)
+ (next-char lexer)))
+
+(defmethod next-token ((lexer basic-lexer))
+ (with-slots (pushback-tokens token-type token-value location) lexer
+ (setf (values token-type token-value)
+ (if pushback-tokens
+ (let ((pushback (pop pushback-tokens)))
+ (setf location (pushed-token-location pushback))
+ (values (pushed-token-type pushback)
+ (pushed-token-value pushback)))
+ (scan-token lexer)))))
+
+(defmethod scan-token :around ((lexer basic-lexer))
+ (with-default-error-location (lexer)
+ (call-next-method)))
+
+(defmethod pushback-token ((lexer basic-lexer) new-token-type
+ &optional new-token-value new-location)
+ (with-slots (pushback-tokens token-type token-value location) lexer
+ (push (make-pushed-token token-type token-value location)
+ pushback-tokens)
+ (when new-location (setf location new-location))
+ (setf token-type new-token-type
+ token-value new-token-value)))
+
+;;; Utilities.
+
+(defmethod skip-spaces ((lexer basic-lexer))
+ (do ((ch (lexer-char lexer) (next-char lexer)))
+ ((not (whitespace-char-p ch)) ch)))
+
+;;;--------------------------------------------------------------------------
+;;; Our main lexer.
+
+(export 'sod-lexer)
+(defclass sod-lexer (basic-lexer)
+ ()
+ (:documentation
+ "Lexical analyser for the SOD lanuage.
+
+ See the LEXER class for the gory details about the lexer protocol."))
+
+(defmethod scan-token ((lexer sod-lexer))
+ (with-slots (stream char keywords location) lexer
+ (prog (ch)
+
+ consider
+
+ ;; Stash the position of this token so that we can report it later.
+ (setf ch (skip-spaces lexer)
+ location (file-location stream))
+
+ ;; Now work out what it is that we're dealing with.
+ (cond
+
+ ;; End-of-file brings its own peculiar joy.
+ ((null ch) (return (values :eof t)))
+
+ ;; Strings.
+ ((or (char= ch #\") (char= ch #\'))
+ (let* ((quote ch)
+ (string
+ (with-output-to-string (out)
+ (loop
+ (flet ((getch ()
+ (setf ch (next-char lexer))
+ (when (null ch)
+ (cerror* "Unexpected end of file in ~
+ ~:[string~;character~] constant"
+ (char= quote #\'))
+ (return))))
+ (getch)
+ (cond ((char= ch quote) (return))
+ ((char= ch #\\) (getch)))
+ (write-char ch out))))))
+ (setf ch (next-char lexer))
+ (ecase quote
+ (#\" (return (values :string string)))
+ (#\' (case (length string)
+ (0 (cerror* "Empty character constant")
+ (return (values :char #\?)))
+ (1 (return (values :char (char string 0))))
+ (t (cerror* "Multiple characters in character constant")
+ (return (values :char (char string 0)))))))))
+
+ ;; Pick out identifiers and keywords.
+ ((or (alpha-char-p ch) (char= ch #\_))
+
+ ;; Scan a sequence of alphanumerics and underscores. We could
+ ;; allow more interesting identifiers, but it would damage our C
+ ;; lexical compatibility.
+ (let ((id (with-output-to-string (out)
+ (loop
+ (write-char ch out)
+ (setf ch (next-char lexer))
+ (when (or (null ch)
+ (not (or (alphanumericp ch)
+ (char= ch #\_))))
+ (return))))))
+
+ ;; Done.
+ (return (values :id id))))
+
+ ;; Pick out numbers. Currently only integers, but we support
+ ;; multiple bases.
+ ((digit-char-p ch)
+
+ ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
+ ;; (maybe uppercase) then we've got a funny radix to deal with.
+ ;; Otherwise, a leading zero signifies octal (daft, I know), else
+ ;; we're left with decimal.
+ (multiple-value-bind (radix skip-char)
+ (if (char/= ch #\0)
+ (values 10 nil)
+ (case (and (setf ch (next-char lexer))
+ (char-downcase ch))
+ (#\b (values 2 t))
+ (#\o (values 8 t))
+ (#\x (values 16 t))
+ (t (values 8 nil))))
+
+ ;; If we last munched an interesting letter, we need to skip over
+ ;; it. That's what the SKIP-CHAR flag is for.
+ ;;
+ ;; Danger, Will Robinson! If we're just about to eat a radix
+ ;; letter, then the next thing must be a digit. For example,
+ ;; `0xfatenning' parses as a hex number followed by an identifier
+ ;; `0xfa ttening', but `0xturning' is an octal number followed by
+ ;; an identifier `0 xturning'.
+ (when skip-char
+ (let ((peek (next-char lexer)))
+ (unless (digit-char-p peek radix)
+ (pushback-char lexer ch)
+ (return-from scan-token (values :integer 0)))
+ (setf ch peek)))
+
+ ;; Scan an integer. While there are digits, feed them into the
+ ;; accumulator.
+ (do ((accum 0 (+ (* accum radix) digit))
+ (digit (and ch (digit-char-p ch radix))
+ (and ch (digit-char-p ch radix))))
+ ((null digit) (return-from scan-token
+ (values :integer accum)))
+ (setf ch (next-char lexer)))))
+
+ ;; A slash might be the start of a comment.
+ ((char= ch #\/)
+ (setf ch (next-char lexer))
+ (case ch
+
+ ;; Comment up to the end of the line.
+ (#\/
+ (loop
+ (setf ch (next-char lexer))
+ (when (or (null ch) (char= ch #\newline))
+ (go scan))))
+
+ ;; Comment up to the next `*/'.
+ (#\*
+ (tagbody
+ top
+ (case (setf ch (next-char lexer))
+ (#\* (go star))
+ ((nil) (go done))
+ (t (go top)))
+ star
+ (case (setf ch (next-char lexer))
+ (#\* (go star))
+ (#\/ (setf ch (next-char lexer))
+ (go done))
+ ((nil) (go done))
+ (t (go top)))
+ done)
+ (go consider))
+
+ ;; False alarm. (The next character is already set up.)
+ (t
+ (return (values #\/ t)))))
+
+ ;; A dot: might be `...'. Tread carefully! We need more lookahead
+ ;; than is good for us.
+ ((char= ch #\.)
+ (setf ch (next-char lexer))
+ (cond ((eql ch #\.)
+ (setf ch (next-char lexer))
+ (cond ((eql ch #\.) (return (values :ellipsis nil)))
+ (t (pushback-char lexer #\.)
+ (return (values #\. t)))))
+ (t
+ (return (values #\. t)))))
+
+ ;; Anything else is a lone delimiter.
+ (t
+ (return (multiple-value-prog1
+ (values ch t)
+ (next-char lexer)))))
+
+ scan
+ ;; Scan a new character and try again.
+ (setf ch (next-char lexer))
+ (go consider))))
+
+;;;----- That's all, folks --------------------------------------------------