;;; -*-lisp-*- ;;; ;;; Implementation of lexical analysis protocol. ;;; ;;; (c) 2009 Straylight/Edgeware ;;; ;;;----- Licensing notice --------------------------------------------------- ;;; ;;; This file is part of the Sensble Object Design, an object system for C. ;;; ;;; SOD is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation; either version 2 of the License, or ;;; (at your option) any later version. ;;; ;;; SOD is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with SOD; if not, write to the Free Software Foundation, ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- ;;; Basic lexical analyser. (defstruct (pushed-token (:constructor make-pushed-token (type value location))) "A token that has been pushed back into a lexer for later processing." type value location) ;;; Class definition. (export 'basic-lexer) (defclass basic-lexer () ((stream :initarg :stream :type stream :reader lexer-stream) (char :initform nil :type (or character null) :reader lexer-char) (pushback-chars :initform nil :type list) (token-type :initform nil :accessor token-type) (token-value :initform nil :accessor token-value) (location :initform nil :reader file-location) (pushback-tokens :initform nil :type list)) (:documentation "Base class for lexical analysers. The lexer reads characters from STREAM, which, for best results, wants to be a POSITION-AWARE-INPUT-STREAM. The lexer provides one-character lookahead by default: the current lookahead character is available to subclasses in the slot CHAR. Before beginning lexical analysis, the lookahead character needs to be established with NEXT-CHAR. If one-character lookahead is insufficient, the analyser can push back an arbitrary number of characters using PUSHBACK-CHAR. The NEXT-TOKEN function scans and returns the next token from the STREAM, and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token lookahead. A parser using the lexical analyser can push back tokens using PUSHBACK-TOKENS. For convenience, the lexer implements a FILE-LOCATION method (delegated to the underlying stream).")) ;;; Reading and pushing back characters. (defmethod next-char ((lexer basic-lexer)) (with-slots (stream char pushback-chars) lexer (setf char (if pushback-chars (pop pushback-chars) (read-char stream nil))))) (defmethod pushback-char ((lexer basic-lexer) new-char) (with-slots (char pushback-chars) lexer (push char pushback-chars) (setf char new-char))) (defmethod fixup-stream* ((lexer basic-lexer) thunk) (with-slots (stream char pushback-chars) lexer (when pushback-chars (error "Lexer has pushed-back characters.")) (when (slot-boundp lexer 'char) (unread-char char stream)) (unwind-protect (funcall thunk stream) (setf char (read-char stream nil))))) ;;; Reading and pushing back tokens. (defmethod next-token :around ((lexer basic-lexer)) (unless (slot-boundp lexer 'char) (next-char lexer))) (defmethod next-token ((lexer basic-lexer)) (with-slots (pushback-tokens token-type token-value location) lexer (setf (values token-type token-value) (if pushback-tokens (let ((pushback (pop pushback-tokens))) (setf location (pushed-token-location pushback)) (values (pushed-token-type pushback) (pushed-token-value pushback))) (scan-token lexer))))) (defmethod scan-token :around ((lexer basic-lexer)) (with-default-error-location (lexer) (call-next-method))) (defmethod pushback-token ((lexer basic-lexer) new-token-type &optional new-token-value new-location) (with-slots (pushback-tokens token-type token-value location) lexer (push (make-pushed-token token-type token-value location) pushback-tokens) (when new-location (setf location new-location)) (setf token-type new-token-type token-value new-token-value))) ;;; Utilities. (defmethod skip-spaces ((lexer basic-lexer)) (do ((ch (lexer-char lexer) (next-char lexer))) ((not (whitespace-char-p ch)) ch))) ;;;-------------------------------------------------------------------------- ;;; Our main lexer. (export 'sod-lexer) (defclass sod-lexer (basic-lexer) () (:documentation "Lexical analyser for the SOD lanuage. See the LEXER class for the gory details about the lexer protocol.")) (defmethod scan-token ((lexer sod-lexer)) (with-slots (stream char keywords location) lexer (prog (ch) consider ;; Stash the position of this token so that we can report it later. (setf ch (skip-spaces lexer) location (file-location stream)) ;; Now work out what it is that we're dealing with. (cond ;; End-of-file brings its own peculiar joy. ((null ch) (return (values :eof t))) ;; Strings. ((or (char= ch #\") (char= ch #\')) (let* ((quote ch) (string (with-output-to-string (out) (loop (flet ((getch () (setf ch (next-char lexer)) (when (null ch) (cerror* "Unexpected end of file in ~ ~:[string~;character~] constant" (char= quote #\')) (return)))) (getch) (cond ((char= ch quote) (return)) ((char= ch #\\) (getch))) (write-char ch out)))))) (setf ch (next-char lexer)) (ecase quote (#\" (return (values :string string))) (#\' (case (length string) (0 (cerror* "Empty character constant") (return (values :char #\?))) (1 (return (values :char (char string 0)))) (t (cerror* "Multiple characters in character constant") (return (values :char (char string 0))))))))) ;; Pick out identifiers and keywords. ((or (alpha-char-p ch) (char= ch #\_)) ;; Scan a sequence of alphanumerics and underscores. We could ;; allow more interesting identifiers, but it would damage our C ;; lexical compatibility. (let ((id (with-output-to-string (out) (loop (write-char ch out) (setf ch (next-char lexer)) (when (or (null ch) (not (or (alphanumericp ch) (char= ch #\_)))) (return)))))) ;; Done. (return (values :id id)))) ;; Pick out numbers. Currently only integers, but we support ;; multiple bases. ((digit-char-p ch) ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x' ;; (maybe uppercase) then we've got a funny radix to deal with. ;; Otherwise, a leading zero signifies octal (daft, I know), else ;; we're left with decimal. (multiple-value-bind (radix skip-char) (if (char/= ch #\0) (values 10 nil) (case (and (setf ch (next-char lexer)) (char-downcase ch)) (#\b (values 2 t)) (#\o (values 8 t)) (#\x (values 16 t)) (t (values 8 nil)))) ;; If we last munched an interesting letter, we need to skip over ;; it. That's what the SKIP-CHAR flag is for. ;; ;; Danger, Will Robinson! If we're just about to eat a radix ;; letter, then the next thing must be a digit. For example, ;; `0xfatenning' parses as a hex number followed by an identifier ;; `0xfa ttening', but `0xturning' is an octal number followed by ;; an identifier `0 xturning'. (when skip-char (let ((peek (next-char lexer))) (unless (digit-char-p peek radix) (pushback-char lexer ch) (return-from scan-token (values :integer 0))) (setf ch peek))) ;; Scan an integer. While there are digits, feed them into the ;; accumulator. (do ((accum 0 (+ (* accum radix) digit)) (digit (and ch (digit-char-p ch radix)) (and ch (digit-char-p ch radix)))) ((null digit) (return-from scan-token (values :integer accum))) (setf ch (next-char lexer))))) ;; A slash might be the start of a comment. ((char= ch #\/) (setf ch (next-char lexer)) (case ch ;; Comment up to the end of the line. (#\/ (loop (setf ch (next-char lexer)) (when (or (null ch) (char= ch #\newline)) (go scan)))) ;; Comment up to the next `*/'. (#\* (tagbody top (case (setf ch (next-char lexer)) (#\* (go star)) ((nil) (go done)) (t (go top))) star (case (setf ch (next-char lexer)) (#\* (go star)) (#\/ (setf ch (next-char lexer)) (go done)) ((nil) (go done)) (t (go top))) done) (go consider)) ;; False alarm. (The next character is already set up.) (t (return (values #\/ t))))) ;; A dot: might be `...'. Tread carefully! We need more lookahead ;; than is good for us. ((char= ch #\.) (setf ch (next-char lexer)) (cond ((eql ch #\.) (setf ch (next-char lexer)) (cond ((eql ch #\.) (return (values :ellipsis nil))) (t (pushback-char lexer #\.) (return (values #\. t))))) (t (return (values #\. t))))) ;; Anything else is a lone delimiter. (t (return (multiple-value-prog1 (values ch t) (next-char lexer))))) scan ;; Scan a new character and try again. (setf ch (next-char lexer)) (go consider)))) ;;;----- That's all, folks --------------------------------------------------