;;; -*-lisp-*- ;;; ;;; Lexical analysis of a vaguely C-like language ;;; ;;; (c) 2009 Straylight/Edgeware ;;; ;;;----- Licensing notice --------------------------------------------------- ;;; ;;; This file is part of the Simple Object Definition system. ;;; ;;; SOD is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by ;;; the Free Software Foundation; either version 2 of the License, or ;;; (at your option) any later version. ;;; ;;; SOD is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with SOD; if not, write to the Free Software Foundation, ;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- ;;; Basic lexical analyser infrastructure. ;; Class definition. (defclass lexer () ((stream :initarg :stream :type stream :reader lexer-stream) (char :initform nil :type (or character null) :reader lexer-char) (pushback-chars :initform nil :type list) (token-type :initform nil :accessor token-type) (token-value :initform nil :accessor token-value) (pushback-tokens :initform nil :type list)) (:documentation "Base class for lexical analysers. The lexer reads characters from STREAM, which, for best results, wants to be a POSITION-AWARE-INPUT-STREAM. The lexer provides one-character lookahead by default: the current lookahead character is available to subclasses in the slot CHAR. Before beginning lexical analysis, the lookahead character needs to be established with NEXT-CHAR. If one-character lookahead is insufficient, the analyser can push back an arbitrary number of characters using PUSHBACK-CHAR. The NEXT-TOKEN function scans and returns the next token from the STREAM, and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token lookahead. A parser using the lexical analyser can push back tokens using PUSHBACK-TOKENS. For convenience, the lexer implements a FILE-LOCATION method (delegated to the underlying stream).")) ;; Lexer protocol. (defgeneric scan-token (lexer) (:documentation "Internal function for scanning tokens from an input stream. Implementing a method on this function is the main responsibility of LEXER subclasses; it is called by the user-facing NEXT-TOKEN function. The method should consume characters (using NEXT-CHAR) as necessary, and return two values: a token type and token value. These will be stored in the corresponding slots in the lexer object in order to provide the user with one-token lookahead.")) (defgeneric next-token (lexer) (:documentation "Scan a token from an input stream. This function scans a token from an input stream. Two values are returned: a `token type' and a `token value'. These are opaque to the LEXER base class, but the intent is that the token type be significant to determining the syntax of the input, while the token value carries any additional information about the token's semantic content. The token type and token value are also made available for lookahead via accessors TOKEN-TYPE and TOKEN-NAME on the LEXER object. If tokens have been pushed back (see PUSHBACK-TOKEN) then they are returned one by one instead of scanning the stream.") (:method ((lexer lexer)) (with-slots (pushback-tokens token-type token-value) lexer (setf (values token-type token-value) (if pushback-tokens (let ((pushback (pop pushback-tokens))) (values (car pushback) (cdr pushback))) (scan-token lexer)))))) (defgeneric pushback-token (lexer token-type &optional token-value) (:documentation "Push a token back into the lexer. Make the given TOKEN-TYPE and TOKEN-VALUE be the current lookahead token. The previous lookahead token is pushed down, and will be made available agan once this new token is consumed by NEXT-TOKEN. The FILE-LOCATION is not affected by pushing tokens back. The TOKEN-TYPE and TOKEN-VALUE be anything at all: for instance, they need not be values which can actually be returned by NEXT-TOKEN.") (:method ((lexer lexer) new-token-type &optional new-token-value) (with-slots (pushback-tokens token-type token-value) lexer (push (cons token-type token-value) pushback-tokens) (setf token-type new-token-type token-value new-token-value)))) (defgeneric next-char (lexer) (:documentation "Fetch the next character from the LEXER's input stream. Read a character from the input stream, and store it in the LEXER's CHAR slot. The character stored is returned. If characters have been pushed back then pushed-back characters are used instead of the input stream. (This function is primarily intended for the use of lexer subclasses.)") (:method ((lexer lexer)) (with-slots (stream char pushback-chars) lexer (setf char (if pushback-chars (pop pushback-chars) (read-char stream nil)))))) (defgeneric pushback-char (lexer char) (:documentation "Push the CHAR back into the lexer. Make CHAR be the current lookahead character (stored in the LEXER's CHAR slot). The previous lookahead character is pushed down, and will be made available again once this character is consumed by NEXT-CHAR. (This function is primarily intended for the use of lexer subclasses.)") (:method ((lexer lexer) new-char) (with-slots (char pushback-chars) lexer (push char pushback-chars) (setf char new-char)))) (defgeneric fixup-stream* (lexer thunk) (:documentation "Helper function for WITH-LEXER-STREAM. This function does the main work for WITH-LEXER-STREAM. The THUNK is invoked on a single argument, the LEXER's underlying STREAM.") (:method ((lexer lexer) thunk) (with-slots (stream char pushback-chars) lexer (when pushback-chars (error "Lexer has pushed-back characters.")) (unread-char char stream) (unwind-protect (funcall thunk stream) (setf char (read-char stream nil)))))) (defmacro with-lexer-stream ((streamvar lexer) &body body) "Evaluate BODY with STREAMVAR bound to the LEXER's input stream. The STREAM is fixed up so that the next character read (e.g., using READ-CHAR) will be the lexer's current lookahead character. Once the BODY completes, the next character in the stream is read and set as the lookahead character. It is an error if the lexer has pushed-back characters (since these can't be pushed back into the input stream properly)." `(fixup-stream* ,lexer (lambda (,streamvar) ,@body))) (defmethod file-location ((lexer lexer)) (with-slots (stream) lexer (file-location stream))) (defgeneric skip-spaces (lexer) (:documentation "Skip over whitespace characters in the LEXER.") (:method ((lexer lexer)) (do ((ch (lexer-char lexer) (next-char lexer))) ((not (whitespace-char-p ch)))))) ;;;-------------------------------------------------------------------------- ;;; Lexer utilities. (defun require-token (lexer wanted-token-type &key (errorp t) (consumep t) default) (with-slots (token-type token-value) lexer (cond ((eql token-type wanted-token-type) (prog1 token-value (when consumep (next-token lexer)))) (errorp (cerror* "Expected ~A but found ~A" (format-token wanted-token-type) (format-token token-type token-value)) default) (t default)))) ;;;-------------------------------------------------------------------------- ;;; Our main lexer. (defun make-keyword-table (&rest keywords) "Construct a keyword table for the lexical analyser. The KEYWORDS arguments are individual keywords, either as strings or as (WORD . VALUE) pairs. A string argument is equivalent to a pair listing the string itself as WORD and the corresponding keyword symbol (forced to uppercase) as the VALUE." (let ((table (make-hash-table :test #'equal))) (dolist (item keywords) (multiple-value-bind (word keyword) (if (consp item) (values (car item) (cdr item)) (values item (intern (string-upcase item) :keyword))) (setf (gethash word table) keyword))) table)) (defparameter *sod-keywords* (make-keyword-table ;; Words with a meaning to C's type system. "char" "int" "float" "void" "long" "short" "signed" "unsigned" "double" "const" "volatile" "restrict" "struct" "union" "enum")) (defclass sod-lexer (lexer) () (:documentation "Lexical analyser for the SOD lanuage. See the LEXER class for the gory details about the lexer protocol.")) (defun format-token (token-type &optional token-value) (when (typep token-type 'lexer) (let ((lexer token-type)) (setf token-type (token-type lexer) token-value (token-value lexer)))) (etypecase token-type ((eql :eof) "") ((eql :string) "") ((eql :char) "") ((eql :id) (format nil "" token-value)) (keyword (format nil "`~(~A~)'" token-type)) (character (format nil "~:[<~:C>~;`~C'~]" (and (graphic-char-p token-type) (char/= token-type #\space)) token-type)))) (defmethod scan-token ((lexer sod-lexer)) (with-slots (stream char keywords) lexer (prog ((ch char)) consider (cond ;; End-of-file brings its own peculiar joy. ((null ch) (return (values :eof t))) ;; Ignore whitespace and continue around for more. ((whitespace-char-p ch) (go scan)) ;; Strings. ((or (char= ch #\") (char= ch #\')) (with-default-error-location ((file-location lexer)) (let* ((quote ch) (string (with-output-to-string (out) (loop (flet ((getch () (setf ch (next-char lexer)) (when (null ch) (cerror* "Unexpected end of file in string/character constant") (return)))) (getch) (cond ((char= ch quote) (return)) ((char= ch #\\) (getch))) (write-char ch out)))))) (setf ch (next-char lexer)) (ecase quote (#\" (return (values :string string))) (#\' (case (length string) (0 (cerror* "Empty character constant") (return (values :char #\?))) (1 (return (values :char (char string 0)))) (t (cerror* "Multiple characters in character constant") (return (values :char (char string 0)))))))))) ;; Pick out identifiers and keywords. ((or (alpha-char-p ch) (char= ch #\_)) ;; Scan a sequence of alphanumerics and underscores. We could ;; allow more interesting identifiers, but it would damage our C ;; lexical compatibility. (let ((id (with-output-to-string (out) (loop (write-char ch out) (setf ch (next-char lexer)) (when (or (null ch) (not (or (alphanumericp ch) (char= ch #\_)))) (return)))))) ;; Done. (return (values :id id)))) ;; Pick out numbers. Currently only integers, but we support ;; multiple bases. ((digit-char-p ch) ;; Sort out the prefix. If we're looking at `0b', `0o' or `0x' ;; (maybe uppercase) then we've got a funny radix to deal with. ;; Otherwise, a leading zero signifies octal (daft, I know), else ;; we're left with decimal. (multiple-value-bind (radix skip-char) (if (char/= ch #\0) (values 10 nil) (case (and (setf ch (next-char lexer)) (char-downcase ch)) (#\b (values 2 t)) (#\o (values 8 t)) (#\x (values 16 t)) (t (values 8 nil)))) ;; If we last munched an interesting letter, we need to skip over ;; it. That's what the SKIP-CHAR flag is for. ;; ;; Danger, Will Robinson! If we're' just about to eat a radix ;; letter, then the next thing must be a digit. For example, ;; `0xfatenning' parses as a hex number followed by an identifier ;; `0xfa ttening', but `0xturning' is an octal number followed ;; by an identifier `0 xturning'. (when skip-char (let ((peek (next-char lexer))) (unless (digit-char-p peek radix) (pushback-char lexer ch) (return-from scan-token (values :integer 0))) (setf ch peek))) ;; Scan an integer. While there are digits, feed them into the ;; accumulator. (do ((accum 0 (+ (* accum radix) digit)) (digit (and ch (digit-char-p ch radix)) (and ch (digit-char-p ch radix)))) ((null digit) (return-from scan-token (values :integer accum))) (setf ch (next-char lexer))))) ;; A slash might be the start of a comment. ((char= ch #\/) (setf ch (next-char lexer)) (case ch ;; Comment up to the end of the line. (#\/ (loop (setf ch (next-char lexer)) (when (or (null ch) (char= ch #\newline)) (go scan)))) ;; Comment up to the next `*/'. (#\* (tagbody top (case (setf ch (next-char lexer)) (#\* (go star)) ((nil) (go done)) (t (go top))) star (case (setf ch (next-char lexer)) (#\* (go star)) (#\/ (setf ch (next-char lexer)) (go done)) ((nil) (go done)) (t (go top))) done) (go consider)) ;; False alarm. (The next character is already set up.) (t (return (values #\/ t))))) ;; A dot: might be `...'. Tread carefully! We need more lookahead ;; than is good for us. ((char= ch #\.) (setf ch (next-char lexer)) (cond ((eql ch #\.) (setf ch (next-char lexer)) (cond ((eql ch #\.) (return (values :ellpisis nil))) (t (pushback-char lexer #\.) (return (values #\. t))))) (t (return (values #\. t))))) ;; Anything else is a lone delimiter. (t (return (multiple-value-prog1 (values ch t) (next-char lexer))))) scan ;; Scan a new character and try again. (setf ch (next-char lexer)) (go consider)))) ;;;-------------------------------------------------------------------------- ;;; C fragments. (defclass c-fragment () ((location :initarg :location :type file-location :accessor c-fragment-location) (text :initarg :text :type string :accessor c-fragment-text)) (:documentation "Represents a fragment of C code to be written to an output file. A C fragment is aware of its original location, and will bear proper #line markers when written out.")) (defun output-c-excursion (stream location thunk) "Invoke THUNK surrounding it by writing #line markers to STREAM. The first marker describes LOCATION; the second refers to the actual output position in STREAM. If LOCATION doesn't provide a line number then no markers are output after all. If the output stream isn't position-aware then no final marker is output." (let* ((location (file-location location)) (line (file-location-line location)) (pathname (file-location-pathname location)) (namestring (and pathname (namestring pathname)))) (cond (line (format stream "~&#line ~D~@[ ~S~]~%" line namestring) (funcall thunk) (when (typep stream 'position-aware-stream) (fresh-line stream) (format stream "~&#line ~D ~S~%" (1+ (position-aware-stream-line stream)) (namestring (stream-pathname stream))))) (t (funcall thunk))))) (defmethod print-object ((fragment c-fragment) stream) (let ((text (c-fragment-text fragment)) (location (c-fragment-location fragment))) (if *print-escape* (print-unreadable-object (fragment stream :type t) (when location (format stream "~A " location)) (cond ((< (length text) 40) (prin1 text stream) stream) (t (prin1 (subseq text 0 40) stream) (write-string "..." stream)))) (output-c-excursion stream location (lambda () (write-string text stream)))))) (defmethod make-load-form ((fragment c-fragment) &optional environment) (make-load-form-saving-slots fragment :environment environment)) (defun scan-c-fragment (lexer end-chars) "Snarfs a sequence of C tokens with balanced brackets. Reads and consumes characters from the LEXER's stream, and returns them as a string. The string will contain whole C tokens, up as far as an occurrence of one of the END-CHARS (a list) which (a) is not within a string or character literal or comment, and (b) appears at the outer level of nesting of brackets (whether round, curly or square -- again counting only brackets which aren't themselves within string/character literals or comments. The final END-CHAR is not consumed. An error is signalled if either the stream ends before an occurrence of one of the END-CHARS, or if mismatching brackets are encountered. No other attempt is made to ensure that the characters read are in fact a valid C fragment. Both original /*...*/ and new //... comments are recognized. Trigraphs and digraphs are currently not recognized." (let ((output (make-string-output-stream)) (ch (lexer-char lexer)) (start-floc (file-location lexer)) (delim nil) (stack nil)) ;; Main loop. At the top of this loop, we've already read a ;; character into CH. This is usually read at the end of processing ;; the individual character, though sometimes (following `/', for ;; example) it's read speculatively because we need one-character ;; lookahead. (block loop (labels ((getch () "Read the next character into CH; complain if we hit EOF." (unless (setf ch (next-char lexer)) (cerror*-with-location start-floc "Unexpected end-of-file in C fragment") (return-from loop)) ch) (putch () "Write the character to the output buffer." (write-char ch output)) (push-delim (d) "Push a closing delimiter onto the stack." (push delim stack) (setf delim d) (getch))) ;; Hack: if the first character is a newline, discard it. Otherwise ;; (a) the output fragment will look funny, and (b) the location ;; information will be wrong. (when (eql ch #\newline) (getch)) ;; And fetch characters. (loop ;; Here we're outside any string or character literal, though we ;; may be nested within brackets. So, if there's no delimiter, and ;; we've found the end character, we're done. (when (and (null delim) (member ch end-chars)) (return)) ;; Otherwise take a copy of the character, and work out what to do ;; next. (putch) (case ch ;; Starting a literal. Continue until we find a matching ;; character not preceded by a `\'. ((#\" #\') (let ((quote ch)) (loop (getch) (putch) (when (eql ch quote) (return)) (when (eql ch #\\) (getch) (putch))) (getch))) ;; Various kinds of opening bracket. Stash the current ;; delimiter, and note that we're looking for a new one. (#\( (push-delim #\))) (#\[ (push-delim #\])) (#\{ (push-delim #\})) ;; Various kinds of closing bracket. If it matches the current ;; delimeter then unstack the next one along. Otherwise ;; something's gone wrong: C syntax doesn't allow unmatched ;; brackets. ((#\) #\] #\}) (if (eql ch delim) (setf delim (pop stack)) (cerror* "Unmatched `~C'." ch)) (getch)) ;; A slash. Maybe a comment next. But maybe not... (#\/ ;; Examine the next character to find out how to proceed. (getch) (case ch ;; A second slash -- eat until the end of the line. (#\/ (putch) (loop (getch) (putch) (when (eql ch #\newline) (return))) (getch)) ;; A star -- eat until we find a star-slash. Since the star ;; might be preceded by another star, we use a little state ;; machine. (#\* (putch) (tagbody main ;; Main state. If we read a star, switch to star state; ;; otherwise eat the character and try again. (getch) (putch) (case ch (#\* (go star)) (t (go main))) star ;; Star state. If we read a slash, we're done; if we read ;; another star, stay in star state; otherwise go back to ;; main. (getch) (putch) (case ch (#\* (go star)) (#\/ (go done)) (t (go main))) done (getch))))) ;; Something else. Eat it and continue. (t (getch))))) (let* ((string (get-output-stream-string output)) (end (position-if (lambda (char) (or (char= char #\newline) (not (whitespace-char-p char)))) string :from-end t)) (trimmed (if end (subseq string 0 (1+ end)) ""))) ;; Return the fragment we've collected. (make-instance 'c-fragment :location start-floc :text trimmed))))) (defun c-fragment-reader (stream char arg) "Reader for C-fragment syntax #{ ... stuff ... }." (declare (ignore char arg)) (let ((lexer (make-instance 'sod-lexer :stream stream))) (next-char lexer) (scan-c-fragment lexer '(#\})))) #+interactive (set-dispatch-macro-character #\# #\{ 'c-fragment-reader) ;;;-------------------------------------------------------------------------- ;;; Testing cruft. #+test (with-input-from-string (in " { foo } 'x' /?/***/! 123 0432 0b010123 0xc0ffee __burp_32 class 0xturning 0xfattening ... class integer : integral_domain { something here; } ") (let* ((stream (make-instance 'position-aware-input-stream :stream in :file #p"magic")) (lexer (make-instance 'sod-lexer :stream stream :keywords *sod-keywords*)) (list nil)) (next-char lexer) (loop (multiple-value-bind (tokty tokval) (next-token lexer) (push (list tokty tokval) list) (when (eql tokty :eof) (return)))) (nreverse list))) ;;;----- That's all, folks --------------------------------------------------