X-Git-Url: https://git.distorted.org.uk/~mdw/sod/blobdiff_plain/3109662aca9c06495ac22c5c58b46e1c036aca5c..c6b4ed992d81518f240509e6ab212d8fe705485a:/src/lexer-proto.lisp diff --git a/src/lexer-proto.lisp b/src/lexer-proto.lisp index 8e0c889..a237a92 100644 --- a/src/lexer-proto.lisp +++ b/src/lexer-proto.lisp @@ -7,7 +7,7 @@ ;;;----- Licensing notice --------------------------------------------------- ;;; -;;; This file is part of the Sensble Object Design, an object system for C. +;;; This file is part of the Sensible Object Design, an object system for C. ;;; ;;; SOD is free software; you can redistribute it and/or modify ;;; it under the terms of the GNU General Public License as published by @@ -26,178 +26,181 @@ (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- -;;; Accessors. +;;; Class definition. -(export 'lexer-char) -(defgeneric lexer-char (lexer) +(export 'sod-token-scanner) +(defclass sod-token-scanner (token-scanner) + ((char-scanner :initarg :char-scanner :reader token-scanner-char-scanner)) (:documentation - "Return the current lookahead character from the LEXER. + "A token scanner for SOD input files. - When the lexer is first created, there is no lookahead character: you must - `prime the pump' by calling `next-char'. The lexer represents - encountering the end of its input stream by setting the lookahead - character to nil. At this point it is still possible to push back - characters.")) + Not a lot here, apart from a character scanner to read from and the + standard token scanner infrastructure.")) ;;;-------------------------------------------------------------------------- -;;; Formatting tokens. - -(defgeneric format-token (token-type &optional token-value) - (:documentation - "Return a string describing a token with the specified type and value.") - (:method ((token-type (eql :eof)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :string)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :char)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :id)) &optional token-value) - (format nil "" token-value)) - (:method ((token-type symbol) &optional token-value) - (declare (ignore token-value)) - (check-type token-type keyword) - (format nil "`~(~A~)'" token-type)) - (:method ((token-type character) &optional token-value) - (declare (ignore token-value)) - (format nil "~:[<~:C>~;`~C'~]" - (and (graphic-char-p token-type) - (char/= token-type #\space)) - token-type))) - -;;;-------------------------------------------------------------------------- -;;; Reading and pushing back characters. - -(export 'next-char) -(defgeneric next-char (lexer) - (:documentation - "Fetch the next character from the LEXER's input stream. - - Read a character from the input stream, and store it in the LEXER's CHAR - slot. The character stored is returned. If characters have been pushed - back then pushed-back characters are used instead of the input stream. If - there are no more characters to be read then the lookahead character is - nil. Returns the new lookahead character. - - (This function is primarily intended for the use of lexer subclasses.)")) - -(export 'pushback-char) -(defgeneric pushback-char (lexer char) - (:documentation - "Push the CHAR back into the lexer. - - Make CHAR be the current lookahead character (stored in the LEXER's CHAR - slot). The previous lookahead character is pushed down, and will be made - available again once this character is consumed by NEXT-CHAR. - - (This function is primarily intended for the use of lexer subclasses.)")) - -(defgeneric fixup-stream* (lexer thunk) - (:documentation - "Helper function for `with-lexer-stream'. - - This function does the main work for `with-lexer-stream'. The THUNK is - invoked on a single argument, the LEXER's underlying STREAM.")) - -(export 'with-lexer-stream) -(defmacro with-lexer-stream ((streamvar lexer) &body body) - "Evaluate BODY with STREAMVAR bound to the LEXER's input stream. - - The STREAM is fixed up so that the next character read (e.g., using - `read-char') will be the lexer's current lookahead character. Once the - BODY completes, the next character in the stream is read and set as the - lookahead character. It is an error if the lexer has pushed-back - characters (since these can't be pushed back into the input stream - properly)." - - `(fixup-stream* ,lexer (lambda (,streamvar) ,@body))) +;;; Indicators and error messages. + +(defvar *indicator-map* (make-hash-table) + "Hash table mapping indicator objects to human-readable descriptions.") + +(export 'define-indicator) +(defun define-indicator (indicator description) + "Associate an INDICATOR with its textual DESCRIPTION. + + Updates the the `*indicator-map*'." + (setf (gethash indicator *indicator-map*) description) + indicator) + +(export 'syntax-error) +(define-condition syntax-error (parser-error base-syntax-error) + ((found :type cons)) + (:report (lambda (error stream) + (labels ((show-token (type value) + (if (characterp type) (show-char type) + (case type + (:id (format nil "" + value)) + (:int "") + (:string "") + (:char "") + (:eof "") + (:ellipsis "`...'") + (:shl "`<<'") + (:shr "`>>'") + (:eq "`=='") + (:ne "`!='") + (:le "`<='") + (:ge "`>='") + (:and "`&&'") + (:or "`||'") + (t (format nil "" type value))))) + (show-expected (thing) + (acond ((gethash thing *indicator-map*) it) + ((atom thing) (show-token thing nil)) + ((eq (car thing) :id) + (format nil "`~A'" (cadr thing))) + (t (format nil "" thing))))) + (report-parser-error error stream + #'show-expected + (lambda (found) + (show-token (car found) + (cdr found)))))))) +(defun syntax-error (scanner expected &key (continuep t) location) + "Signal a (maybe) continuable syntax error." + (funcall (if continuep #'cerror*-with-location #'error-with-location) + (or location scanner) 'syntax-error + :expected expected + :found (cons (token-type scanner) (token-value scanner)))) + +(export 'lexer-error) +(define-condition lexer-error (parser-error base-lexer-error) + ((found :type (or character nil))) + (:report (lambda (error stream) + (flet ((show-expected (exp) + (typecase exp + (character (show-char exp)) + (string (format nil "`~A'" exp)) + ((cons (eql :digit) *) + (format nil "" (cadr exp))) + ((eql :eof) "") + ((eql :any) "") + (t (format nil "" exp))))) + (report-parser-error error stream + #'show-expected #'show-char))))) +(defun lexer-error (char-scanner expected &key location) + "Signal a continuable lexical error." + (cerror*-with-location (or location char-scanner) 'lexer-error + :expected expected + :found (and (not (scanner-at-eof-p char-scanner)) + (scanner-current-char char-scanner)))) + +(export 'skip-until) +(defparse skip-until (:context (context token-scanner-context) + (&key (keep-end nil keep-end-p)) + &rest token-types) + "Discard tokens until we find one listed in TOKEN-TYPES. + + Each of the TOKEN-TYPES is an expression which evaluates to either a + two-item list (TYPE VALUE), or a singleton TYPE; the latter is equivalent + to a list (TYPE t). Such a pair matches a token with the corresponding + TYPE and VALUE, except that a VALUE of `t' matches any token value. + + If KEEP-END is true then retain the found token for later; otherwise + discard it. KEEP-END defaults to true if multiple TOKEN-TYPES are given; + otherwise false. If end-of-file is encountered then the indicator list is + simply the list of TOKEN-TYPES; otherwise the result is `nil'." + `(%skip-until ,(parser-scanner context) + (list ,@token-types) + :keep-end ,(if keep-end-p keep-end + (> (length token-types) 1)))) + +(export 'error) +(defparse error (:context (context token-scanner-context) + (&key ignore-unconsumed force-progress) + sub &optional (recover t) &body body) + "Try to parse SUB; if it fails then report an error, and parse RECOVER. + + This is the main way to recover from errors and continue parsing. Even + then, it's not especially brilliant. + + If the SUB parser succeeds then just propagate its result: it's like we + were never here. Otherwise, try to recover in a sensible way so we can + continue parsing. The details of this recovery are subject to change, but + the final action is generally to invoke the RECOVER parser and return its + result. + + If IGNORE-UNCONSUMED evaluates non-nil, then just propagate a failure of + SUB if it didn't consume input. (This makes it suitable for use where the + parser containing `error' might be optional.)" + `(parse-error-recover ,(parser-scanner context) + (parser () ,sub) + (parser () ,recover) + :ignore-unconsumed ,ignore-unconsumed + :force-progress ,force-progress + :action ,(and body `(lambda () ,@body)))) + +(export 'must) +(defparse must (:context (context token-scanner-context) + sub &optional default) + "Try to parse SUB; if it fails, report an error, and return DEFAULT. + + This parser can't actually fail." + `(parse (error () ,sub (t ,default)))) ;;;-------------------------------------------------------------------------- -;;; Reading and pushing back tokens. - -(export 'scan-token) -(defgeneric scan-token (lexer) - (:documentation - "Internal protocol for scanning tokens from an input stream. - - Implementing a method on this function is the main responsibility of LEXER - subclasses; it is called by the user-facing `next-token' function. - - The method should consume characters (using `next-char') as necessary, and - return two values: a token type and token value. These will be stored in - the corresponding slots in the lexer object in order to provide the user - with one-token lookahead.")) - -(export 'next-token) -(defgeneric next-token (lexer) - (:documentation - "Scan a token from an input stream. - - This function scans a token from an input stream. Two values are - returned: a `token type' and a `token value'. These are opaque to the - LEXER base class, but the intent is that the token type be significant to - determining the syntax of the input, while the token value carries any - additional information about the token's semantic content. The token type - and token value are also made available for lookahead via accessors - TOKEN-TYPE and TOKEN-VALUE on the `lexer' object. - - The new lookahead token type and value are returned as two separate - values. - - If tokens have been pushed back (see `pushback-token') then they are - returned one by one instead of scanning the stream.")) - -(export 'pushback-token) -(defgeneric pushback-token (lexer token-type &optional token-value location) - (:documentation - "Push a token back into the lexer. - - Make the given TOKEN-TYPE and TOKEN-VALUE be the current lookahead token. - The previous lookahead token is pushed down, and will be made available - agan once this new token is consumed by NEXT-TOKEN. If LOCATION is - non-nil then `file-location' is saved and replaced by LOCATION. The - TOKEN-TYPE and TOKEN-VALUE can be anything at all: for instance, they need - not be values which can actually be returned by NEXT-TOKEN.")) - -;;;-------------------------------------------------------------------------- -;;; Utilities. - -(export 'skip-spaces) -(defgeneric skip-spaces (lexer) - (:documentation - "Skip over whitespace characters in the LEXER. - - There must be a lookahead character; when the function returns, the - lookahead character will be a non-whitespace character or nil if there - were no non-whitespace characters remaining. Returns the new lookahead - character.")) - -(export 'require-token) -(defun require-token - (lexer wanted-token-type &key (errorp t) (consumep t) default) - "Require a particular token to appear. - - If the LEXER's current lookahead token has type `wanted-token-type' then - consume it (using `next-token') and return its value. Otherwise, if the - token doesn't have the requested type then signal a continuable error - describing the situation and return DEFAULT (which defaults to nil). - - If ERRORP is false then no error is signalled; this is useful for - consuming or checking for optional punctuation. If CONSUMEP is false then - a matching token is not consumed; non-matching tokens are never consumed." - - (with-slots (token-type token-value) lexer - (cond ((eql token-type wanted-token-type) - (prog1 token-value - (when consumep (next-token lexer)))) - (errorp - (cerror* "Expected ~A but found ~A" - (format-token wanted-token-type) - (format-token token-type token-value)) - default) - (t - default)))) +;;; Lexical analysis utilities. + +(export 'scan-comment) +(defun scan-comment (char-scanner) + "Scan a comment (either `/* ... */' or `// ...') from CHAR-SCANNER. + + The result isn't interesting." + (with-parser-context (character-scanner-context :scanner char-scanner) + (let ((start (file-location char-scanner))) + (parse (or (and "/*" + (lisp (let ((state nil)) + (loop (cond ((scanner-at-eof-p char-scanner) + (lexer-error char-scanner + (list "*/")) + (info-with-location + start "Comment started here") + (return (values nil t t))) + ((char= (scanner-current-char + char-scanner) + #\*) + (setf state '*) + (scanner-step char-scanner)) + ((and (eq state '*) + (char= (scanner-current-char + char-scanner) + #\/)) + (scanner-step char-scanner) + (return (values nil t t))) + (t + (setf state nil) + (scanner-step char-scanner))))))) + (and "//" + (skip-many () (not #\newline)) + (? #\newline))))))) ;;;----- That's all, folks --------------------------------------------------