X-Git-Url: https://git.distorted.org.uk/~mdw/sod/blobdiff_plain/3109662aca9c06495ac22c5c58b46e1c036aca5c..2ba6e0bde273dd3fd6c1288f53edf7b96f6442d1:/src/lexer-proto.lisp diff --git a/src/lexer-proto.lisp b/src/lexer-proto.lisp index 8e0c889..d2181e1 100644 --- a/src/lexer-proto.lisp +++ b/src/lexer-proto.lisp @@ -26,178 +26,125 @@ (cl:in-package #:sod) ;;;-------------------------------------------------------------------------- -;;; Accessors. +;;; Class definition. -(export 'lexer-char) -(defgeneric lexer-char (lexer) +(export 'sod-token-scanner) +(defclass sod-token-scanner (token-scanner) + ((char-scanner :initarg :char-scanner :reader token-scanner-char-scanner)) (:documentation - "Return the current lookahead character from the LEXER. + "A token scanner for SOD input files. - When the lexer is first created, there is no lookahead character: you must - `prime the pump' by calling `next-char'. The lexer represents - encountering the end of its input stream by setting the lookahead - character to nil. At this point it is still possible to push back - characters.")) + Not a lot here, apart from a character scanner to read from and the + standard token scanner infrastructure.")) ;;;-------------------------------------------------------------------------- -;;; Formatting tokens. - -(defgeneric format-token (token-type &optional token-value) - (:documentation - "Return a string describing a token with the specified type and value.") - (:method ((token-type (eql :eof)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :string)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :char)) &optional token-value) - (declare (ignore token-value)) - "") - (:method ((token-type (eql :id)) &optional token-value) - (format nil "" token-value)) - (:method ((token-type symbol) &optional token-value) - (declare (ignore token-value)) - (check-type token-type keyword) - (format nil "`~(~A~)'" token-type)) - (:method ((token-type character) &optional token-value) - (declare (ignore token-value)) - (format nil "~:[<~:C>~;`~C'~]" - (and (graphic-char-p token-type) - (char/= token-type #\space)) - token-type))) - -;;;-------------------------------------------------------------------------- -;;; Reading and pushing back characters. - -(export 'next-char) -(defgeneric next-char (lexer) - (:documentation - "Fetch the next character from the LEXER's input stream. - - Read a character from the input stream, and store it in the LEXER's CHAR - slot. The character stored is returned. If characters have been pushed - back then pushed-back characters are used instead of the input stream. If - there are no more characters to be read then the lookahead character is - nil. Returns the new lookahead character. - - (This function is primarily intended for the use of lexer subclasses.)")) - -(export 'pushback-char) -(defgeneric pushback-char (lexer char) - (:documentation - "Push the CHAR back into the lexer. - - Make CHAR be the current lookahead character (stored in the LEXER's CHAR - slot). The previous lookahead character is pushed down, and will be made - available again once this character is consumed by NEXT-CHAR. - - (This function is primarily intended for the use of lexer subclasses.)")) - -(defgeneric fixup-stream* (lexer thunk) - (:documentation - "Helper function for `with-lexer-stream'. - - This function does the main work for `with-lexer-stream'. The THUNK is - invoked on a single argument, the LEXER's underlying STREAM.")) - -(export 'with-lexer-stream) -(defmacro with-lexer-stream ((streamvar lexer) &body body) - "Evaluate BODY with STREAMVAR bound to the LEXER's input stream. - - The STREAM is fixed up so that the next character read (e.g., using - `read-char') will be the lexer's current lookahead character. Once the - BODY completes, the next character in the stream is read and set as the - lookahead character. It is an error if the lexer has pushed-back - characters (since these can't be pushed back into the input stream - properly)." - - `(fixup-stream* ,lexer (lambda (,streamvar) ,@body))) +;;; Indicators and error messages. + +(defvar *indicator-map* (make-hash-table) + "Hash table mapping indicator objects to human-readable descriptions.") + +(export 'define-indicator) +(defun define-indicator (indicator description) + "Associate an INDICATOR with its textual DESCRIPTION. + + Updates the the `*indicator-map*'." + (setf (gethash indicator *indicator-map*) description) + indicator) + +(export 'syntax-error) +(defun syntax-error (scanner expected &key (continuep t)) + "Signal a (maybe) continuable syntax error." + (labels ((show-token (type value) + (if (characterp type) + (format nil "~/sod::show-char/" type) + (case type + (:id (format nil "" value)) + (:int "") + (:string "") + (:char "") + (:eof "") + (:ellipsis "`...'") + (t (format nil "" type value))))) + (show-expected (thing) + (acond ((gethash thing *indicator-map*) it) + ((atom thing) (show-token thing nil)) + ((eq (car thing) :id) + (format nil "`~A'" (cadr thing))) + (t (format nil "" thing))))) + (funcall (if continuep #'cerror* #'error) + "Syntax error: ~ + expected ~{~#[~;~A~;~A or ~A~:;~A, ~]~} ~ + but found ~A" + (mapcar #'show-expected expected) + (show-token (token-type scanner) (token-value scanner))))) + +(export 'lexer-error) +(defun lexer-error (char-scanner expected consumedp) + "Signal a continuable lexical error." + (cerror* "Lexical error: ~ + expected ~{~#[~;~A~;~A or ~A~;:~A, ~]~} ~ + but found ~/sod::show-char/~ + ~@[ at ~A~]" + (mapcar (lambda (exp) + (typecase exp + (character (format nil "~/sod::show-char/" exp)) + (string (format nil "`~A'" exp)) + ((cons (eql :digit) *) (format nil "" + (cadr exp))) + ((eql :eof) "") + ((eql :any) "") + (t (format nil "" exp)))) + expected) + (and (not (scanner-at-eof-p char-scanner)) + (scanner-current-char char-scanner)) + (and consumedp (file-location char-scanner)))) + +(defparse skip-until (:context (context token-scanner-context) + (&key (keep-end nil keep-end-p)) + &rest token-types) + "Discard tokens until we find one listed in TOKEN-TYPES. + + If KEEP-END is true then retain the found token for later; otherwise + discard it. KEEP-END defaults to true if multiple TOKEN-TYPES are given; + otherwise false. If end-of-file is encountered then the indicator list is + simply the list of TOKEN-TYPES; otherwise the result is `nil'." + `(skip-until ,(parser-scanner context) + (list ,@token-types) + :keep-end ,(if keep-end-p keep-end + (> (length token-types) 1)))) + +(defparse error (:context (context token-scanner-context) + (&key) sub &optional (recover t)) + "Try to parse SUB; if it fails then report an error, and parse RECOVER. + + This is the main way to recover from errors and continue parsing. Even + then, it's not especially brilliant. + + If the SUB parser succeeds then just propagate its result: it's like we + were never here. Otherwise, try to recover in a sensible way so we can + continue parsing. The details of this recovery are subject to change, but + the final action is generally to invoke the RECOVER parser and return its + result." + `(parse-error-recover ,(parser-scanner context) + (parser () ,sub) + (parser () ,recover))) ;;;-------------------------------------------------------------------------- -;;; Reading and pushing back tokens. - -(export 'scan-token) -(defgeneric scan-token (lexer) - (:documentation - "Internal protocol for scanning tokens from an input stream. - - Implementing a method on this function is the main responsibility of LEXER - subclasses; it is called by the user-facing `next-token' function. - - The method should consume characters (using `next-char') as necessary, and - return two values: a token type and token value. These will be stored in - the corresponding slots in the lexer object in order to provide the user - with one-token lookahead.")) - -(export 'next-token) -(defgeneric next-token (lexer) - (:documentation - "Scan a token from an input stream. - - This function scans a token from an input stream. Two values are - returned: a `token type' and a `token value'. These are opaque to the - LEXER base class, but the intent is that the token type be significant to - determining the syntax of the input, while the token value carries any - additional information about the token's semantic content. The token type - and token value are also made available for lookahead via accessors - TOKEN-TYPE and TOKEN-VALUE on the `lexer' object. - - The new lookahead token type and value are returned as two separate - values. - - If tokens have been pushed back (see `pushback-token') then they are - returned one by one instead of scanning the stream.")) - -(export 'pushback-token) -(defgeneric pushback-token (lexer token-type &optional token-value location) - (:documentation - "Push a token back into the lexer. - - Make the given TOKEN-TYPE and TOKEN-VALUE be the current lookahead token. - The previous lookahead token is pushed down, and will be made available - agan once this new token is consumed by NEXT-TOKEN. If LOCATION is - non-nil then `file-location' is saved and replaced by LOCATION. The - TOKEN-TYPE and TOKEN-VALUE can be anything at all: for instance, they need - not be values which can actually be returned by NEXT-TOKEN.")) - -;;;-------------------------------------------------------------------------- -;;; Utilities. - -(export 'skip-spaces) -(defgeneric skip-spaces (lexer) - (:documentation - "Skip over whitespace characters in the LEXER. - - There must be a lookahead character; when the function returns, the - lookahead character will be a non-whitespace character or nil if there - were no non-whitespace characters remaining. Returns the new lookahead - character.")) - -(export 'require-token) -(defun require-token - (lexer wanted-token-type &key (errorp t) (consumep t) default) - "Require a particular token to appear. - - If the LEXER's current lookahead token has type `wanted-token-type' then - consume it (using `next-token') and return its value. Otherwise, if the - token doesn't have the requested type then signal a continuable error - describing the situation and return DEFAULT (which defaults to nil). - - If ERRORP is false then no error is signalled; this is useful for - consuming or checking for optional punctuation. If CONSUMEP is false then - a matching token is not consumed; non-matching tokens are never consumed." - - (with-slots (token-type token-value) lexer - (cond ((eql token-type wanted-token-type) - (prog1 token-value - (when consumep (next-token lexer)))) - (errorp - (cerror* "Expected ~A but found ~A" - (format-token wanted-token-type) - (format-token token-type token-value)) - default) - (t - default)))) +;;; Lexical analysis utilities. + +(defun scan-comment (char-scanner) + "Scan a comment (either `/* ... */' or `// ...') from CHAR-SCANNER. + + The result isn't interesting." + (with-parser-context (character-scanner-context :scanner char-scanner) + (parse (or (and "/*" + (and (skip-many () + (and (skip-many () (not #\*)) + (label "*/" (skip-many (:min 1) #\*))) + (not #\/)) + #\/)) + (and "//" + (skip-many () (not #\newline)) + (? #\newline)))))) ;;;----- That's all, folks --------------------------------------------------