X-Git-Url: https://git.distorted.org.uk/~mdw/sod/blobdiff_plain/aa14a4cddcb96b681d5c19a2ec8bad382f43b264..12949379840101e2d65883f29c5e8f0f6de49e9c:/src/fragment-parse.lisp

diff --git a/src/fragment-parse.lisp b/src/fragment-parse.lisp
index 5f58885..fcaa92e 100644
--- a/src/fragment-parse.lisp
+++ b/src/fragment-parse.lisp
@@ -7,7 +7,7 @@
 
 ;;;----- Licensing notice ---------------------------------------------------
 ;;;
-;;; This file is part of the Sensble Object Design, an object system for C.
+;;; This file is part of the Sensible Object Design, an object system for C.
 ;;;
 ;;; SOD is free software; you can redistribute it and/or modify
 ;;; it under the terms of the GNU General Public License as published by
@@ -32,15 +32,17 @@
 (defun scan-c-fragment (scanner end-chars)
   "Parse a C fragment from the SCANNER.
 
-   SCANNER must be a `sod-token-scanner' instance.
+   SCANNER must be a `sod-token-scanner' instance.  The END-CHARS are a
+   sequence of characters, any of which delimits the fragment.  The
+   delimiting character is left current in the scanner.
 
    The parsing process is a simple approximation to C lexical analysis.  It
    takes into account comments (both C and C++ style), string and character
    literals."
 
-  (let ((char-scanner (token-scanner-char-scanner scanner))
-	(delim nil)
-	(stack nil))
+  (let* ((char-scanner (token-scanner-char-scanner scanner))
+	 (delim-match nil) (delim-found nil) (delim-loc nil)
+	 (stack nil) (start nil) (tokstart nil) (eofwhine t))
     (with-parser-context (character-scanner-context :scanner char-scanner)
 
       ;; Hack.  If the first character is a newline then discard it
@@ -49,11 +51,20 @@
       (parse #\newline)
 
       ;; This seems the easiest way of gathering stuff.
+      (setf start (file-location char-scanner))
       (with-scanner-place (place char-scanner)
 
-	(flet ((push-delim (d)
-		 (push delim stack)
-		 (setf delim d))
+	(flet ((push-delim (found match)
+		 (push (list delim-found delim-match delim-loc) stack)
+		 (setf delim-found found
+		       delim-match match
+		       delim-loc tokstart))
+
+	       (pop-delim ()
+		 (destructuring-bind (found match loc) (pop stack)
+		   (setf delim-found found
+			 delim-match match
+			 delim-loc loc)))
 
 	       (result ()
 		 (let* ((output (scanner-interval char-scanner place))
@@ -69,38 +80,68 @@
 
 	  ;; March through characters until we reach the end.
 	  (loop
+	    (setf tokstart (file-location char-scanner))
 	    (cond-parse (:consumedp cp :expected exp)
 
 	      ;; Whitespace and comments are universally dull.
 	      ((satisfies whitespace-char-p) (parse :whitespace))
 	      ((scan-comment char-scanner))
 
-	      ;; See if we've reached the end.  There's a small trick here: I
-	      ;; capture the result in the `if-char' consequent to ensure
-	      ;; that we don't include the delimiter.
-	      ((if-char () (and (null delim) (member it end-chars))
-		 (values (result) t t)
-		 (values end-chars nil nil))
+	      ;; See if we've reached the end.  We must leave the delimiter
+	      ;; in the scanner, so `if-char' and its various friends aren't
+	      ;; appropriate.
+	      ((lisp (if (and (null delim-match)
+			      (not (scanner-at-eof-p char-scanner))
+			      (member (scanner-current-char char-scanner)
+				      end-chars))
+			 (values (result) t t)
+			 (values end-chars nil nil)))
 	       (return (values it t t)))
 	      (:eof
-	       (lexer-error char-scanner '(:any) cp)
+	       (when eofwhine
+		 (lexer-error char-scanner nil))
+	       (loop
+		 (unless delim-found (return))
+		 (info-with-location delim-loc
+				     "Unmatched `~C' found here" delim-found)
+		 (pop-delim))
+	       (info-with-location start "C fragment started here")
 	       (return (values (result) t t)))
 
 	      ;; Opening and closing brackets.  Opening brackets push things
-	      ;; onto a stack; closing brackets pop things off again.
-	      (#\( (push-delim #\)))
-	      (#\[ (push-delim #\]))
-	      (#\{ (push-delim #\}))
-	      ((or #\) #\] #\})
-	       (if (eql it delim)
-		   (setf delim (pop stack))
-		   (cerror* "Unmatched `~C.'." it)))
+	      ;; onto a stack; closing brackets pop things off again.  Pop a
+	      ;; bracket even if it doesn't match, to encourage progress
+	      ;; towards finding an end-delimiter.
+	      (#\( (push-delim #\( #\)))
+	      (#\[ (push-delim #\[ #\]))
+	      (#\{ (push-delim #\{ #\}))
+	      ((lisp (let ((char (scanner-current-char char-scanner)))
+		       (case char
+			 ((#\) #\] #\})
+			  (unless (eql char delim-match)
+			    (lexer-error char-scanner
+					 (and delim-match
+					      (list delim-match)))
+			    (when delim-loc
+			      (info-with-location
+			       delim-loc
+			       "Mismatched `~C' found here" delim-found)))
+			  (scanner-step char-scanner)
+			  (when delim-match (pop-delim))
+			  (values char t t))
+			 (t
+			  (values '(#\) #\] #\}) nil nil))))))
 
 	      ;; String and character literals.
 	      ((seq ((quote (or #\" #\'))
 		     (nil (skip-many ()
-			      (or (and #\\ :any) (not quote))))
-		     (nil (char quote)))))
+			    (or (and #\\ :any) (not quote))))
+		     (nil (or (char quote)
+			      (seq (:eof)
+				(lexer-error char-scanner (list quote))
+				(info-with-location tokstart
+						    "Literal started here")
+				(setf eofwhine nil)))))))
 
 	      ;; Anything else.
 	      (:any)
@@ -108,24 +149,39 @@
 	      ;; This really shouldn't be able to happen.
 	      (t
 	       (assert cp)
-	       (lexer-error char-scanner exp cp)))))))))
+	       (when (scanner-at-eof-p char-scanner)
+		 (setf eofwhine nil))
+	       (lexer-error char-scanner exp)))))))))
 
 (export 'parse-delimited-fragment)
-(defun parse-delimited-fragment (scanner begin end)
+(defun parse-delimited-fragment (scanner begin end &key keep-end)
   "Parse a C fragment delimited by BEGIN and END.
 
-   The BEGIN and END arguments are characters.  (Currently, BEGIN can be any
-  token type, but you probably shouldn't rely on this.)"
+   The BEGIN and END arguments are the start and end delimiters.  BEGIN can
+   be any token type, but is usually a delimiter character; it may also be t
+   to mean `don't care' -- but there must be an initial token of some kind
+   for annoying technical reasons.  END may be either a character or a list
+   of characters.  If KEEP-END is true, the trailing delimiter is left in the
+   token scanner so that it's available for further parsing decisions: this
+   is probably what you want if END is a list."
 
   ;; This is decidedly nasty.  The basic problem is that `scan-c-fragment'
   ;; works at the character level rather than at the lexical level, and if we
-  ;; commit to the `[' too early then `scanner-step' will eat the first few
-  ;; characters of the fragment -- and then the rest of the parse will get
-  ;; horrifically confused.
-
-  (if (eql (token-type scanner) begin)
-      (multiple-value-prog1 (values (scan-c-fragment scanner (list end)) t t)
-	(scanner-step scanner))
+  ;; commit to the BEGIN character too early then `scanner-step' will eat the
+  ;; first few characters of the fragment -- and then the rest of the parse
+  ;; will get horrifically confused.
+
+  (if (if (eq begin t)
+	  (not (scanner-at-eof-p scanner))
+	  (eql (token-type scanner) begin))
+      (multiple-value-prog1
+	  (values (scan-c-fragment scanner
+				   (if (listp end) end
+				       (list end)))
+		  t
+		  t)
+	(scanner-step scanner)
+	(unless keep-end (scanner-step scanner)))
       (values (list begin) nil nil)))
 
 ;;;----- That's all, folks --------------------------------------------------