mdw@git.distorted.org.uk Git - sod/blame_incremental - src/impl-lexer.lisp

;;; -*-lisp-*-
;;;
;;; Implementation of lexical analysis protocol.
;;;
;;; (c) 2009 Straylight/Edgeware
;;;

;;;----- Licensing notice ---------------------------------------------------
;;;
;;; This file is part of the Sensble Object Design, an object system for C.
;;;
;;; SOD is free software; you can redistribute it and/or modify
;;; it under the terms of the GNU General Public License as published by
;;; the Free Software Foundation; either version 2 of the License, or
;;; (at your option) any later version.
;;;
;;; SOD is distributed in the hope that it will be useful,
;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
;;; GNU General Public License for more details.
;;;
;;; You should have received a copy of the GNU General Public License
;;; along with SOD; if not, write to the Free Software Foundation,
;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

(cl:in-package #:sod)

;;;--------------------------------------------------------------------------
;;; Basic lexical analyser.

(defstruct (pushed-token
	     (:constructor make-pushed-token (type value location)))
  "A token that has been pushed back into a lexer for later processing."
  type value location)

;;; Class definition.

(export 'basic-lexer)
(defclass basic-lexer ()
  ((stream :initarg :stream :type stream :reader lexer-stream)
   (char :initform nil :type (or character null) :reader lexer-char)
   (pushback-chars :initform nil :type list)
   (token-type :initform nil :accessor token-type)
   (token-value :initform nil :accessor token-value)
   (location :initform nil :reader file-location)
   (pushback-tokens :initform nil :type list))
  (:documentation
   "Base class for lexical analysers.

   The lexer reads characters from STREAM, which, for best results, wants to
   be a POSITION-AWARE-INPUT-STREAM.

   The lexer provides one-character lookahead by default: the current
   lookahead character is available to subclasses in the slot CHAR.  Before
   beginning lexical analysis, the lookahead character needs to be
   established with NEXT-CHAR.  If one-character lookahead is insufficient,
   the analyser can push back an arbitrary number of characters using
   PUSHBACK-CHAR.

   The NEXT-TOKEN function scans and returns the next token from the STREAM,
   and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token
   lookahead.  A parser using the lexical analyser can push back tokens using
   PUSHBACK-TOKENS.

   For convenience, the lexer implements a FILE-LOCATION method (delegated to
   the underlying stream)."))

;;; Reading and pushing back characters.

(defmethod next-char ((lexer basic-lexer))
  (with-slots (stream char pushback-chars) lexer
    (setf char (if pushback-chars
		   (pop pushback-chars)
		   (read-char stream nil)))))

(defmethod pushback-char ((lexer basic-lexer) new-char)
  (with-slots (char pushback-chars) lexer
    (push char pushback-chars)
    (setf char new-char)))

(defmethod fixup-stream* ((lexer basic-lexer) thunk)
  (with-slots (stream char pushback-chars) lexer
    (when pushback-chars
      (error "Lexer has pushed-back characters."))
    (when (slot-boundp lexer 'char)
      (unread-char char stream))
    (unwind-protect
	 (funcall thunk stream)
      (setf char (read-char stream nil)))))

;;; Reading and pushing back tokens.

(defmethod next-token :around ((lexer basic-lexer))
  (unless (slot-boundp lexer 'char)
    (next-char lexer)))

(defmethod next-token ((lexer basic-lexer))
  (with-slots (pushback-tokens token-type token-value location) lexer
    (setf (values token-type token-value)
	  (if pushback-tokens
	      (let ((pushback (pop pushback-tokens)))
		(setf location (pushed-token-location pushback))
		(values (pushed-token-type pushback)
			(pushed-token-value pushback)))
	      (scan-token lexer)))))

(defmethod scan-token :around ((lexer basic-lexer))
  (with-default-error-location (lexer)
    (call-next-method)))

(defmethod pushback-token ((lexer basic-lexer) new-token-type
			   &optional new-token-value new-location)
  (with-slots (pushback-tokens token-type token-value location) lexer
    (push (make-pushed-token token-type token-value location)
	  pushback-tokens)
    (when new-location (setf location new-location))
    (setf token-type new-token-type
	  token-value new-token-value)))

;;; Utilities.

(defmethod skip-spaces ((lexer basic-lexer))
  (do ((ch (lexer-char lexer) (next-char lexer)))
      ((not (whitespace-char-p ch)) ch)))

;;;--------------------------------------------------------------------------
;;; Our main lexer.

(export 'sod-lexer)
(defclass sod-lexer (basic-lexer)
  ()
  (:documentation
   "Lexical analyser for the SOD lanuage.

   See the LEXER class for the gory details about the lexer protocol."))

(defmethod scan-token ((lexer sod-lexer))
  (with-slots (stream char keywords location) lexer
    (prog (ch)

     consider

       ;; Stash the position of this token so that we can report it later.
       (setf ch (skip-spaces lexer)
	     location (file-location stream))

       ;; Now work out what it is that we're dealing with.
       (cond

	 ;; End-of-file brings its own peculiar joy.
	 ((null ch) (return (values :eof t)))

	 ;; Strings.
	 ((or (char= ch #\") (char= ch #\'))
	  (let* ((quote ch)
		 (string
		  (with-output-to-string (out)
		    (loop
		      (flet ((getch ()
			       (setf ch (next-char lexer))
			       (when (null ch)
				 (cerror* "Unexpected end of file in ~
					   ~:[string~;character~] constant"
					  (char= quote #\'))
				 (return))))
			(getch)
			(cond ((char= ch quote) (return))
			      ((char= ch #\\) (getch)))
			(write-char ch out))))))
	    (setf ch (next-char lexer))
	    (ecase quote
	      (#\" (return (values :string string)))
	      (#\' (case (length string)
		     (0 (cerror* "Empty character constant")
			(return (values :char #\?)))
		     (1 (return (values :char (char string 0))))
		     (t (cerror* "Multiple characters in character constant")
			(return (values :char (char string 0)))))))))

	 ;; Pick out identifiers and keywords.
	 ((or (alpha-char-p ch) (char= ch #\_))

	  ;; Scan a sequence of alphanumerics and underscores.  We could
	  ;; allow more interesting identifiers, but it would damage our C
	  ;; lexical compatibility.
	  (let ((id (with-output-to-string (out)
		      (loop
			(write-char ch out)
			(setf ch (next-char lexer))
			(when (or (null ch)
				  (not (or (alphanumericp ch)
					   (char= ch #\_))))
			  (return))))))

	    ;; Done.
	    (return (values :id id))))

	 ;; Pick out numbers.  Currently only integers, but we support
	 ;; multiple bases.
	 ((digit-char-p ch)

	  ;; Sort out the prefix.  If we're looking at `0b', `0o' or `0x'
	  ;; (maybe uppercase) then we've got a funny radix to deal with.
	  ;; Otherwise, a leading zero signifies octal (daft, I know), else
	  ;; we're left with decimal.
	  (multiple-value-bind (radix skip-char)
	      (if (char/= ch #\0)
		  (values 10 nil)
		  (case (and (setf ch (next-char lexer))
			     (char-downcase ch))
		    (#\b (values 2 t))
		    (#\o (values 8 t))
		    (#\x (values 16 t))
		    (t (values 8 nil))))

	    ;; If we last munched an interesting letter, we need to skip over
	    ;; it.  That's what the SKIP-CHAR flag is for.
	    ;;
	    ;; Danger, Will Robinson!  If we're just about to eat a radix
	    ;; letter, then the next thing must be a digit.  For example,
	    ;; `0xfatenning' parses as a hex number followed by an identifier
	    ;; `0xfa ttening', but `0xturning' is an octal number followed by
	    ;; an identifier `0 xturning'.
	    (when skip-char
	      (let ((peek (next-char lexer)))
		(unless (digit-char-p peek radix)
		  (pushback-char lexer ch)
		  (return-from scan-token (values :integer 0)))
		(setf ch peek)))

	    ;; Scan an integer.  While there are digits, feed them into the
	    ;; accumulator.
	    (do ((accum 0 (+ (* accum radix) digit))
		 (digit (and ch (digit-char-p ch radix))
			(and ch (digit-char-p ch radix))))
		((null digit) (return-from scan-token
				(values :integer accum)))
	      (setf ch (next-char lexer)))))

	 ;; A slash might be the start of a comment.
	 ((char= ch #\/)
	  (setf ch (next-char lexer))
	  (case ch

	    ;; Comment up to the end of the line.
	    (#\/
	     (loop
	       (setf ch (next-char lexer))
	       (when (or (null ch) (char= ch #\newline))
		 (go scan))))

	    ;; Comment up to the next `*/'.
	    (#\*
	     (tagbody
	      top
		(case (setf ch (next-char lexer))
		  (#\* (go star))
		  ((nil) (go done))
		  (t (go top)))
	      star
		(case (setf ch (next-char lexer))
		  (#\* (go star))
		  (#\/ (setf ch (next-char lexer))
		       (go done))
		  ((nil) (go done))
		  (t (go top)))
	      done)
	     (go consider))

	    ;; False alarm.  (The next character is already set up.)
	    (t
	     (return (values #\/ t)))))

	 ;; A dot: might be `...'.  Tread carefully!  We need more lookahead
	 ;; than is good for us.
	 ((char= ch #\.)
	  (setf ch (next-char lexer))
	  (cond ((eql ch #\.)
		 (setf ch (next-char lexer))
		 (cond ((eql ch #\.) (return (values :ellipsis nil)))
		       (t (pushback-char lexer #\.)
			  (return (values #\. t)))))
		(t
		 (return (values #\. t)))))

	 ;; Anything else is a lone delimiter.
	 (t
	  (return (multiple-value-prog1
		      (values ch t)
		    (next-char lexer)))))

     scan
       ;; Scan a new character and try again.
       (setf ch (next-char lexer))
       (go consider))))

;;;----- That's all, folks --------------------------------------------------
Commit	Line	Data
	1	;;; --lisp--
	2	;;;
	3	;;; Implementation of lexical analysis protocol.
	4	;;;
	5	;;; (c) 2009 Straylight/Edgeware
	6	;;;
	7
	8	;;;----- Licensing notice ---------------------------------------------------
	9	;;;
	10	;;; This file is part of the Sensble Object Design, an object system for C.
	11	;;;
	12	;;; SOD is free software; you can redistribute it and/or modify
	13	;;; it under the terms of the GNU General Public License as published by
	14	;;; the Free Software Foundation; either version 2 of the License, or
	15	;;; (at your option) any later version.
	16	;;;
	17	;;; SOD is distributed in the hope that it will be useful,
	18	;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
	19	;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	20	;;; GNU General Public License for more details.
	21	;;;
	22	;;; You should have received a copy of the GNU General Public License
	23	;;; along with SOD; if not, write to the Free Software Foundation,
	24	;;; Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
	25
	26	(cl:in-package #:sod)
	27
	28	;;;--------------------------------------------------------------------------
	29	;;; Basic lexical analyser.
	30
	31	(defstruct (pushed-token
	32	(:constructor make-pushed-token (type value location)))
	33	"A token that has been pushed back into a lexer for later processing."
	34	type value location)
	35
	36	;;; Class definition.
	37
	38	(export 'basic-lexer)
	39	(defclass basic-lexer ()
	40	((stream :initarg :stream :type stream :reader lexer-stream)
	41	(char :initform nil :type (or character null) :reader lexer-char)
	42	(pushback-chars :initform nil :type list)
	43	(token-type :initform nil :accessor token-type)
	44	(token-value :initform nil :accessor token-value)
	45	(location :initform nil :reader file-location)
	46	(pushback-tokens :initform nil :type list))
	47	(:documentation
	48	"Base class for lexical analysers.
	49
	50	The lexer reads characters from STREAM, which, for best results, wants to
	51	be a POSITION-AWARE-INPUT-STREAM.
	52
	53	The lexer provides one-character lookahead by default: the current
	54	lookahead character is available to subclasses in the slot CHAR. Before
	55	beginning lexical analysis, the lookahead character needs to be
	56	established with NEXT-CHAR. If one-character lookahead is insufficient,
	57	the analyser can push back an arbitrary number of characters using
	58	PUSHBACK-CHAR.
	59
	60	The NEXT-TOKEN function scans and returns the next token from the STREAM,
	61	and makes it available as TOKEN-TYPE and TOKEN-VALUE, providing one-token
	62	lookahead. A parser using the lexical analyser can push back tokens using
	63	PUSHBACK-TOKENS.
	64
	65	For convenience, the lexer implements a FILE-LOCATION method (delegated to
	66	the underlying stream)."))
	67
	68	;;; Reading and pushing back characters.
	69
	70	(defmethod next-char ((lexer basic-lexer))
	71	(with-slots (stream char pushback-chars) lexer
	72	(setf char (if pushback-chars
	73	(pop pushback-chars)
	74	(read-char stream nil)))))
	75
	76	(defmethod pushback-char ((lexer basic-lexer) new-char)
	77	(with-slots (char pushback-chars) lexer
	78	(push char pushback-chars)
	79	(setf char new-char)))
	80
	81	(defmethod fixup-stream* ((lexer basic-lexer) thunk)
	82	(with-slots (stream char pushback-chars) lexer
	83	(when pushback-chars
	84	(error "Lexer has pushed-back characters."))
	85	(when (slot-boundp lexer 'char)
	86	(unread-char char stream))
	87	(unwind-protect
	88	(funcall thunk stream)
	89	(setf char (read-char stream nil)))))
	90
	91	;;; Reading and pushing back tokens.
	92
	93	(defmethod next-token :around ((lexer basic-lexer))
	94	(unless (slot-boundp lexer 'char)
	95	(next-char lexer)))
	96
	97	(defmethod next-token ((lexer basic-lexer))
	98	(with-slots (pushback-tokens token-type token-value location) lexer
	99	(setf (values token-type token-value)
	100	(if pushback-tokens
	101	(let ((pushback (pop pushback-tokens)))
	102	(setf location (pushed-token-location pushback))
	103	(values (pushed-token-type pushback)
	104	(pushed-token-value pushback)))
	105	(scan-token lexer)))))
	106
	107	(defmethod scan-token :around ((lexer basic-lexer))
	108	(with-default-error-location (lexer)
	109	(call-next-method)))
	110
	111	(defmethod pushback-token ((lexer basic-lexer) new-token-type
	112	&optional new-token-value new-location)
	113	(with-slots (pushback-tokens token-type token-value location) lexer
	114	(push (make-pushed-token token-type token-value location)
	115	pushback-tokens)
	116	(when new-location (setf location new-location))
	117	(setf token-type new-token-type
	118	token-value new-token-value)))
	119
	120	;;; Utilities.
	121
	122	(defmethod skip-spaces ((lexer basic-lexer))
	123	(do ((ch (lexer-char lexer) (next-char lexer)))
	124	((not (whitespace-char-p ch)) ch)))
	125
	126	;;;--------------------------------------------------------------------------
	127	;;; Our main lexer.
	128
	129	(export 'sod-lexer)
	130	(defclass sod-lexer (basic-lexer)
	131	()
	132	(:documentation
	133	"Lexical analyser for the SOD lanuage.
	134
	135	See the LEXER class for the gory details about the lexer protocol."))
	136
	137	(defmethod scan-token ((lexer sod-lexer))
	138	(with-slots (stream char keywords location) lexer
	139	(prog (ch)
	140
	141	consider
	142
	143	;; Stash the position of this token so that we can report it later.
	144	(setf ch (skip-spaces lexer)
	145	location (file-location stream))
	146
	147	;; Now work out what it is that we're dealing with.
	148	(cond
	149
	150	;; End-of-file brings its own peculiar joy.
	151	((null ch) (return (values :eof t)))
	152
	153	;; Strings.
	154	((or (char= ch #\") (char= ch #\'))
	155	(let* ((quote ch)
	156	(string
	157	(with-output-to-string (out)
	158	(loop
	159	(flet ((getch ()
	160	(setf ch (next-char lexer))
	161	(when (null ch)
	162	(cerror* "Unexpected end of file in ~
	163	~:[string~;character~] constant"
	164	(char= quote #\'))
	165	(return))))
	166	(getch)
	167	(cond ((char= ch quote) (return))
	168	((char= ch #\\) (getch)))
	169	(write-char ch out))))))
	170	(setf ch (next-char lexer))
	171	(ecase quote
	172	(#\" (return (values :string string)))
	173	(#\' (case (length string)
	174	(0 (cerror* "Empty character constant")
	175	(return (values :char #\?)))
	176	(1 (return (values :char (char string 0))))
	177	(t (cerror* "Multiple characters in character constant")
	178	(return (values :char (char string 0)))))))))
	179
	180	;; Pick out identifiers and keywords.
	181	((or (alpha-char-p ch) (char= ch #\_))
	182
	183	;; Scan a sequence of alphanumerics and underscores. We could
	184	;; allow more interesting identifiers, but it would damage our C
	185	;; lexical compatibility.
	186	(let ((id (with-output-to-string (out)
	187	(loop
	188	(write-char ch out)
	189	(setf ch (next-char lexer))
	190	(when (or (null ch)
	191	(not (or (alphanumericp ch)
	192	(char= ch #\_))))
	193	(return))))))
	194
	195	;; Done.
	196	(return (values :id id))))
	197
	198	;; Pick out numbers. Currently only integers, but we support
	199	;; multiple bases.
	200	((digit-char-p ch)
	201
	202	;; Sort out the prefix. If we're looking at `0b', `0o' or `0x'
	203	;; (maybe uppercase) then we've got a funny radix to deal with.
	204	;; Otherwise, a leading zero signifies octal (daft, I know), else
	205	;; we're left with decimal.
	206	(multiple-value-bind (radix skip-char)
	207	(if (char/= ch #\0)
	208	(values 10 nil)
	209	(case (and (setf ch (next-char lexer))
	210	(char-downcase ch))
	211	(#\b (values 2 t))
	212	(#\o (values 8 t))
	213	(#\x (values 16 t))
	214	(t (values 8 nil))))
	215
	216	;; If we last munched an interesting letter, we need to skip over
	217	;; it. That's what the SKIP-CHAR flag is for.
	218	;;
	219	;; Danger, Will Robinson! If we're just about to eat a radix
	220	;; letter, then the next thing must be a digit. For example,
	221	;; `0xfatenning' parses as a hex number followed by an identifier
	222	;; `0xfa ttening', but `0xturning' is an octal number followed by
	223	;; an identifier `0 xturning'.
	224	(when skip-char
	225	(let ((peek (next-char lexer)))
	226	(unless (digit-char-p peek radix)
	227	(pushback-char lexer ch)
	228	(return-from scan-token (values :integer 0)))
	229	(setf ch peek)))
	230
	231	;; Scan an integer. While there are digits, feed them into the
	232	;; accumulator.
	233	(do ((accum 0 (+ (* accum radix) digit))
	234	(digit (and ch (digit-char-p ch radix))
	235	(and ch (digit-char-p ch radix))))
	236	((null digit) (return-from scan-token
	237	(values :integer accum)))
	238	(setf ch (next-char lexer)))))
	239
	240	;; A slash might be the start of a comment.
	241	((char= ch #\/)
	242	(setf ch (next-char lexer))
	243	(case ch
	244
	245	;; Comment up to the end of the line.
	246	(#\/
	247	(loop
	248	(setf ch (next-char lexer))
	249	(when (or (null ch) (char= ch #\newline))
	250	(go scan))))
	251
	252	;; Comment up to the next `*/'.
	253	(#\*
	254	(tagbody
	255	top
	256	(case (setf ch (next-char lexer))
	257	(#\* (go star))
	258	((nil) (go done))
	259	(t (go top)))
	260	star
	261	(case (setf ch (next-char lexer))
	262	(#\* (go star))
	263	(#\/ (setf ch (next-char lexer))
	264	(go done))
	265	((nil) (go done))
	266	(t (go top)))
	267	done)
	268	(go consider))
	269
	270	;; False alarm. (The next character is already set up.)
	271	(t
	272	(return (values #\/ t)))))
	273
	274	;; A dot: might be `...'. Tread carefully! We need more lookahead
	275	;; than is good for us.
	276	((char= ch #\.)
	277	(setf ch (next-char lexer))
	278	(cond ((eql ch #\.)
	279	(setf ch (next-char lexer))
	280	(cond ((eql ch #\.) (return (values :ellipsis nil)))
	281	(t (pushback-char lexer #\.)
	282	(return (values #\. t)))))
	283	(t
	284	(return (values #\. t)))))
	285
	286	;; Anything else is a lone delimiter.
	287	(t
	288	(return (multiple-value-prog1
	289	(values ch t)
	290	(next-char lexer)))))
	291
	292	scan
	293	;; Scan a new character and try again.
	294	(setf ch (next-char lexer))
	295	(go consider))))
	296
	297	;;;----- That's all, folks --------------------------------------------------