doc/syntax.tex: Put the `arbitrary code execution' warning in a box.
[sod] / doc / syntax.tex
CommitLineData
1f7d590d
MW
1%%% -*-latex-*-
2%%%
3%%% Module syntax
4%%%
5%%% (c) 2015 Straylight/Edgeware
6%%%
7
8%%%----- Licensing notice ---------------------------------------------------
9%%%
10%%% This file is part of the Sensble Object Design, an object system for C.
11%%%
12%%% SOD is free software; you can redistribute it and/or modify
13%%% it under the terms of the GNU General Public License as published by
14%%% the Free Software Foundation; either version 2 of the License, or
15%%% (at your option) any later version.
16%%%
17%%% SOD is distributed in the hope that it will be useful,
18%%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19%%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20%%% GNU General Public License for more details.
21%%%
22%%% You should have received a copy of the GNU General Public License
23%%% along with SOD; if not, write to the Free Software Foundation,
24%%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26\chapter{Module syntax} \label{ch:syntax}
27
28%%%--------------------------------------------------------------------------
29
30Fortunately, Sod is syntactically quite simple. I've used a little slightly
31unusual notation in order to make the presentation easier to read. For any
32nonterminal $x$:
33\begin{itemize}
34\item $\epsilon$ denotes the empty nonterminal:
35 \begin{quote}
36 $\epsilon$ ::=
37 \end{quote}
38\item @[$x$@] means an optional $x$:
39 \begin{quote}
40 \syntax{@[$x$@] ::= $\epsilon$ @! $x$}
41 \end{quote}
42\item $x^*$ means a sequence of zero or more $x$s:
43 \begin{quote}
44 \syntax{$x^*$ ::= $\epsilon$ @! $x^*$ $x$}
45 \end{quote}
46\item $x^+$ means a sequence of one or more $x$s:
47 \begin{quote}
48 \syntax{$x^+$ ::= $x$ $x^*$}
49 \end{quote}
50\item $x$@<-list> means a sequence of one or more $x$s separated
51 by commas:
52 \begin{quote}
53 \syntax{$x$<-list> ::= $x$ @! $x$<-list> "," $x$}
54 \end{quote}
55\end{itemize}
56
57\subsection{Lexical syntax}
58\label{sec:syntax.lex}
59
60Whitespace and comments are discarded. The remaining characters are
61collected into tokens according to the following syntax.
62
63\begin{grammar}
64<token> ::= <identifier>
65\alt <string-literal>
66\alt <char-literal>
67\alt <integer-literal>
68\alt <punctuation>
69\end{grammar}
70
71This syntax is slightly ambiguous, and is disambiguated by the \emph{maximal
72munch} rule: at each stage we take the longest sequence of characters which
73could be a token.
74
75\subsubsection{Identifiers} \label{sec:syntax.lex.id}
76
77\begin{grammar}
78<identifier> ::= <id-start-char> @<id-body-char>^*
79
80<id-start-char> ::= <alpha-char> | "_"
81
82<id-body-char> ::= <id-start-char> @! <digit-char>
83
84<alpha-char> ::= "A" | "B" | \dots\ | "Z"
85\alt "a" | "b" | \dots\ | "z"
86\alt <extended-alpha-char>
87
88<digit-char> ::= "0" | <nonzero-digit-char>
89
90<nonzero-digit-char> ::= "1" | "2" $| \cdots |$ "9"
91\end{grammar}
92
93The precise definition of @<alpha-char> is left to the function
94\textsf{alpha-char-p} in the hosting Lisp system. For portability,
95programmers are encouraged to limit themselves to the standard ASCII letters.
96
97There are no reserved words at the lexical level, but the higher-level syntax
98recognizes certain identifiers as \emph{keywords} in some contexts. There is
99also an ambiguity (inherited from C) in the declaration syntax which is
100settled by distinguishing type names from other identifiers at a lexical
101level.
102
103\subsubsection{String and character literals} \label{sec:syntax.lex.string}
104
105\begin{grammar}
106<string-literal> ::= "\"" @<string-literal-char>^* "\""
107
108<char-literal> ::= "'" <char-literal-char> "'"
109
110<string-literal-char> ::= any character other than "\\" or "\""
111\alt "\\" <char>
112
113<char-literal-char> ::= any character other than "\\" or "'"
114\alt "\\" <char>
115
116<char> ::= any single character
117\end{grammar}
118
119The syntax for string and character literals differs from~C. In particular,
120escape sequences such as @`\textbackslash n' are not recognized. The use
121of string and character literals in Sod, outside of C~fragments, is limited,
122and the simple syntax seems adequate. For the sake of future compatibility,
123the use of character sequences which resemble C escape sequences is
124discouraged.
125
126\subsubsection{Integer literals} \label{sec:syntax.lex.int}
127
128\begin{grammar}
129<integer-literal> ::= <decimal-integer>
130\alt <binary-integer>
131\alt <octal-integer>
132\alt <hex-integer>
133
134<decimal-integer> ::= <nonzero-digit-char> @<digit-char>^*
135
136<binary-integer> ::= "0" @("b"|"B"@) @<binary-digit-char>^+
137
138<binary-digit-char> ::= "0" | "1"
139
140<octal-integer> ::= "0" @["o"|"O"@] @<octal-digit-char>^+
141
142<octal-digit-char> ::= "0" | "1" $| \cdots |$ "7"
143
144<hex-integer> ::= "0" @("x"|"X"@) @<hex-digit-char>^+
145
146<hex-digit-char> ::= <digit-char>
147\alt "A" | "B" | "C" | "D" | "E" | "F"
148\alt "a" | "b" | "c" | "d" | "e" | "f"
149\end{grammar}
150
151Sod understands only integers, not floating-point numbers; its integer syntax
152goes slightly beyond C in allowing a @`0o' prefix for octal and @`0b' for
153binary. However, length and signedness indicators are not permitted.
154
155\subsubsection{Punctuation} \label{sec:syntax.lex.punct}
156
157\begin{grammar}
158<punctuation> ::= any nonalphanumeric character other than "_", "\"" or "'"
159\end{grammar}
160
161\subsubsection{Comments} \label{sec:lex-comment}
162
163\begin{grammar}
164<comment> ::= <block-comment>
165\alt <line-comment>
166
167<block-comment> ::=
168 "/*"
169 @<not-star>^* @(@<star>^+ <not-star-or-slash> @<not-star>^*@)^*
170 @<star>^*
171 "*/"
172
173<star> ::= "*"
174
175<not-star> ::= any character other than "*"
176
177<not-star-or-slash> ::= any character other than "*" or "/"
178
179<line-comment> ::= "//" @<not-newline>^* <newline>
180
181<newline> ::= a newline character
182
183<not-newline> ::= any character other than newline
184\end{grammar}
185
186Comments are exactly as in C99: both traditional block comments `\texttt{/*}
187\dots\ \texttt{*/}' and \Cplusplus-style `\texttt{//} \dots' comments are
188permitted and ignored.
189
190\subsection{Special nonterminals}
191\label{sec:special-nonterminals}
192
193Aside from the lexical syntax presented above (\xref{sec:lexical-syntax}),
194two special nonterminals occur in the module syntax.
195
196\subsubsection{S-expressions} \label{sec:syntax-sexp}
197
198\begin{grammar}
199<s-expression> ::= an S-expression, as parsed by the Lisp reader
200\end{grammar}
201
202When an S-expression is expected, the Sod parser simply calls the host Lisp
203system's \textsf{read} function. Sod modules are permitted to modify the
204read table to extend the S-expression syntax.
205
206S-expressions are self-delimiting, so no end-marker is needed.
207
208\subsubsection{C fragments} \label{sec:syntax.lex.cfrag}
209
210\begin{grammar}
211<c-fragment> ::= a sequence of C tokens, with matching brackets
212\end{grammar}
213
214Sequences of C code are simply stored and written to the output unchanged
215during translation. They are read using a simple scanner which nonetheless
216understands C comments and string and character literals.
217
218A C fragment is terminated by one of a small number of delimiter characters
219determined by the immediately surrounding context -- usually a closing brace
220or bracket. The first such delimiter character which is not enclosed in
221brackets, braces or parenthesis ends the fragment.
222
223\subsection{Module syntax} \label{sec:syntax-module}
224
225\begin{grammar}
226<module> ::= @<definition>^*
227
228<definition> ::= <import-definition>
229\alt <load-definition>
230\alt <lisp-definition>
231\alt <code-definition>
232\alt <typename-definition>
233\alt <class-definition>
234\end{grammar}
235
236A module is the top-level syntactic item. A module consists of a sequence of
237definitions.
238
239\subsection{Simple definitions} \label{sec:syntax.defs}
240
241\subsubsection{Importing modules} \label{sec:syntax.defs.import}
242
243\begin{grammar}
244<import-definition> ::= "import" <string> ";"
245\end{grammar}
246
247The module named @<string> is processed and its definitions made available.
248
249A search is made for a module source file as follows.
250\begin{itemize}
251\item The module name @<string> is converted into a filename by appending
252 @`.sod', if it has no extension already.\footnote{%
253 Technically, what happens is \textsf{(merge-pathnames name (make-pathname
254 :type "SOD" :case :common))}, so exactly what this means varies
255 according to the host system.} %
256\item The file is looked for relative to the directory containing the
257 importing module.
258\item If that fails, then the file is looked for in each directory on the
259 module search path in turn.
260\item If the file still isn't found, an error is reported and the import
261 fails.
262\end{itemize}
263At this point, if the file has previously been imported, nothing further
264happens.\footnote{%
265 This check is done using \textsf{truename}, so it should see through simple
266 tricks like symbolic links. However, it may be confused by fancy things
267 like bind mounts and so on.} %
268
269Recursive imports, either direct or indirect, are an error.
270
271\subsubsection{Loading extensions} \label{sec:syntax.defs.load}
272
273\begin{grammar}
274<load-definition> ::= "load" <string> ";"
275\end{grammar}
276
277The Lisp file named @<string> is loaded and evaluated.
278
279A search is made for a Lisp source file as follows.
280\begin{itemize}
281\item The name @<string> is converted into a filename by appending @`.lisp',
282 if it has no extension already.\footnote{%
283 Technically, what happens is \textsf{(merge-pathnames name (make-pathname
284 :type "LISP" :case :common))}, so exactly what this means varies
285 according to the host system.} %
286\item A search is then made in the same manner as for module imports
287 (\xref{sec:syntax-module}).
288\end{itemize}
289If the file is found, it is loaded using the host Lisp's \textsf{load}
290function.
291
292Note that Sod doesn't attempt to compile Lisp files, or even to look for
293existing compiled files. The right way to package a substantial extension to
294the Sod translator is to provide the extension as a standard ASDF system (or
295similar) and leave a dropping @"foo-extension.lisp" in the module path saying
296something like
297\begin{quote}
298 \textsf{(asdf:load-system :foo-extension)}
299\end{quote}
300which will arrange for the extension to be compiled if necessary.
301
302(This approach means that the language doesn't need to depend on any
303particular system definition facility. It's bad enough already that it
304depends on Common Lisp.)
305
306\subsubsection{Lisp escapes} \label{sec:syntax.defs.lisp}
307
308\begin{grammar}
309<lisp-definition> ::= "lisp" <s-expression> ";"
310\end{grammar}
311
312The @<s-expression> is evaluated immediately. It can do anything it likes.
313
eae50115
MW
314\begin{boxy}[Warning!]
315 This means that hostile Sod modules are a security hazard. Lisp code can
316 read and write files, start other programs, and make network connections.
317 Don't install Sod modules from sources that you don't trust.\footnote{%
318 Presumably you were going to run the corresponding code at some point, so
319 this isn't as unusually scary as it sounds. But please be careful.} %
320\end{boxy}
1f7d590d
MW
321
322\subsubsection{Declaring type names} \label{sec:syntax.defs.typename}
323
324\begin{grammar}
325<typename-definition> ::=
326 "typename" <identifier-list> ";"
327\end{grammar}
328
329Each @<identifier> is declared as naming a C type. This is important because
330the C type syntax -- which Sod uses -- is ambiguous, and disambiguation is
331done by distinguishing type names from other identifiers.
332
333Don't declare class names using @"typename"; use @"class" forward
334declarations instead.
335
336\subsection{Literal code} \label{sec:syntax-code}
337
338\begin{grammar}
339<code-definition> ::=
340 "code" <identifier> ":" <identifier> @[<constraints>@]
341 "{" <c-fragment> "}"
342
343<constraints> ::= "[" <constraint-list> "]"
344
345<constraint> ::= @<identifier>^+
346\end{grammar}
347
348The @<c-fragment> will be output unchanged to one of the output files.
349
350The first @<identifier> is the symbolic name of an output file. Predefined
351output file names are @"c" and @"h", which are the implementation code and
352header file respectively; other output files can be defined by extensions.
353
354The second @<identifier> provides a name for the output item. Several C
355fragments can have the same name: they will be concatenated together in the
356order in which they were encountered.
357
358The @<constraints> provide a means for specifying where in the output file
359the output item should appear. (Note the two kinds of square brackets shown
360in the syntax: square brackets must appear around the constraints if they are
361present, but that they may be omitted.) Each comma-separated @<constraint>
362is a sequence of identifiers naming output items, and indicates that the
363output items must appear in the order given -- though the translator is free
364to insert additional items in between them. (The particular output items
365needn't be defined already -- indeed, they needn't be defined ever.)
366
367There is a predefined output item @"includes" in both the @"c" and @"h"
368output files which is a suitable place for inserting @"\#include"
369preprocessor directives in order to declare types and functions for use
370elsewhere in the generated output files.
371
372\subsection{Property sets} \label{sec:syntax.propset}
373
374\begin{grammar}
375<properties> ::= "[" <property-list> "]"
376
377<property> ::= <identifier> "=" <expression>
378\end{grammar}
379
380Property sets are a means for associating miscellaneous information with
381classes and related items. By using property sets, additional information
382can be passed to extensions without the need to introduce idiosyncratic
383syntax.
384
385A property has a name, given as an @<identifier>, and a value computed by
386evaluating an @<expression>. The value can be one of a number of types,
387though the only operators currently defined act on integer values only.
388
389\subsubsection{The expression evaluator} \label{sec:syntax.propset.expr}
390
391\begin{grammar}
392<expression> ::= <term> | <expression> "+" <term> | <expression> "-" <term>
393
394<term> ::= <factor> | <term> "*" <factor> | <term> "/" <factor>
395
396<factor> ::= <primary> | "+" <factor> | "-" <factor>
397
398<primary> ::=
399 <integer-literal> | <string-literal> | <char-literal> | <identifier>
400\alt "?" <s-expression>
401\alt "(" <expression> ")"
402\end{grammar}
403
404The arithmetic expression syntax is simple and standard; there are currently
405no bitwise, logical, or comparison operators.
406
407A @<primary> expression may be a literal or an identifier. Note that
408identifiers stand for themselves: they \emph{do not} denote values. For more
409fancy expressions, the syntax
410\begin{quote}
411 @"?" @<s-expression>
412\end{quote}
413causes the @<s-expression> to be evaluated using the Lisp \textsf{eval}
414function.
415%%% FIXME crossref to extension docs
416
417\subsection{C types} \label{sec:syntax.c-types}
418
419Sod's syntax for C types closely mirrors the standard C syntax. A C type has
420two parts: a sequence of @<declaration-specifier>s and a @<declarator>. In
421Sod, a type must contain at least one @<declaration-specifier> (i.e.,
422`implicit @"int"' is forbidden), and storage-class specifiers are not
423recognized.
424
425\subsubsection{Declaration specifiers} \label{sec:syntax.c-types.declspec}
426
427\begin{grammar}
428<declaration-specifier> ::= <type-name>
429\alt "struct" <identifier> | "union" <identifier> | "enum" <identifier>
430\alt "void" | "char" | "int" | "float" | "double"
431\alt "short" | "long"
432\alt "signed" | "unsigned"
433\alt <qualifier>
434
435<qualifier> ::= "const" | "volatile" | "restrict"
436
437<type-name> ::= <identifier>
438\end{grammar}
439
440A @<type-name> is an identifier which has been declared as being a type name,
441using the @"typename" or @"class" definitions.
442
443Declaration specifiers may appear in any order. However, not all
444combinations are permitted. A declaration specifier must consist of zero or
445more @<qualifiers>, and one of the following, up to reordering.
446\begin{itemize}
447\item @<type-name>
448\item @"struct" @<identifier>, @"union" @<identifier>, @"enum" @<identifier>
449\item @"void"
450\item @"char", @"unsigned char", @"signed char"
451\item @"short", @"unsigned short", @"signed short"
452\item @"short int", @"unsigned short int", @"signed short int"
453\item @"int", @"unsigned int", @"signed int", @"unsigned", @"signed"
454\item @"long", @"unsigned long", @"signed long"
455\item @"long int", @"unsigned long int", @"signed long int"
456\item @"long long", @"unsigned long long", @"signed long long"
457\item @"long long int", @"unsigned long long int", @"signed long long int"
458\item @"float", @"double", @"long double"
459\end{itemize}
460All of these have their usual C meanings.
461
462\subsubsection{Declarators} \label{sec:syntax.c-types.declarator}
463
464\begin{grammar}
465<declarator>$[k]$ ::= @<pointer>^* <primary-declarator>$[k]$
466
467<primary-declarator>$[k]$ ::= $k$
468\alt "(" <primary-declarator>$[k]$ ")"
469\alt <primary-declarator>$[k]$ @<declarator-suffix>^*
470
471<pointer> ::= "*" @<qualifier>^*
472
473<declarator-suffix> ::= "[" <c-fragment> "]"
474\alt "(" <arguments> ")"
475
476<arguments> ::= $\epsilon$ | "..."
477\alt <argument-list> @["," "..."@]
478
479<argument> ::= @<declaration-specifier>^+ <argument-declarator>
480
481<argument-declarator> ::= <declarator>@[<identifier> @! $\epsilon$@]
482
483<simple-declarator> ::= <declarator>@[<identifier>@]
484
485<dotted-name> ::= <identifier> "." <identifier>
486
487<dotted-declarator> ::= <declarator>@[<dotted-name>@]
488\end{grammar}
489
490The declarator syntax is taken from C, but with some differences.
491\begin{itemize}
492\item Array dimensions are uninterpreted @<c-fragments>, terminated by a
493 closing square bracket. This allows array dimensions to contain arbitrary
494 constant expressions.
495\item A declarator may have either a single @<identifier> at its centre or a
496 pair of @<identifier>s separated by a @`.'; this is used to refer to
497 slots or messages defined in superclasses.
498\end{itemize}
499The remaining differences are (I hope) a matter of presentation rather than
500substance.
501
502\subsection{Defining classes} \label{sec:syntax.class}
503
504\begin{grammar}
505<class-definition> ::= <class-forward-declaration>
506\alt <full-class-definition>
507\end{grammar}
508
509\subsubsection{Forward declarations} \label{sec:class.class.forward}
510
511\begin{grammar}
512<class-forward-declaration> ::= "class" <identifier> ";"
513\end{grammar}
514
515A @<class-forward-declaration> informs Sod that an @<identifier> will be used
516to name a class which is currently undefined. Forward declarations are
517necessary in order to resolve certain kinds of circularity. For example,
518\begin{listing}
519class Sub;
520
521class Super : SodObject {
522 Sub *sub;
523};
524
525class Sub : Super {
526 /* ... */
527};
528\end{listing}
529
530\subsubsection{Full class definitions} \label{sec:class.class.full}
531
532\begin{grammar}
533<full-class-definition> ::=
534 @[<properties>@]
535 "class" <identifier> ":" <identifier-list>
536 "{" @<class-item>^* "}"
537
538<class-item> ::= <slot-item> ";"
539\alt <message-item>
540\alt <method-item>
541\alt <initializer-item> ";"
542\end{grammar}
543
544A full class definition provides a complete description of a class.
545
546The first @<identifier> gives the name of the class. It is an error to
547give the name of an existing class (other than a forward-referenced class),
548or an existing type name. It is conventional to give classes `MixedCase'
549names, to distinguish them from other kinds of identifiers.
550
551The @<identifier-list> names the direct superclasses for the new class. It
552is an error if any of these @<identifier>s does not name a defined class.
553
554The @<properties> provide additional information. The standard class
555properties are as follows.
556\begin{description}
557\item[@"lisp_class"] The name of the Lisp class to use within the translator
558 to represent this class. The property value must be an identifier; the
559 default is @"sod_class". Extensions may define classes with additional
560 behaviour, and may recognize additional class properties.
561\item[@"metaclass"] The name of the Sod metaclass for this class. In the
562 generated code, a class is itself an instance of another class -- its
563 \emph{metaclass}. The metaclass defines which slots the class will have,
564 which messages it will respond to, and what its behaviour will be when it
565 receives them. The property value must be an identifier naming a defined
566 subclass of @"SodClass". The default metaclass is @"SodClass".
567 %%% FIXME xref to theory
568\item[@"nick"] A nickname for the class, to be used to distinguish it from
569 other classes in various limited contexts. The property value must be an
570 identifier; the default is constructed by forcing the class name to
571 lower-case.
572\end{description}
573
574The class body consists of a sequence of @<class-item>s enclosed in braces.
575These items are discussed on the following sections.
576
577\subsubsection{Slot items} \label{sec:sntax.class.slot}
578
579\begin{grammar}
580<slot-item> ::=
581 @[<properties>@]
582 @<declaration-specifier>^+ <init-declarator-list>
583
584<init-declarator> ::= <declarator> @["=" <initializer>@]
585\end{grammar}
586
587A @<slot-item> defines one or more slots. All instances of the class and any
588subclass will contain these slot, with the names and types given by the
589@<declaration-specifiers> and the @<declarators>. Slot declarators may not
590contain qualified identifiers.
591
592It is not possible to declare a slot with function type: such an item is
593interpreted as being a @<message-item> or @<method-item>. Pointers to
594functions are fine.
595
596An @<initializer>, if present, is treated as if a separate
597@<initializer-item> containing the slot name and initializer were present.
598For example,
599\begin{listing}
600[nick = eg]
601class Example : Super {
602 int foo = 17;
603};
604\end{listing}
605means the same as
606\begin{listing}
607[nick = eg]
608class Example : Super {
609 int foo;
610 eg.foo = 17;
611};
612\end{listing}
613
614\subsubsection{Initializer items} \label{sec:syntax.class.init}
615
616\begin{grammar}
617<initializer-item> ::= @["class"@] <slot-initializer-list>
618
619<slot-initializer> ::= <qualified-identifier> "=" <initializer>
620
621<initializer> :: "{" <c-fragment> "}" | <c-fragment>
622\end{grammar}
623
624An @<initializer-item> provides an initial value for one or more slots. If
625prefixed by @"class", then the initial values are for class slots (i.e.,
626slots of the class object itself); otherwise they are for instance slots.
627
628The first component of the @<qualified-identifier> must be the nickname of
629one of the class's superclasses (including itself); the second must be the
630name of a slot defined in that superclass.
631
632The initializer has one of two forms.
633\begin{itemize}
634\item A @<c-fragment> enclosed in braces denotes an aggregate initializer.
635 This is suitable for initializing structure, union or array slots.
636\item A @<c-fragment> \emph{not} beginning with an open brace is a `bare'
637 initializer, and continues until the next @`,' or @`;' which is not within
638 nested brackets. Bare initializers are suitable for initializing scalar
639 slots, such as pointers or integers, and strings.
640\end{itemize}
641
642\subsubsection{Message items} \label{sec:syntax.class.message}
643
644\begin{grammar}
645<message-item> ::=
646 @[<properties>@]
647 @<declaration-specifier>^+ <declarator> @[<method-body>@]
648\end{grammar}
649
650\subsubsection{Method items} \label{sec:syntax.class.method}
651
652\begin{grammar}
653<method-item> ::=
654 @[<properties>@]
655 @<declaration-specifier>^+ <declarator> <method-body>
656
657<method-body> ::= "{" <c-fragment> "}" | "extern" ";"
658\end{grammar}
659
660
661%%%----- That's all, folks --------------------------------------------------
662
663%%% Local variables:
664%%% mode: LaTeX
665%%% TeX-master: "sod.tex"
666%%% TeX-PDF-mode: t
667%%% End: