doc/sod.sty: Fiddle with the `describe' categories.
[sod] / doc / syntax.tex
CommitLineData
1f7d590d
MW
1%%% -*-latex-*-
2%%%
3%%% Module syntax
4%%%
5%%% (c) 2015 Straylight/Edgeware
6%%%
7
8%%%----- Licensing notice ---------------------------------------------------
9%%%
10%%% This file is part of the Sensble Object Design, an object system for C.
11%%%
12%%% SOD is free software; you can redistribute it and/or modify
13%%% it under the terms of the GNU General Public License as published by
14%%% the Free Software Foundation; either version 2 of the License, or
15%%% (at your option) any later version.
16%%%
17%%% SOD is distributed in the hope that it will be useful,
18%%% but WITHOUT ANY WARRANTY; without even the implied warranty of
19%%% MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20%%% GNU General Public License for more details.
21%%%
22%%% You should have received a copy of the GNU General Public License
23%%% along with SOD; if not, write to the Free Software Foundation,
24%%% Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
25
26\chapter{Module syntax} \label{ch:syntax}
27
28%%%--------------------------------------------------------------------------
29
30Fortunately, Sod is syntactically quite simple. I've used a little slightly
31unusual notation in order to make the presentation easier to read. For any
32nonterminal $x$:
33\begin{itemize}
34\item $\epsilon$ denotes the empty nonterminal:
35 \begin{quote}
36 $\epsilon$ ::=
37 \end{quote}
38\item @[$x$@] means an optional $x$:
39 \begin{quote}
40 \syntax{@[$x$@] ::= $\epsilon$ @! $x$}
41 \end{quote}
42\item $x^*$ means a sequence of zero or more $x$s:
43 \begin{quote}
44 \syntax{$x^*$ ::= $\epsilon$ @! $x^*$ $x$}
45 \end{quote}
46\item $x^+$ means a sequence of one or more $x$s:
47 \begin{quote}
48 \syntax{$x^+$ ::= $x$ $x^*$}
49 \end{quote}
50\item $x$@<-list> means a sequence of one or more $x$s separated
51 by commas:
52 \begin{quote}
53 \syntax{$x$<-list> ::= $x$ @! $x$<-list> "," $x$}
54 \end{quote}
55\end{itemize}
56
57\subsection{Lexical syntax}
58\label{sec:syntax.lex}
59
60Whitespace and comments are discarded. The remaining characters are
61collected into tokens according to the following syntax.
62
63\begin{grammar}
64<token> ::= <identifier>
65\alt <string-literal>
66\alt <char-literal>
67\alt <integer-literal>
68\alt <punctuation>
69\end{grammar}
70
71This syntax is slightly ambiguous, and is disambiguated by the \emph{maximal
72munch} rule: at each stage we take the longest sequence of characters which
73could be a token.
74
75\subsubsection{Identifiers} \label{sec:syntax.lex.id}
76
77\begin{grammar}
78<identifier> ::= <id-start-char> @<id-body-char>^*
79
80<id-start-char> ::= <alpha-char> | "_"
81
82<id-body-char> ::= <id-start-char> @! <digit-char>
83
84<alpha-char> ::= "A" | "B" | \dots\ | "Z"
85\alt "a" | "b" | \dots\ | "z"
86\alt <extended-alpha-char>
87
88<digit-char> ::= "0" | <nonzero-digit-char>
89
90<nonzero-digit-char> ::= "1" | "2" $| \cdots |$ "9"
91\end{grammar}
92
93The precise definition of @<alpha-char> is left to the function
94\textsf{alpha-char-p} in the hosting Lisp system. For portability,
95programmers are encouraged to limit themselves to the standard ASCII letters.
96
97There are no reserved words at the lexical level, but the higher-level syntax
98recognizes certain identifiers as \emph{keywords} in some contexts. There is
99also an ambiguity (inherited from C) in the declaration syntax which is
100settled by distinguishing type names from other identifiers at a lexical
101level.
102
103\subsubsection{String and character literals} \label{sec:syntax.lex.string}
104
105\begin{grammar}
106<string-literal> ::= "\"" @<string-literal-char>^* "\""
107
108<char-literal> ::= "'" <char-literal-char> "'"
109
110<string-literal-char> ::= any character other than "\\" or "\""
111\alt "\\" <char>
112
113<char-literal-char> ::= any character other than "\\" or "'"
114\alt "\\" <char>
115
116<char> ::= any single character
117\end{grammar}
118
119The syntax for string and character literals differs from~C. In particular,
120escape sequences such as @`\textbackslash n' are not recognized. The use
121of string and character literals in Sod, outside of C~fragments, is limited,
122and the simple syntax seems adequate. For the sake of future compatibility,
123the use of character sequences which resemble C escape sequences is
124discouraged.
125
126\subsubsection{Integer literals} \label{sec:syntax.lex.int}
127
128\begin{grammar}
129<integer-literal> ::= <decimal-integer>
130\alt <binary-integer>
131\alt <octal-integer>
132\alt <hex-integer>
133
134<decimal-integer> ::= <nonzero-digit-char> @<digit-char>^*
135
136<binary-integer> ::= "0" @("b"|"B"@) @<binary-digit-char>^+
137
138<binary-digit-char> ::= "0" | "1"
139
140<octal-integer> ::= "0" @["o"|"O"@] @<octal-digit-char>^+
141
142<octal-digit-char> ::= "0" | "1" $| \cdots |$ "7"
143
144<hex-integer> ::= "0" @("x"|"X"@) @<hex-digit-char>^+
145
146<hex-digit-char> ::= <digit-char>
147\alt "A" | "B" | "C" | "D" | "E" | "F"
148\alt "a" | "b" | "c" | "d" | "e" | "f"
149\end{grammar}
150
151Sod understands only integers, not floating-point numbers; its integer syntax
152goes slightly beyond C in allowing a @`0o' prefix for octal and @`0b' for
153binary. However, length and signedness indicators are not permitted.
154
155\subsubsection{Punctuation} \label{sec:syntax.lex.punct}
156
157\begin{grammar}
158<punctuation> ::= any nonalphanumeric character other than "_", "\"" or "'"
159\end{grammar}
160
161\subsubsection{Comments} \label{sec:lex-comment}
162
163\begin{grammar}
164<comment> ::= <block-comment>
165\alt <line-comment>
166
167<block-comment> ::=
168 "/*"
169 @<not-star>^* @(@<star>^+ <not-star-or-slash> @<not-star>^*@)^*
170 @<star>^*
171 "*/"
172
173<star> ::= "*"
174
175<not-star> ::= any character other than "*"
176
177<not-star-or-slash> ::= any character other than "*" or "/"
178
179<line-comment> ::= "//" @<not-newline>^* <newline>
180
181<newline> ::= a newline character
182
183<not-newline> ::= any character other than newline
184\end{grammar}
185
186Comments are exactly as in C99: both traditional block comments `\texttt{/*}
187\dots\ \texttt{*/}' and \Cplusplus-style `\texttt{//} \dots' comments are
188permitted and ignored.
189
190\subsection{Special nonterminals}
191\label{sec:special-nonterminals}
192
193Aside from the lexical syntax presented above (\xref{sec:lexical-syntax}),
194two special nonterminals occur in the module syntax.
195
196\subsubsection{S-expressions} \label{sec:syntax-sexp}
197
198\begin{grammar}
199<s-expression> ::= an S-expression, as parsed by the Lisp reader
200\end{grammar}
201
202When an S-expression is expected, the Sod parser simply calls the host Lisp
203system's \textsf{read} function. Sod modules are permitted to modify the
204read table to extend the S-expression syntax.
205
206S-expressions are self-delimiting, so no end-marker is needed.
207
208\subsubsection{C fragments} \label{sec:syntax.lex.cfrag}
209
210\begin{grammar}
211<c-fragment> ::= a sequence of C tokens, with matching brackets
212\end{grammar}
213
214Sequences of C code are simply stored and written to the output unchanged
215during translation. They are read using a simple scanner which nonetheless
216understands C comments and string and character literals.
217
218A C fragment is terminated by one of a small number of delimiter characters
219determined by the immediately surrounding context -- usually a closing brace
220or bracket. The first such delimiter character which is not enclosed in
221brackets, braces or parenthesis ends the fragment.
222
223\subsection{Module syntax} \label{sec:syntax-module}
224
225\begin{grammar}
226<module> ::= @<definition>^*
227
228<definition> ::= <import-definition>
229\alt <load-definition>
230\alt <lisp-definition>
231\alt <code-definition>
232\alt <typename-definition>
233\alt <class-definition>
234\end{grammar}
235
236A module is the top-level syntactic item. A module consists of a sequence of
237definitions.
238
239\subsection{Simple definitions} \label{sec:syntax.defs}
240
241\subsubsection{Importing modules} \label{sec:syntax.defs.import}
242
243\begin{grammar}
244<import-definition> ::= "import" <string> ";"
245\end{grammar}
246
247The module named @<string> is processed and its definitions made available.
248
249A search is made for a module source file as follows.
250\begin{itemize}
251\item The module name @<string> is converted into a filename by appending
252 @`.sod', if it has no extension already.\footnote{%
253 Technically, what happens is \textsf{(merge-pathnames name (make-pathname
254 :type "SOD" :case :common))}, so exactly what this means varies
255 according to the host system.} %
256\item The file is looked for relative to the directory containing the
257 importing module.
258\item If that fails, then the file is looked for in each directory on the
259 module search path in turn.
260\item If the file still isn't found, an error is reported and the import
261 fails.
262\end{itemize}
263At this point, if the file has previously been imported, nothing further
264happens.\footnote{%
265 This check is done using \textsf{truename}, so it should see through simple
266 tricks like symbolic links. However, it may be confused by fancy things
267 like bind mounts and so on.} %
268
269Recursive imports, either direct or indirect, are an error.
270
271\subsubsection{Loading extensions} \label{sec:syntax.defs.load}
272
273\begin{grammar}
274<load-definition> ::= "load" <string> ";"
275\end{grammar}
276
277The Lisp file named @<string> is loaded and evaluated.
278
279A search is made for a Lisp source file as follows.
280\begin{itemize}
281\item The name @<string> is converted into a filename by appending @`.lisp',
282 if it has no extension already.\footnote{%
283 Technically, what happens is \textsf{(merge-pathnames name (make-pathname
284 :type "LISP" :case :common))}, so exactly what this means varies
285 according to the host system.} %
286\item A search is then made in the same manner as for module imports
287 (\xref{sec:syntax-module}).
288\end{itemize}
289If the file is found, it is loaded using the host Lisp's \textsf{load}
290function.
291
292Note that Sod doesn't attempt to compile Lisp files, or even to look for
293existing compiled files. The right way to package a substantial extension to
294the Sod translator is to provide the extension as a standard ASDF system (or
295similar) and leave a dropping @"foo-extension.lisp" in the module path saying
296something like
297\begin{quote}
298 \textsf{(asdf:load-system :foo-extension)}
299\end{quote}
300which will arrange for the extension to be compiled if necessary.
301
302(This approach means that the language doesn't need to depend on any
303particular system definition facility. It's bad enough already that it
304depends on Common Lisp.)
305
306\subsubsection{Lisp escapes} \label{sec:syntax.defs.lisp}
307
308\begin{grammar}
309<lisp-definition> ::= "lisp" <s-expression> ";"
310\end{grammar}
311
312The @<s-expression> is evaluated immediately. It can do anything it likes.
313
314\textbf{Warning!} This means that hostile Sod modules are a security hazard.
315Lisp code can read and write files, start other programs, and make network
316connections. Don't install Sod modules from sources that you don't
317trust.\footnote{%
318 Presumably you were going to run the corresponding code at some point, so
319 this isn't as unusually scary as it sounds. But please be careful.} %
320
321\subsubsection{Declaring type names} \label{sec:syntax.defs.typename}
322
323\begin{grammar}
324<typename-definition> ::=
325 "typename" <identifier-list> ";"
326\end{grammar}
327
328Each @<identifier> is declared as naming a C type. This is important because
329the C type syntax -- which Sod uses -- is ambiguous, and disambiguation is
330done by distinguishing type names from other identifiers.
331
332Don't declare class names using @"typename"; use @"class" forward
333declarations instead.
334
335\subsection{Literal code} \label{sec:syntax-code}
336
337\begin{grammar}
338<code-definition> ::=
339 "code" <identifier> ":" <identifier> @[<constraints>@]
340 "{" <c-fragment> "}"
341
342<constraints> ::= "[" <constraint-list> "]"
343
344<constraint> ::= @<identifier>^+
345\end{grammar}
346
347The @<c-fragment> will be output unchanged to one of the output files.
348
349The first @<identifier> is the symbolic name of an output file. Predefined
350output file names are @"c" and @"h", which are the implementation code and
351header file respectively; other output files can be defined by extensions.
352
353The second @<identifier> provides a name for the output item. Several C
354fragments can have the same name: they will be concatenated together in the
355order in which they were encountered.
356
357The @<constraints> provide a means for specifying where in the output file
358the output item should appear. (Note the two kinds of square brackets shown
359in the syntax: square brackets must appear around the constraints if they are
360present, but that they may be omitted.) Each comma-separated @<constraint>
361is a sequence of identifiers naming output items, and indicates that the
362output items must appear in the order given -- though the translator is free
363to insert additional items in between them. (The particular output items
364needn't be defined already -- indeed, they needn't be defined ever.)
365
366There is a predefined output item @"includes" in both the @"c" and @"h"
367output files which is a suitable place for inserting @"\#include"
368preprocessor directives in order to declare types and functions for use
369elsewhere in the generated output files.
370
371\subsection{Property sets} \label{sec:syntax.propset}
372
373\begin{grammar}
374<properties> ::= "[" <property-list> "]"
375
376<property> ::= <identifier> "=" <expression>
377\end{grammar}
378
379Property sets are a means for associating miscellaneous information with
380classes and related items. By using property sets, additional information
381can be passed to extensions without the need to introduce idiosyncratic
382syntax.
383
384A property has a name, given as an @<identifier>, and a value computed by
385evaluating an @<expression>. The value can be one of a number of types,
386though the only operators currently defined act on integer values only.
387
388\subsubsection{The expression evaluator} \label{sec:syntax.propset.expr}
389
390\begin{grammar}
391<expression> ::= <term> | <expression> "+" <term> | <expression> "-" <term>
392
393<term> ::= <factor> | <term> "*" <factor> | <term> "/" <factor>
394
395<factor> ::= <primary> | "+" <factor> | "-" <factor>
396
397<primary> ::=
398 <integer-literal> | <string-literal> | <char-literal> | <identifier>
399\alt "?" <s-expression>
400\alt "(" <expression> ")"
401\end{grammar}
402
403The arithmetic expression syntax is simple and standard; there are currently
404no bitwise, logical, or comparison operators.
405
406A @<primary> expression may be a literal or an identifier. Note that
407identifiers stand for themselves: they \emph{do not} denote values. For more
408fancy expressions, the syntax
409\begin{quote}
410 @"?" @<s-expression>
411\end{quote}
412causes the @<s-expression> to be evaluated using the Lisp \textsf{eval}
413function.
414%%% FIXME crossref to extension docs
415
416\subsection{C types} \label{sec:syntax.c-types}
417
418Sod's syntax for C types closely mirrors the standard C syntax. A C type has
419two parts: a sequence of @<declaration-specifier>s and a @<declarator>. In
420Sod, a type must contain at least one @<declaration-specifier> (i.e.,
421`implicit @"int"' is forbidden), and storage-class specifiers are not
422recognized.
423
424\subsubsection{Declaration specifiers} \label{sec:syntax.c-types.declspec}
425
426\begin{grammar}
427<declaration-specifier> ::= <type-name>
428\alt "struct" <identifier> | "union" <identifier> | "enum" <identifier>
429\alt "void" | "char" | "int" | "float" | "double"
430\alt "short" | "long"
431\alt "signed" | "unsigned"
432\alt <qualifier>
433
434<qualifier> ::= "const" | "volatile" | "restrict"
435
436<type-name> ::= <identifier>
437\end{grammar}
438
439A @<type-name> is an identifier which has been declared as being a type name,
440using the @"typename" or @"class" definitions.
441
442Declaration specifiers may appear in any order. However, not all
443combinations are permitted. A declaration specifier must consist of zero or
444more @<qualifiers>, and one of the following, up to reordering.
445\begin{itemize}
446\item @<type-name>
447\item @"struct" @<identifier>, @"union" @<identifier>, @"enum" @<identifier>
448\item @"void"
449\item @"char", @"unsigned char", @"signed char"
450\item @"short", @"unsigned short", @"signed short"
451\item @"short int", @"unsigned short int", @"signed short int"
452\item @"int", @"unsigned int", @"signed int", @"unsigned", @"signed"
453\item @"long", @"unsigned long", @"signed long"
454\item @"long int", @"unsigned long int", @"signed long int"
455\item @"long long", @"unsigned long long", @"signed long long"
456\item @"long long int", @"unsigned long long int", @"signed long long int"
457\item @"float", @"double", @"long double"
458\end{itemize}
459All of these have their usual C meanings.
460
461\subsubsection{Declarators} \label{sec:syntax.c-types.declarator}
462
463\begin{grammar}
464<declarator>$[k]$ ::= @<pointer>^* <primary-declarator>$[k]$
465
466<primary-declarator>$[k]$ ::= $k$
467\alt "(" <primary-declarator>$[k]$ ")"
468\alt <primary-declarator>$[k]$ @<declarator-suffix>^*
469
470<pointer> ::= "*" @<qualifier>^*
471
472<declarator-suffix> ::= "[" <c-fragment> "]"
473\alt "(" <arguments> ")"
474
475<arguments> ::= $\epsilon$ | "..."
476\alt <argument-list> @["," "..."@]
477
478<argument> ::= @<declaration-specifier>^+ <argument-declarator>
479
480<argument-declarator> ::= <declarator>@[<identifier> @! $\epsilon$@]
481
482<simple-declarator> ::= <declarator>@[<identifier>@]
483
484<dotted-name> ::= <identifier> "." <identifier>
485
486<dotted-declarator> ::= <declarator>@[<dotted-name>@]
487\end{grammar}
488
489The declarator syntax is taken from C, but with some differences.
490\begin{itemize}
491\item Array dimensions are uninterpreted @<c-fragments>, terminated by a
492 closing square bracket. This allows array dimensions to contain arbitrary
493 constant expressions.
494\item A declarator may have either a single @<identifier> at its centre or a
495 pair of @<identifier>s separated by a @`.'; this is used to refer to
496 slots or messages defined in superclasses.
497\end{itemize}
498The remaining differences are (I hope) a matter of presentation rather than
499substance.
500
501\subsection{Defining classes} \label{sec:syntax.class}
502
503\begin{grammar}
504<class-definition> ::= <class-forward-declaration>
505\alt <full-class-definition>
506\end{grammar}
507
508\subsubsection{Forward declarations} \label{sec:class.class.forward}
509
510\begin{grammar}
511<class-forward-declaration> ::= "class" <identifier> ";"
512\end{grammar}
513
514A @<class-forward-declaration> informs Sod that an @<identifier> will be used
515to name a class which is currently undefined. Forward declarations are
516necessary in order to resolve certain kinds of circularity. For example,
517\begin{listing}
518class Sub;
519
520class Super : SodObject {
521 Sub *sub;
522};
523
524class Sub : Super {
525 /* ... */
526};
527\end{listing}
528
529\subsubsection{Full class definitions} \label{sec:class.class.full}
530
531\begin{grammar}
532<full-class-definition> ::=
533 @[<properties>@]
534 "class" <identifier> ":" <identifier-list>
535 "{" @<class-item>^* "}"
536
537<class-item> ::= <slot-item> ";"
538\alt <message-item>
539\alt <method-item>
540\alt <initializer-item> ";"
541\end{grammar}
542
543A full class definition provides a complete description of a class.
544
545The first @<identifier> gives the name of the class. It is an error to
546give the name of an existing class (other than a forward-referenced class),
547or an existing type name. It is conventional to give classes `MixedCase'
548names, to distinguish them from other kinds of identifiers.
549
550The @<identifier-list> names the direct superclasses for the new class. It
551is an error if any of these @<identifier>s does not name a defined class.
552
553The @<properties> provide additional information. The standard class
554properties are as follows.
555\begin{description}
556\item[@"lisp_class"] The name of the Lisp class to use within the translator
557 to represent this class. The property value must be an identifier; the
558 default is @"sod_class". Extensions may define classes with additional
559 behaviour, and may recognize additional class properties.
560\item[@"metaclass"] The name of the Sod metaclass for this class. In the
561 generated code, a class is itself an instance of another class -- its
562 \emph{metaclass}. The metaclass defines which slots the class will have,
563 which messages it will respond to, and what its behaviour will be when it
564 receives them. The property value must be an identifier naming a defined
565 subclass of @"SodClass". The default metaclass is @"SodClass".
566 %%% FIXME xref to theory
567\item[@"nick"] A nickname for the class, to be used to distinguish it from
568 other classes in various limited contexts. The property value must be an
569 identifier; the default is constructed by forcing the class name to
570 lower-case.
571\end{description}
572
573The class body consists of a sequence of @<class-item>s enclosed in braces.
574These items are discussed on the following sections.
575
576\subsubsection{Slot items} \label{sec:sntax.class.slot}
577
578\begin{grammar}
579<slot-item> ::=
580 @[<properties>@]
581 @<declaration-specifier>^+ <init-declarator-list>
582
583<init-declarator> ::= <declarator> @["=" <initializer>@]
584\end{grammar}
585
586A @<slot-item> defines one or more slots. All instances of the class and any
587subclass will contain these slot, with the names and types given by the
588@<declaration-specifiers> and the @<declarators>. Slot declarators may not
589contain qualified identifiers.
590
591It is not possible to declare a slot with function type: such an item is
592interpreted as being a @<message-item> or @<method-item>. Pointers to
593functions are fine.
594
595An @<initializer>, if present, is treated as if a separate
596@<initializer-item> containing the slot name and initializer were present.
597For example,
598\begin{listing}
599[nick = eg]
600class Example : Super {
601 int foo = 17;
602};
603\end{listing}
604means the same as
605\begin{listing}
606[nick = eg]
607class Example : Super {
608 int foo;
609 eg.foo = 17;
610};
611\end{listing}
612
613\subsubsection{Initializer items} \label{sec:syntax.class.init}
614
615\begin{grammar}
616<initializer-item> ::= @["class"@] <slot-initializer-list>
617
618<slot-initializer> ::= <qualified-identifier> "=" <initializer>
619
620<initializer> :: "{" <c-fragment> "}" | <c-fragment>
621\end{grammar}
622
623An @<initializer-item> provides an initial value for one or more slots. If
624prefixed by @"class", then the initial values are for class slots (i.e.,
625slots of the class object itself); otherwise they are for instance slots.
626
627The first component of the @<qualified-identifier> must be the nickname of
628one of the class's superclasses (including itself); the second must be the
629name of a slot defined in that superclass.
630
631The initializer has one of two forms.
632\begin{itemize}
633\item A @<c-fragment> enclosed in braces denotes an aggregate initializer.
634 This is suitable for initializing structure, union or array slots.
635\item A @<c-fragment> \emph{not} beginning with an open brace is a `bare'
636 initializer, and continues until the next @`,' or @`;' which is not within
637 nested brackets. Bare initializers are suitable for initializing scalar
638 slots, such as pointers or integers, and strings.
639\end{itemize}
640
641\subsubsection{Message items} \label{sec:syntax.class.message}
642
643\begin{grammar}
644<message-item> ::=
645 @[<properties>@]
646 @<declaration-specifier>^+ <declarator> @[<method-body>@]
647\end{grammar}
648
649\subsubsection{Method items} \label{sec:syntax.class.method}
650
651\begin{grammar}
652<method-item> ::=
653 @[<properties>@]
654 @<declaration-specifier>^+ <declarator> <method-body>
655
656<method-body> ::= "{" <c-fragment> "}" | "extern" ";"
657\end{grammar}
658
659
660%%%----- That's all, folks --------------------------------------------------
661
662%%% Local variables:
663%%% mode: LaTeX
664%%% TeX-master: "sod.tex"
665%%% TeX-PDF-mode: t
666%%% End: