[sod] / doc / sod.tex

\documentclass[noarticle]{strayman}

\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
\usepackage[palatino, helvetica, courier, maths=cmr]{mdwfonts}
\usepackage{tikz}
\usepackage{syntax}
\usepackage{sverb}
\usepackage{mdwtab}
\usepackage[mdwmargin]{mdwthm}
\usepackage{amssymb}
\usepackage{footnote}
\usepackage{at}
\usepackage{mdwref}

\title{A Sensible Object Design for C}
\author{Mark Wooding}

\makeatletter

\errorcontextlines999

\def\syntleft{\normalfont\itshape}
\let\syntright\empty

\let\codeface\sffamily

\def\ulitleft{\normalfont\codeface}
\let\ulitright\empty

\let\listingsize\relax

\let\epsilon\varepsilon

\atdef <#1>{\synt{#1}\@scripts}
\atdef "#1"{\lit*{#1}\@scripts}
\atdef `#1'{\lit{#1}\@scripts}
\atdef |#1|{\textsf{#1}\@scripts}
\def\dbl@maybe#1{\let\@tempa#1\futurelet\@ch\dbl@maybe@i}
\def\dbl@maybe@i{\m@maybe\ifx\@ch\@tempa\@tempa\!\@tempa%
  \expandafter\@firstoftwo\expandafter\@scripts%
  \else\@tempa\expandafter\@scripts\fi}
\atdef [{\dbl@maybe[}
\atdef ]{\dbl@maybe]}
\atdef {{\m@maybe\{\@scripts}
\atdef }{\m@maybe\}\@scripts}
\atdef ({\m@maybe(\@scripts}
\atdef ){\m@maybe)\@scripts}
\atdef !{\m@maybe|\@scripts}
\atdef to{\leavevmode\unskip\quad\m@maybe\longrightarrow\m@maybe@end\quad}
\let\m@maybe@end\relax
\def\m@maybe{\ifmmode\else$\let\m@maybe@end$\fi}
\def\@scripts{\futurelet\@ch\@scripts@i}

\def\chain#1#2{\mathsf{ch}_{#1}(#2)}
\def\chainhead#1#2{\mathsf{hd}_{#1}(#2)}
\def\chaintail#1#2{\mathsf{tl}_{#1}(#2)}

\let\implies\Rightarrow

\atdef ;#1\\{\normalfont\itshape;#1\\}
\let\@@grammar\grammar
\def\grammar{\def\textbar{\hbox{$|$}}\@@grammar}

\begingroup\lccode`\~=`\_\lowercase{\endgroup
\def\@scripts@i{\if1\ifx\@ch~1\else\ifx\@ch^1\else0\fi\fi%
  \expandafter\@scripts@ii\else\expandafter\m@maybe@end\fi}}
\def\@scripts@ii#1#2{\m@maybe#1{#2}\@scripts}

\def\Cplusplus{C\kern-\p@++}
\def\Csharp{C\#}
\def\man#1#2{\textbf{#1}(#2)}

\begingroup\lccode`\~=`\
\lowercase{
\endgroup
\def\prog{%
  \codeface%
  \quote%
  \let\old@nl\\%
  \obeylines%
  \tabbing%
  \global\let~\\%
  \global\let\\\textbackslash%
}
\def\endprog{%
  \endtabbing%
  \global\let\\\old@nl%
  \endquote%
}}

\newenvironment{boxy}[1][\q@]{%
  \dimen@\linewidth\advance\dimen@-1.2pt\advance\dimen@-2ex%
  \medskip%
  \vbox\bgroup\hrule\hbox\bgroup\vrule%
  \vbox\bgroup\vskip1ex\hbox\bgroup\hskip1ex\minipage\dimen@%
  \def\@temp{#1}\ifx\@temp\q@\else\leavevmode{\headfam\bfseries#1\quad}\fi%
}{%
  \endminipage\hskip1ex\egroup\vskip1ex\egroup%
  \vrule\egroup\hrule\egroup%
  \medskip%
}

\def\definedescribecategory#1#2{\@namedef{cat!#1}{#2}}
\def\describecategoryname#1{%
  \expandafter\let\expandafter\@tempa\csname cat!#1\endcsname%
  \ifx\@tempa\relax#1\else\@tempa\fi}
\definedescribecategory{fun}{function}
\definedescribecategory{gf}{generic function}
\definedescribecategory{var}{variable}
\definedescribecategory{const}{constant}
\definedescribecategory{meth}{primary method}
\definedescribecategory{ar-meth}{around-method}
\definedescribecategory{be-meth}{before-method}
\definedescribecategory{af-meth}{after-method}
\definedescribecategory{cls}{class}
\definedescribecategory{ty}{type}
\definedescribecategory{mac}{macro}

\def\q@{\q@}
\newenvironment{describe}[3][\q@]{%
  \normalfont%
  \par\goodbreak%
  \vspace{\bigskipamount}%
  \setbox\z@\hbox{\bfseries[\describecategoryname{#2}]}%
  \dimen@\linewidth\advance\dimen@-\wd\z@%
  \def\@temp##1 ##2\q@{\message{#2:##1}\label{#2:##1}}%
  \def\@tempa{#1}\ifx\@tempa\q@\@temp#3 \q@\else\@temp{#1} \\\fi%
  \edef\@temp{{\the\linewidth}{@{}p{\the\dimen@}%
      @{\extracolsep{\fill}}l@{\extracolsep{0pt}}}}%
  \noindent\csname tabular*\expandafter\endcsname\@temp%
  \tabbing\codeface#3\endtabbing&\unhbox\z@\\\endtabular%
%  \@afterheading%
  \list{}{\rightmargin\z@}\item%
}{%
  \endlist%
}

\def\push{\quad\=\+\kill}

\begin{document}

\maketitle

\include{sod-tut}

%%%--------------------------------------------------------------------------
\chapter{Internals}

\section{Generated names}

The generated names for functions and objects related to a class are
constructed systematically so as not to interfere with each other.  The rules
on class, slot and message naming exist so as to ensure that the generated
names don't collide with each other.

The following notation is used in this section.
\begin{description}
\item[@<class>] The full name of the `focus' class: the one for which we are
  generating name.
\item[@<super-nick>] The nickname of a superclass.
\item[@<head-nick>] The nickname of the chain-head class of the chain
  in question.
\end{description}

\subsection{Instance layout}

%%%--------------------------------------------------------------------------
\section{Syntax}
\label{sec:syntax}

Fortunately, Sod is syntactically quite simple.  I've used a little slightly
unusual notation in order to make the presentation easier to read.  For any
nonterminal $x$:
\begin{itemize}
\item $\epsilon$ denotes the empty nonterminal:
  \begin{quote}
    $\epsilon$ ::=
  \end{quote}
\item @[$x$@] means an optional $x$:
  \begin{quote}
    \syntax{@[$x$@] ::= $\epsilon$ @! $x$}
  \end{quote}
\item $x^*$ means a sequence of zero or more $x$s:
  \begin{quote}
    \syntax{$x^*$ ::= $\epsilon$ @! $x^*$ $x$}
  \end{quote}
\item $x^+$ means a sequence of one or more $x$s:
  \begin{quote}
    \syntax{$x^+$ ::= $x$ $x^*$}
  \end{quote}
\item $x$@<-list> means a sequence of one or more $x$s separated
  by commas:
  \begin{quote}
    \syntax{$x$<-list> ::= $x$ @! $x$<-list> "," $x$}
  \end{quote}
\end{itemize}

\subsection{Lexical syntax}
\label{sec:syntax.lex}

Whitespace and comments are discarded.  The remaining characters are
collected into tokens according to the following syntax.

\begin{grammar}
<token> ::= <identifier>
\alt <string-literal>
\alt <char-literal>
\alt <integer-literal>
\alt <punctuation>
\end{grammar}

This syntax is slightly ambiguous, and is disambiguated by the \emph{maximal
munch} rule: at each stage we take the longest sequence of characters which
could be a token.

\subsubsection{Identifiers} \label{sec:syntax.lex.id}

\begin{grammar}
<identifier> ::= <id-start-char> @<id-body-char>^*

<id-start-char> ::= <alpha-char> | "_"

<id-body-char> ::= <id-start-char> @! <digit-char>

<alpha-char> ::= "A" | "B" | \dots\ | "Z"
\alt "a" | "b" | \dots\ | "z"
\alt <extended-alpha-char>

<digit-char> ::= "0" | <nonzero-digit-char>

<nonzero-digit-char> ::= "1" | "2" $| \cdots |$ "9"
\end{grammar}

The precise definition of @<alpha-char> is left to the function
\textsf{alpha-char-p} in the hosting Lisp system.  For portability,
programmers are encouraged to limit themselves to the standard ASCII letters.

There are no reserved words at the lexical level, but the higher-level syntax
recognizes certain identifiers as \emph{keywords} in some contexts.  There is
also an ambiguity (inherited from C) in the declaration syntax which is
settled by distinguishing type names from other identifiers at a lexical
level.

\subsubsection{String and character literals} \label{sec:syntax.lex.string}

\begin{grammar}
<string-literal> ::= "\"" @<string-literal-char>^* "\""

<char-literal> ::= "'" <char-literal-char> "'"

<string-literal-char> ::= any character other than "\\" or "\""
\alt "\\" <char>

<char-literal-char> ::= any character other than "\\" or "'"
\alt "\\" <char>

<char> ::= any single character
\end{grammar}

The syntax for string and character literals differs from~C.  In particular,
escape sequences such as @`\textbackslash n' are not recognized.  The use
of string and character literals in Sod, outside of C~fragments, is limited,
and the simple syntax seems adequate.  For the sake of future compatibility,
the use of character sequences which resemble C escape sequences is
discouraged.

\subsubsection{Integer literals} \label{sec:syntax.lex.int}

\begin{grammar}
<integer-literal> ::= <decimal-integer>
\alt <binary-integer>
\alt <octal-integer>
\alt <hex-integer>

<decimal-integer> ::= <nonzero-digit-char> @<digit-char>^*

<binary-integer> ::= "0" @("b"|"B"@) @<binary-digit-char>^+

<binary-digit-char> ::= "0" | "1"

<octal-integer> ::= "0" @["o"|"O"@] @<octal-digit-char>^+

<octal-digit-char> ::= "0" | "1" $| \cdots |$ "7"

<hex-integer> ::= "0" @("x"|"X"@) @<hex-digit-char>^+

<hex-digit-char> ::= <digit-char>
\alt "A" | "B" | "C" | "D" | "E" | "F"
\alt "a" | "b" | "c" | "d" | "e" | "f"
\end{grammar}

Sod understands only integers, not floating-point numbers; its integer syntax
goes slightly beyond C in allowing a @`0o' prefix for octal and @`0b' for
binary.  However, length and signedness indicators are not permitted.

\subsubsection{Punctuation} \label{sec:syntax.lex.punct}

\begin{grammar}
<punctuation> ::= any nonalphanumeric character other than "_", "\"" or "'"
\end{grammar}

\subsubsection{Comments} \label{sec:lex-comment}

\begin{grammar}
<comment> ::= <block-comment>
\alt <line-comment>

<block-comment> ::=
  "/*"
  @<not-star>^* @(@<star>^+ <not-star-or-slash> @<not-star>^*@)^*
  @<star>^*
  "*/"

<star> ::= "*"

<not-star> ::= any character other than "*"

<not-star-or-slash> ::= any character other than "*" or  "/"

<line-comment> ::= "//" @<not-newline>^* <newline>

<newline> ::= a newline character

<not-newline> ::= any character other than newline
\end{grammar}

Comments are exactly as in C99: both traditional block comments `\texttt{/*}
\dots\ \texttt{*/}' and \Cplusplus-style `\texttt{//} \dots' comments are
permitted and ignored.

\subsection{Special nonterminals}
\label{sec:special-nonterminals}

Aside from the lexical syntax presented above (\xref{sec:lexical-syntax}),
two special nonterminals occur in the module syntax.

\subsubsection{S-expressions} \label{sec:syntax-sexp}

\begin{grammar}
<s-expression> ::= an S-expression, as parsed by the Lisp reader
\end{grammar}

When an S-expression is expected, the Sod parser simply calls the host Lisp
system's \textsf{read} function.  Sod modules are permitted to modify the
read table to extend the S-expression syntax.

S-expressions are self-delimiting, so no end-marker is needed.

\subsubsection{C fragments} \label{sec:syntax.lex.cfrag}

\begin{grammar}
<c-fragment> ::= a sequence of C tokens, with matching brackets
\end{grammar}

Sequences of C code are simply stored and written to the output unchanged
during translation.  They are read using a simple scanner which nonetheless
understands C comments and string and character literals.

A C fragment is terminated by one of a small number of delimiter characters
determined by the immediately surrounding context -- usually a closing brace
or bracket.  The first such delimiter character which is not enclosed in
brackets, braces or parenthesis ends the fragment.

\subsection{Module syntax} \label{sec:syntax-module}

\begin{grammar}
<module> ::= @<definition>^*

<definition> ::= <import-definition>
\alt <load-definition>
\alt <lisp-definition>
\alt <code-definition>
\alt <typename-definition>
\alt <class-definition>
\end{grammar}

A module is the top-level syntactic item.  A module consists of a sequence of
definitions.

\subsection{Simple definitions} \label{sec:syntax.defs}

\subsubsection{Importing modules} \label{sec:syntax.defs.import}

\begin{grammar}
<import-definition> ::= "import" <string> ";"
\end{grammar}

The module named @<string> is processed and its definitions made available.

A search is made for a module source file as follows.
\begin{itemize}
\item The module name @<string> is converted into a filename by appending
  @`.sod', if it has no extension already.\footnote{%
    Technically, what happens is \textsf{(merge-pathnames name (make-pathname
    :type "SOD" :case :common))}, so exactly what this means varies
    according to the host system.} %
\item The file is looked for relative to the directory containing the
  importing module.
\item If that fails, then the file is looked for in each directory on the
  module search path in turn.
\item If the file still isn't found, an error is reported and the import
  fails.
\end{itemize}
At this point, if the file has previously been imported, nothing further
happens.\footnote{%
  This check is done using \textsf{truename}, so it should see through simple
  tricks like symbolic links.  However, it may be confused by fancy things
  like bind mounts and so on.} %

Recursive imports, either direct or indirect, are an error.

\subsubsection{Loading extensions} \label{sec:syntax.defs.load}

\begin{grammar}
<load-definition> ::= "load" <string> ";"
\end{grammar}

The Lisp file named @<string> is loaded and evaluated.

A search is made for a Lisp source file as follows.
\begin{itemize}
\item The name @<string> is converted into a filename by appending @`.lisp',
  if it has no extension already.\footnote{%
    Technically, what happens is \textsf{(merge-pathnames name (make-pathname
    :type "LISP" :case :common))}, so exactly what this means varies
    according to the host system.} %
\item A search is then made in the same manner as for module imports
  (\xref{sec:syntax-module}).
\end{itemize}
If the file is found, it is loaded using the host Lisp's \textsf{load}
function.

Note that Sod doesn't attempt to compile Lisp files, or even to look for
existing compiled files.  The right way to package a substantial extension to
the Sod translator is to provide the extension as a standard ASDF system (or
similar) and leave a dropping @"foo-extension.lisp" in the module path saying
something like
\begin{quote}
  \textsf{(asdf:load-system :foo-extension)}
\end{quote}
which will arrange for the extension to be compiled if necessary.

(This approach means that the language doesn't need to depend on any
particular system definition facility.  It's bad enough already that it
depends on Common Lisp.)

\subsubsection{Lisp escapes} \label{sec:syntax.defs.lisp}

\begin{grammar}
<lisp-definition> ::= "lisp" <s-expression> ";"
\end{grammar}

The @<s-expression> is evaluated immediately.  It can do anything it likes.

\textbf{Warning!}  This means that hostile Sod modules are a security hazard.
Lisp code can read and write files, start other programs, and make network
connections.  Don't install Sod modules from sources that you don't
trust.\footnote{%
  Presumably you were going to run the corresponding code at some point, so
  this isn't as unusually scary as it sounds.  But please be careful.} %

\subsubsection{Declaring type names} \label{sec:syntax.defs.typename}

\begin{grammar}
<typename-definition> ::=
  "typename" <identifier-list> ";"
\end{grammar}

Each @<identifier> is declared as naming a C type.  This is important because
the C type syntax -- which Sod uses -- is ambiguous, and disambiguation is
done by distinguishing type names from other identifiers.

Don't declare class names using @"typename"; use @"class" forward
declarations instead.

\subsection{Literal code} \label{sec:syntax-code}

\begin{grammar}
<code-definition> ::=
  "code" <identifier> ":" <identifier> @[<constraints>@]
  "{" <c-fragment> "}"

<constraints> ::= "[" <constraint-list> "]"

<constraint> ::= @<identifier>^+
\end{grammar}

The @<c-fragment> will be output unchanged to one of the output files.

The first @<identifier> is the symbolic name of an output file.  Predefined
output file names are @"c" and @"h", which are the implementation code and
header file respectively; other output files can be defined by extensions.

The second @<identifier> provides a name for the output item.  Several C
fragments can have the same name: they will be concatenated together in the
order in which they were encountered.

The @<constraints> provide a means for specifying where in the output file
the output item should appear.  (Note the two kinds of square brackets shown
in the syntax: square brackets must appear around the constraints if they are
present, but that they may be omitted.)  Each comma-separated @<constraint>
is a sequence of identifiers naming output items, and indicates that the
output items must appear in the order given -- though the translator is free
to insert additional items in between them.  (The particular output items
needn't be defined already -- indeed, they needn't be defined ever.)

There is a predefined output item @"includes" in both the @"c" and @"h"
output files which is a suitable place for inserting @"\#include"
preprocessor directives in order to declare types and functions for use
elsewhere in the generated output files.

\subsection{Property sets} \label{sec:syntax.propset}

\begin{grammar}
<properties> ::= "[" <property-list> "]"

<property> ::= <identifier> "=" <expression>
\end{grammar}

Property sets are a means for associating miscellaneous information with
classes and related items.  By using property sets, additional information
can be passed to extensions without the need to introduce idiosyncratic
syntax.

A property has a name, given as an @<identifier>, and a value computed by
evaluating an @<expression>.  The value can be one of a number of types,
though the only operators currently defined act on integer values only.

\subsubsection{The expression evaluator} \label{sec:syntax.propset.expr}

\begin{grammar}
<expression> ::= <term> | <expression> "+" <term> | <expression> "-" <term>

<term> ::= <factor> | <term> "*" <factor> | <term> "/" <factor>

<factor> ::= <primary> | "+" <factor> | "-" <factor>

<primary> ::=
     <integer-literal> | <string-literal> | <char-literal> | <identifier>
\alt "?" <s-expression>
\alt "(" <expression> ")"
\end{grammar}

The arithmetic expression syntax is simple and standard; there are currently
no bitwise, logical, or comparison operators.

A @<primary> expression may be a literal or an identifier.  Note that
identifiers stand for themselves: they \emph{do not} denote values.  For more
fancy expressions, the syntax
\begin{quote}
  @"?" @<s-expression>
\end{quote}
causes the @<s-expression> to be evaluated using the Lisp \textsf{eval}
function.
%%% FIXME crossref to extension docs

\subsection{C types} \label{sec:syntax.c-types}

Sod's syntax for C types closely mirrors the standard C syntax.  A C type has
two parts: a sequence of @<declaration-specifier>s and a @<declarator>.  In
Sod, a type must contain at least one @<declaration-specifier> (i.e.,
`implicit @"int"' is forbidden), and storage-class specifiers are not
recognized.

\subsubsection{Declaration specifiers} \label{sec:syntax.c-types.declspec}

\begin{grammar}
<declaration-specifier> ::= <type-name>
\alt "struct" <identifier> | "union" <identifier> | "enum" <identifier>
\alt "void" | "char" | "int" | "float" | "double"
\alt "short" | "long"
\alt "signed" | "unsigned"
\alt <qualifier>

<qualifier> ::= "const" | "volatile" | "restrict"

<type-name> ::= <identifier>
\end{grammar}

A @<type-name> is an identifier which has been declared as being a type name,
using the @"typename" or @"class" definitions.

Declaration specifiers may appear in any order.  However, not all
combinations are permitted.  A declaration specifier must consist of zero or
more @<qualifiers>, and one of the following, up to reordering.
\begin{itemize}
\item @<type-name>
\item @"struct" @<identifier>, @"union" @<identifier>, @"enum" @<identifier>
\item @"void"
\item @"char", @"unsigned char", @"signed char"
\item @"short", @"unsigned short", @"signed short"
\item @"short int", @"unsigned short int", @"signed short int"
\item @"int", @"unsigned int", @"signed int", @"unsigned", @"signed"
\item @"long", @"unsigned long", @"signed long"
\item @"long int", @"unsigned long int", @"signed long int"
\item @"long long", @"unsigned long long", @"signed long long"
\item @"long long int", @"unsigned long long int", @"signed long long int"
\item @"float", @"double", @"long double"
\end{itemize}
All of these have their usual C meanings.

\subsubsection{Declarators} \label{sec:syntax.c-types.declarator}

\begin{grammar}
<declarator>$[k]$ ::= @<pointer>^* <primary-declarator>$[k]$

<primary-declarator>$[k]$ ::= $k$
\alt "(" <primary-declarator>$[k]$ ")"
\alt <primary-declarator>$[k]$ @<declarator-suffix>^*

<pointer> ::= "*" @<qualifier>^*

<declarator-suffix> ::= "[" <c-fragment> "]"
\alt "(" <arguments> ")"

<arguments> ::= $\epsilon$ | "..."
\alt <argument-list> @["," "..."@]

<argument> ::= @<declaration-specifier>^+ <argument-declarator>

<argument-declarator> ::= <declarator>@[<identifier> @! $\epsilon$@]

<simple-declarator> ::= <declarator>@[<identifier>@]

<dotted-name> ::= <identifier> "." <identifier>

<dotted-declarator> ::= <declarator>@[<dotted-name>@]
\end{grammar}

The declarator syntax is taken from C, but with some differences.
\begin{itemize}
\item Array dimensions are uninterpreted @<c-fragments>, terminated by a
  closing square bracket.  This allows array dimensions to contain arbitrary
  constant expressions.
\item A declarator may have either a single @<identifier> at its centre or a
  pair of @<identifier>s separated by a @`.'; this is used to refer to
  slots or messages defined in superclasses.
\end{itemize}
The remaining differences are (I hope) a matter of presentation rather than
substance.

\subsection{Defining classes} \label{sec:syntax.class}

\begin{grammar}
<class-definition> ::= <class-forward-declaration>
\alt <full-class-definition>
\end{grammar}

\subsubsection{Forward declarations} \label{sec:class.class.forward}

\begin{grammar}
<class-forward-declaration> ::= "class" <identifier> ";"
\end{grammar}

A @<class-forward-declaration> informs Sod that an @<identifier> will be used
to name a class which is currently undefined.  Forward declarations are
necessary in order to resolve certain kinds of circularity.  For example,
\begin{listing}
class Sub;

class Super : SodObject {
  Sub *sub;
};

class Sub : Super {
  /* ... */
};
\end{listing}

\subsubsection{Full class definitions} \label{sec:class.class.full}

\begin{grammar}
<full-class-definition> ::=
  @[<properties>@]
  "class" <identifier> ":" <identifier-list>
  "{" @<class-item>^* "}"

<class-item> ::= <slot-item> ";"
\alt <message-item>
\alt <method-item>
\alt  <initializer-item> ";"
\end{grammar}

A full class definition provides a complete description of a class.

The first @<identifier> gives the name of the class.  It is an error to
give the name of an existing class (other than a forward-referenced class),
or an existing type name.  It is conventional to give classes `MixedCase'
names, to distinguish them from other kinds of identifiers.

The @<identifier-list> names the direct superclasses for the new class.  It
is an error if any of these @<identifier>s does not name a defined class.

The @<properties> provide additional information.  The standard class
properties are as follows.
\begin{description}
\item[@"lisp_class"] The name of the Lisp class to use within the translator
  to represent this class.  The property value must be an identifier; the
  default is @"sod_class".  Extensions may define classes with additional
  behaviour, and may recognize additional class properties.
\item[@"metaclass"] The name of the Sod metaclass for this class.  In the
  generated code, a class is itself an instance of another class -- its
  \emph{metaclass}.  The metaclass defines which slots the class will have,
  which messages it will respond to, and what its behaviour will be when it
  receives them.  The property value must be an identifier naming a defined
  subclass of @"SodClass".  The default metaclass is @"SodClass".
  %%% FIXME xref to theory
\item[@"nick"] A nickname for the class, to be used to distinguish it from
  other classes in various limited contexts.  The property value must be an
  identifier; the default is constructed by forcing the class name to
  lower-case.
\end{description}

The class body consists of a sequence of @<class-item>s enclosed in braces.
These items are discussed on the following sections.

\subsubsection{Slot items} \label{sec:sntax.class.slot}

\begin{grammar}
<slot-item> ::=
  @[<properties>@]
  @<declaration-specifier>^+ <init-declarator-list>

<init-declarator> ::= <declarator> @["=" <initializer>@]
\end{grammar}

A @<slot-item> defines one or more slots.  All instances of the class and any
subclass will contain these slot, with the names and types given by the
@<declaration-specifiers> and the @<declarators>.  Slot declarators may not
contain qualified identifiers.

It is not possible to declare a slot with function type: such an item is
interpreted as being a @<message-item> or @<method-item>.  Pointers to
functions are fine.

An @<initializer>, if present, is treated as if a separate
@<initializer-item> containing the slot name and initializer were present.
For example,
\begin{listing}
[nick = eg]
class Example : Super {
  int foo = 17;
};
\end{listing}
means the same as
\begin{listing}
[nick = eg]
class Example : Super {
  int foo;
  eg.foo = 17;
};
\end{listing}

\subsubsection{Initializer items} \label{sec:syntax.class.init}

\begin{grammar}
<initializer-item> ::= @["class"@] <slot-initializer-list>

<slot-initializer> ::= <qualified-identifier> "=" <initializer>

<initializer> :: "{" <c-fragment> "}" | <c-fragment>
\end{grammar}

An @<initializer-item> provides an initial value for one or more slots.  If
prefixed by @"class", then the initial values are for class slots (i.e.,
slots of the class object itself); otherwise they are for instance slots.

The first component of the @<qualified-identifier> must be the nickname of
one of the class's superclasses (including itself); the second must be the
name of a slot defined in that superclass.

The initializer has one of two forms.
\begin{itemize}
\item A @<c-fragment> enclosed in braces denotes an aggregate initializer.
  This is suitable for initializing structure, union or array slots.
\item A @<c-fragment> \emph{not} beginning with an open brace is a `bare'
  initializer, and continues until the next @`,' or @`;' which is not within
  nested brackets.  Bare initializers are suitable for initializing scalar
  slots, such as pointers or integers, and strings.
\end{itemize}

\subsubsection{Message items} \label{sec:syntax.class.message}

\begin{grammar}
<message-item> ::=
  @[<properties>@]
  @<declaration-specifier>^+ <declarator> @[<method-body>@]
\end{grammar}

\subsubsection{Method items} \label{sec:syntax.class.method}

\begin{grammar}
<method-item> ::=
  @[<properties>@]
  @<declaration-specifier>^+ <declarator> <method-body>

<method-body> ::= "{" <c-fragment> "}" | "extern" ";"
\end{grammar}

%%%--------------------------------------------------------------------------
\section{Class objects}

\begin{listing}
typedef struct SodClass__ichain_obj SodClass;

struct sod_chain {
  size_t n_classes;                     /* Number of classes in chain */
  const SodClass *const *classes;       /* Vector of classes, head first */
  size_t off_ichain;                    /* Offset of ichain from instance base */
  const struct sod_vtable *vt;          /* Vtable pointer for chain */
  size_t ichainsz;                      /* Size of the ichain structure */
};

struct sod_vtable {
  SodClass *_class;                     /* Pointer to instance's class */
  size_t _base;                         /* Offset to instance base */
};

struct SodClass__islots {

  /* Basic information */
  const char *name;                     /* The class's name as a string */
  const char *nick;                     /* The nickname as a string */

  /* Instance allocation and initialization */
  size_t instsz;                        /* Instance layout size in bytes */
  void *(*imprint)(void *);             /* Stamp instance with vtable ptrs */
  void *(*init)(void *);                /* Initialize instance */

  /* Superclass structure */
  size_t n_supers;                      /* Number of direct superclasses */
  const SodClass *const *supers;        /* Vector of direct superclasses */
  size_t n_cpl;                         /* Length of class precedence list */
  const SodClass *const *cpl;           /* Vector for class precedence list */

  /* Chain structure */
  const SodClass *link;                 /* Link to next class in chain */
  const SodClass *head;                 /* Pointer to head of chain */
  size_t level;                         /* Index of class in its chain */
  size_t n_chains;                      /* Number of superclass chains */
  const sod_chain *chains;              /* Vector of chain structures */

  /* Layout */
  size_t off_islots;                    /* Offset of islots from ichain base */
  size_t islotsz;                       /* Size of instance slots */
};

struct SodClass__ichain_obj {
  const SodClass__vt_obj *_vt;
  struct SodClass__islots cls;
};

struct sod_instance {
  struct sod_vtable *_vt;
};
\end{listing}

\begin{listing}
void *sod_convert(const SodClass *cls, const void *obj)
{
  const struct sod_instance *inst = obj;
  const SodClass *real = inst->_vt->_cls;
  const struct sod_chain *chain;
  size_t i, index;

  for (i = 0; i < real->cls.n_chains; i++) {
    chain = &real->cls.chains[i];
    if (chain->classes[0] == cls->cls.head) {
      index = cls->cls.index;
      if (index < chain->n_classes && chain->classes[index] == cls)
        return ((char *)cls - inst->_vt._base + chain->off_ichain);
      else
        return (0);
    }
  }
  return (0);
}
\end{listing}

%%%--------------------------------------------------------------------------
\section{Classes}
\label{sec:class}

\subsection{Classes and superclasses} \label{sec:class.defs}

A @<full-class-definition> must list one or more existing classes to be the
\emph{direct superclasses} for the new class being defined.  We make the
following definitions.
\begin{itemize}
\item The \emph{superclasses} of a class consist of the class itself together
  with the superclasses of its direct superclasses.
\item The \emph{proper superclasses} of a class are its superclasses other
  than itself.
\item If $C$ is a (proper) superclass of $D$ then $D$ is a (\emph{proper})
  \emph{subclass} of $C$.
\end{itemize}
The predefined class @|SodObject| has no direct superclasses; it is unique in
this respect.  All classes are subclasses of @|SodObject|.

\subsection{The class precedence list} \label{sec:class.cpl}

Let $C$ be a class.  The superclasses of $C$ form a directed graph, with an
edge from each class to each of its direct superclasses.  This is the
\emph{superclass graph of $C$}.

In order to resolve inheritance of items, we define a \emph{class precedence
  list} (or CPL) for each class, which imposes a total order on that class's
superclasses.  The default algorithm for computing the CPL is the \emph{C3}
algorithm \cite{fixme-c3}, though extensions may implement other algorithms.

The default algorithm works as follows.  Let $C$ be the class whose CPL we
are to compute.  Let $X$ and $Y$ be two of $C$'s superclasses.
\begin{itemize}
\item $C$ must appear first in the CPL.
\item If $X$ appears before $Y$ in the CPL of one of $C$'s direct
  superclasses, then $X$ appears before $Y$ in the $C$'s CPL.
\item If the above rules don't suffice to order $X$ and $Y$, then whichever
  of $X$ and $Y$ has a subclass which appears further left in the list of
  $C$'s direct superclasses will appear earlier in the CPL.
\end{itemize}
This last rule is sufficient to disambiguate because if both $X$ and $Y$ are
superclasses of the same direct superclass of $C$ then that direct
superclass's CPL will order $X$ and $Y$.

We say that \emph{$X$ is more specific than $Y$ as a superclass of $C$} if
$X$ is earlier than $Y$ in $C$'s class precedence list.  If $C$ is clear from
context then we omit it, saying simply that $X$ is more specific than $Y$.

\subsection{Instances and metaclasses} \label{sec:class.meta}

A class defines the structure and behaviour of its \emph{instances}: run-time
objects created (possibly) dynamically.  An instance is an instance of only
one class, though structurally it may be used in place of an instance of any
of that class's superclasses.  It is possible, with care, to change the class
of an instance at run-time.

Classes are themselves represented as instances -- called \emph{class
  objects} -- in the running program.  Being instances, they have a class,
called the \emph{metaclass}.  The metaclass defines the structure and
behaviour of the class object.

The predefined class @|SodClass| is the default metaclass for new classes.
@|SodClass| has @|SodObject| as its only direct superclass.  @|SodClass| is
its own metaclass.

To make matters more complicated, Sod has \emph{two} distinct metalevels: as
well as the runtime metalevel, as discussed above, there's a compile-time
metalevel hosted in the Sod translator.  Since Sod is written in Common Lisp,
a Sod class's compile-time metaclass is a CLOS class.  The usual compile-time
metaclass is @|sod-class|.  The compile-time metalevel is the subject of
\xref{ch:api}.

\subsection{Items and inheritance} \label{sec:class.inherit}

A class definition also declares \emph{slots}, \emph{messages},
\emph{initializers} and \emph{methods} -- collectively referred to as
\emph{items}.  In addition to the items declared in the class definition --
the class's \emph{direct items} -- a class also \emph{inherits} items from
its superclasses.

The precise rules for item inheritance vary according to the kinds of items
involved.

Some object systems have a notion of `repeated inheritance': if there are
multiple paths in the superclass graph from a class to one of its
superclasses then items defined in that superclass may appear duplicated in
the subclass.  Sod does not have this notion.

\subsubsection{Slots} \label{sec:class.inherit.slots}
A \emph{slot} is a unit of state.  In other object systems, slots may be
called `fields', `member variables', or `instance variables'.

A slot has a \emph{name} and a \emph{type}.  The name serves only to
distinguish the slot from other direct slots defined by the same class.  A
class inherits all of its proper superclasses' slots.  Slots inherited from
superclasses do not conflict with each other or with direct slots, even if
they have the same names.

At run-time, each instance of the class holds a separate value for each slot,
whether direct or inherited.  Changing the value of an instance's slot
doesn't affect other instances.

\subsubsection{Initializers} \label{sec:class.inherit.init}
Mumble.

\subsubsection{Messages} \label{sec:class.inherit.messages}
A \emph{message} is the stimulus for behaviour.  In Sod, a class must define,
statically, the name and format of the messages it is able to receive and the
values it will return in reply.  In this respect, a message is similar to
`abstract member functions' or `interface member functions' in other object
systems.

Like slots, a message has a \emph{name} and a \emph{type}.  Again, the name
serves only to distinguish the message from other direct messages defined by
the same class.  Messages inherited from superclasses do not conflict with
each other or with direct messages, even if they have the same name.

At run-time, one sends a message to an instance by invoking a function
obtained from the instance's \emph{vtable}: \xref{sec:fixme-vtable}.

\subsubsection{Methods} \label{sec:class.inherit.methods}
A \emph{method} is a unit of behaviour.  In other object systems, methods may
be called `member functions'.

A method is associated with a message.  When a message is received by an
instance, all of the methods associated with that message on the instance's
class or any of its superclasses are \emph{applicable}.  The details of how
the applicable methods are invoked are described fully in
\xref{sec:fixme-method-combination}.

\subsection{Chains and instance layout} \label{sec:class.layout}

C is a rather low-level language, and in particular it exposes details of the
way data is laid out in memory.  Since an instance of a class~$C$ should be
(at least in principle) usable anywhere an instance of some superclass $B
\succeq C$ is expected, this implies that an instance of the subclass $C$
needs to contain within it a complete instance of each superclass $B$, laid
out according to the rules of instances of $B$, so that if we have (the
address of) an instance of $C$, we can easily construct a pointer to a thing
which looks like an instance of $B$ contained within it.

Specifically, the information we need to retain for an instance of a
class~$C$ is:
\begin{itemize}
\item the values of each of the slots defined by $C$, including those defined
  by superclasses;
\item information which will let us convert a pointer to $C$ into a pointer
  to any superclass $B \succeq C$;
\item information which will let us call the appropriate effective method for
  each message defined by $C$, including those defined by superclasses; and
\item some additional meta-level information, such as how to find the class
  object for $C$ given (the address of) one of its instances.
\end{itemize}

Observe that, while each distinct instance must clearly have its own storage
for slots, all instances of $C$ can share a single copy of the remaining
information.  The individual instance only needs to keep a pointer to this
shared table, which, inspired by the similar structure in many \Cplusplus\
ABIs, are called a \emph{vtable}.

The easiest approach would be to decide that instances of $C$ are exactly
like instances of $B$, only with extra space at the end for the extra slots
which $C$ defines over and above those already existing in $B$.  Conversion
is then trivial: a pointer to an instance of $C$ can be converted to a
pointer to an instance of some superclass $B$ simply by casting.  Even though
the root class @|SodObject| doesn't have any slots at all, its instances will
still need a vtable so that you can find its class object: the address of the
vtable therefore needs to be at the very start of the instance structure.
Again, a vtable for a superclass would have a vtable for each of its
superclasses as a prefix, with new items added afterwards.

This appealing approach works well for an object system which only permits
single inheritance of both state and behaviour.  Alas, it breaks down when
multiple inheritance is allowed: $C$ can be a subclass of both $B$ and $B'$,
even though $B$ is not a subclass of $B'$, nor \emph{vice versa}; so, in
general, $B$'s instance structure will not be a prefix of $B'$'s, nor will
$B'$'s be a prefix of $B$'s, and therefore $C$ cannot have both $B$ and $B'$
as a prefix.

A (non-root) class may -- though need not -- have a distinguished \emph{link}
superclass, which need not be a direct superclass.  Furthermore, each
class~$C$ must satisfy the \emph{chain condition}: for any superclass $A$ of
$C$, there can be at most one other superclass of $C$ whose link superclass
is $A$.\footnote{%
  That is, it's permitted for two classes $B$ and $B'$ to have the same link
  superclass $A$, but $B$ and $B'$ can't then both be superclasses of the
  same class $C$.} %
Therefore, the links partition the superclasses of~$C$ into nice linear
\emph{chains}, such that each superclass is a member of exactly one chain.
If a class~$B$ has a link superclass~$A$, then $B$'s \emph{level} is one more
than that of $A$; otherwise $B$ is called a \emph{chain head} and its level
is zero.  If the classes in a chain are written in a list, chain head first,
then the level of each class gives its index in the list.

Chains therefore allow us to recover some of the linearity properties which
made layout simple in the case of single inheritance.  The instance structure
for a class $C$ contains a substructure for each of $C$'s superclass chains;
a pointer to an object of class $C$ actually points to the substructure for
the chain containing $C$.  The order of these substructures is unimportant
for now.\footnote{%
  The chains appear in the order in which their most specific classes appear
  in $C$'s class precedence list.  This guarantees that the chain containing
  $C$ itself appears first, so that a pointer to $C$'s instance structure is
  actually a pointer to $C$'s chain substructure.  Apart from that, it's a
  simple, stable, but basically arbitrary choice which can't be changed
  without breaking the ABI.} %
The substructure for each chain begins with a pointer to a vtable, followed
by a structure for each superclass in the chain containing the slots defined
by that superclass, with the chain head (least specific class) first.

Suppose we have a pointer to (static) type $C$, and want to convert it into a
pointer to some superclass $B$ of $C$ -- an \emph{upcast}.\footnote{%
  In the more general case, we have a pointer to static type $C$, which
  actually points to an object of some subclass $D$ of $C$, and want to
  convert it into a pointer to type $B$.  Such a conversion is called a
  \emph{downcast} if $B$ is a subclass of $C$, or a \emph{cross-cast}
  otherwise.  Downcasts and cross-casts require complicated run-time
  checking, and can will fail unless $B$ is a superclass of $D$.} %
If $B$ is in the same chain as $C$ -- an \emph{in-chain upcast} -- then the
pointer value is already correct and it's only necessary to cast it
appropriately.  Otherwise -- a \emph{cross-chain upcast} -- the pointer needs
to be adjusted to point to a different chain substructure.  Since the lengths
and relative positions of the chain substructures vary between classes, the
adjustments are stored in the vtable.  Cross-chain upcasts are therefore a
bit slower than in-chain upcasts.

Each chain has its own separate vtable, because much of the metadata stored
in the vtable is specific to a particular chain.  For example:
\begin{itemize}
\item offsets to other chains' substructures will vary depending on which
  chain we start from; and
\item entry points to methods {

%%%--------------------------------------------------------------------------
\chapter{The Lisp programming interface} \label{ch:api}

%% output for `h' files
%%
%% prologue
%% guard start
%% typedefs start
%% typedefs
%% typedefs end
%% includes start
%% includes
%% includes end
%% classes start
%% CLASS banner
%% CLASS islots start
%% CLASS islots slots
%% CLASS islots end
%% CLASS vtmsgs start
%% CLASS vtmsgs CLASS start
%% CLASS vtmsgs CLASS slots
%% CLASS vtmsgs CLASS end
%% CLASS vtmsgs end
%% CLASS vtables start
%% CLASS vtables CHAIN-HEAD start
%% CLASS vtables CHAIN-HEAD slots
%% CLASS vtables CHAIN-HEAD end
%% CLASS vtables end
%% CLASS vtable-externs
%% CLASS vtable-externs-after
%% CLASS methods start
%% CLASS methods
%% CLASS methods end
%% CLASS ichains start
%% CLASS ichains CHAIN-HEAD start
%% CLASS ichains CHAIN-HEAD slots
%% CLASS ichains CHAIN-HEAD end
%% CLASS ichains end
%% CLASS ilayout start
%% CLASS ilayout slots
%% CLASS ilayout end
%% CLASS conversions
%% CLASS object
%% classes end
%% guard end
%% epilogue

%% output for `c' files
%%
%% prologue
%% includes start
%% includes
%% includes end
%% classes start
%% CLASS banner
%% CLASS direct-methods start
%% CLASS direct-methods METHOD start
%% CLASS direct-methods METHOD body
%% CLASS direct-methods METHOD end
%% CLASS direct-methods end
%% CLASS effective-methods
%% CLASS vtables start
%% CLASS vtables CHAIN-HEAD start
%% CLASS vtables CHAIN-HEAD class-pointer METACLASS
%% CLASS vtables CHAIN-HEAD base-offset
%% CLASS vtables CHAIN-HEAD chain-offset TARGET-HEAD
%% CLASS vtables CHAIN-HEAD vtmsgs CLASS start
%% CLASS vtables CHAIN-HEAD vtmsgs CLASS slots
%% CLASS vtables CHAIN-HEAD vtmsgs CLASS end
%% CLASS vtables CHAIN-HEAD end
%% CLASS vtables end
%% CLASS object prepare
%% CLASS object start
%% CLASS object CHAIN-HEAD ichain start
%% CLASS object SUPER slots start
%% CLASS object SUPER slots
%% CLASS object SUPER vtable
%% CLASS object SUPER slots end
%% CLASS object CHAIN-HEAD ichain end
%% CLASS object end
%% classes end
%% epilogue

%%%--------------------------------------------------------------------------

\include{sod-backg}
\include{sod-protocol}

\end{document}
\f
%%% Local variables:
%%% mode: LaTeX
%%% TeX-PDF-mode: t
%%% End: