static void unget(input *in, int c, filepos *pos) {
if (in->npushback >= in->pushbacksize) {
in->pushbacksize = in->npushback + 16;
- in->pushback = resize(in->pushback, in->pushbacksize);
+ in->pushback = sresize(in->pushback, in->pushbacksize, pushback);
}
in->pushback[in->npushback].chr = c;
in->pushback[in->npushback].pos = *pos; /* structure copy */
}
static void macrodef(tree234 *macros, wchar_t *name, wchar_t *text,
filepos fpos) {
- macro *m = mknew(macro);
+ macro *m = snew(macro);
m->name = name;
m->text = text;
if (add234(macros, m) != m) {
m.name = name;
gotit = find234(macros, &m, NULL);
if (gotit) {
- macrostack *expansion = mknew(macrostack);
+ macrostack *expansion = snew(macrostack);
expansion->next = in->stack;
expansion->text = gotit->text;
expansion->pos = *pos; /* structure copy */
freetree234(macros);
}
+static void input_configure(input *in, paragraph *cfg) {
+ assert(cfg->type == para_Config);
+
+ if (!ustricmp(cfg->keyword, L"input-charset")) {
+ in->charset = charset_from_ustr(&cfg->fpos, uadv(cfg->keyword));
+ }
+}
+
/*
* Can return EOF
*/
-static int get(input *in, filepos *pos) {
+static int get(input *in, filepos *pos, rdstringc *rsc) {
int pushbackpt = in->stack ? in->stack->npushback : 0;
if (in->npushback > pushbackpt) {
--in->npushback;
}
else if (in->stack) {
wchar_t c = in->stack->text[in->stack->ptr];
+ if (pos)
+ *pos = in->stack->pos;
if (in->stack->text[++in->stack->ptr] == L'\0') {
macrostack *tmp = in->stack;
in->stack = tmp->next;
return c;
}
else if (in->currfp) {
- int c = getc(in->currfp);
- if (c == EOF) {
- fclose(in->currfp);
- in->currfp = NULL;
- }
- /* Track line numbers, for error reporting */
- if (pos)
- *pos = in->pos;
- if (in->reportcols) {
- switch (c) {
- case '\t':
- in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
- break;
- case '\n':
- in->pos.col = 1;
- in->pos.line++;
- break;
- default:
- in->pos.col++;
- break;
+ while (in->wcpos >= in->nwc) {
+
+ int c = getc(in->currfp);
+
+ if (c == EOF) {
+ fclose(in->currfp);
+ in->currfp = NULL;
+ return EOF;
+ }
+
+ if (rsc)
+ rdaddc(rsc, c);
+
+ /* Track line numbers, for error reporting */
+ if (pos)
+ *pos = in->pos;
+ if (in->reportcols) {
+ switch (c) {
+ case '\t':
+ in->pos.col = 1 + (in->pos.col + TAB_STOP-1) % TAB_STOP;
+ break;
+ case '\n':
+ in->pos.col = 1;
+ in->pos.line++;
+ break;
+ default:
+ in->pos.col++;
+ break;
+ }
+ } else {
+ in->pos.col = -1;
+ if (c == '\n')
+ in->pos.line++;
+ }
+
+ /*
+ * Do input character set translation, so that we return
+ * Unicode.
+ */
+ {
+ char buf[1];
+ char const *p;
+ int inlen;
+
+ buf[0] = (char)c;
+ p = buf;
+ inlen = 1;
+
+ in->nwc = charset_to_unicode(&p, &inlen,
+ in->wc, lenof(in->wc),
+ in->charset, &in->csstate,
+ NULL, 0);
+ assert(p == buf+1 && inlen == 0);
+
+ in->wcpos = 0;
}
- } else {
- in->pos.col = -1;
- if (c == '\n')
- in->pos.line++;
}
- /* FIXME: do input charmap translation. We should be returning
- * Unicode here. */
- return c;
+
+ return in->wc[in->wcpos++];
+
} else
return EOF;
}
int type;
int cmd, aux;
wchar_t *text;
+ char *origtext;
filepos pos;
};
enum {
c_c, /* code */
c_cfg, /* configuration directive */
c_copyright, /* copyright statement */
+ c_cq, /* quoted code (sugar for \q{\cw{x}}) */
c_cw, /* weak code */
c_date, /* document processing date */
c_dd, /* description list: description */
{"c", c_c}, /* code */
{"cfg", c_cfg}, /* configuration directive */
{"copyright", c_copyright}, /* copyright statement */
+ {"cq", c_cq}, /* quoted code (sugar for \q{\cw{x}}) */
{"cw", c_cw}, /* weak code */
{"date", c_date}, /* document processing date */
{"dd", c_dd}, /* description list: description */
token get_token(input *in) {
int c;
int nls;
+ int prevpos;
token ret;
rdstring rs = { 0, 0, NULL };
+ rdstringc rsc = { 0, 0, NULL };
filepos cpos;
ret.text = NULL; /* default */
- c = get(in, &cpos);
+ ret.origtext = NULL; /* default */
+ if (in->pushback_chars) {
+ rdaddsc(&rsc, in->pushback_chars);
+ sfree(in->pushback_chars);
+ in->pushback_chars = NULL;
+ }
+ c = get(in, &cpos, &rsc);
ret.pos = cpos;
if (iswhite(c)) { /* tok_white or tok_eop */
nls = 0;
+ prevpos = 0;
do {
if (isnl(c))
nls++;
- } while ((c = get(in, &cpos)) != EOF && iswhite(c));
+ prevpos = rsc.pos;
+ } while ((c = get(in, &cpos, &rsc)) != EOF && iswhite(c));
if (c == EOF) {
ret.type = tok_eof;
+ sfree(rsc.text);
return ret;
}
+ if (rsc.text) {
+ in->pushback_chars = dupstr(rsc.text + prevpos);
+ sfree(rsc.text);
+ }
unget(in, c, &cpos);
ret.type = (nls > 1 ? tok_eop : tok_white);
return ret;
} else if (c == EOF) { /* tok_eof */
ret.type = tok_eof;
+ sfree(rsc.text);
return ret;
} else if (c == '\\') { /* tok_cmd */
- c = get(in, &cpos);
+ rsc.pos = prevpos = 0;
+ c = get(in, &cpos, &rsc);
if (c == '-' || c == '\\' || c == '_' ||
c == '#' || c == '{' || c == '}' || c == '.') {
/* single-char command */
rdadd(&rs, c);
+ prevpos = rsc.pos;
} else if (c == 'u') {
int len = 0;
do {
rdadd(&rs, c);
len++;
- c = get(in, &cpos);
+ prevpos = rsc.pos;
+ c = get(in, &cpos, &rsc);
} while (ishex(c) && len < 5);
unget(in, c, &cpos);
} else if (iscmd(c)) {
do {
rdadd(&rs, c);
- c = get(in, &cpos);
+ prevpos = rsc.pos;
+ c = get(in, &cpos, &rsc);
} while (iscmd(c));
unget(in, c, &cpos);
}
*/
ret.type = tok_cmd;
ret.text = ustrdup(rs.text);
+ if (rsc.text) {
+ in->pushback_chars = dupstr(rsc.text + prevpos);
+ rsc.text[prevpos] = '\0';
+ ret.origtext = dupstr(rsc.text);
+ } else {
+ ret.origtext = dupstr("");
+ }
match_kw(&ret);
sfree(rs.text);
+ sfree(rsc.text);
return ret;
} else if (c == '{') { /* tok_lbrace */
ret.type = tok_lbrace;
+ sfree(rsc.text);
return ret;
} else if (c == '}') { /* tok_rbrace */
ret.type = tok_rbrace;
+ sfree(rsc.text);
return ret;
} else { /* tok_word */
/*
* a hyphen.
*/
ret.aux = FALSE; /* assumed for now */
+ prevpos = 0;
while (1) {
if (iswhite(c) || c=='{' || c=='}' || c=='\\' || c==EOF) {
/* Put back the character that caused termination */
} else {
rdadd(&rs, c);
if (c == '-') {
+ prevpos = rsc.pos;
ret.aux = TRUE;
break; /* hyphen terminates word */
}
}
- c = get(in, &cpos);
+ prevpos = rsc.pos;
+ c = get(in, &cpos, &rsc);
}
ret.type = tok_word;
ret.text = ustrdup(rs.text);
+ if (rsc.text) {
+ in->pushback_chars = dupstr(rsc.text + prevpos);
+ rsc.text[prevpos] = '\0';
+ ret.origtext = dupstr(rsc.text);
+ } else {
+ ret.origtext = dupstr("");
+ }
sfree(rs.text);
+ sfree(rsc.text);
return ret;
}
}
int c;
filepos cpos;
- c = get(in, &cpos);
+ c = get(in, &cpos, NULL);
unget(in, c, &cpos);
return (c == '{');
}
filepos cpos;
ret.type = tok_word;
- c = get(in, &cpos); /* expect (and discard) one space */
+ ret.origtext = NULL;
+ c = get(in, &cpos, NULL); /* expect (and discard) one space */
ret.pos = cpos;
if (c == ' ') {
- c = get(in, &cpos);
+ c = get(in, &cpos, NULL);
ret.pos = cpos;
}
while (!isnl(c) && c != EOF) {
int c2 = c;
- c = get(in, &cpos);
+ c = get(in, &cpos, NULL);
/* Discard \r just before \n. */
if (c2 != 13 || !isnl(c))
rdadd(&rs, c2);
word *mnewword;
if (!hptrptr)
return NULL;
- mnewword = mknew(word);
+ mnewword = snew(word);
*mnewword = newword; /* structure copy */
mnewword->next = NULL;
**hptrptr = mnewword;
* Adds a new paragraph to a linked list
*/
static paragraph *addpara(paragraph newpara, paragraph ***hptrptr) {
- paragraph *mnewpara = mknew(paragraph);
+ paragraph *mnewpara = snew(paragraph);
*mnewpara = newpara; /* structure copy */
mnewpara->next = NULL;
**hptrptr = mnewpara;
* Destructor before token is reassigned; should catch most memory
* leaks
*/
-#define dtor(t) ( sfree(t.text) )
+#define dtor(t) ( sfree(t.text), sfree(t.origtext) )
/*
* Reads a single file (ie until get() returns EOF)
*/
-static void read_file(paragraph ***ret, input *in, indexdata *idx) {
+static void read_file(paragraph ***ret, input *in, indexdata *idx,
+ tree234 *macros) {
token t;
paragraph par;
word wd, **whptr, **idximplicit;
- tree234 *macros;
wchar_t utext[2], *wdtext;
int style, spcstyle;
int already;
stack_style = 2, /* \e, \c, \cw */
stack_idx = 4, /* \I, \i, \ii */
stack_hyper = 8, /* \W */
- stack_quote = 16, /* \q */
+ stack_quote = 16 /* \q */
} type;
word **whptr; /* to restore from \u alternatives */
word **idximplicit; /* to restore from \u alternatives */
+ filepos fpos;
+ int in_code;
} *sitem;
stack parsestk;
struct crossparaitem {
wchar_t uchr;
t.text = NULL;
- macros = newtree234(macrocmp);
+ t.origtext = NULL;
already = FALSE;
crossparastk = stk_new();
int start_cmd = c__invalid;
par.words = NULL;
par.keyword = NULL;
+ par.origkeyword = NULL;
whptr = &par.words;
/*
* nested lists, code paras etc). Hence, the previous
* paragraph must be of a list type.
*/
- sitem = mknew(struct crossparaitem);
+ sitem = snew(struct crossparaitem);
stop = (struct crossparaitem *)stk_top(crossparastk);
if (stop)
*sitem = *stop;
* block-quoted (typically they will be indented a
* bit).
*/
- sitem = mknew(struct crossparaitem);
+ sitem = snew(struct crossparaitem);
stop = (struct crossparaitem *)stk_top(crossparastk);
if (stop)
*sitem = *stop;
continue;
}
+ while (t.type == tok_cmd &&
+ macrolookup(macros, in, t.text, &t.pos)) {
+ dtor(t), t = get_token(in);
+ }
+
/*
* This token begins a paragraph. See if it's one of the
* special commands that define a paragraph type.
if (needkw > 0) {
rdstring rs = { 0, 0, NULL };
+ rdstringc rsc = { 0, 0, NULL };
int nkeys = 0;
filepos fp;
/* Get keywords. */
dtor(t), t = get_token(in);
fp = t.pos;
- while (t.type == tok_lbrace) {
+ while (t.type == tok_lbrace ||
+ (t.type == tok_white && (needkw & 24))) {
+ /*
+ * In paragraph types which can't accept any
+ * body text (such as \cfg), we are lenient
+ * about whitespace between keywords. This is
+ * important for \cfg in particular since it
+ * can often have many keywords which are long
+ * pieces of text, so it's useful to permit the
+ * user to wrap the line between them.
+ */
+ if (t.type == tok_white) {
+ dtor(t), t = get_token(in); /* eat the space */
+ continue;
+ }
/* This is a keyword. */
nkeys++;
/* FIXME: there will be bugs if anyone specifies an
t.type == tok_word ||
t.type == tok_white ||
(t.type == tok_cmd && t.cmd == c__nbsp) ||
- (t.type == tok_cmd && t.cmd == c__escaped)) {
+ (t.type == tok_cmd && t.cmd == c__escaped) ||
+ (t.type == tok_cmd && t.cmd == c_u)) {
if (t.type == tok_white ||
- (t.type == tok_cmd && t.cmd == c__nbsp))
+ (t.type == tok_cmd && t.cmd == c__nbsp)) {
rdadd(&rs, ' ');
- else
+ rdaddc(&rsc, ' ');
+ } else if (t.type == tok_cmd && t.cmd == c_u) {
+ rdadd(&rs, t.aux);
+ rdaddc(&rsc, '\\');
+ rdaddsc(&rsc, t.origtext);
+ } else {
rdadds(&rs, t.text);
+ rdaddsc(&rsc, t.origtext);
+ }
}
if (t.type != tok_rbrace) {
error(err_kwunclosed, &t.pos);
continue;
}
rdadd(&rs, 0); /* add string terminator */
+ rdaddc(&rsc, 0); /* add string terminator */
dtor(t), t = get_token(in); /* eat right brace */
}
- rdadd(&rs, 0); /* add string terminator */
+ rdadd(&rs, 0); /* add string terminator */
+ rdaddc(&rsc, 0); /* add string terminator */
/* See whether we have the right number of keywords. */
if ((needkw & 48) && nkeys > 0)
}
par.keyword = rdtrim(&rs);
+ par.origkeyword = rdtrimc(&rsc);
/* Move to EOP in case of needkw==8 or 16 (no body) */
if (needkw & 24) {
already = TRUE;/* inhibit get_token at top of loop */
prev_para_type = par.type;
addpara(par, ret);
+
+ if (par.type == para_Config) {
+ input_configure(in, &par);
+ }
continue; /* next paragraph */
}
}
* Mid-paragraph commands:
*
* \K \k
- * \c \cw
+ * \c \cw \cq
* \e
* \i \ii
* \I
+ * \q
* \u
* \W
* \date
case tok_lbrace:
error(err_unexbrace, &t.pos);
/* Error recovery: push nop */
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
sitem->type = stack_nop;
+ sitem->fpos = t.pos;
stk_push(parsestk, sitem);
break;
case tok_rbrace:
}
indexing = FALSE;
rdadd(&indexstr, L'\0');
- index_merge(idx, FALSE, indexstr.text, idxwordlist);
+ index_merge(idx, FALSE, indexstr.text,
+ idxwordlist, &sitem->fpos);
sfree(indexstr.text);
}
if (sitem->type & stack_hyper) {
}
break;
case c_q:
+ case c_cq:
+ type = t.cmd;
dtor(t), t = get_token(in);
if (t.type != tok_lbrace) {
error(err_explbr, &t.pos);
} else {
- wd.text = NULL;
- wd.type = toquotestyle(style);
- wd.alt = NULL;
- wd.aux = quote_Open;
- wd.fpos = t.pos;
- wd.breaks = FALSE;
- if (!indexing || index_visible)
- addword(wd, &whptr);
- if (indexing) {
- rdadd(&indexstr, L'"');
- addword(wd, &idximplicit);
+ /*
+ * Enforce that \q may not be used anywhere
+ * within \c. (It shouldn't be necessary
+ * since the whole point of \c should be
+ * that the user wants to exercise exact
+ * control over the glyphs used, and
+ * forbidding it has the useful effect of
+ * relieving some backends of having to
+ * make difficult decisions.)
+ */
+ int stype;
+
+ if (style != word_Code && style != word_WeakCode) {
+ wd.text = NULL;
+ wd.type = toquotestyle(style);
+ wd.alt = NULL;
+ wd.aux = quote_Open;
+ wd.fpos = t.pos;
+ wd.breaks = FALSE;
+ if (!indexing || index_visible)
+ addword(wd, &whptr);
+ if (indexing) {
+ rdadd(&indexstr, L'"');
+ addword(wd, &idximplicit);
+ }
+ stype = stack_quote;
+ } else {
+ error(err_codequote, &t.pos);
+ stype = stack_nop;
}
- sitem = mknew(struct stack_item);
- sitem->type = stack_quote;
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
+ sitem->type = stype;
+ if (type == c_cq) {
+ if (style != word_Normal) {
+ error(err_nestedstyles, &t.pos);
+ } else {
+ style = word_WeakCode;
+ spcstyle = tospacestyle(style);
+ sitem->type |= stack_style;
+ }
+ }
stk_push(parsestk, sitem);
}
break;
* delimiting the text marked by the link.
*/
dtor(t), t = get_token(in);
+ sitem = snew(struct stack_item);
+ sitem->fpos = wd.fpos;
+ sitem->type = stack_hyper;
+ /*
+ * Special cases: \W{}\i, \W{}\ii
+ */
+ if (t.type == tok_cmd &&
+ (t.cmd == c_i || t.cmd == c_ii)) {
+ if (indexing) {
+ error(err_nestedindex, &t.pos);
+ } else {
+ /* Add an index-reference word with no
+ * text as yet */
+ wd.type = word_IndexRef;
+ wd.text = NULL;
+ wd.alt = NULL;
+ wd.aux = 0;
+ wd.breaks = FALSE;
+ indexword = addword(wd, &whptr);
+ /* Set up a rdstring to read the
+ * index text */
+ indexstr = nullrs;
+ /* Flags so that we do the Right
+ * Things with text */
+ index_visible = (type != c_I);
+ index_downcase = (type == c_ii);
+ indexing = TRUE;
+ idxwordlist = NULL;
+ idximplicit = &idxwordlist;
+
+ sitem->type |= stack_idx;
+ }
+ dtor(t), t = get_token(in);
+ }
/*
* Special cases: \W{}\c, \W{}\e, \W{}\cw
*/
- sitem = mknew(struct stack_item);
- sitem->type = stack_hyper;
if (t.type == tok_cmd &&
(t.cmd == c_e || t.cmd == c_c || t.cmd == c_cw)) {
if (style != word_Normal)
error(err_nestedstyles, &t.pos);
/* Error recovery: eat lbrace, push nop. */
dtor(t), t = get_token(in);
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
sitem->type = stack_nop;
stk_push(parsestk, sitem);
}
type == c_cw ? word_WeakCode :
word_Emph);
spcstyle = tospacestyle(style);
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
sitem->type = stack_style;
stk_push(parsestk, sitem);
}
error(err_nestedindex, &t.pos);
/* Error recovery: eat lbrace, push nop. */
dtor(t), t = get_token(in);
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
sitem->type = stack_nop;
stk_push(parsestk, sitem);
}
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
sitem->type = stack_idx;
dtor(t), t = get_token(in);
/*
* sidetrack from the main thread of the
* paragraph.
*/
- sitem = mknew(struct stack_item);
+ sitem = snew(struct stack_item);
+ sitem->fpos = t.pos;
sitem->type = stack_ualt;
sitem->whptr = whptr;
sitem->idximplicit = idximplicit;
}
stk_free(parsestk);
prev_para_type = par.type;
- addpara(par, ret);
+ /*
+ * Before we add the paragraph to the output list, we
+ * should check that there was any text in it at all; there
+ * might not be if (for example) the paragraph contained
+ * nothing but an unrecognised command sequence, and if we
+ * put an empty paragraph on the list it may confuse the
+ * back ends later on.
+ */
+ if (par.words) {
+ addpara(par, ret);
+ }
if (t.type == tok_eof)
already = TRUE;
}
* this cleanup doesn't happen.
*/
dtor(t);
- macrocleanup(macros);
stk_free(crossparastk);
}
+struct {
+ char const *magic;
+ size_t nmagic;
+ void (*reader)(input *);
+} magics[] = {
+ { "%!FontType1-", 12, &read_pfa_file },
+ { "%!PS-AdobeFont-", 15, &read_pfa_file },
+ { "\x80\x01", 2, &read_pfb_file },
+ { "StartFontMetrics", 16, &read_afm_file },
+ { "\x00\x01\x00\x00", 4, &read_sfnt_file },
+ { "true", 4, &read_sfnt_file },
+};
+
paragraph *read_input(input *in, indexdata *idx) {
paragraph *head = NULL;
paragraph **hptr = &head;
+ tree234 *macros;
+ char mag[16];
+ size_t len, i;
+ void (*reader)(input *);
+
+ macros = newtree234(macrocmp);
while (in->currindex < in->nfiles) {
in->currfp = fopen(in->filenames[in->currindex], "r");
if (in->currfp) {
setpos(in, in->filenames[in->currindex]);
- read_file(&hptr, in, idx);
+ in->charset = in->defcharset;
+ in->csstate = charset_init_state;
+ in->wcpos = in->nwc = 0;
+ in->pushback_chars = NULL;
+ reader = NULL;
+ len = fread(mag, 1, sizeof(mag), in->currfp);
+ for (i = 0; i < lenof(magics); i++) {
+ if (len >= magics[i].nmagic &&
+ memcmp(mag, magics[i].magic, magics[i].nmagic) == 0) {
+ reader = magics[i].reader;
+ break;
+ }
+ }
+ rewind(in->currfp);
+ if (reader == NULL)
+ read_file(&hptr, in, idx, macros);
+ else
+ (*reader)(in);
}
in->currindex++;
}
+ macrocleanup(macros);
+
return head;
}