2 * This file is part of DisOrder
3 * Copyright (C) 2007 Richard Kettlewell
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
21 * @brief Noddy HTML parser
39 /** @brief Entity table type */
45 /** @brief Known entities
47 * We only support the entities that turn up in the HTML files we
48 * actually care about.
50 * Keep in alphabetical order.
52 static const struct entity entities
[] = {
58 /** @brief Skip whitespace */
59 static const char *skipwhite(const char *input
) {
60 while(*input
&& isspace((unsigned char)*input
))
65 /** @brief Parse an entity */
66 static const char *parse_entity(const char *input
,
68 input
= skipwhite(input
);
70 input
= skipwhite(input
+ 1);
72 *entityp
= strtoul(skipwhite(input
+ 1), (char **)&input
, 16);
74 *entityp
= strtoul(input
, (char **)&input
, 10);
76 struct dynstr name
[1];
80 while(isalnum((unsigned char)*input
))
81 dynstr_append(name
, tolower((unsigned char)*input
++));
82 dynstr_terminate(name
);
83 if((n
= TABLE_FIND(entities
, struct entity
, name
, name
->vec
)) < 0) {
84 error(0, "unknown entity '%s'", name
->vec
);
87 *entityp
= entities
[n
].value
;
89 input
= skipwhite(input
);
95 /** @brief Parse one character or entity and append it to a @ref dynstr */
96 static const char *parse_one(const char *input
, struct dynstr
*d
) {
99 input
= parse_entity(input
+ 1, &c
);
100 if(one_ucs42utf8(c
, d
))
101 dynstr_append(d
, '?'); /* U+FFFD might be a better choice */
103 dynstr_append(d
, *input
++);
107 /** @brief Too-stupid-to-live HTML parser
108 * @param callbacks Parser callbacks
109 * @param input HTML document
110 * @param u User data pointer
111 * @return 0 on success, -1 on error
113 int html_parse(const struct html_parser_callbacks
*callbacks
,
116 struct dynstr text
[1];
121 struct dynstr tag
[1];
124 /* flush collected text */
126 dynstr_terminate(text
);
127 callbacks
->text(text
->vec
, u
);
131 input
= skipwhite(input
+ 1);
132 /* see if it's an open or close tag */
134 input
= skipwhite(input
+ 1);
137 attrs
= hash_new(sizeof(char *));
139 while(isalnum((unsigned char)*input
))
140 dynstr_append(tag
, tolower((unsigned char)*input
++));
141 dynstr_terminate(tag
);
142 input
= skipwhite(input
);
144 /* gather attributes */
145 while(*input
&& *input
!= '>') {
146 struct dynstr name
[1], value
[1];
151 while(isalnum((unsigned char)*input
))
152 dynstr_append(name
, tolower((unsigned char)*input
++));
153 dynstr_terminate(name
);
154 input
= skipwhite(input
);
156 /* attribute value */
157 input
= skipwhite(input
+ 1);
158 if(*input
== '"' || *input
== '\'') {
160 const int q
= *input
++;
161 while(*input
&& *input
!= q
)
162 input
= parse_one(input
, value
);
167 while(*input
&& *input
!= '>' && !isspace((unsigned char)*input
))
168 input
= parse_one(input
, value
);
170 dynstr_terminate(value
);
172 /* stash the value */
173 hash_add(attrs
, name
->vec
, value
->vec
, HASH_INSERT_OR_REPLACE
);
174 input
= skipwhite(input
);
178 error(0, "unterminated tag %s", tag
->vec
);
183 callbacks
->open(tag
->vec
, attrs
, u
);
185 callbacks
->close(tag
->vec
, u
);
187 input
= parse_one(input
, text
);
189 /* flush any trailing text */
191 dynstr_terminate(text
);
192 callbacks
->text(text
->vec
, u
);