Use memcpy helpers and FILLZERO
[secnet] / netlink.c
1 /* User-kernel network link */
2
3 /* See RFCs 791, 792, 1123 and 1812 */
4
5 /* The netlink device is actually a router. Tunnels are unnumbered
6 point-to-point lines (RFC1812 section 2.2.7); the router has a
7 single address (the 'router-id'). */
8
9 /* This is where we currently have the anti-spoofing paranoia - before
10 sending a packet to the kernel we check that the tunnel it came
11 over could reasonably have produced it. */
12
13
14 /* Points to note from RFC1812 (which may require changes in this
15 file):
16
17 3.3.4 Maximum Transmission Unit - MTU
18
19 The MTU of each logical interface MUST be configurable within the
20 range of legal MTUs for the interface.
21
22 Many Link Layer protocols define a maximum frame size that may be
23 sent. In such cases, a router MUST NOT allow an MTU to be set which
24 would allow sending of frames larger than those allowed by the Link
25 Layer protocol. However, a router SHOULD be willing to receive a
26 packet as large as the maximum frame size even if that is larger than
27 the MTU.
28
29 4.2.1 A router SHOULD count datagrams discarded.
30
31 4.2.2.1 Source route options - we probably should implement processing
32 of source routes, even though mostly the security policy will prevent
33 their use.
34
35 5.3.13.4 Source Route Options
36
37 A router MUST implement support for source route options in forwarded
38 packets. A router MAY implement a configuration option that, when
39 enabled, causes all source-routed packets to be discarded. However,
40 such an option MUST NOT be enabled by default.
41
42 5.3.13.5 Record Route Option
43
44 Routers MUST support the Record Route option in forwarded packets.
45
46 A router MAY provide a configuration option that, if enabled, will
47 cause the router to ignore (i.e., pass through unchanged) Record
48 Route options in forwarded packets. If provided, such an option MUST
49 default to enabling the record-route. This option should not affect
50 the processing of Record Route options in datagrams received by the
51 router itself (in particular, Record Route options in ICMP echo
52 requests will still be processed according to Section [4.3.3.6]).
53
54 5.3.13.6 Timestamp Option
55
56 Routers MUST support the timestamp option in forwarded packets. A
57 timestamp value MUST follow the rules given [INTRO:2].
58
59 If the flags field = 3 (timestamp and prespecified address), the
60 router MUST add its timestamp if the next prespecified address
61 matches any of the router's IP addresses. It is not necessary that
62 the prespecified address be either the address of the interface on
63 which the packet arrived or the address of the interface over which
64 it will be sent.
65
66
67 4.2.2.7 Fragmentation: RFC 791 Section 3.2
68
69 Fragmentation, as described in [INTERNET:1], MUST be supported by a
70 router.
71
72 4.2.2.8 Reassembly: RFC 791 Section 3.2
73
74 As specified in the corresponding section of [INTRO:2], a router MUST
75 support reassembly of datagrams that it delivers to itself.
76
77 4.2.2.9 Time to Live: RFC 791 Section 3.2
78
79 Note in particular that a router MUST NOT check the TTL of a packet
80 except when forwarding it.
81
82 A router MUST NOT discard a datagram just because it was received
83 with TTL equal to zero or one; if it is to the router and otherwise
84 valid, the router MUST attempt to receive it.
85
86 On messages the router originates, the IP layer MUST provide a means
87 for the transport layer to set the TTL field of every datagram that
88 is sent. When a fixed TTL value is used, it MUST be configurable.
89
90
91 8.1 The Simple Network Management Protocol - SNMP
92 8.1.1 SNMP Protocol Elements
93
94 Routers MUST be manageable by SNMP [MGT:3]. The SNMP MUST operate
95 using UDP/IP as its transport and network protocols.
96
97
98 */
99
100 #include <string.h>
101 #include <assert.h>
102 #include <limits.h>
103 #include "secnet.h"
104 #include "util.h"
105 #include "ipaddr.h"
106 #include "netlink.h"
107 #include "process.h"
108
109 #ifdef NETLINK_DEBUG
110 #define MDEBUG(...) Message(M_DEBUG, __VA_ARGS__)
111 #else /* !NETLINK_DEBUG */
112 #define MDEBUG(...) ((void)0)
113 #endif /* !NETLINK_DEBUG */
114
115 #define ICMP_TYPE_ECHO_REPLY 0
116
117 #define ICMP_TYPE_UNREACHABLE 3
118 #define ICMP_CODE_NET_UNREACHABLE 0
119 #define ICMP_CODE_PROTOCOL_UNREACHABLE 2
120 #define ICMP_CODE_FRAGMENTATION_REQUIRED 4
121 #define ICMP_CODE_NET_PROHIBITED 13
122
123 #define ICMP_TYPE_ECHO_REQUEST 8
124
125 #define ICMP_TYPE_TIME_EXCEEDED 11
126 #define ICMP_CODE_TTL_EXCEEDED 0
127
128 /* Generic IP checksum routine */
129 static inline uint16_t ip_csum(const uint8_t *iph,int32_t count)
130 {
131 register uint32_t sum=0;
132
133 while (count>1) {
134 sum+=ntohs(*(uint16_t *)iph);
135 iph+=2;
136 count-=2;
137 }
138 if(count>0)
139 sum+=*(uint8_t *)iph;
140 while (sum>>16)
141 sum=(sum&0xffff)+(sum>>16);
142 return htons(~sum);
143 }
144
145 #ifdef i386
146 /*
147 * This is a version of ip_compute_csum() optimized for IP headers,
148 * which always checksum on 4 octet boundaries.
149 *
150 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
151 * Arnt Gulbrandsen.
152 */
153 static inline uint16_t ip_fast_csum(const uint8_t *iph, int32_t ihl) {
154 uint32_t sum;
155
156 __asm__ __volatile__(
157 "movl (%1), %0 ;\n"
158 "subl $4, %2 ;\n"
159 "jbe 2f ;\n"
160 "addl 4(%1), %0 ;\n"
161 "adcl 8(%1), %0 ;\n"
162 "adcl 12(%1), %0 ;\n"
163 "1: adcl 16(%1), %0 ;\n"
164 "lea 4(%1), %1 ;\n"
165 "decl %2 ;\n"
166 "jne 1b ;\n"
167 "adcl $0, %0 ;\n"
168 "movl %0, %2 ;\n"
169 "shrl $16, %0 ;\n"
170 "addw %w2, %w0 ;\n"
171 "adcl $0, %0 ;\n"
172 "notl %0 ;\n"
173 "2: ;\n"
174 /* Since the input registers which are loaded with iph and ipl
175 are modified, we must also specify them as outputs, or gcc
176 will assume they contain their original values. */
177 : "=r" (sum), "=r" (iph), "=r" (ihl)
178 : "1" (iph), "2" (ihl)
179 : "memory");
180 return sum;
181 }
182 #else
183 static inline uint16_t ip_fast_csum(const uint8_t *iph, int32_t ihl)
184 {
185 assert(ihl < INT_MAX/4);
186 return ip_csum(iph,ihl*4);
187 }
188 #endif
189
190 struct iphdr {
191 #if defined (WORDS_BIGENDIAN)
192 uint8_t version:4,
193 ihl:4;
194 #else
195 uint8_t ihl:4,
196 version:4;
197 #endif
198 uint8_t tos;
199 uint16_t tot_len;
200 uint16_t id;
201 uint16_t frag;
202 #define IPHDR_FRAG_OFF ((uint16_t)0x1fff)
203 #define IPHDR_FRAG_MORE ((uint16_t)0x2000)
204 #define IPHDR_FRAG_DONT ((uint16_t)0x4000)
205 /* reserved 0x8000 */
206 uint8_t ttl;
207 uint8_t protocol;
208 uint16_t check;
209 uint32_t saddr;
210 uint32_t daddr;
211 /* The options start here. */
212 };
213
214 struct icmphdr {
215 struct iphdr iph;
216 uint8_t type;
217 uint8_t code;
218 uint16_t check;
219 union icmpinfofield {
220 uint32_t unused;
221 struct {
222 uint8_t pointer;
223 uint8_t unused1;
224 uint16_t unused2;
225 } pprob;
226 uint32_t gwaddr;
227 struct {
228 uint16_t id;
229 uint16_t seq;
230 } echo;
231 struct {
232 uint16_t unused;
233 uint16_t mtu;
234 } fragneeded;
235 } d;
236 };
237
238 static const union icmpinfofield icmp_noinfo;
239
240 static void netlink_client_deliver(struct netlink *st,
241 struct netlink_client *client,
242 uint32_t source, uint32_t dest,
243 struct buffer_if *buf);
244 static void netlink_host_deliver(struct netlink *st,
245 struct netlink_client *sender,
246 uint32_t source, uint32_t dest,
247 struct buffer_if *buf);
248
249 static const char *sender_name(struct netlink_client *sender /* or NULL */)
250 {
251 return sender?sender->name:"(local)";
252 }
253
254 static void netlink_packet_deliver(struct netlink *st,
255 struct netlink_client *client,
256 struct buffer_if *buf);
257
258 /* XXX RFC1812 4.3.2.5:
259 All other ICMP error messages (Destination Unreachable,
260 Redirect, Time Exceeded, and Parameter Problem) SHOULD have their
261 precedence value set to 6 (INTERNETWORK CONTROL) or 7 (NETWORK
262 CONTROL). The IP Precedence value for these error messages MAY be
263 settable.
264 */
265 static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
266 uint32_t source, uint32_t dest,
267 uint16_t len)
268 {
269 struct icmphdr *h;
270
271 BUF_ALLOC(&st->icmp,"netlink_icmp_tmpl");
272 buffer_init(&st->icmp,calculate_max_start_pad());
273 h=buf_append(&st->icmp,sizeof(*h));
274
275 h->iph.version=4;
276 h->iph.ihl=5;
277 h->iph.tos=0;
278 h->iph.tot_len=htons(len+(h->iph.ihl*4)+8);
279 h->iph.id=0;
280 h->iph.frag=0;
281 h->iph.ttl=255; /* XXX should be configurable */
282 h->iph.protocol=1;
283 h->iph.saddr=htonl(source);
284 h->iph.daddr=htonl(dest);
285 h->iph.check=0;
286 h->iph.check=ip_fast_csum((uint8_t *)&h->iph,h->iph.ihl);
287 h->check=0;
288 h->d.unused=0;
289
290 return h;
291 }
292
293 /* Fill in the ICMP checksum field correctly */
294 static void netlink_icmp_csum(struct icmphdr *h)
295 {
296 int32_t len;
297
298 len=ntohs(h->iph.tot_len)-(4*h->iph.ihl);
299 h->check=0;
300 h->check=ip_csum(&h->type,len);
301 }
302
303 /* RFC1122:
304 * An ICMP error message MUST NOT be sent as the result of
305 * receiving:
306 *
307 * * an ICMP error message, or
308 *
309 * * a datagram destined to an IP broadcast or IP multicast
310 * address, or
311 *
312 * * a datagram sent as a link-layer broadcast, or
313 *
314 * * a non-initial fragment, or
315 *
316 * * a datagram whose source address does not define a single
317 * host -- e.g., a zero address, a loopback address, a
318 * broadcast address, a multicast address, or a Class E
319 * address.
320 */
321 static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
322 {
323 struct iphdr *iph;
324 struct icmphdr *icmph;
325 uint32_t source;
326
327 if (buf->size < (int)sizeof(struct icmphdr)) return False;
328 iph=(struct iphdr *)buf->start;
329 icmph=(struct icmphdr *)buf->start;
330 if (iph->protocol==1) {
331 switch(icmph->type) {
332 /* Based on http://www.iana.org/assignments/icmp-parameters/icmp-parameters.xhtml#icmp-parameters-types
333 * as retrieved Thu, 20 Mar 2014 00:16:44 +0000.
334 * Deprecated, reserved, unassigned and experimental
335 * options are treated as not safe to reply to.
336 */
337 case 0: /* Echo Reply */
338 case 8: /* Echo */
339 case 13: /* Timestamp */
340 case 14: /* Timestamp Reply */
341 return True;
342 default:
343 return False;
344 }
345 }
346 /* How do we spot broadcast destination addresses? */
347 if (ntohs(iph->frag)&IPHDR_FRAG_OFF) return False;
348 source=ntohl(iph->saddr);
349 if (source==0) return False;
350 if ((source&0xff000000)==0x7f000000) return False;
351 /* How do we spot broadcast source addresses? */
352 if ((source&0xf0000000)==0xe0000000) return False; /* Multicast */
353 if ((source&0xf0000000)==0xf0000000) return False; /* Class E */
354 return True;
355 }
356
357 /* How much of the original IP packet do we include in its ICMP
358 response? The header plus up to 64 bits. */
359
360 /* XXX TODO RFC1812:
361 4.3.2.3 Original Message Header
362
363 Historically, every ICMP error message has included the Internet
364 header and at least the first 8 data bytes of the datagram that
365 triggered the error. This is no longer adequate, due to the use of
366 IP-in-IP tunneling and other technologies. Therefore, the ICMP
367 datagram SHOULD contain as much of the original datagram as possible
368 without the length of the ICMP datagram exceeding 576 bytes. The
369 returned IP header (and user data) MUST be identical to that which
370 was received, except that the router is not required to undo any
371 modifications to the IP header that are normally performed in
372 forwarding that were performed before the error was detected (e.g.,
373 decrementing the TTL, or updating options). Note that the
374 requirements of Section [4.3.3.5] supersede this requirement in some
375 cases (i.e., for a Parameter Problem message, if the problem is in a
376 modified field, the router must undo the modification). See Section
377 [4.3.3.5]).
378 */
379 static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
380 {
381 if (buf->size < (int)sizeof(struct iphdr)) return 0;
382 struct iphdr *iph=(struct iphdr *)buf->start;
383 uint16_t hlen,plen;
384
385 hlen=iph->ihl*4;
386 /* We include the first 8 bytes of the packet data, provided they exist */
387 hlen+=8;
388 plen=ntohs(iph->tot_len);
389 return (hlen>plen?plen:hlen);
390 }
391
392 /* client indicates where the packet we're constructing a response to
393 comes from. NULL indicates the host. */
394 static void netlink_icmp_simple(struct netlink *st,
395 struct netlink_client *origsender,
396 struct buffer_if *buf,
397 uint8_t type, uint8_t code,
398 union icmpinfofield info)
399 {
400 struct icmphdr *h;
401 uint16_t len;
402
403 if (netlink_icmp_may_reply(buf)) {
404 struct iphdr *iph=(struct iphdr *)buf->start;
405
406 uint32_t icmpdest = ntohl(iph->saddr);
407 uint32_t icmpsource;
408 const char *icmpsourcedebugprefix;
409 if (!st->ptp) {
410 icmpsource=st->secnet_address;
411 icmpsourcedebugprefix="";
412 } else if (origsender) {
413 /* was from peer, send reply as if from host */
414 icmpsource=st->local_address;
415 icmpsourcedebugprefix="L!";
416 } else {
417 /* was from host, send reply as if from peer */
418 icmpsource=st->secnet_address; /* actually, peer address */
419 icmpsourcedebugprefix="P!";
420 }
421 MDEBUG("%s: generating ICMP re %s[%s]->[%s]:"
422 " from %s%s type=%u code=%u\n",
423 st->name, sender_name(origsender),
424 ipaddr_to_string(ntohl(iph->saddr)),
425 ipaddr_to_string(ntohl(iph->daddr)),
426 icmpsourcedebugprefix,
427 ipaddr_to_string(icmpsource),
428 type, code);
429
430 len=netlink_icmp_reply_len(buf);
431 h=netlink_icmp_tmpl(st,icmpsource,icmpdest,len);
432 h->type=type; h->code=code; h->d=info;
433 BUF_ADD_BYTES(append,&st->icmp,buf->start,len);
434 netlink_icmp_csum(h);
435
436 if (!st->ptp) {
437 netlink_packet_deliver(st,NULL,&st->icmp);
438 } else if (origsender) {
439 netlink_client_deliver(st,origsender,icmpsource,icmpdest,&st->icmp);
440 } else {
441 netlink_host_deliver(st,NULL,icmpsource,icmpdest,&st->icmp);
442 }
443 BUF_ASSERT_FREE(&st->icmp);
444 }
445 }
446
447 /*
448 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the
449 * checksum.
450 * RFC1812: 4.2.2.5 MUST discard messages containing invalid checksums.
451 *
452 * Is the datagram acceptable?
453 *
454 * 1. Length at least the size of an ip header
455 * 2. Version of 4
456 * 3. Checksums correctly.
457 * 4. Doesn't have a bogus length
458 */
459 static bool_t netlink_check(struct netlink *st, struct buffer_if *buf,
460 char *errmsgbuf, int errmsgbuflen)
461 {
462 #define BAD(...) do{ \
463 snprintf(errmsgbuf,errmsgbuflen,__VA_ARGS__); \
464 return False; \
465 }while(0)
466
467 if (buf->size < (int)sizeof(struct iphdr)) BAD("len %"PRIu32"",buf->size);
468 struct iphdr *iph=(struct iphdr *)buf->start;
469 int32_t len;
470
471 if (iph->ihl < 5) BAD("ihl %u",iph->ihl);
472 if (iph->version != 4) BAD("version %u",iph->version);
473 if (buf->size < iph->ihl*4) BAD("size %"PRId32"<%u*4",buf->size,iph->ihl);
474 if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) BAD("csum");
475 len=ntohs(iph->tot_len);
476 /* There should be no padding */
477 if (buf->size!=len) BAD("len %"PRId32"!=%"PRId32,buf->size,len);
478 if (len<(iph->ihl<<2)) BAD("len %"PRId32"<(%u<<2)",len,iph->ihl);
479 /* XXX check that there's no source route specified */
480 return True;
481
482 #undef BAD
483 }
484
485 static const char *fragment_filter_header(uint8_t *base, long *hlp)
486 {
487 const int fixedhl = sizeof(struct iphdr);
488 long hl = *hlp;
489 const uint8_t *ipend = base + hl;
490 uint8_t *op = base + fixedhl;
491 const uint8_t *ip = op;
492
493 while (ip < ipend) {
494 uint8_t opt = ip[0];
495 int remain = ipend - ip;
496 if (opt == 0x00) /* End of Options List */ break;
497 if (opt == 0x01) /* No Operation */ continue;
498 if (remain < 2) return "IPv4 options truncated at length";
499 int optlen = ip[1];
500 if (remain < optlen) return "IPv4 options truncated in option";
501 if (opt & 0x80) /* copy */ {
502 memmove(op, ip, optlen);
503 op += optlen;
504 }
505 ip += optlen;
506 }
507 while ((hl = (op - base)) & 0x3)
508 *op++ = 0x00 /* End of Option List */;
509 ((struct iphdr*)base)->ihl = hl >> 2;
510 *hlp = hl;
511
512 return 0;
513 }
514
515 /* Fragment or send ICMP Fragmentation Needed */
516 static void netlink_maybe_fragment(struct netlink *st,
517 struct netlink_client *sender,
518 netlink_deliver_fn *deliver,
519 void *deliver_dst,
520 const char *delivery_name,
521 int32_t mtu,
522 uint32_t source, uint32_t dest,
523 struct buffer_if *buf)
524 {
525 struct iphdr *iph=(struct iphdr*)buf->start;
526 long hl = iph->ihl*4;
527 const char *ssource = ipaddr_to_string(source);
528
529 if (buf->size <= mtu) {
530 deliver(deliver_dst, buf);
531 return;
532 }
533
534 MDEBUG("%s: fragmenting %s->%s org.size=%"PRId32"\n",
535 st->name, ssource, delivery_name, buf->size);
536
537 #define BADFRAG(m, ...) \
538 Message(M_WARNING, \
539 "%s: fragmenting packet from source %s" \
540 " for transmission via %s: " m "\n", \
541 st->name, ssource, delivery_name, \
542 ## __VA_ARGS__);
543
544 unsigned orig_frag = ntohs(iph->frag);
545
546 if (orig_frag&IPHDR_FRAG_DONT) {
547 union icmpinfofield info =
548 { .fragneeded = { .unused = 0, .mtu = htons(mtu) } };
549 netlink_icmp_simple(st,sender,buf,
550 ICMP_TYPE_UNREACHABLE,
551 ICMP_CODE_FRAGMENTATION_REQUIRED,
552 info);
553 BUF_FREE(buf);
554 return;
555 }
556 if (mtu < hl + 8) {
557 BADFRAG("mtu %"PRId32" too small", mtu);
558 BUF_FREE(buf);
559 return;
560 }
561
562 /* we (ab)use the icmp buffer to stash the original packet */
563 struct buffer_if *orig = &st->icmp;
564 BUF_ALLOC(orig,"netlink_client_deliver fragment orig");
565 buffer_copy(orig,buf);
566 BUF_FREE(buf);
567
568 const uint8_t *startindata = orig->start + hl;
569 const uint8_t *indata = startindata;
570 const uint8_t *endindata = orig->start + orig->size;
571 _Bool filtered = 0;
572
573 for (;;) {
574 /* compute our fragment offset */
575 long dataoffset = indata - startindata
576 + (orig_frag & IPHDR_FRAG_OFF)*8;
577 assert(!(dataoffset & 7));
578 if (dataoffset > IPHDR_FRAG_OFF*8) {
579 BADFRAG("ultimate fragment offset out of range");
580 break;
581 }
582
583 BUF_ALLOC(buf,"netlink_client_deliver fragment frag");
584 buffer_init(buf,calculate_max_start_pad());
585
586 /* copy header (possibly filtered); will adjust in a bit */
587 struct iphdr *fragh = buf_append(buf, hl);
588 memcpy(fragh, orig->start, hl);
589
590 /* decide how much payload to copy and copy it */
591 long avail = mtu - hl;
592 long remain = endindata - indata;
593 long use = avail < remain ? (avail & ~(long)7) : remain;
594 BUF_ADD_BYTES(append, buf, indata, use);
595 indata += use;
596
597 _Bool last_frag = indata >= endindata;
598
599 /* adjust the header */
600 fragh->tot_len = htons(buf->size);
601 fragh->frag =
602 htons((orig_frag & ~IPHDR_FRAG_OFF) |
603 (last_frag ? 0 : IPHDR_FRAG_MORE) |
604 (dataoffset >> 3));
605 fragh->check = 0;
606 fragh->check = ip_fast_csum((const void*)fragh, fragh->ihl);
607
608 /* actually send it */
609 deliver(deliver_dst, buf);
610 if (last_frag)
611 break;
612
613 /* after copying the header for the first frag,
614 * we filter the header for the remaining frags */
615 if (!filtered++) {
616 const char *bad = fragment_filter_header(orig->start, &hl);
617 if (bad) { BADFRAG("%s", bad); break; }
618 }
619 }
620
621 BUF_FREE(orig);
622
623 #undef BADFRAG
624 }
625
626 /* Deliver a packet _to_ client; used after we have decided
627 * what to do with it (and just to check that the client has
628 * actually registered a delivery function with us). */
629 static void netlink_client_deliver(struct netlink *st,
630 struct netlink_client *client,
631 uint32_t source, uint32_t dest,
632 struct buffer_if *buf)
633 {
634 if (!client->deliver) {
635 string_t s,d;
636 s=ipaddr_to_string(source);
637 d=ipaddr_to_string(dest);
638 Message(M_ERR,"%s: dropping %s->%s, client not registered\n",
639 st->name,s,d);
640 BUF_FREE(buf);
641 return;
642 }
643 netlink_maybe_fragment(st,NULL, client->deliver,client->dst,client->name,
644 client->mtu, source,dest,buf);
645 client->outcount++;
646 }
647
648 /* Deliver a packet to the host; used after we have decided that that
649 * is what to do with it. */
650 static void netlink_host_deliver(struct netlink *st,
651 struct netlink_client *sender,
652 uint32_t source, uint32_t dest,
653 struct buffer_if *buf)
654 {
655 netlink_maybe_fragment(st,sender, st->deliver_to_host,st->dst,"(host)",
656 st->mtu, source,dest,buf);
657 st->outcount++;
658 }
659
660 /* Deliver a packet. "sender"==NULL for packets from the host and packets
661 generated internally in secnet. */
662 static void netlink_packet_deliver(struct netlink *st,
663 struct netlink_client *sender,
664 struct buffer_if *buf)
665 {
666 if (buf->size < (int)sizeof(struct iphdr)) {
667 Message(M_ERR,"%s: trying to deliver a too-short packet"
668 " from %s!\n",st->name, sender_name(sender));
669 BUF_FREE(buf);
670 return;
671 }
672
673 struct iphdr *iph=(struct iphdr *)buf->start;
674 uint32_t dest=ntohl(iph->daddr);
675 uint32_t source=ntohl(iph->saddr);
676 uint32_t best_quality;
677 bool_t allow_route=False;
678 bool_t found_allowed=False;
679 int best_match;
680 int i;
681
682 BUF_ASSERT_USED(buf);
683
684 if (dest==st->secnet_address) {
685 Message(M_ERR,"%s: trying to deliver a packet to myself!\n",st->name);
686 BUF_FREE(buf);
687 return;
688 }
689
690 /* Packets from the host (sender==NULL) may always be routed. Packets
691 from clients with the allow_route option will also be routed. */
692 if (!sender || (sender && (sender->options & OPT_ALLOWROUTE)))
693 allow_route=True;
694
695 /* If !allow_route, we check the routing table anyway, and if
696 there's a suitable route with OPT_ALLOWROUTE set we use it. If
697 there's a suitable route, but none with OPT_ALLOWROUTE set then
698 we generate ICMP 'communication with destination network
699 administratively prohibited'. */
700
701 best_quality=0;
702 best_match=-1;
703 for (i=0; i<st->n_clients; i++) {
704 if (st->routes[i]->up &&
705 ipset_contains_addr(st->routes[i]->networks,dest)) {
706 /* It's an available route to the correct destination. But is
707 it better than the one we already have? */
708
709 /* If we have already found an allowed route then we don't
710 bother looking at routes we're not allowed to use. If
711 we don't yet have an allowed route we'll consider any. */
712 if (!allow_route && found_allowed) {
713 if (!(st->routes[i]->options&OPT_ALLOWROUTE)) continue;
714 }
715
716 if (st->routes[i]->link_quality>best_quality
717 || best_quality==0) {
718 best_quality=st->routes[i]->link_quality;
719 best_match=i;
720 if (st->routes[i]->options&OPT_ALLOWROUTE)
721 found_allowed=True;
722 /* If quality isn't perfect we may wish to
723 consider kicking the tunnel with a 0-length
724 packet to prompt it to perform a key setup.
725 Then it'll eventually decide it's up or
726 down. */
727 /* If quality is perfect and we're allowed to use the
728 route we don't need to search any more. */
729 if (best_quality>=MAXIMUM_LINK_QUALITY &&
730 (allow_route || found_allowed)) break;
731 }
732 }
733 }
734 if (best_match==-1) {
735 /* The packet's not going down a tunnel. It might (ought to)
736 be for the host. */
737 if (ipset_contains_addr(st->networks,dest)) {
738 netlink_host_deliver(st,sender,source,dest,buf);
739 BUF_ASSERT_FREE(buf);
740 } else {
741 string_t s,d;
742 s=ipaddr_to_string(source);
743 d=ipaddr_to_string(dest);
744 Message(M_DEBUG,"%s: don't know where to deliver packet "
745 "(s=%s, d=%s)\n", st->name, s, d);
746 netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE,
747 ICMP_CODE_NET_UNREACHABLE, icmp_noinfo);
748 BUF_FREE(buf);
749 }
750 } else {
751 if (!allow_route &&
752 !(st->routes[best_match]->options&OPT_ALLOWROUTE)) {
753 string_t s,d;
754 s=ipaddr_to_string(source);
755 d=ipaddr_to_string(dest);
756 /* We have a usable route but aren't allowed to use it.
757 Generate ICMP destination unreachable: communication
758 with destination network administratively prohibited */
759 Message(M_NOTICE,"%s: denied forwarding for packet (s=%s, d=%s)\n",
760 st->name,s,d);
761
762 netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE,
763 ICMP_CODE_NET_PROHIBITED, icmp_noinfo);
764 BUF_FREE(buf);
765 } else {
766 if (best_quality>0) {
767 netlink_client_deliver(st,st->routes[best_match],
768 source,dest,buf);
769 BUF_ASSERT_FREE(buf);
770 } else {
771 /* Generate ICMP destination unreachable */
772 netlink_icmp_simple(st,sender,buf,
773 ICMP_TYPE_UNREACHABLE,
774 ICMP_CODE_NET_UNREACHABLE,
775 icmp_noinfo);
776 BUF_FREE(buf);
777 }
778 }
779 }
780 BUF_ASSERT_FREE(buf);
781 }
782
783 static void netlink_packet_forward(struct netlink *st,
784 struct netlink_client *sender,
785 struct buffer_if *buf)
786 {
787 if (buf->size < (int)sizeof(struct iphdr)) return;
788 struct iphdr *iph=(struct iphdr *)buf->start;
789
790 BUF_ASSERT_USED(buf);
791
792 /* Packet has already been checked */
793 if (iph->ttl<=1) {
794 /* Generate ICMP time exceeded */
795 netlink_icmp_simple(st,sender,buf,ICMP_TYPE_TIME_EXCEEDED,
796 ICMP_CODE_TTL_EXCEEDED,icmp_noinfo);
797 BUF_FREE(buf);
798 return;
799 }
800 iph->ttl--;
801 iph->check=0;
802 iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl);
803
804 netlink_packet_deliver(st,sender,buf);
805 BUF_ASSERT_FREE(buf);
806 }
807
808 /* Deal with packets addressed explicitly to us */
809 static void netlink_packet_local(struct netlink *st,
810 struct netlink_client *sender,
811 struct buffer_if *buf)
812 {
813 struct icmphdr *h;
814
815 st->localcount++;
816
817 if (buf->size < (int)sizeof(struct icmphdr)) {
818 Message(M_WARNING,"%s: short packet addressed to secnet; "
819 "ignoring it\n",st->name);
820 BUF_FREE(buf);
821 return;
822 }
823 h=(struct icmphdr *)buf->start;
824
825 unsigned fraginfo = ntohs(h->iph.frag);
826 if ((fraginfo&(IPHDR_FRAG_OFF|IPHDR_FRAG_MORE))!=0) {
827 if (!(fraginfo & IPHDR_FRAG_OFF))
828 /* report only for first fragment */
829 Message(M_WARNING,"%s: fragmented packet addressed to secnet; "
830 "ignoring it\n",st->name);
831 BUF_FREE(buf);
832 return;
833 }
834
835 if (h->iph.protocol==1) {
836 /* It's ICMP */
837 if (h->type==ICMP_TYPE_ECHO_REQUEST && h->code==0) {
838 /* ICMP echo-request. Special case: we re-use the buffer
839 to construct the reply. */
840 h->type=ICMP_TYPE_ECHO_REPLY;
841 h->iph.daddr=h->iph.saddr;
842 h->iph.saddr=htonl(st->secnet_address);
843 h->iph.ttl=255;
844 h->iph.check=0;
845 h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl);
846 netlink_icmp_csum(h);
847 netlink_packet_deliver(st,NULL,buf);
848 return;
849 }
850 Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name);
851 } else {
852 /* Send ICMP protocol unreachable */
853 netlink_icmp_simple(st,sender,buf,ICMP_TYPE_UNREACHABLE,
854 ICMP_CODE_PROTOCOL_UNREACHABLE,icmp_noinfo);
855 BUF_FREE(buf);
856 return;
857 }
858
859 BUF_FREE(buf);
860 }
861
862 /* If cid==NULL packet is from host, otherwise cid specifies which tunnel
863 it came from. */
864 static void netlink_incoming(struct netlink *st, struct netlink_client *sender,
865 struct buffer_if *buf)
866 {
867 uint32_t source,dest;
868 struct iphdr *iph;
869 char errmsgbuf[50];
870 const char *sourcedesc=sender?sender->name:"host";
871
872 BUF_ASSERT_USED(buf);
873
874 if (!netlink_check(st,buf,errmsgbuf,sizeof(errmsgbuf))) {
875 Message(M_WARNING,"%s: bad IP packet from %s: %s\n",
876 st->name,sourcedesc,
877 errmsgbuf);
878 BUF_FREE(buf);
879 return;
880 }
881 assert(buf->size >= (int)sizeof(struct iphdr));
882 iph=(struct iphdr *)buf->start;
883
884 source=ntohl(iph->saddr);
885 dest=ntohl(iph->daddr);
886
887 /* Check source. If we don't like the source, there's no point
888 generating ICMP because we won't know how to get it to the
889 source of the packet. */
890 if (sender) {
891 /* Check that the packet source is appropriate for the tunnel
892 it came down */
893 if (!ipset_contains_addr(sender->networks,source)) {
894 string_t s,d;
895 s=ipaddr_to_string(source);
896 d=ipaddr_to_string(dest);
897 Message(M_WARNING,"%s: packet from tunnel %s with bad "
898 "source address (s=%s,d=%s)\n",st->name,sender->name,s,d);
899 BUF_FREE(buf);
900 return;
901 }
902 } else {
903 /* Check that the packet originates in our configured local
904 network, and hasn't been forwarded from elsewhere or
905 generated with the wrong source address */
906 if (!ipset_contains_addr(st->networks,source)) {
907 string_t s,d;
908 s=ipaddr_to_string(source);
909 d=ipaddr_to_string(dest);
910 Message(M_WARNING,"%s: outgoing packet with bad source address "
911 "(s=%s,d=%s)\n",st->name,s,d);
912 BUF_FREE(buf);
913 return;
914 }
915 }
916
917 /* If this is a point-to-point device we don't examine the
918 destination address at all; we blindly send it down our
919 one-and-only registered tunnel, or to the host, depending on
920 where it came from. It's up to external software to check
921 address validity and generate ICMP, etc. */
922 if (st->ptp) {
923 if (sender) {
924 netlink_host_deliver(st,sender,source,dest,buf);
925 } else {
926 netlink_client_deliver(st,st->clients,source,dest,buf);
927 }
928 BUF_ASSERT_FREE(buf);
929 return;
930 }
931
932 /* st->secnet_address needs checking before matching destination
933 addresses */
934 if (dest==st->secnet_address) {
935 netlink_packet_local(st,sender,buf);
936 BUF_ASSERT_FREE(buf);
937 return;
938 }
939 netlink_packet_forward(st,sender,buf);
940 BUF_ASSERT_FREE(buf);
941 }
942
943 static void netlink_inst_incoming(void *sst, struct buffer_if *buf)
944 {
945 struct netlink_client *c=sst;
946 struct netlink *st=c->nst;
947
948 netlink_incoming(st,c,buf);
949 }
950
951 static void netlink_dev_incoming(void *sst, struct buffer_if *buf)
952 {
953 struct netlink *st=sst;
954
955 netlink_incoming(st,NULL,buf);
956 }
957
958 static void netlink_set_quality(void *sst, uint32_t quality)
959 {
960 struct netlink_client *c=sst;
961 struct netlink *st=c->nst;
962
963 c->link_quality=quality;
964 c->up=(c->link_quality==LINK_QUALITY_DOWN)?False:True;
965 if (c->options&OPT_SOFTROUTE) {
966 st->set_routes(st->dst,c);
967 }
968 }
969
970 static void netlink_output_subnets(struct netlink *st, uint32_t loglevel,
971 struct subnet_list *snets)
972 {
973 int32_t i;
974 string_t net;
975
976 for (i=0; i<snets->entries; i++) {
977 net=subnet_to_string(snets->list[i]);
978 Message(loglevel,"%s ",net);
979 free(net);
980 }
981 }
982
983 static void netlink_dump_routes(struct netlink *st, bool_t requested)
984 {
985 int i;
986 string_t net;
987 uint32_t c=M_INFO;
988
989 if (requested) c=M_WARNING;
990 if (st->ptp) {
991 net=ipaddr_to_string(st->secnet_address);
992 Message(c,"%s: point-to-point (remote end is %s); routes: ",
993 st->name, net);
994 netlink_output_subnets(st,c,st->clients->subnets);
995 Message(c,"\n");
996 } else {
997 Message(c,"%s: routing table:\n",st->name);
998 for (i=0; i<st->n_clients; i++) {
999 netlink_output_subnets(st,c,st->routes[i]->subnets);
1000 Message(c,"-> tunnel %s (%s,mtu %d,%s routes,%s,"
1001 "quality %d,use %d,pri %lu)\n",
1002 st->routes[i]->name,
1003 st->routes[i]->up?"up":"down",
1004 st->routes[i]->mtu,
1005 st->routes[i]->options&OPT_SOFTROUTE?"soft":"hard",
1006 st->routes[i]->options&OPT_ALLOWROUTE?"free":"restricted",
1007 st->routes[i]->link_quality,
1008 st->routes[i]->outcount,
1009 (unsigned long)st->routes[i]->priority);
1010 }
1011 net=ipaddr_to_string(st->secnet_address);
1012 Message(c,"%s/32 -> netlink \"%s\" (use %d)\n",
1013 net,st->name,st->localcount);
1014 for (i=0; i<st->subnets->entries; i++) {
1015 net=subnet_to_string(st->subnets->list[i]);
1016 Message(c,"%s ",net);
1017 free(net);
1018 }
1019 if (i>0)
1020 Message(c,"-> host (use %d)\n",st->outcount);
1021 }
1022 }
1023
1024 /* ap is a pointer to a member of the routes array */
1025 static int netlink_compare_client_priority(const void *ap, const void *bp)
1026 {
1027 const struct netlink_client *const*a=ap;
1028 const struct netlink_client *const*b=bp;
1029
1030 if ((*a)->priority==(*b)->priority) return 0;
1031 if ((*a)->priority<(*b)->priority) return 1;
1032 return -1;
1033 }
1034
1035 static void netlink_phase_hook(void *sst, uint32_t new_phase)
1036 {
1037 struct netlink *st=sst;
1038 struct netlink_client *c;
1039 int32_t i;
1040
1041 /* All the networks serviced by the various tunnels should now
1042 * have been registered. We build a routing table by sorting the
1043 * clients by priority. */
1044 st->routes=safe_malloc_ary(sizeof(*st->routes),st->n_clients,
1045 "netlink_phase_hook");
1046 /* Fill the table */
1047 i=0;
1048 for (c=st->clients; c; c=c->next) {
1049 assert(i<INT_MAX);
1050 st->routes[i++]=c;
1051 }
1052 /* Sort the table in descending order of priority */
1053 qsort(st->routes,st->n_clients,sizeof(*st->routes),
1054 netlink_compare_client_priority);
1055
1056 netlink_dump_routes(st,False);
1057 }
1058
1059 static void netlink_signal_handler(void *sst, int signum)
1060 {
1061 struct netlink *st=sst;
1062 Message(M_INFO,"%s: route dump requested by SIGUSR1\n",st->name);
1063 netlink_dump_routes(st,True);
1064 }
1065
1066 static void netlink_inst_set_mtu(void *sst, int32_t new_mtu)
1067 {
1068 struct netlink_client *c=sst;
1069
1070 c->mtu=new_mtu;
1071 }
1072
1073 static void netlink_inst_reg(void *sst, netlink_deliver_fn *deliver,
1074 void *dst, uint32_t *localmtu_r)
1075 {
1076 struct netlink_client *c=sst;
1077 struct netlink *st=c->nst;
1078
1079 c->deliver=deliver;
1080 c->dst=dst;
1081
1082 if (localmtu_r)
1083 *localmtu_r=st->mtu;
1084 }
1085
1086 static struct flagstr netlink_option_table[]={
1087 { "soft", OPT_SOFTROUTE },
1088 { "allow-route", OPT_ALLOWROUTE },
1089 { NULL, 0}
1090 };
1091 /* This is the routine that gets called when the closure that's
1092 returned by an invocation of a netlink device closure (eg. tun,
1093 userv-ipif) is invoked. It's used to create routes and pass in
1094 information about them; the closure it returns is used by site
1095 code. */
1096 static closure_t *netlink_inst_create(struct netlink *st,
1097 struct cloc loc, dict_t *dict)
1098 {
1099 struct netlink_client *c;
1100 string_t name;
1101 struct ipset *networks;
1102 uint32_t options,priority;
1103 int32_t mtu;
1104 list_t *l;
1105
1106 name=dict_read_string(dict, "name", True, st->name, loc);
1107
1108 l=dict_lookup(dict,"routes");
1109 if (!l)
1110 cfgfatal(loc,st->name,"required parameter \"routes\" not found\n");
1111 networks=string_list_to_ipset(l,loc,st->name,"routes");
1112 options=string_list_to_word(dict_lookup(dict,"options"),
1113 netlink_option_table,st->name);
1114
1115 priority=dict_read_number(dict,"priority",False,st->name,loc,0);
1116 mtu=dict_read_number(dict,"mtu",False,st->name,loc,0);
1117
1118 if ((options&OPT_SOFTROUTE) && !st->set_routes) {
1119 cfgfatal(loc,st->name,"this netlink device does not support "
1120 "soft routes.\n");
1121 return NULL;
1122 }
1123
1124 if (options&OPT_SOFTROUTE) {
1125 /* XXX for now we assume that soft routes require root privilege;
1126 this may not always be true. The device driver can tell us. */
1127 require_root_privileges=True;
1128 require_root_privileges_explanation="netlink: soft routes";
1129 if (st->ptp) {
1130 cfgfatal(loc,st->name,"point-to-point netlinks do not support "
1131 "soft routes.\n");
1132 return NULL;
1133 }
1134 }
1135
1136 /* Check that nets are a subset of st->remote_networks;
1137 refuse to register if they are not. */
1138 if (!ipset_is_subset(st->remote_networks,networks)) {
1139 cfgfatal(loc,st->name,"routes are not allowed\n");
1140 return NULL;
1141 }
1142
1143 c=safe_malloc(sizeof(*c),"netlink_inst_create");
1144 c->cl.description=name;
1145 c->cl.type=CL_NETLINK;
1146 c->cl.apply=NULL;
1147 c->cl.interface=&c->ops;
1148 c->ops.st=c;
1149 c->ops.reg=netlink_inst_reg;
1150 c->ops.deliver=netlink_inst_incoming;
1151 c->ops.set_quality=netlink_set_quality;
1152 c->ops.set_mtu=netlink_inst_set_mtu;
1153 c->nst=st;
1154
1155 c->networks=networks;
1156 c->subnets=ipset_to_subnet_list(networks);
1157 c->priority=priority;
1158 c->deliver=NULL;
1159 c->dst=NULL;
1160 c->name=name;
1161 c->link_quality=LINK_QUALITY_UNUSED;
1162 c->mtu=mtu?mtu:st->mtu;
1163 c->options=options;
1164 c->outcount=0;
1165 c->up=False;
1166 c->kup=False;
1167 c->next=st->clients;
1168 st->clients=c;
1169 assert(st->n_clients < INT_MAX);
1170 st->n_clients++;
1171
1172 return &c->cl;
1173 }
1174
1175 static list_t *netlink_inst_apply(closure_t *self, struct cloc loc,
1176 dict_t *context, list_t *args)
1177 {
1178 struct netlink *st=self->interface;
1179
1180 dict_t *dict;
1181 item_t *item;
1182 closure_t *cl;
1183
1184 item=list_elem(args,0);
1185 if (!item || item->type!=t_dict) {
1186 cfgfatal(loc,st->name,"must have a dictionary argument\n");
1187 }
1188 dict=item->data.dict;
1189
1190 cl=netlink_inst_create(st,loc,dict);
1191
1192 return new_closure(cl);
1193 }
1194
1195 netlink_deliver_fn *netlink_init(struct netlink *st,
1196 void *dst, struct cloc loc,
1197 dict_t *dict, cstring_t description,
1198 netlink_route_fn *set_routes,
1199 netlink_deliver_fn *to_host)
1200 {
1201 item_t *sa, *ptpa;
1202 list_t *l;
1203
1204 st->dst=dst;
1205 st->cl.description=description;
1206 st->cl.type=CL_PURE;
1207 st->cl.apply=netlink_inst_apply;
1208 st->cl.interface=st;
1209 st->clients=NULL;
1210 st->routes=NULL;
1211 st->n_clients=0;
1212 st->set_routes=set_routes;
1213 st->deliver_to_host=to_host;
1214
1215 st->name=dict_read_string(dict,"name",False,description,loc);
1216 if (!st->name) st->name=description;
1217 l=dict_lookup(dict,"networks");
1218 if (l)
1219 st->networks=string_list_to_ipset(l,loc,st->name,"networks");
1220 else {
1221 struct ipset *empty;
1222 empty=ipset_new();
1223 st->networks=ipset_complement(empty);
1224 ipset_free(empty);
1225 }
1226 l=dict_lookup(dict,"remote-networks");
1227 if (l) {
1228 st->remote_networks=string_list_to_ipset(l,loc,st->name,
1229 "remote-networks");
1230 } else {
1231 struct ipset *empty;
1232 empty=ipset_new();
1233 st->remote_networks=ipset_complement(empty);
1234 ipset_free(empty);
1235 }
1236 st->local_address=string_item_to_ipaddr(
1237 dict_find_item(dict,"local-address", True, "netlink", loc),"netlink");
1238
1239 sa=dict_find_item(dict,"secnet-address",False,"netlink",loc);
1240 ptpa=dict_find_item(dict,"ptp-address",False,"netlink",loc);
1241 if (sa && ptpa) {
1242 cfgfatal(loc,st->name,"you may not specify secnet-address and "
1243 "ptp-address in the same netlink device\n");
1244 }
1245 if (!(sa || ptpa)) {
1246 cfgfatal(loc,st->name,"you must specify secnet-address or "
1247 "ptp-address for this netlink device\n");
1248 }
1249 if (sa) {
1250 st->secnet_address=string_item_to_ipaddr(sa,"netlink");
1251 st->ptp=False;
1252 } else {
1253 st->secnet_address=string_item_to_ipaddr(ptpa,"netlink");
1254 st->ptp=True;
1255 }
1256 /* To be strictly correct we could subtract secnet_address from
1257 networks here. It shouldn't make any practical difference,
1258 though, and will make the route dump look complicated... */
1259 st->subnets=ipset_to_subnet_list(st->networks);
1260 st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
1261 buffer_new(&st->icmp,MAX(ICMP_BUFSIZE,st->mtu));
1262 st->outcount=0;
1263 st->localcount=0;
1264
1265 add_hook(PHASE_SETUP,netlink_phase_hook,st);
1266 request_signal_notification(SIGUSR1, netlink_signal_handler, st);
1267
1268 /* If we're point-to-point then we return a CL_NETLINK directly,
1269 rather than a CL_NETLINK_OLD or pure closure (depending on
1270 compatibility). This CL_NETLINK is for our one and only
1271 client. Our cl.apply function is NULL. */
1272 if (st->ptp) {
1273 closure_t *cl;
1274 cl=netlink_inst_create(st,loc,dict);
1275 st->cl=*cl;
1276 }
1277 return netlink_dev_incoming;
1278 }
1279
1280 /* No connection to the kernel at all... */
1281
1282 struct null {
1283 struct netlink nl;
1284 };
1285
1286 static bool_t null_set_route(void *sst, struct netlink_client *routes)
1287 {
1288 struct null *st=sst;
1289
1290 if (routes->up!=routes->kup) {
1291 Message(M_INFO,"%s: setting routes for tunnel %s to state %s\n",
1292 st->nl.name,routes->name,
1293 routes->up?"up":"down");
1294 routes->kup=routes->up;
1295 return True;
1296 }
1297 return False;
1298 }
1299
1300 static void null_deliver(void *sst, struct buffer_if *buf)
1301 {
1302 return;
1303 }
1304
1305 static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
1306 list_t *args)
1307 {
1308 struct null *st;
1309 item_t *item;
1310 dict_t *dict;
1311
1312 st=safe_malloc(sizeof(*st),"null_apply");
1313
1314 item=list_elem(args,0);
1315 if (!item || item->type!=t_dict)
1316 cfgfatal(loc,"null-netlink","parameter must be a dictionary\n");
1317
1318 dict=item->data.dict;
1319
1320 netlink_init(&st->nl,st,loc,dict,"null-netlink",null_set_route,
1321 null_deliver);
1322
1323 return new_closure(&st->nl.cl);
1324 }
1325
1326 void netlink_module(dict_t *dict)
1327 {
1328 add_closure(dict,"null-netlink",null_apply);
1329 }