41968e4dc6ffc1c56f44cdb21d6c8b89623b1aea
[secnet] / netlink.c
1 /* User-kernel network link */
2
3 /* See RFCs 791, 792, 1123 and 1812 */
4
5 /* The netlink device is actually a router. Tunnels are unnumbered
6 point-to-point lines (RFC1812 section 2.2.7); the router has a
7 single address (the 'router-id'). */
8
9 /* This is where we currently have the anti-spoofing paranoia - before
10 sending a packet to the kernel we check that the tunnel it came
11 over could reasonably have produced it. */
12
13
14 /* Points to note from RFC1812 (which may require changes in this
15 file):
16
17 3.3.4 Maximum Transmission Unit - MTU
18
19 The MTU of each logical interface MUST be configurable within the
20 range of legal MTUs for the interface.
21
22 Many Link Layer protocols define a maximum frame size that may be
23 sent. In such cases, a router MUST NOT allow an MTU to be set which
24 would allow sending of frames larger than those allowed by the Link
25 Layer protocol. However, a router SHOULD be willing to receive a
26 packet as large as the maximum frame size even if that is larger than
27 the MTU.
28
29 4.2.1 A router SHOULD count datagrams discarded.
30
31 4.2.2.1 Source route options - we probably should implement processing
32 of source routes, even though mostly the security policy will prevent
33 their use.
34
35 5.3.13.4 Source Route Options
36
37 A router MUST implement support for source route options in forwarded
38 packets. A router MAY implement a configuration option that, when
39 enabled, causes all source-routed packets to be discarded. However,
40 such an option MUST NOT be enabled by default.
41
42 5.3.13.5 Record Route Option
43
44 Routers MUST support the Record Route option in forwarded packets.
45
46 A router MAY provide a configuration option that, if enabled, will
47 cause the router to ignore (i.e., pass through unchanged) Record
48 Route options in forwarded packets. If provided, such an option MUST
49 default to enabling the record-route. This option should not affect
50 the processing of Record Route options in datagrams received by the
51 router itself (in particular, Record Route options in ICMP echo
52 requests will still be processed according to Section [4.3.3.6]).
53
54 5.3.13.6 Timestamp Option
55
56 Routers MUST support the timestamp option in forwarded packets. A
57 timestamp value MUST follow the rules given [INTRO:2].
58
59 If the flags field = 3 (timestamp and prespecified address), the
60 router MUST add its timestamp if the next prespecified address
61 matches any of the router's IP addresses. It is not necessary that
62 the prespecified address be either the address of the interface on
63 which the packet arrived or the address of the interface over which
64 it will be sent.
65
66
67 4.2.2.7 Fragmentation: RFC 791 Section 3.2
68
69 Fragmentation, as described in [INTERNET:1], MUST be supported by a
70 router.
71
72 4.2.2.8 Reassembly: RFC 791 Section 3.2
73
74 As specified in the corresponding section of [INTRO:2], a router MUST
75 support reassembly of datagrams that it delivers to itself.
76
77 4.2.2.9 Time to Live: RFC 791 Section 3.2
78
79 Note in particular that a router MUST NOT check the TTL of a packet
80 except when forwarding it.
81
82 A router MUST NOT discard a datagram just because it was received
83 with TTL equal to zero or one; if it is to the router and otherwise
84 valid, the router MUST attempt to receive it.
85
86 On messages the router originates, the IP layer MUST provide a means
87 for the transport layer to set the TTL field of every datagram that
88 is sent. When a fixed TTL value is used, it MUST be configurable.
89
90
91 8.1 The Simple Network Management Protocol - SNMP
92 8.1.1 SNMP Protocol Elements
93
94 Routers MUST be manageable by SNMP [MGT:3]. The SNMP MUST operate
95 using UDP/IP as its transport and network protocols.
96
97
98 */
99
100 #include <string.h>
101 #include <assert.h>
102 #include <limits.h>
103 #include "secnet.h"
104 #include "util.h"
105 #include "ipaddr.h"
106 #include "netlink.h"
107 #include "process.h"
108
109 #ifdef NETLINK_DEBUG
110 #define MDEBUG(...) Message(M_DEBUG, __VA_ARGS__)
111 #else /* !NETLINK_DEBUG */
112 #define MDEBUG(...) ((void)0)
113 #endif /* !NETLINK_DEBUG */
114
115 #define ICMP_TYPE_ECHO_REPLY 0
116
117 #define ICMP_TYPE_UNREACHABLE 3
118 #define ICMP_CODE_NET_UNREACHABLE 0
119 #define ICMP_CODE_PROTOCOL_UNREACHABLE 2
120 #define ICMP_CODE_FRAGMENTATION_REQUIRED 4
121 #define ICMP_CODE_NET_PROHIBITED 13
122
123 #define ICMP_TYPE_ECHO_REQUEST 8
124
125 #define ICMP_TYPE_TIME_EXCEEDED 11
126 #define ICMP_CODE_TTL_EXCEEDED 0
127
128 /* Generic IP checksum routine */
129 static inline uint16_t ip_csum(const uint8_t *iph,int32_t count)
130 {
131 register uint32_t sum=0;
132
133 while (count>1) {
134 sum+=ntohs(*(uint16_t *)iph);
135 iph+=2;
136 count-=2;
137 }
138 if(count>0)
139 sum+=*(uint8_t *)iph;
140 while (sum>>16)
141 sum=(sum&0xffff)+(sum>>16);
142 return htons(~sum);
143 }
144
145 #ifdef i386
146 /*
147 * This is a version of ip_compute_csum() optimized for IP headers,
148 * which always checksum on 4 octet boundaries.
149 *
150 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
151 * Arnt Gulbrandsen.
152 */
153 static inline uint16_t ip_fast_csum(const uint8_t *iph, int32_t ihl) {
154 uint32_t sum;
155
156 __asm__ __volatile__(
157 "movl (%1), %0 ;\n"
158 "subl $4, %2 ;\n"
159 "jbe 2f ;\n"
160 "addl 4(%1), %0 ;\n"
161 "adcl 8(%1), %0 ;\n"
162 "adcl 12(%1), %0 ;\n"
163 "1: adcl 16(%1), %0 ;\n"
164 "lea 4(%1), %1 ;\n"
165 "decl %2 ;\n"
166 "jne 1b ;\n"
167 "adcl $0, %0 ;\n"
168 "movl %0, %2 ;\n"
169 "shrl $16, %0 ;\n"
170 "addw %w2, %w0 ;\n"
171 "adcl $0, %0 ;\n"
172 "notl %0 ;\n"
173 "2: ;\n"
174 /* Since the input registers which are loaded with iph and ipl
175 are modified, we must also specify them as outputs, or gcc
176 will assume they contain their original values. */
177 : "=r" (sum), "=r" (iph), "=r" (ihl)
178 : "1" (iph), "2" (ihl)
179 : "memory");
180 return sum;
181 }
182 #else
183 static inline uint16_t ip_fast_csum(uint8_t *iph, int32_t ihl)
184 {
185 assert(ihl < INT_MAX/4);
186 return ip_csum(iph,ihl*4);
187 }
188 #endif
189
190 struct iphdr {
191 #if defined (WORDS_BIGENDIAN)
192 uint8_t version:4,
193 ihl:4;
194 #else
195 uint8_t ihl:4,
196 version:4;
197 #endif
198 uint8_t tos;
199 uint16_t tot_len;
200 uint16_t id;
201 uint16_t frag;
202 #define IPHDR_FRAG_OFF ((uint16_t)0x1fff)
203 #define IPHDR_FRAG_MORE ((uint16_t)0x2000)
204 #define IPHDR_FRAG_DONT ((uint16_t)0x4000)
205 /* reserved 0x8000 */
206 uint8_t ttl;
207 uint8_t protocol;
208 uint16_t check;
209 uint32_t saddr;
210 uint32_t daddr;
211 /* The options start here. */
212 };
213
214 struct icmphdr {
215 struct iphdr iph;
216 uint8_t type;
217 uint8_t code;
218 uint16_t check;
219 union icmpinfofield {
220 uint32_t unused;
221 struct {
222 uint8_t pointer;
223 uint8_t unused1;
224 uint16_t unused2;
225 } pprob;
226 uint32_t gwaddr;
227 struct {
228 uint16_t id;
229 uint16_t seq;
230 } echo;
231 struct {
232 uint16_t unused;
233 uint16_t mtu;
234 } fragneeded;
235 } d;
236 };
237
238 static const union icmpinfofield icmp_noinfo;
239
240 static void netlink_packet_deliver(struct netlink *st,
241 struct netlink_client *client,
242 struct buffer_if *buf);
243
244 /* XXX RFC1812 4.3.2.5:
245 All other ICMP error messages (Destination Unreachable,
246 Redirect, Time Exceeded, and Parameter Problem) SHOULD have their
247 precedence value set to 6 (INTERNETWORK CONTROL) or 7 (NETWORK
248 CONTROL). The IP Precedence value for these error messages MAY be
249 settable.
250 */
251 static struct icmphdr *netlink_icmp_tmpl(struct netlink *st,
252 uint32_t dest,uint16_t len)
253 {
254 struct icmphdr *h;
255
256 BUF_ALLOC(&st->icmp,"netlink_icmp_tmpl");
257 buffer_init(&st->icmp,calculate_max_start_pad());
258 h=buf_append(&st->icmp,sizeof(*h));
259
260 h->iph.version=4;
261 h->iph.ihl=5;
262 h->iph.tos=0;
263 h->iph.tot_len=htons(len+(h->iph.ihl*4)+8);
264 h->iph.id=0;
265 h->iph.frag=0;
266 h->iph.ttl=255; /* XXX should be configurable */
267 h->iph.protocol=1;
268 h->iph.saddr=htonl(st->secnet_address);
269 h->iph.daddr=htonl(dest);
270 h->iph.check=0;
271 h->iph.check=ip_fast_csum((uint8_t *)&h->iph,h->iph.ihl);
272 h->check=0;
273 h->d.unused=0;
274
275 return h;
276 }
277
278 /* Fill in the ICMP checksum field correctly */
279 static void netlink_icmp_csum(struct icmphdr *h)
280 {
281 int32_t len;
282
283 len=ntohs(h->iph.tot_len)-(4*h->iph.ihl);
284 h->check=0;
285 h->check=ip_csum(&h->type,len);
286 }
287
288 /* RFC1122:
289 * An ICMP error message MUST NOT be sent as the result of
290 * receiving:
291 *
292 * * an ICMP error message, or
293 *
294 * * a datagram destined to an IP broadcast or IP multicast
295 * address, or
296 *
297 * * a datagram sent as a link-layer broadcast, or
298 *
299 * * a non-initial fragment, or
300 *
301 * * a datagram whose source address does not define a single
302 * host -- e.g., a zero address, a loopback address, a
303 * broadcast address, a multicast address, or a Class E
304 * address.
305 */
306 static bool_t netlink_icmp_may_reply(struct buffer_if *buf)
307 {
308 struct iphdr *iph;
309 struct icmphdr *icmph;
310 uint32_t source;
311
312 if (buf->size < (int)sizeof(struct icmphdr)) return False;
313 iph=(struct iphdr *)buf->start;
314 icmph=(struct icmphdr *)buf->start;
315 if (iph->protocol==1) {
316 switch(icmph->type) {
317 /* Based on http://www.iana.org/assignments/icmp-parameters/icmp-parameters.xhtml#icmp-parameters-types
318 * as retrieved Thu, 20 Mar 2014 00:16:44 +0000.
319 * Deprecated, reserved, unassigned and experimental
320 * options are treated as not safe to reply to.
321 */
322 case 0: /* Echo Reply */
323 case 8: /* Echo */
324 case 13: /* Timestamp */
325 case 14: /* Timestamp Reply */
326 return True;
327 default:
328 return False;
329 }
330 }
331 /* How do we spot broadcast destination addresses? */
332 if (ntohs(iph->frag)&IPHDR_FRAG_OFF) return False;
333 source=ntohl(iph->saddr);
334 if (source==0) return False;
335 if ((source&0xff000000)==0x7f000000) return False;
336 /* How do we spot broadcast source addresses? */
337 if ((source&0xf0000000)==0xe0000000) return False; /* Multicast */
338 if ((source&0xf0000000)==0xf0000000) return False; /* Class E */
339 return True;
340 }
341
342 /* How much of the original IP packet do we include in its ICMP
343 response? The header plus up to 64 bits. */
344
345 /* XXX TODO RFC1812:
346 4.3.2.3 Original Message Header
347
348 Historically, every ICMP error message has included the Internet
349 header and at least the first 8 data bytes of the datagram that
350 triggered the error. This is no longer adequate, due to the use of
351 IP-in-IP tunneling and other technologies. Therefore, the ICMP
352 datagram SHOULD contain as much of the original datagram as possible
353 without the length of the ICMP datagram exceeding 576 bytes. The
354 returned IP header (and user data) MUST be identical to that which
355 was received, except that the router is not required to undo any
356 modifications to the IP header that are normally performed in
357 forwarding that were performed before the error was detected (e.g.,
358 decrementing the TTL, or updating options). Note that the
359 requirements of Section [4.3.3.5] supersede this requirement in some
360 cases (i.e., for a Parameter Problem message, if the problem is in a
361 modified field, the router must undo the modification). See Section
362 [4.3.3.5]).
363 */
364 static uint16_t netlink_icmp_reply_len(struct buffer_if *buf)
365 {
366 if (buf->size < (int)sizeof(struct iphdr)) return 0;
367 struct iphdr *iph=(struct iphdr *)buf->start;
368 uint16_t hlen,plen;
369
370 hlen=iph->ihl*4;
371 /* We include the first 8 bytes of the packet data, provided they exist */
372 hlen+=8;
373 plen=ntohs(iph->tot_len);
374 return (hlen>plen?plen:hlen);
375 }
376
377 /* client indicates where the packet we're constructing a response to
378 comes from. NULL indicates the host. */
379 static void netlink_icmp_simple(struct netlink *st, struct buffer_if *buf,
380 uint8_t type, uint8_t code,
381 union icmpinfofield info)
382 {
383 struct icmphdr *h;
384 uint16_t len;
385
386 if (netlink_icmp_may_reply(buf)) {
387 struct iphdr *iph=(struct iphdr *)buf->start;
388 len=netlink_icmp_reply_len(buf);
389 h=netlink_icmp_tmpl(st,ntohl(iph->saddr),len);
390 h->type=type; h->code=code; h->d=info;
391 memcpy(buf_append(&st->icmp,len),buf->start,len);
392 netlink_icmp_csum(h);
393 netlink_packet_deliver(st,NULL,&st->icmp);
394 BUF_ASSERT_FREE(&st->icmp);
395 }
396 }
397
398 /*
399 * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the
400 * checksum.
401 * RFC1812: 4.2.2.5 MUST discard messages containing invalid checksums.
402 *
403 * Is the datagram acceptable?
404 *
405 * 1. Length at least the size of an ip header
406 * 2. Version of 4
407 * 3. Checksums correctly.
408 * 4. Doesn't have a bogus length
409 */
410 static bool_t netlink_check(struct netlink *st, struct buffer_if *buf,
411 char *errmsgbuf, int errmsgbuflen)
412 {
413 #define BAD(...) do{ \
414 snprintf(errmsgbuf,errmsgbuflen,__VA_ARGS__); \
415 return False; \
416 }while(0)
417
418 if (buf->size < (int)sizeof(struct iphdr)) BAD("len %"PRIu32"",buf->size);
419 struct iphdr *iph=(struct iphdr *)buf->start;
420 int32_t len;
421
422 if (iph->ihl < 5) BAD("ihl %u",iph->ihl);
423 if (iph->version != 4) BAD("version %u",iph->version);
424 if (buf->size < iph->ihl*4) BAD("size %"PRId32"<%u*4",buf->size,iph->ihl);
425 if (ip_fast_csum((uint8_t *)iph, iph->ihl)!=0) BAD("csum");
426 len=ntohs(iph->tot_len);
427 /* There should be no padding */
428 if (buf->size!=len) BAD("len %"PRId32"!=%"PRId32,buf->size,len);
429 if (len<(iph->ihl<<2)) BAD("len %"PRId32"<(%u<<2)",len,iph->ihl);
430 /* XXX check that there's no source route specified */
431 return True;
432
433 #undef BAD
434 }
435
436 static const char *fragment_filter_header(uint8_t *base, long *hlp)
437 {
438 const int fixedhl = sizeof(struct iphdr);
439 long hl = *hlp;
440 const uint8_t *ipend = base + hl;
441 uint8_t *op = base + fixedhl;
442 const uint8_t *ip = op;
443
444 while (ip < ipend) {
445 uint8_t opt = ip[0];
446 int remain = ipend - ip;
447 if (opt == 0x00) /* End of Options List */ break;
448 if (opt == 0x01) /* No Operation */ continue;
449 if (remain < 2) return "IPv4 options truncated at length";
450 int optlen = ip[1];
451 if (remain < optlen) return "IPv4 options truncated in option";
452 if (opt & 0x80) /* copy */ {
453 memmove(op, ip, optlen);
454 op += optlen;
455 }
456 ip += optlen;
457 }
458 while ((hl = (op - base)) & 0x3)
459 *op++ = 0x00 /* End of Option List */;
460 ((struct iphdr*)base)->ihl = hl >> 2;
461 *hlp = hl;
462
463 return 0;
464 }
465
466 /* Fragment or send ICMP Fragmentation Needed */
467 static void netlink_maybe_fragment(struct netlink *st,
468 netlink_deliver_fn *deliver,
469 void *deliver_dst,
470 const char *delivery_name,
471 int32_t mtu,
472 uint32_t source, uint32_t dest,
473 struct buffer_if *buf)
474 {
475 struct iphdr *iph=(struct iphdr*)buf->start;
476 long hl = iph->ihl*4;
477 const char *ssource = ipaddr_to_string(source);
478
479 if (buf->size <= mtu) {
480 deliver(deliver_dst, buf);
481 return;
482 }
483
484 MDEBUG("%s: fragmenting %s->%s org.size=%"PRId32"\n",
485 st->name, ssource, delivery_name, buf->size);
486
487 #define BADFRAG(m, ...) \
488 Message(M_WARNING, \
489 "%s: fragmenting packet from source %s" \
490 " for transmission via %s: " m "\n", \
491 st->name, ssource, delivery_name, \
492 ## __VA_ARGS__);
493
494 unsigned orig_frag = ntohs(iph->frag);
495
496 if (orig_frag&IPHDR_FRAG_DONT) {
497 union icmpinfofield info =
498 { .fragneeded = { .unused = 0, .mtu = htons(mtu) } };
499 netlink_icmp_simple(st,buf,
500 ICMP_TYPE_UNREACHABLE,
501 ICMP_CODE_FRAGMENTATION_REQUIRED,
502 info);
503 BUF_FREE(buf);
504 return;
505 }
506 if (mtu < hl + 8) {
507 BADFRAG("mtu %"PRId32" too small", mtu);
508 BUF_FREE(buf);
509 return;
510 }
511
512 /* we (ab)use the icmp buffer to stash the original packet */
513 struct buffer_if *orig = &st->icmp;
514 BUF_ALLOC(orig,"netlink_client_deliver fragment orig");
515 buffer_copy(orig,buf);
516 BUF_FREE(buf);
517
518 const uint8_t *startindata = orig->start + hl;
519 const uint8_t *indata = startindata;
520 const uint8_t *endindata = orig->start + orig->size;
521 _Bool filtered = 0;
522
523 for (;;) {
524 /* compute our fragment offset */
525 long dataoffset = indata - startindata
526 + (orig_frag & IPHDR_FRAG_OFF)*8;
527 assert(!(dataoffset & 7));
528 if (dataoffset > IPHDR_FRAG_OFF*8) {
529 BADFRAG("ultimate fragment offset out of range");
530 break;
531 }
532
533 BUF_ALLOC(buf,"netlink_client_deliver fragment frag");
534 buffer_init(buf,calculate_max_start_pad());
535
536 /* copy header (possibly filtered); will adjust in a bit */
537 struct iphdr *fragh = buf_append(buf, hl);
538 memcpy(fragh, orig->start, hl);
539
540 /* decide how much payload to copy and copy it */
541 long avail = mtu - hl;
542 long remain = endindata - indata;
543 long use = avail < remain ? (avail & ~(long)7) : remain;
544 memcpy(buf_append(buf, use), indata, use);
545 indata += use;
546
547 _Bool last_frag = indata >= endindata;
548
549 /* adjust the header */
550 fragh->tot_len = htons(buf->size);
551 fragh->frag =
552 htons((orig_frag & ~IPHDR_FRAG_OFF) |
553 (last_frag ? 0 : IPHDR_FRAG_MORE) |
554 (dataoffset >> 3));
555 fragh->check = 0;
556 fragh->check = ip_fast_csum((const void*)fragh, fragh->ihl);
557
558 /* actually send it */
559 deliver(deliver_dst, buf);
560 if (last_frag)
561 break;
562
563 /* after copying the header for the first frag,
564 * we filter the header for the remaining frags */
565 if (!filtered++) {
566 const char *bad = fragment_filter_header(orig->start, &hl);
567 if (bad) { BADFRAG("%s", bad); break; }
568 }
569 }
570
571 BUF_FREE(orig);
572
573 #undef BADFRAG
574 }
575
576 /* Deliver a packet _to_ client; used after we have decided
577 * what to do with it (and just to check that the client has
578 * actually registered a delivery function with us). */
579 static void netlink_client_deliver(struct netlink *st,
580 struct netlink_client *client,
581 uint32_t source, uint32_t dest,
582 struct buffer_if *buf)
583 {
584 if (!client->deliver) {
585 string_t s,d;
586 s=ipaddr_to_string(source);
587 d=ipaddr_to_string(dest);
588 Message(M_ERR,"%s: dropping %s->%s, client not registered\n",
589 st->name,s,d);
590 free(s); free(d);
591 BUF_FREE(buf);
592 return;
593 }
594 netlink_maybe_fragment(st, client->deliver,client->dst,client->name,
595 client->mtu, source,dest,buf);
596 client->outcount++;
597 }
598
599 /* Deliver a packet to the host; used after we have decided that that
600 * is what to do with it. */
601 static void netlink_host_deliver(struct netlink *st,
602 uint32_t source, uint32_t dest,
603 struct buffer_if *buf)
604 {
605 netlink_maybe_fragment(st, st->deliver_to_host,st->dst,"(host)",
606 st->mtu, source,dest,buf);
607 st->outcount++;
608 }
609
610 /* Deliver a packet. "sender"==NULL for packets from the host and packets
611 generated internally in secnet. */
612 static void netlink_packet_deliver(struct netlink *st,
613 struct netlink_client *sender,
614 struct buffer_if *buf)
615 {
616 if (buf->size < (int)sizeof(struct iphdr)) {
617 Message(M_ERR,"%s: trying to deliver a too-short packet"
618 " from %s!\n",st->name, sender?sender->name:"(local)");
619 BUF_FREE(buf);
620 return;
621 }
622
623 struct iphdr *iph=(struct iphdr *)buf->start;
624 uint32_t dest=ntohl(iph->daddr);
625 uint32_t source=ntohl(iph->saddr);
626 uint32_t best_quality;
627 bool_t allow_route=False;
628 bool_t found_allowed=False;
629 int best_match;
630 int i;
631
632 BUF_ASSERT_USED(buf);
633
634 if (dest==st->secnet_address) {
635 Message(M_ERR,"%s: trying to deliver a packet to myself!\n",st->name);
636 BUF_FREE(buf);
637 return;
638 }
639
640 /* Packets from the host (sender==NULL) may always be routed. Packets
641 from clients with the allow_route option will also be routed. */
642 if (!sender || (sender && (sender->options & OPT_ALLOWROUTE)))
643 allow_route=True;
644
645 /* If !allow_route, we check the routing table anyway, and if
646 there's a suitable route with OPT_ALLOWROUTE set we use it. If
647 there's a suitable route, but none with OPT_ALLOWROUTE set then
648 we generate ICMP 'communication with destination network
649 administratively prohibited'. */
650
651 best_quality=0;
652 best_match=-1;
653 for (i=0; i<st->n_clients; i++) {
654 if (st->routes[i]->up &&
655 ipset_contains_addr(st->routes[i]->networks,dest)) {
656 /* It's an available route to the correct destination. But is
657 it better than the one we already have? */
658
659 /* If we have already found an allowed route then we don't
660 bother looking at routes we're not allowed to use. If
661 we don't yet have an allowed route we'll consider any. */
662 if (!allow_route && found_allowed) {
663 if (!(st->routes[i]->options&OPT_ALLOWROUTE)) continue;
664 }
665
666 if (st->routes[i]->link_quality>best_quality
667 || best_quality==0) {
668 best_quality=st->routes[i]->link_quality;
669 best_match=i;
670 if (st->routes[i]->options&OPT_ALLOWROUTE)
671 found_allowed=True;
672 /* If quality isn't perfect we may wish to
673 consider kicking the tunnel with a 0-length
674 packet to prompt it to perform a key setup.
675 Then it'll eventually decide it's up or
676 down. */
677 /* If quality is perfect and we're allowed to use the
678 route we don't need to search any more. */
679 if (best_quality>=MAXIMUM_LINK_QUALITY &&
680 (allow_route || found_allowed)) break;
681 }
682 }
683 }
684 if (best_match==-1) {
685 /* The packet's not going down a tunnel. It might (ought to)
686 be for the host. */
687 if (ipset_contains_addr(st->networks,dest)) {
688 netlink_host_deliver(st,source,dest,buf);
689 BUF_ASSERT_FREE(buf);
690 } else {
691 string_t s,d;
692 s=ipaddr_to_string(source);
693 d=ipaddr_to_string(dest);
694 Message(M_DEBUG,"%s: don't know where to deliver packet "
695 "(s=%s, d=%s)\n", st->name, s, d);
696 free(s); free(d);
697 netlink_icmp_simple(st,buf,ICMP_TYPE_UNREACHABLE,
698 ICMP_CODE_NET_UNREACHABLE, icmp_noinfo);
699 BUF_FREE(buf);
700 }
701 } else {
702 if (!allow_route &&
703 !(st->routes[best_match]->options&OPT_ALLOWROUTE)) {
704 string_t s,d;
705 s=ipaddr_to_string(source);
706 d=ipaddr_to_string(dest);
707 /* We have a usable route but aren't allowed to use it.
708 Generate ICMP destination unreachable: communication
709 with destination network administratively prohibited */
710 Message(M_NOTICE,"%s: denied forwarding for packet (s=%s, d=%s)\n",
711 st->name,s,d);
712 free(s); free(d);
713
714 netlink_icmp_simple(st,buf,ICMP_TYPE_UNREACHABLE,
715 ICMP_CODE_NET_PROHIBITED, icmp_noinfo);
716 BUF_FREE(buf);
717 } else {
718 if (best_quality>0) {
719 netlink_client_deliver(st,st->routes[best_match],
720 source,dest,buf);
721 BUF_ASSERT_FREE(buf);
722 } else {
723 /* Generate ICMP destination unreachable */
724 netlink_icmp_simple(st,buf,
725 ICMP_TYPE_UNREACHABLE,
726 ICMP_CODE_NET_UNREACHABLE,
727 icmp_noinfo);
728 BUF_FREE(buf);
729 }
730 }
731 }
732 BUF_ASSERT_FREE(buf);
733 }
734
735 static void netlink_packet_forward(struct netlink *st,
736 struct netlink_client *sender,
737 struct buffer_if *buf)
738 {
739 if (buf->size < (int)sizeof(struct iphdr)) return;
740 struct iphdr *iph=(struct iphdr *)buf->start;
741
742 BUF_ASSERT_USED(buf);
743
744 /* Packet has already been checked */
745 if (iph->ttl<=1) {
746 /* Generate ICMP time exceeded */
747 netlink_icmp_simple(st,buf,ICMP_TYPE_TIME_EXCEEDED,
748 ICMP_CODE_TTL_EXCEEDED,icmp_noinfo);
749 BUF_FREE(buf);
750 return;
751 }
752 iph->ttl--;
753 iph->check=0;
754 iph->check=ip_fast_csum((uint8_t *)iph,iph->ihl);
755
756 netlink_packet_deliver(st,sender,buf);
757 BUF_ASSERT_FREE(buf);
758 }
759
760 /* Deal with packets addressed explicitly to us */
761 static void netlink_packet_local(struct netlink *st,
762 struct netlink_client *sender,
763 struct buffer_if *buf)
764 {
765 struct icmphdr *h;
766
767 st->localcount++;
768
769 if (buf->size < (int)sizeof(struct icmphdr)) {
770 Message(M_WARNING,"%s: short packet addressed to secnet; "
771 "ignoring it\n",st->name);
772 BUF_FREE(buf);
773 return;
774 }
775 h=(struct icmphdr *)buf->start;
776
777 unsigned fraginfo = ntohs(h->iph.frag);
778 if ((fraginfo&(IPHDR_FRAG_OFF|IPHDR_FRAG_MORE))!=0) {
779 if (!(fraginfo & IPHDR_FRAG_OFF))
780 /* report only for first fragment */
781 Message(M_WARNING,"%s: fragmented packet addressed to secnet; "
782 "ignoring it\n",st->name);
783 BUF_FREE(buf);
784 return;
785 }
786
787 if (h->iph.protocol==1) {
788 /* It's ICMP */
789 if (h->type==ICMP_TYPE_ECHO_REQUEST && h->code==0) {
790 /* ICMP echo-request. Special case: we re-use the buffer
791 to construct the reply. */
792 h->type=ICMP_TYPE_ECHO_REPLY;
793 h->iph.daddr=h->iph.saddr;
794 h->iph.saddr=htonl(st->secnet_address);
795 h->iph.ttl=255;
796 h->iph.check=0;
797 h->iph.check=ip_fast_csum((uint8_t *)h,h->iph.ihl);
798 netlink_icmp_csum(h);
799 netlink_packet_deliver(st,NULL,buf);
800 return;
801 }
802 Message(M_WARNING,"%s: unknown incoming ICMP\n",st->name);
803 } else {
804 /* Send ICMP protocol unreachable */
805 netlink_icmp_simple(st,buf,ICMP_TYPE_UNREACHABLE,
806 ICMP_CODE_PROTOCOL_UNREACHABLE,icmp_noinfo);
807 BUF_FREE(buf);
808 return;
809 }
810
811 BUF_FREE(buf);
812 }
813
814 /* If cid==NULL packet is from host, otherwise cid specifies which tunnel
815 it came from. */
816 static void netlink_incoming(struct netlink *st, struct netlink_client *sender,
817 struct buffer_if *buf)
818 {
819 uint32_t source,dest;
820 struct iphdr *iph;
821 char errmsgbuf[50];
822 const char *sourcedesc=sender?sender->name:"host";
823
824 BUF_ASSERT_USED(buf);
825
826 if (!netlink_check(st,buf,errmsgbuf,sizeof(errmsgbuf))) {
827 Message(M_WARNING,"%s: bad IP packet from %s: %s\n",
828 st->name,sourcedesc,
829 errmsgbuf);
830 BUF_FREE(buf);
831 return;
832 }
833 assert(buf->size >= (int)sizeof(struct iphdr));
834 iph=(struct iphdr *)buf->start;
835
836 source=ntohl(iph->saddr);
837 dest=ntohl(iph->daddr);
838
839 /* Check source. If we don't like the source, there's no point
840 generating ICMP because we won't know how to get it to the
841 source of the packet. */
842 if (sender) {
843 /* Check that the packet source is appropriate for the tunnel
844 it came down */
845 if (!ipset_contains_addr(sender->networks,source)) {
846 string_t s,d;
847 s=ipaddr_to_string(source);
848 d=ipaddr_to_string(dest);
849 Message(M_WARNING,"%s: packet from tunnel %s with bad "
850 "source address (s=%s,d=%s)\n",st->name,sender->name,s,d);
851 free(s); free(d);
852 BUF_FREE(buf);
853 return;
854 }
855 } else {
856 /* Check that the packet originates in our configured local
857 network, and hasn't been forwarded from elsewhere or
858 generated with the wrong source address */
859 if (!ipset_contains_addr(st->networks,source)) {
860 string_t s,d;
861 s=ipaddr_to_string(source);
862 d=ipaddr_to_string(dest);
863 Message(M_WARNING,"%s: outgoing packet with bad source address "
864 "(s=%s,d=%s)\n",st->name,s,d);
865 free(s); free(d);
866 BUF_FREE(buf);
867 return;
868 }
869 }
870
871 /* If this is a point-to-point device we don't examine the
872 destination address at all; we blindly send it down our
873 one-and-only registered tunnel, or to the host, depending on
874 where it came from. It's up to external software to check
875 address validity and generate ICMP, etc. */
876 if (st->ptp) {
877 if (sender) {
878 netlink_host_deliver(st,source,dest,buf);
879 } else {
880 netlink_client_deliver(st,st->clients,source,dest,buf);
881 }
882 BUF_ASSERT_FREE(buf);
883 return;
884 }
885
886 /* st->secnet_address needs checking before matching destination
887 addresses */
888 if (dest==st->secnet_address) {
889 netlink_packet_local(st,sender,buf);
890 BUF_ASSERT_FREE(buf);
891 return;
892 }
893 netlink_packet_forward(st,sender,buf);
894 BUF_ASSERT_FREE(buf);
895 }
896
897 static void netlink_inst_incoming(void *sst, struct buffer_if *buf)
898 {
899 struct netlink_client *c=sst;
900 struct netlink *st=c->nst;
901
902 netlink_incoming(st,c,buf);
903 }
904
905 static void netlink_dev_incoming(void *sst, struct buffer_if *buf)
906 {
907 struct netlink *st=sst;
908
909 netlink_incoming(st,NULL,buf);
910 }
911
912 static void netlink_set_quality(void *sst, uint32_t quality)
913 {
914 struct netlink_client *c=sst;
915 struct netlink *st=c->nst;
916
917 c->link_quality=quality;
918 c->up=(c->link_quality==LINK_QUALITY_DOWN)?False:True;
919 if (c->options&OPT_SOFTROUTE) {
920 st->set_routes(st->dst,c);
921 }
922 }
923
924 static void netlink_output_subnets(struct netlink *st, uint32_t loglevel,
925 struct subnet_list *snets)
926 {
927 int32_t i;
928 string_t net;
929
930 for (i=0; i<snets->entries; i++) {
931 net=subnet_to_string(snets->list[i]);
932 Message(loglevel,"%s ",net);
933 free(net);
934 }
935 }
936
937 static void netlink_dump_routes(struct netlink *st, bool_t requested)
938 {
939 int i;
940 string_t net;
941 uint32_t c=M_INFO;
942
943 if (requested) c=M_WARNING;
944 if (st->ptp) {
945 net=ipaddr_to_string(st->secnet_address);
946 Message(c,"%s: point-to-point (remote end is %s); routes: ",
947 st->name, net);
948 free(net);
949 netlink_output_subnets(st,c,st->clients->subnets);
950 Message(c,"\n");
951 } else {
952 Message(c,"%s: routing table:\n",st->name);
953 for (i=0; i<st->n_clients; i++) {
954 netlink_output_subnets(st,c,st->routes[i]->subnets);
955 Message(c,"-> tunnel %s (%s,mtu %d,%s routes,%s,"
956 "quality %d,use %d,pri %lu)\n",
957 st->routes[i]->name,
958 st->routes[i]->up?"up":"down",
959 st->routes[i]->mtu,
960 st->routes[i]->options&OPT_SOFTROUTE?"soft":"hard",
961 st->routes[i]->options&OPT_ALLOWROUTE?"free":"restricted",
962 st->routes[i]->link_quality,
963 st->routes[i]->outcount,
964 (unsigned long)st->routes[i]->priority);
965 }
966 net=ipaddr_to_string(st->secnet_address);
967 Message(c,"%s/32 -> netlink \"%s\" (use %d)\n",
968 net,st->name,st->localcount);
969 free(net);
970 for (i=0; i<st->subnets->entries; i++) {
971 net=subnet_to_string(st->subnets->list[i]);
972 Message(c,"%s ",net);
973 free(net);
974 }
975 if (i>0)
976 Message(c,"-> host (use %d)\n",st->outcount);
977 }
978 }
979
980 /* ap is a pointer to a member of the routes array */
981 static int netlink_compare_client_priority(const void *ap, const void *bp)
982 {
983 const struct netlink_client *const*a=ap;
984 const struct netlink_client *const*b=bp;
985
986 if ((*a)->priority==(*b)->priority) return 0;
987 if ((*a)->priority<(*b)->priority) return 1;
988 return -1;
989 }
990
991 static void netlink_phase_hook(void *sst, uint32_t new_phase)
992 {
993 struct netlink *st=sst;
994 struct netlink_client *c;
995 int32_t i;
996
997 /* All the networks serviced by the various tunnels should now
998 * have been registered. We build a routing table by sorting the
999 * clients by priority. */
1000 st->routes=safe_malloc_ary(sizeof(*st->routes),st->n_clients,
1001 "netlink_phase_hook");
1002 /* Fill the table */
1003 i=0;
1004 for (c=st->clients; c; c=c->next) {
1005 assert(i<INT_MAX);
1006 st->routes[i++]=c;
1007 }
1008 /* Sort the table in descending order of priority */
1009 qsort(st->routes,st->n_clients,sizeof(*st->routes),
1010 netlink_compare_client_priority);
1011
1012 netlink_dump_routes(st,False);
1013 }
1014
1015 static void netlink_signal_handler(void *sst, int signum)
1016 {
1017 struct netlink *st=sst;
1018 Message(M_INFO,"%s: route dump requested by SIGUSR1\n",st->name);
1019 netlink_dump_routes(st,True);
1020 }
1021
1022 static void netlink_inst_set_mtu(void *sst, int32_t new_mtu)
1023 {
1024 struct netlink_client *c=sst;
1025
1026 c->mtu=new_mtu;
1027 }
1028
1029 static void netlink_inst_reg(void *sst, netlink_deliver_fn *deliver,
1030 void *dst, uint32_t *localmtu_r)
1031 {
1032 struct netlink_client *c=sst;
1033 struct netlink *st=c->nst;
1034
1035 c->deliver=deliver;
1036 c->dst=dst;
1037
1038 if (localmtu_r)
1039 *localmtu_r=st->mtu;
1040 }
1041
1042 static struct flagstr netlink_option_table[]={
1043 { "soft", OPT_SOFTROUTE },
1044 { "allow-route", OPT_ALLOWROUTE },
1045 { NULL, 0}
1046 };
1047 /* This is the routine that gets called when the closure that's
1048 returned by an invocation of a netlink device closure (eg. tun,
1049 userv-ipif) is invoked. It's used to create routes and pass in
1050 information about them; the closure it returns is used by site
1051 code. */
1052 static closure_t *netlink_inst_create(struct netlink *st,
1053 struct cloc loc, dict_t *dict)
1054 {
1055 struct netlink_client *c;
1056 string_t name;
1057 struct ipset *networks;
1058 uint32_t options,priority;
1059 int32_t mtu;
1060 list_t *l;
1061
1062 name=dict_read_string(dict, "name", True, st->name, loc);
1063
1064 l=dict_lookup(dict,"routes");
1065 if (!l)
1066 cfgfatal(loc,st->name,"required parameter \"routes\" not found\n");
1067 networks=string_list_to_ipset(l,loc,st->name,"routes");
1068 options=string_list_to_word(dict_lookup(dict,"options"),
1069 netlink_option_table,st->name);
1070
1071 priority=dict_read_number(dict,"priority",False,st->name,loc,0);
1072 mtu=dict_read_number(dict,"mtu",False,st->name,loc,0);
1073
1074 if ((options&OPT_SOFTROUTE) && !st->set_routes) {
1075 cfgfatal(loc,st->name,"this netlink device does not support "
1076 "soft routes.\n");
1077 return NULL;
1078 }
1079
1080 if (options&OPT_SOFTROUTE) {
1081 /* XXX for now we assume that soft routes require root privilege;
1082 this may not always be true. The device driver can tell us. */
1083 require_root_privileges=True;
1084 require_root_privileges_explanation="netlink: soft routes";
1085 if (st->ptp) {
1086 cfgfatal(loc,st->name,"point-to-point netlinks do not support "
1087 "soft routes.\n");
1088 return NULL;
1089 }
1090 }
1091
1092 /* Check that nets are a subset of st->remote_networks;
1093 refuse to register if they are not. */
1094 if (!ipset_is_subset(st->remote_networks,networks)) {
1095 cfgfatal(loc,st->name,"routes are not allowed\n");
1096 return NULL;
1097 }
1098
1099 c=safe_malloc(sizeof(*c),"netlink_inst_create");
1100 c->cl.description=name;
1101 c->cl.type=CL_NETLINK;
1102 c->cl.apply=NULL;
1103 c->cl.interface=&c->ops;
1104 c->ops.st=c;
1105 c->ops.reg=netlink_inst_reg;
1106 c->ops.deliver=netlink_inst_incoming;
1107 c->ops.set_quality=netlink_set_quality;
1108 c->ops.set_mtu=netlink_inst_set_mtu;
1109 c->nst=st;
1110
1111 c->networks=networks;
1112 c->subnets=ipset_to_subnet_list(networks);
1113 c->priority=priority;
1114 c->deliver=NULL;
1115 c->dst=NULL;
1116 c->name=name;
1117 c->link_quality=LINK_QUALITY_UNUSED;
1118 c->mtu=mtu?mtu:st->mtu;
1119 c->options=options;
1120 c->outcount=0;
1121 c->up=False;
1122 c->kup=False;
1123 c->next=st->clients;
1124 st->clients=c;
1125 assert(st->n_clients < INT_MAX);
1126 st->n_clients++;
1127
1128 return &c->cl;
1129 }
1130
1131 static list_t *netlink_inst_apply(closure_t *self, struct cloc loc,
1132 dict_t *context, list_t *args)
1133 {
1134 struct netlink *st=self->interface;
1135
1136 dict_t *dict;
1137 item_t *item;
1138 closure_t *cl;
1139
1140 item=list_elem(args,0);
1141 if (!item || item->type!=t_dict) {
1142 cfgfatal(loc,st->name,"must have a dictionary argument\n");
1143 }
1144 dict=item->data.dict;
1145
1146 cl=netlink_inst_create(st,loc,dict);
1147
1148 return new_closure(cl);
1149 }
1150
1151 netlink_deliver_fn *netlink_init(struct netlink *st,
1152 void *dst, struct cloc loc,
1153 dict_t *dict, cstring_t description,
1154 netlink_route_fn *set_routes,
1155 netlink_deliver_fn *to_host)
1156 {
1157 item_t *sa, *ptpa;
1158 list_t *l;
1159
1160 st->dst=dst;
1161 st->cl.description=description;
1162 st->cl.type=CL_PURE;
1163 st->cl.apply=netlink_inst_apply;
1164 st->cl.interface=st;
1165 st->clients=NULL;
1166 st->routes=NULL;
1167 st->n_clients=0;
1168 st->set_routes=set_routes;
1169 st->deliver_to_host=to_host;
1170
1171 st->name=dict_read_string(dict,"name",False,description,loc);
1172 if (!st->name) st->name=description;
1173 l=dict_lookup(dict,"networks");
1174 if (l)
1175 st->networks=string_list_to_ipset(l,loc,st->name,"networks");
1176 else {
1177 struct ipset *empty;
1178 empty=ipset_new();
1179 st->networks=ipset_complement(empty);
1180 ipset_free(empty);
1181 }
1182 l=dict_lookup(dict,"remote-networks");
1183 if (l) {
1184 st->remote_networks=string_list_to_ipset(l,loc,st->name,
1185 "remote-networks");
1186 } else {
1187 struct ipset *empty;
1188 empty=ipset_new();
1189 st->remote_networks=ipset_complement(empty);
1190 ipset_free(empty);
1191 }
1192
1193 sa=dict_find_item(dict,"secnet-address",False,"netlink",loc);
1194 ptpa=dict_find_item(dict,"ptp-address",False,"netlink",loc);
1195 if (sa && ptpa) {
1196 cfgfatal(loc,st->name,"you may not specify secnet-address and "
1197 "ptp-address in the same netlink device\n");
1198 }
1199 if (!(sa || ptpa)) {
1200 cfgfatal(loc,st->name,"you must specify secnet-address or "
1201 "ptp-address for this netlink device\n");
1202 }
1203 if (sa) {
1204 st->secnet_address=string_item_to_ipaddr(sa,"netlink");
1205 st->ptp=False;
1206 } else {
1207 st->secnet_address=string_item_to_ipaddr(ptpa,"netlink");
1208 st->ptp=True;
1209 }
1210 /* To be strictly correct we could subtract secnet_address from
1211 networks here. It shouldn't make any practical difference,
1212 though, and will make the route dump look complicated... */
1213 st->subnets=ipset_to_subnet_list(st->networks);
1214 st->mtu=dict_read_number(dict, "mtu", False, "netlink", loc, DEFAULT_MTU);
1215 buffer_new(&st->icmp,MAX(ICMP_BUFSIZE,st->mtu));
1216 st->outcount=0;
1217 st->localcount=0;
1218
1219 add_hook(PHASE_SETUP,netlink_phase_hook,st);
1220 request_signal_notification(SIGUSR1, netlink_signal_handler, st);
1221
1222 /* If we're point-to-point then we return a CL_NETLINK directly,
1223 rather than a CL_NETLINK_OLD or pure closure (depending on
1224 compatibility). This CL_NETLINK is for our one and only
1225 client. Our cl.apply function is NULL. */
1226 if (st->ptp) {
1227 closure_t *cl;
1228 cl=netlink_inst_create(st,loc,dict);
1229 st->cl=*cl;
1230 }
1231 return netlink_dev_incoming;
1232 }
1233
1234 /* No connection to the kernel at all... */
1235
1236 struct null {
1237 struct netlink nl;
1238 };
1239
1240 static bool_t null_set_route(void *sst, struct netlink_client *routes)
1241 {
1242 struct null *st=sst;
1243
1244 if (routes->up!=routes->kup) {
1245 Message(M_INFO,"%s: setting routes for tunnel %s to state %s\n",
1246 st->nl.name,routes->name,
1247 routes->up?"up":"down");
1248 routes->kup=routes->up;
1249 return True;
1250 }
1251 return False;
1252 }
1253
1254 static void null_deliver(void *sst, struct buffer_if *buf)
1255 {
1256 return;
1257 }
1258
1259 static list_t *null_apply(closure_t *self, struct cloc loc, dict_t *context,
1260 list_t *args)
1261 {
1262 struct null *st;
1263 item_t *item;
1264 dict_t *dict;
1265
1266 st=safe_malloc(sizeof(*st),"null_apply");
1267
1268 item=list_elem(args,0);
1269 if (!item || item->type!=t_dict)
1270 cfgfatal(loc,"null-netlink","parameter must be a dictionary\n");
1271
1272 dict=item->data.dict;
1273
1274 netlink_init(&st->nl,st,loc,dict,"null-netlink",null_set_route,
1275 null_deliver);
1276
1277 return new_closure(&st->nl.cl);
1278 }
1279
1280 void netlink_module(dict_t *dict)
1281 {
1282 add_closure(dict,"null-netlink",null_apply);
1283 }