+#define IPCK_INIT 0xffff
+
+/* Compute an IP checksum over some data. This is a restartable interface:
+ * initialize A to `IPCK_INIT' for the first call.
+ */
+static unsigned ipcksum(const void *buf, size_t n, unsigned a)
+{
+ unsigned long aa = a ^ 0xffff;
+ const unsigned char *p = buf, *l = p + n;
+
+ while (p < l - 1) { aa += LOAD16_B(p); p += 2; }
+ if (p < l) { aa += (unsigned)(*p) << 8; }
+ do aa = (aa & 0xffff) + (aa >> 16); while (aa >= 0x10000);
+ return (aa == 0xffff ? aa : aa ^ 0xffff);
+}
+
+/* TCP/UDP pseudoheader structure. */
+struct phdr {
+ struct in_addr ph_src, ph_dst;
+ uint8_t ph_z, ph_p;
+ uint16_t ph_len;
+};
+struct phdr6 {
+ struct in6_addr ph6_src, ph6_dst;
+ uint32_t ph6_len;
+ uint8_t ph6_z0, ph6_z1, ph6_z2, ph6_nxt;
+};
+
+struct raw_state {
+ union addr me, a;
+ int sk, rawicmp, rawudp;
+ uint16_t srcport, dstport;
+ unsigned q;
+};
+
+static int raw_setup(void *stv, int sk, const struct param *pp)
+{
+ struct raw_state *st = stv;
+ socklen_t sz;
+ int i, mtu = -1;
+ struct ifaddrs *ifa, *ifaa, *ifap;
+ struct ifreq ifr;
+ struct icmp6_filter f6;
+
+ /* Check that the address is OK, and that we have the necessary raw
+ * sockets.
+ *
+ * For IPv6, also set the filter so we don't get too many useless wakeups.
+ */
+ switch (pp->a.sa.sa_family) {
+ case AF_INET:
+ if (rawerr) { errno = rawerr; goto fail_0; }
+ st->rawicmp = rawicmp; st->rawudp = rawudp; st->sk = sk;
+ /* IPv4 filtering is available on Linux but isn't portable. */
+ break;
+ case AF_INET6:
+ if (rawerr6) { errno = rawerr6; goto fail_0; }
+ st->rawicmp = rawicmp6; st->rawudp = rawudp6; st->sk = sk;
+ ICMP6_FILTER_SETBLOCKALL(&f6);
+ ICMP6_FILTER_SETPASS(ICMP6_PACKET_TOO_BIG, &f6);
+ ICMP6_FILTER_SETPASS(ICMP6_DST_UNREACH, &f6);
+ if (setsockopt(st->rawicmp, IPPROTO_ICMPV6, ICMP6_FILTER,
+ &f6, sizeof(f6))) {
+ die(EXIT_FAILURE, "failed to set icmpv6 filter: %s",
+ strerror(errno));
+ }
+ break;
+ default:
+ errno = EPFNOSUPPORT; goto fail_0;
+ }
+
+ /* Initialize the sequence number. */
+ st->q = rand() & 0xffff;
+
+ /* Snaffle the local and remote address and port number. */
+ st->a = pp->a;
+ sz = sizeof(st->me);
+ if (getsockname(sk, &st->me.sa, &sz))
+ goto fail_0;
+
+ /* Only now do some fiddling because Linux doesn't like port numbers in
+ * IPv6 raw destination addresses...
+ */
+ switch (pp->a.sa.sa_family) {
+ case AF_INET:
+ st->srcport = st->me.sin.sin_port; st->me.sin.sin_port = 0;
+ st->dstport = st->a.sin.sin_port; st->a.sin.sin_port = 0;
+ break;
+ case AF_INET6:
+ st->srcport = st->me.sin6.sin6_port; st->me.sin6.sin6_port = 0;
+ st->dstport = st->a.sin6.sin6_port; st->a.sin6.sin6_port = 0;
+ break;
+ default:
+ abort();
+ }
+
+ /* There isn't a portable way to force the DF flag onto a packet through
+ * UDP, or even through raw IP, unless we write the entire IP header
+ * ourselves. This is somewhat annoying, especially since we have an
+ * uphill struggle keeping track of which systems randomly expect which
+ * header fields to be presented in host byte order. Oh, well.
+ */
+ i = 1;
+ if (setsockopt(rawudp, IPPROTO_IP, IP_HDRINCL, &i, sizeof(i))) goto fail_0;
+
+ /* Find an upper bound on the MTU. Do two passes over the interface
+ * list. If we can find matches for our local address then use the
+ * highest one of those; otherwise do a second pass and simply take the
+ * highest MTU of any network interface.
+ */
+ if (getifaddrs(&ifaa)) goto fail_0;
+ for (i = 0; i < 2; i++) {
+ for (ifap = 0, ifa = ifaa; ifa; ifa = ifa->ifa_next) {
+ if (!(ifa->ifa_flags & IFF_UP) || !ifa->ifa_addr ||
+ ifa->ifa_addr->sa_family != st->me.sa.sa_family ||
+ (i == 0 &&
+ !addreq((union addr *)ifa->ifa_addr, &st->me, 0)) ||
+ (i == 1 && ifap && strcmp(ifap->ifa_name, ifa->ifa_name) == 0) ||
+ strlen(ifa->ifa_name) >= sizeof(ifr.ifr_name))
+ continue;
+ ifap = ifa;
+ strcpy(ifr.ifr_name, ifa->ifa_name);
+ if (ioctl(sk, SIOCGIFMTU, &ifr)) goto fail_1;
+ if (mtu < ifr.ifr_mtu) mtu = ifr.ifr_mtu;
+ }
+ if (mtu > 0) break;
+ }
+ if (mtu < 0) { errno = ENOTCONN; goto fail_1; }
+ freeifaddrs(ifaa);
+
+ /* Done. */
+ return (mtu);
+
+fail_1:
+ freeifaddrs(ifaa);
+fail_0:
+ return (-1);
+}
+
+static void raw_finish(void *stv) { ; }
+
+static void raw_selprep(void *stv, int *maxfd, fd_set *fd_in)
+ { struct raw_state *st = stv; ADDFD(st->sk); ADDFD(st->rawicmp); }
+
+static int raw_xmit(void *stv, int mtu)
+{
+ struct raw_state *st = stv;
+ unsigned char b[65536], *p;
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ struct udphdr *udp;
+ struct phdr ph;
+ struct phdr6 ph6;
+ unsigned ck;
+
+ switch (st->a.sa.sa_family) {
+
+ case AF_INET:
+
+ /* Build the IP header. */
+ ip = (struct ip *)b;
+ ip->ip_v = 4;
+ ip->ip_hl = sizeof(*ip)/4;
+ ip->ip_tos = IPTOS_RELIABILITY;
+ ip->ip_len = sane_htons(mtu);
+ STEP(st->q); ip->ip_id = htons(st->q);
+ ip->ip_off = sane_htons(0 | IP_DF);
+ ip->ip_ttl = 64;
+ ip->ip_p = IPPROTO_UDP;
+ ip->ip_sum = 0;
+ ip->ip_src = st->me.sin.sin_addr;
+ ip->ip_dst = st->a.sin.sin_addr;
+
+ /* Build a UDP packet in the output buffer. */
+ udp = (struct udphdr *)(ip + 1);
+ udp->uh_sport = st->srcport;
+ udp->uh_dport = st->dstport;
+ udp->uh_ulen = htons(mtu - sizeof(*ip));
+ udp->uh_sum = 0;
+
+ /* Copy the payload. */
+ p = (unsigned char *)(udp + 1);
+ memcpy(p, buf, mtu - (p - b));
+
+ /* Calculate the UDP checksum. */
+ ph.ph_src = ip->ip_src;
+ ph.ph_dst = ip->ip_dst;
+ ph.ph_z = 0;
+ ph.ph_p = IPPROTO_UDP;
+ ph.ph_len = udp->uh_ulen;
+ ck = IPCK_INIT;
+ ck = ipcksum(&ph, sizeof(ph), ck);
+ ck = ipcksum(udp, mtu - sizeof(*ip), ck);
+ udp->uh_sum = htons(ck);
+
+ break;
+
+ case AF_INET6:
+
+ /* Build the IP header. */
+ ip6 = (struct ip6_hdr *)b;
+ STEP(st->q); ip6->ip6_flow = htonl(0x60000000 | st->q);
+ ip6->ip6_plen = htons(mtu - sizeof(*ip6));
+ ip6->ip6_nxt = IPPROTO_UDP;
+ ip6->ip6_hlim = 64;
+ ip6->ip6_src = st->me.sin6.sin6_addr;
+ ip6->ip6_dst = st->a.sin6.sin6_addr;
+
+ /* Build a UDP packet in the output buffer. */
+ udp = (struct udphdr *)(ip6 + 1);
+ udp->uh_sport = st->srcport;
+ udp->uh_dport = st->dstport;
+ udp->uh_ulen = htons(mtu - sizeof(*ip6));
+ udp->uh_sum = 0;
+
+ /* Copy the payload. */
+ p = (unsigned char *)(udp + 1);
+ memcpy(p, buf, mtu - (p - b));
+
+ /* Calculate the UDP checksum. */
+ ph6.ph6_src = ip6->ip6_src;
+ ph6.ph6_dst = ip6->ip6_dst;
+ ph6.ph6_len = udp->uh_ulen;
+ ph6.ph6_z0 = ph6.ph6_z1 = ph6.ph6_z2 = 0;
+ ph6.ph6_nxt = IPPROTO_UDP;
+ ck = IPCK_INIT;
+ ck = ipcksum(&ph6, sizeof(ph6), ck);
+ ck = ipcksum(udp, mtu - sizeof(*ip6), ck);
+ udp->uh_sum = htons(ck);
+
+ break;
+
+ default:
+ abort();
+ }
+
+ /* Send the whole thing off. If we're too big for the interface then we
+ * might need to trim immediately.
+ */
+ if (sendto(st->rawudp, b, mtu, 0, &st->a.sa, addrsz(&st->a)) < 0) {
+ if (errno == EMSGSIZE) return (RC_LOWER);
+ else goto fail_0;
+ }
+
+ /* Done. */
+ return (RC_OK);
+
+fail_0:
+ return (RC_FAIL);
+}
+
+static int raw_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
+{
+ struct raw_state *st = stv;
+ unsigned char b[65536];
+ struct ip *ip;
+ struct ip6_hdr *ip6;
+ struct icmp *icmp;
+ struct icmp6_hdr *icmp6;
+ struct udphdr *udp;
+ const unsigned char *payload;
+ ssize_t n;
+
+ /* An ICMP packet: see what's inside. */
+ if (FD_ISSET(st->rawicmp, fd_in)) {
+ if ((n = read(st->rawicmp, b, sizeof(b))) < 0) goto fail_0;
+
+ switch (st->me.sa.sa_family) {
+
+ case AF_INET:
+
+ ip = (struct ip *)b;
+ if (n < sizeof(*ip) || n < sizeof(4*ip->ip_hl) ||
+ ip->ip_v != 4 || ip->ip_p != IPPROTO_ICMP)
+ goto skip_icmp;
+ n -= sizeof(4*ip->ip_hl);
+
+ icmp = (struct icmp *)(b + 4*ip->ip_hl);
+ if (n < sizeof(*icmp) || icmp->icmp_type != ICMP_UNREACH)
+ goto skip_icmp;
+ n -= offsetof(struct icmp, icmp_ip);
+
+ ip = &icmp->icmp_ip;
+ if (n < sizeof(*ip) ||
+ ip->ip_p != IPPROTO_UDP || ip->ip_hl != sizeof(*ip)/4 ||
+ ip->ip_id != htons(st->q) ||
+ ip->ip_src.s_addr != st->me.sin.sin_addr.s_addr ||
+ ip->ip_dst.s_addr != st->a.sin.sin_addr.s_addr)
+ goto skip_icmp;
+ n -= sizeof(*ip);
+
+ udp = (struct udphdr *)(ip + 1);
+ if (n < sizeof(*udp) || udp->uh_sport != st->srcport ||
+ udp->uh_dport != st->dstport)
+ goto skip_icmp;
+ n -= sizeof(*udp);
+
+ payload = (const unsigned char *)(udp + 1);
+ if (!mypacketp(ps, payload, n)) goto skip_icmp;
+
+ if (icmp->icmp_code == ICMP_UNREACH_PORT) return (RC_HIGHER);
+ else if (icmp->icmp_code != ICMP_UNREACH_NEEDFRAG) goto skip_icmp;
+ else if (icmp->icmp_nextmtu) return (htons(icmp->icmp_nextmtu));
+ else return (RC_LOWER);
+
+ break;
+
+ case AF_INET6:
+ icmp6 = (struct icmp6_hdr *)b;
+ if (n < sizeof(*icmp6) ||
+ (icmp6->icmp6_type != ICMP6_PACKET_TOO_BIG &&
+ icmp6->icmp6_type != ICMP6_DST_UNREACH))
+ goto skip_icmp;
+ n -= sizeof(*icmp6);
+
+ ip6 = (struct ip6_hdr *)(icmp6 + 1);
+ if (n < sizeof(*ip6) || ip6->ip6_nxt != IPPROTO_UDP ||
+ memcmp(ip6->ip6_src.s6_addr,
+ st->me.sin6.sin6_addr.s6_addr, 16) ||
+ memcmp(ip6->ip6_dst.s6_addr,
+ st->a.sin6.sin6_addr.s6_addr, 16) ||
+ (ntohl(ip6->ip6_flow)&0xffff) != st->q)
+ goto skip_icmp;
+ n -= sizeof(*ip6);
+
+ udp = (struct udphdr *)(ip6 + 1);
+ if (n < sizeof(*udp) || udp->uh_sport != st->srcport ||
+ udp->uh_dport != st->dstport)
+ goto skip_icmp;
+ n -= sizeof(*udp);
+
+ payload = (const unsigned char *)(udp + 1);
+ if (!mypacketp(ps, payload, n)) goto skip_icmp;
+
+ if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG)
+ return (ntohs(icmp6->icmp6_mtu));
+ else switch (icmp6->icmp6_code) {
+ case ICMP6_DST_UNREACH_ADMIN:
+ case ICMP6_DST_UNREACH_NOPORT:
+ return (RC_HIGHER);
+ default:
+ goto skip_icmp;
+ }
+ break;
+
+ default:
+ abort();
+ }
+ }
+
+skip_icmp:;
+
+ /* If we got a reply to the current probe then we're good. If we got an
+ * error, or the packet's sequence number is wrong, then ignore it.
+ */
+ if (FD_ISSET(st->sk, fd_in)) {
+ if ((n = read(st->sk, b, sizeof(b))) < 0) return (RC_OK);
+ else if (mypacketp(ps, b, n)) return (RC_HIGHER);
+ else return (RC_OK);
+ }
+
+ return (RC_OK);
+
+fail_0:
+ return (RC_FAIL);
+}
+
+static const struct probe_ops raw_ops = {
+ "raw", OPS_CHAIN, sizeof(struct raw_state),
+ raw_setup, raw_finish,
+ raw_selprep, raw_xmit, raw_selproc
+};
+
+#undef OPS_CHAIN
+#define OPS_CHAIN &raw_ops
+
+/*----- Doing the job on Linux --------------------------------------------*/
+
+#if defined(linux)
+
+#ifndef IP_MTU
+# define IP_MTU 14 /* Blech! */
+#endif
+
+struct linux_state {
+ int sol, so_mtu_discover, so_mtu;
+ int sk;
+ size_t hdrlen;
+};
+
+static int linux_setup(void *stv, int sk, const struct param *pp)
+{
+ struct linux_state *st = stv;
+ int i, mtu;
+ socklen_t sz;
+
+ /* Check that the address is OK. */
+ switch (pp->a.sa.sa_family) {
+ case AF_INET:
+ st->sol = IPPROTO_IP;
+ st->so_mtu_discover = IP_MTU_DISCOVER;
+ st->so_mtu = IP_MTU;
+ st->hdrlen = 28;
+ break;
+ case AF_INET6:
+ st->sol = IPPROTO_IPV6;
+ st->so_mtu_discover = IPV6_MTU_DISCOVER;
+ st->so_mtu = IPV6_MTU;
+ st->hdrlen = 48;
+ break;
+ default:
+ errno = EPFNOSUPPORT;
+ return (-1);
+ }
+
+ /* Snaffle the UDP socket. */
+ st->sk = sk;
+
+ /* Turn on kernel path-MTU discovery and force DF on. */
+ i = IP_PMTUDISC_PROBE;
+ if (setsockopt(st->sk, st->sol, st->so_mtu_discover, &i, sizeof(i)))
+ return (-1);
+
+ /* Read the initial MTU guess back and report it. */
+ sz = sizeof(mtu);
+ if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz))
+ return (-1);
+
+ /* Done. */
+ return (mtu);
+}
+
+static void linux_finish(void *stv) { ; }
+
+static void linux_selprep(void *stv, int *maxfd, fd_set *fd_in)
+ { struct linux_state *st = stv; ADDFD(st->sk); }
+
+static int linux_xmit(void *stv, int mtu)
+{
+ struct linux_state *st = stv;
+
+ /* Write the packet. */
+ if (write(st->sk, buf, mtu - st->hdrlen) >= 0) return (RC_OK);
+ else if (errno == EMSGSIZE) return (RC_LOWER);
+ else return (RC_FAIL);
+}
+
+static int linux_selproc(void *stv, fd_set *fd_in, struct probestate *ps)
+{
+ struct linux_state *st = stv;
+ int mtu;
+ socklen_t sz;
+ ssize_t n;
+ unsigned char b[65536];
+
+ /* Read an answer. If it looks like the right kind of error then report a
+ * success. This is potentially wrong, since we can't tell whether an
+ * error was delayed from an earlier probe. However, we never return
+ * RC_LOWER from this method, so the packet sizes ought to be monotonically
+ * decreasing and this won't cause trouble. Otherwise update from the
+ * kernel's idea of the right MTU.
+ */
+ if (FD_ISSET(st->sk, fd_in)) {
+ n = read(st->sk, &buf, sizeof(buf));
+ if (n >= 0 ?
+ mypacketp(ps, b, n) :
+ errno == ECONNREFUSED || errno == EHOSTUNREACH)
+ return (RC_HIGHER);
+ sz = sizeof(mtu);
+ if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz))
+ return (RC_FAIL);
+ return (mtu);
+ }
+ return (RC_OK);
+}
+
+static const struct probe_ops linux_ops = {
+ "linux", OPS_CHAIN, sizeof(struct linux_state),
+ linux_setup, linux_finish,
+ linux_selprep, linux_xmit, linux_selproc
+};
+
+#undef OPS_CHAIN
+#define OPS_CHAIN &linux_ops