X-Git-Url: https://git.distorted.org.uk/~mdw/tripe/blobdiff_plain/b7e1e18b57c30555f65a2548e0f1f186200c0ea8..HEAD:/pathmtu/pathmtu.c diff --git a/pathmtu/pathmtu.c b/pathmtu/pathmtu.c index acc88378..9c5c8294 100644 --- a/pathmtu/pathmtu.c +++ b/pathmtu/pathmtu.c @@ -9,26 +9,27 @@ * * This file is part of Trivial IP Encryption (TrIPE). * - * TrIPE is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * TrIPE is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 3 of the License, or (at your + * option) any later version. * - * TrIPE is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * TrIPE is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. * * You should have received a copy of the GNU General Public License - * along with TrIPE; if not, write to the Free Software Foundation, - * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * along with TrIPE. If not, see . */ /*----- Header files ------------------------------------------------------*/ #include "config.h" +#include #include +#include #include #include #include @@ -43,6 +44,21 @@ #include #include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_GETIFADDRS +# include +# include +# include +#endif + +#include +#include #include #include #include @@ -54,8 +70,14 @@ static unsigned char buf[65536]; +#define POLY 0x1002d + /*----- Utility functions -------------------------------------------------*/ +/* Step a value according to a simple LFSR. */ +#define STEP(q) \ + do (q) = ((q) & 0x8000) ? ((q) << 1) ^ POLY : ((q) << 1); while (0) + /* Fill buffer with a constant but pseudorandom string. Uses a simple * LFSR. */ @@ -64,77 +86,848 @@ static void fillbuffer(unsigned char *p, size_t sz) unsigned int y = 0xbc20; const unsigned char *l = p + sz; int i; -#define POLY 0x002d while (p < l) { *p++ = y & 0xff; - for (i = 0; i < 8; i++) { - if (!(y & 0x8000)) y <<= 1; - else y = (y << 1) ^ POLY; - } + for (i = 0; i < 8; i++) STEP(y); } } -/*----- Doing the actual job ----------------------------------------------*/ +/* Convert a string to floating point. */ +static double s2f(const char *s, const char *what) +{ + double f; + char *q; + + errno = 0; + f = strtod(s, &q); + if (errno || *q) die(EXIT_FAILURE, "bad %s", what); + return (f); +} -#if defined(linux) +/* Convert a floating-point value into a struct timeval. */ +static void f2tv(struct timeval *tv, double t) + { tv->tv_sec = t; tv->tv_usec = (t - tv->tv_sec)*MILLION; } -#ifndef IP_MTU -# define IP_MTU 14 /* Blech! */ -#endif +union addr { + struct sockaddr sa; + struct sockaddr_in sin; + struct sockaddr_in6 sin6; +}; -static int pathmtu(struct sockaddr_in *sin, double to) +/* Check whether an address family is even slightly supported. */ +static int addrfamok(int af) { - int sk; + switch (af) { + case AF_INET: case AF_INET6: return (1); + default: return (0); + } +} + +/* Return the size of a socket address. */ +static size_t addrsz(const union addr *a) +{ + switch (a->sa.sa_family) { + case AF_INET: return (sizeof(a->sin)); + case AF_INET6: return (sizeof(a->sin6)); + default: abort(); + } +} + +/*----- Main algorithm skeleton -------------------------------------------*/ + +struct param { + unsigned f; /* Various flags */ +#define F_VERBOSE 1u /* Give a running commentary */ + double retx; /* Initial retransmit interval */ + double regr; /* Retransmit growth factor */ + double timeout; /* Retransmission timeout */ + int seqoff; /* Offset to write sequence number */ + const struct probe_ops *pops; /* Probe algorithm description */ + union addr a; /* Destination address */ +}; + +struct probestate { + const struct param *pp; + unsigned q; +}; + +struct probe_ops { + const char *name; + const struct probe_ops *next; + size_t statesz; + int (*setup)(void *, int, const struct param *); + void (*finish)(void *); + void (*selprep)(void *, int *, fd_set *); + int (*xmit)(void *, int); + int (*selproc)(void *, fd_set *, struct probestate *); +}; + +#define OPS_CHAIN 0 + +enum { + RC_FAIL = -99, + RC_OK = 0, + RC_LOWER = -1, + RC_HIGHER = -2, + RC_NOREPLY = -3 + /* or a positive MTU upper-bound */ +}; + +/* Add a file descriptor FD to the set `fd_in', updating `*maxfd'. */ +#define ADDFD(fd) \ + do { FD_SET(fd, fd_in); if (*maxfd < fd) *maxfd = fd; } while (0) + +/* Check whether a buffer contains a packet from our current probe. */ +static int mypacketp(struct probestate *ps, + const unsigned char *p, size_t sz) +{ + const struct param *pp = ps->pp; + + return (sz >= pp->seqoff + 2 && LOAD16(p + pp->seqoff) == ps->q); +} + +/* See whether MTU is an acceptable MTU value. Return an appropriate + * RC_... code or a new suggested MTU. + */ +static int probe(struct probestate *ps, void *st, int mtu) +{ + const struct param *pp = ps->pp; fd_set fd_in; - int mtu; - int i; - size_t sz; - struct timeval tv; - - tv.tv_sec = to; tv.tv_usec = (to - tv.tv_sec) * 1000000; - if ((sk = socket(PF_INET, SOCK_DGRAM, 0)) < 0) goto fail_0; - i = IP_PMTUDISC_DO; - if (setsockopt(sk, SOL_IP, IP_MTU_DISCOVER, &i, sizeof(i))) - goto fail_1; - if (connect(sk, (struct sockaddr *)sin, sizeof(*sin))) goto fail_1; + struct timeval tv, now, when, done; + double timer = pp->retx; + int rc, maxfd; + + /* Set up the first retransmit and give-up timers. */ + gettimeofday(&now, 0); + f2tv(&tv, pp->timeout); TV_ADD(&done, &now, &tv); + f2tv(&tv, timer); TV_ADD(&when, &now, &tv); + if (TV_CMP(&when, >, &done)) when = done; + + /* Send the initial probe. */ + if (pp->f & F_VERBOSE) + moan("sending probe of size %d (seq = %04x)", mtu, ps->q); + STEP(ps->q); + STORE16(buf + pp->seqoff, ps->q); + if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc); + for (;;) { - sz = sizeof(mtu); - if (getsockopt(sk, SOL_IP, IP_MTU, &mtu, &sz)) goto fail_1; - if (write(sk, buf, mtu - 28) < 0) goto fail_1; - FD_SET(sk, &fd_in); - if (select(sk + 1, &fd_in, 0, 0, &tv) < 0) goto fail_1; - if (!FD_ISSET(sk, &fd_in)) break; - if (read(sk, &i, 1) >= 0 || - errno == ECONNREFUSED || errno == EHOSTUNREACH) - break; - if (errno != EMSGSIZE) goto fail_1; + + /* Wait for something interesting to happen. */ + maxfd = 0; FD_ZERO(&fd_in); + pp->pops->selprep(st, &maxfd, &fd_in); + TV_SUB(&tv, &when, &now); + if (select(maxfd + 1, &fd_in, 0, 0, &tv) < 0) return (RC_FAIL); + gettimeofday(&now, 0); + + /* See whether the probe method has any answers for us. */ + if ((rc = pp->pops->selproc(st, &fd_in, ps)) != RC_OK) return (rc); + + /* If we've waited too long, give up. If we should retransmit, do + * that. + */ + if (TV_CMP(&now, >, &done)) + return (RC_NOREPLY); + else if (TV_CMP(&now, >, &when)) { + if (pp->f & F_VERBOSE) moan("re-sending probe of size %d", mtu); + if ((rc = pp->pops->xmit(st, mtu)) != RC_OK) return (rc); + do { + timer *= pp->regr; f2tv(&tv, timer); TV_ADD(&when, &when, &tv); + } while (TV_CMP(&when, <, &now)); + if (TV_CMP(&when, >, &done)) when = done; + } + } +} + +/* Discover the path MTU to the destination address. */ +static int pathmtu(const struct param *pp) +{ + int sk; + int mtu, lo, hi; + int rc, droppy = -1; + void *st; + struct probestate ps; + + /* Build and connect a UDP socket. We'll need this to know the local port + * number to use if nothing else. Set other stuff up. + */ + if ((sk = socket(pp->a.sa.sa_family, SOCK_DGRAM, IPPROTO_UDP)) < 0) + goto fail_0; + if (connect(sk, &pp->a.sa, addrsz(&pp->a))) goto fail_1; + st = xmalloc(pp->pops->statesz); + if ((mtu = pp->pops->setup(st, sk, pp)) < 0) goto fail_2; + ps.pp = pp; ps.q = rand() & 0xffff; + switch (pp->a.sa.sa_family) { + case AF_INET: lo = 576; break; + case AF_INET6: lo = 1280; break; + default: abort(); + } + hi = mtu; + if (hi < lo) { errno = EMSGSIZE; return (-1); } + + /* And now we do a thing which is sort of like a binary search, except that + * we also take explicit clues as establishing a new upper bound, and we + * try to hug that initially. + */ + for (;;) { + assert(lo <= mtu && mtu <= hi); + if (pp->f & F_VERBOSE) moan("probe: %d <= %d <= %d", lo, mtu, hi); + rc = probe(&ps, st, mtu); + switch (rc) { + + case RC_FAIL: + if (pp->f & F_VERBOSE) moan("probe failed"); + goto fail_3; + + case RC_NOREPLY: + /* If we've not seen a dropped packet before then we don't know what + * this means yet -- in particular, we don't know which bit of the + * network is swallowing packets. Send a minimum-size probe. If + * that doesn't come back then assume that the remote host is + * swallowing our packets. If it does, then we assume that dropped + * packets are a result of ICMP fragmentation-needed reports being + * lost or suppressed. + */ + if (pp->f & F_VERBOSE) moan("gave up: black hole detected"); + if (droppy == -1) { + if (pp->f & F_VERBOSE) moan("sending minimum-size probe"); + switch (probe(&ps, st, lo)) { + case RC_FAIL: + goto fail_3; + case RC_NOREPLY: + if (pp->f & F_VERBOSE) { + moan("no reply from min-size probe: " + "assume black hole at target"); + } + droppy = 1; + break; + case RC_HIGHER: + if (pp->f & F_VERBOSE) { + moan("reply from min-size probe OK: " + "assume black hole in network"); + } + droppy = 0; + break; + default: + if (pp->f & F_VERBOSE) + moan("unexpected return code from probe"); + errno = ENOTCONN; + goto fail_3; + } + } + + if (droppy) goto higher; else goto lower; + + case RC_HIGHER: + higher: + if (droppy == -1) { + if (pp->f & F_VERBOSE) + moan("probe returned: remote host is not a black hole"); + droppy = 0; + } + if (mtu == hi) { + if (pp->f & F_VERBOSE) moan("probe returned: found correct MTU"); + goto done; + } + lo = mtu; + + /* Now we must make a new guess, between lo and hi. We know that lo + * is good; but we're not so sure about hi here. We know that hi > + * lo, so this will find an approximate midpoint, greater than lo and + * no more than hi. + */ + if (pp->f & F_VERBOSE) moan("probe returned: guessing higher"); + mtu += (hi - lo + 1)/2; + break; + + case RC_LOWER: + lower: + /* If this didn't work, and we're already at the bottom of our + * possible range, then something has gone horribly wrong. + */ + assert(lo < mtu); + hi = mtu - 1; + if (lo == hi) { + if (pp->f & F_VERBOSE) moan("error returned: found correct MTU"); + mtu = lo; + goto done; + } + + /* We must make a new guess, between lo and hi. We're probably + * fairly sure that lo will succeed, since either it's the minimum + * MTU or we've tested it already; but we're not quite sure about hi, + * so we want to aim high. + */ + if (pp->f & F_VERBOSE) moan("error returned: guessing lower"); + mtu -= (hi - lo + 1)/2; + break; + + default: + if (pp->f & F_VERBOSE) moan("error returned with new MTU estimate"); + mtu = hi = rc; + break; + } } + +done: + /* Clean up and return our result. */ + pp->pops->finish(st); + xfree(st); close(sk); return (mtu); +fail_3: + pp->pops->finish(st); +fail_2: + xfree(st); fail_1: close(sk); fail_0: return (-1); } +/*----- Doing it the hard way ---------------------------------------------*/ + +#ifdef HAVE_GETIFADDRS + +#if defined(linux) || defined(__OpenBSD__) +# define IPHDR_SANE +#endif + +#ifdef IPHDR_SANE +# define sane_htons htons +# define sane_htonl htonl #else +# define sane_htons +# define sane_htonl +#endif + +static int rawicmp = -1, rawudp = -1, rawerr = 0; +static int rawicmp6 = -1, rawudp6 = -1, rawerr6 = 0; + +#define IPCK_INIT 0xffff + +/* Compare two addresses. Maybe compare the port numbers too. */ +#define AEF_PORT 1u +static int addreq(const union addr *a, const union addr *b, unsigned f) +{ + switch (a->sa.sa_family) { + case AF_INET: + return (a->sin.sin_addr.s_addr == b->sin.sin_addr.s_addr && + (!(f&AEF_PORT) || a->sin.sin_port == b->sin.sin_port)); + case AF_INET6: + return (!memcmp(a->sin6.sin6_addr.s6_addr, + b->sin6.sin6_addr.s6_addr, 16) && + (!(f&AEF_PORT) || a->sin6.sin6_port == b->sin6.sin6_port)); + default: + abort(); + } +} + +/* Compute an IP checksum over some data. This is a restartable interface: + * initialize A to `IPCK_INIT' for the first call. + */ +static unsigned ipcksum(const void *buf, size_t n, unsigned a) +{ + unsigned long aa = a ^ 0xffff; + const unsigned char *p = buf, *l = p + n; + + while (p < l - 1) { aa += LOAD16_B(p); p += 2; } + if (p < l) { aa += (unsigned)(*p) << 8; } + do aa = (aa & 0xffff) + (aa >> 16); while (aa >= 0x10000); + return (aa == 0xffff ? aa : aa ^ 0xffff); +} + +/* TCP/UDP pseudoheader structure. */ +struct phdr { + struct in_addr ph_src, ph_dst; + uint8_t ph_z, ph_p; + uint16_t ph_len; +}; +struct phdr6 { + struct in6_addr ph6_src, ph6_dst; + uint32_t ph6_len; + uint8_t ph6_z0, ph6_z1, ph6_z2, ph6_nxt; +}; + +struct raw_state { + union addr me, a; + int sk, rawicmp, rawudp; + uint16_t srcport, dstport; + unsigned q; +}; + +static int raw_setup(void *stv, int sk, const struct param *pp) +{ + struct raw_state *st = stv; + socklen_t sz; + int i, mtu = -1; + struct ifaddrs *ifa, *ifaa, *ifap; + struct ifreq ifr; + struct icmp6_filter f6; + + /* Check that the address is OK, and that we have the necessary raw + * sockets. + * + * For IPv6, also set the filter so we don't get too many useless wakeups. + */ + switch (pp->a.sa.sa_family) { + case AF_INET: + if (rawerr) { errno = rawerr; goto fail_0; } + st->rawicmp = rawicmp; st->rawudp = rawudp; st->sk = sk; + /* IPv4 filtering is available on Linux but isn't portable. */ + break; + case AF_INET6: + if (rawerr6) { errno = rawerr6; goto fail_0; } + st->rawicmp = rawicmp6; st->rawudp = rawudp6; st->sk = sk; + ICMP6_FILTER_SETBLOCKALL(&f6); + ICMP6_FILTER_SETPASS(ICMP6_PACKET_TOO_BIG, &f6); + ICMP6_FILTER_SETPASS(ICMP6_DST_UNREACH, &f6); + if (setsockopt(st->rawicmp, IPPROTO_ICMPV6, ICMP6_FILTER, + &f6, sizeof(f6))) { + die(EXIT_FAILURE, "failed to set icmpv6 filter: %s", + strerror(errno)); + } + break; + default: + errno = EPFNOSUPPORT; goto fail_0; + } + + /* Initialize the sequence number. */ + st->q = rand() & 0xffff; + + /* Snaffle the local and remote address and port number. */ + st->a = pp->a; + sz = sizeof(st->me); + if (getsockname(sk, &st->me.sa, &sz)) + goto fail_0; + + /* Only now do some fiddling because Linux doesn't like port numbers in + * IPv6 raw destination addresses... + */ + switch (pp->a.sa.sa_family) { + case AF_INET: + st->srcport = st->me.sin.sin_port; st->me.sin.sin_port = 0; + st->dstport = st->a.sin.sin_port; st->a.sin.sin_port = 0; + break; + case AF_INET6: + st->srcport = st->me.sin6.sin6_port; st->me.sin6.sin6_port = 0; + st->dstport = st->a.sin6.sin6_port; st->a.sin6.sin6_port = 0; + break; + default: + abort(); + } + + /* There isn't a portable way to force the DF flag onto a packet through + * UDP, or even through raw IP, unless we write the entire IP header + * ourselves. This is somewhat annoying, especially since we have an + * uphill struggle keeping track of which systems randomly expect which + * header fields to be presented in host byte order. Oh, well. + */ + i = 1; + if (setsockopt(rawudp, IPPROTO_IP, IP_HDRINCL, &i, sizeof(i))) goto fail_0; + + /* Find an upper bound on the MTU. Do two passes over the interface + * list. If we can find matches for our local address then use the + * highest one of those; otherwise do a second pass and simply take the + * highest MTU of any network interface. + */ + if (getifaddrs(&ifaa)) goto fail_0; + for (i = 0; i < 2; i++) { + for (ifap = 0, ifa = ifaa; ifa; ifa = ifa->ifa_next) { + if (!(ifa->ifa_flags & IFF_UP) || !ifa->ifa_addr || + ifa->ifa_addr->sa_family != st->me.sa.sa_family || + (i == 0 && + !addreq((union addr *)ifa->ifa_addr, &st->me, 0)) || + (i == 1 && ifap && strcmp(ifap->ifa_name, ifa->ifa_name) == 0) || + strlen(ifa->ifa_name) >= sizeof(ifr.ifr_name)) + continue; + ifap = ifa; + strcpy(ifr.ifr_name, ifa->ifa_name); + if (ioctl(sk, SIOCGIFMTU, &ifr)) goto fail_1; + if (mtu < ifr.ifr_mtu) mtu = ifr.ifr_mtu; + } + if (mtu > 0) break; + } + if (mtu < 0) { errno = ENOTCONN; goto fail_1; } + freeifaddrs(ifaa); + + /* Done. */ + return (mtu); + +fail_1: + freeifaddrs(ifaa); +fail_0: + return (-1); +} + +static void raw_finish(void *stv) { ; } + +static void raw_selprep(void *stv, int *maxfd, fd_set *fd_in) + { struct raw_state *st = stv; ADDFD(st->sk); ADDFD(st->rawicmp); } + +static int raw_xmit(void *stv, int mtu) +{ + struct raw_state *st = stv; + unsigned char b[65536], *p; + struct ip *ip; + struct ip6_hdr *ip6; + struct udphdr *udp; + struct phdr ph; + struct phdr6 ph6; + unsigned ck; + + switch (st->a.sa.sa_family) { + + case AF_INET: + + /* Build the IP header. */ + ip = (struct ip *)b; + ip->ip_v = 4; + ip->ip_hl = sizeof(*ip)/4; + ip->ip_tos = IPTOS_RELIABILITY; + ip->ip_len = sane_htons(mtu); + STEP(st->q); ip->ip_id = htons(st->q); + ip->ip_off = sane_htons(0 | IP_DF); + ip->ip_ttl = 64; + ip->ip_p = IPPROTO_UDP; + ip->ip_sum = 0; + ip->ip_src = st->me.sin.sin_addr; + ip->ip_dst = st->a.sin.sin_addr; + + /* Build a UDP packet in the output buffer. */ + udp = (struct udphdr *)(ip + 1); + udp->uh_sport = st->srcport; + udp->uh_dport = st->dstport; + udp->uh_ulen = htons(mtu - sizeof(*ip)); + udp->uh_sum = 0; + + /* Copy the payload. */ + p = (unsigned char *)(udp + 1); + memcpy(p, buf, mtu - (p - b)); + + /* Calculate the UDP checksum. */ + ph.ph_src = ip->ip_src; + ph.ph_dst = ip->ip_dst; + ph.ph_z = 0; + ph.ph_p = IPPROTO_UDP; + ph.ph_len = udp->uh_ulen; + ck = IPCK_INIT; + ck = ipcksum(&ph, sizeof(ph), ck); + ck = ipcksum(udp, mtu - sizeof(*ip), ck); + udp->uh_sum = htons(ck); + + break; + + case AF_INET6: + + /* Build the IP header. */ + ip6 = (struct ip6_hdr *)b; + STEP(st->q); ip6->ip6_flow = htonl(0x60000000 | st->q); + ip6->ip6_plen = htons(mtu - sizeof(*ip6)); + ip6->ip6_nxt = IPPROTO_UDP; + ip6->ip6_hlim = 64; + ip6->ip6_src = st->me.sin6.sin6_addr; + ip6->ip6_dst = st->a.sin6.sin6_addr; + + /* Build a UDP packet in the output buffer. */ + udp = (struct udphdr *)(ip6 + 1); + udp->uh_sport = st->srcport; + udp->uh_dport = st->dstport; + udp->uh_ulen = htons(mtu - sizeof(*ip6)); + udp->uh_sum = 0; + + /* Copy the payload. */ + p = (unsigned char *)(udp + 1); + memcpy(p, buf, mtu - (p - b)); + + /* Calculate the UDP checksum. */ + ph6.ph6_src = ip6->ip6_src; + ph6.ph6_dst = ip6->ip6_dst; + ph6.ph6_len = udp->uh_ulen; + ph6.ph6_z0 = ph6.ph6_z1 = ph6.ph6_z2 = 0; + ph6.ph6_nxt = IPPROTO_UDP; + ck = IPCK_INIT; + ck = ipcksum(&ph6, sizeof(ph6), ck); + ck = ipcksum(udp, mtu - sizeof(*ip6), ck); + udp->uh_sum = htons(ck); + + break; + + default: + abort(); + } + + /* Send the whole thing off. If we're too big for the interface then we + * might need to trim immediately. + */ + if (sendto(st->rawudp, b, mtu, 0, &st->a.sa, addrsz(&st->a)) < 0) { + if (errno == EMSGSIZE) return (RC_LOWER); + else goto fail_0; + } + + /* Done. */ + return (RC_OK); + +fail_0: + return (RC_FAIL); +} + +static int raw_selproc(void *stv, fd_set *fd_in, struct probestate *ps) +{ + struct raw_state *st = stv; + unsigned char b[65536]; + struct ip *ip; + struct ip6_hdr *ip6; + struct icmp *icmp; + struct icmp6_hdr *icmp6; + struct udphdr *udp; + const unsigned char *payload; + ssize_t n; + + /* An ICMP packet: see what's inside. */ + if (FD_ISSET(st->rawicmp, fd_in)) { + if ((n = read(st->rawicmp, b, sizeof(b))) < 0) goto fail_0; + + switch (st->me.sa.sa_family) { + + case AF_INET: + + ip = (struct ip *)b; + if (n < sizeof(*ip) || n < sizeof(4*ip->ip_hl) || + ip->ip_v != 4 || ip->ip_p != IPPROTO_ICMP) + goto skip_icmp; + n -= sizeof(4*ip->ip_hl); + + icmp = (struct icmp *)(b + 4*ip->ip_hl); + if (n < sizeof(*icmp) || icmp->icmp_type != ICMP_UNREACH) + goto skip_icmp; + n -= offsetof(struct icmp, icmp_ip); + + ip = &icmp->icmp_ip; + if (n < sizeof(*ip) || + ip->ip_p != IPPROTO_UDP || ip->ip_hl != sizeof(*ip)/4 || + ip->ip_id != htons(st->q) || + ip->ip_src.s_addr != st->me.sin.sin_addr.s_addr || + ip->ip_dst.s_addr != st->a.sin.sin_addr.s_addr) + goto skip_icmp; + n -= sizeof(*ip); + + udp = (struct udphdr *)(ip + 1); + if (n < sizeof(*udp) || udp->uh_sport != st->srcport || + udp->uh_dport != st->dstport) + goto skip_icmp; + n -= sizeof(*udp); + + payload = (const unsigned char *)(udp + 1); + if (!mypacketp(ps, payload, n)) goto skip_icmp; + + if (icmp->icmp_code == ICMP_UNREACH_PORT) return (RC_HIGHER); + else if (icmp->icmp_code != ICMP_UNREACH_NEEDFRAG) goto skip_icmp; + else if (icmp->icmp_nextmtu) return (htons(icmp->icmp_nextmtu)); + else return (RC_LOWER); + + break; + + case AF_INET6: + icmp6 = (struct icmp6_hdr *)b; + if (n < sizeof(*icmp6) || + (icmp6->icmp6_type != ICMP6_PACKET_TOO_BIG && + icmp6->icmp6_type != ICMP6_DST_UNREACH)) + goto skip_icmp; + n -= sizeof(*icmp6); + + ip6 = (struct ip6_hdr *)(icmp6 + 1); + if (n < sizeof(*ip6) || ip6->ip6_nxt != IPPROTO_UDP || + memcmp(ip6->ip6_src.s6_addr, + st->me.sin6.sin6_addr.s6_addr, 16) || + memcmp(ip6->ip6_dst.s6_addr, + st->a.sin6.sin6_addr.s6_addr, 16) || + (ntohl(ip6->ip6_flow)&0xffff) != st->q) + goto skip_icmp; + n -= sizeof(*ip6); + + udp = (struct udphdr *)(ip6 + 1); + if (n < sizeof(*udp) || udp->uh_sport != st->srcport || + udp->uh_dport != st->dstport) + goto skip_icmp; + n -= sizeof(*udp); + + payload = (const unsigned char *)(udp + 1); + if (!mypacketp(ps, payload, n)) goto skip_icmp; + + if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG) + return (ntohs(icmp6->icmp6_mtu)); + else switch (icmp6->icmp6_code) { + case ICMP6_DST_UNREACH_ADMIN: + case ICMP6_DST_UNREACH_NOPORT: + return (RC_HIGHER); + default: + goto skip_icmp; + } + break; + + default: + abort(); + } + } + +skip_icmp:; + + /* If we got a reply to the current probe then we're good. If we got an + * error, or the packet's sequence number is wrong, then ignore it. + */ + if (FD_ISSET(st->sk, fd_in)) { + if ((n = read(st->sk, b, sizeof(b))) < 0) return (RC_OK); + else if (mypacketp(ps, b, n)) return (RC_HIGHER); + else return (RC_OK); + } + + return (RC_OK); + +fail_0: + return (RC_FAIL); +} + +static const struct probe_ops raw_ops = { + "raw", OPS_CHAIN, sizeof(struct raw_state), + raw_setup, raw_finish, + raw_selprep, raw_xmit, raw_selproc +}; + +#undef OPS_CHAIN +#define OPS_CHAIN &raw_ops + +#endif + +/*----- Doing the job on Linux --------------------------------------------*/ + +#if defined(linux) + +#ifndef IP_MTU +# define IP_MTU 14 /* Blech! */ +#endif + +struct linux_state { + int sol, so_mtu_discover, so_mtu; + int sk; + size_t hdrlen; +}; + +static int linux_setup(void *stv, int sk, const struct param *pp) +{ + struct linux_state *st = stv; + int i, mtu; + socklen_t sz; + + /* Check that the address is OK. */ + switch (pp->a.sa.sa_family) { + case AF_INET: + st->sol = IPPROTO_IP; + st->so_mtu_discover = IP_MTU_DISCOVER; + st->so_mtu = IP_MTU; + st->hdrlen = 28; + break; + case AF_INET6: + st->sol = IPPROTO_IPV6; + st->so_mtu_discover = IPV6_MTU_DISCOVER; + st->so_mtu = IPV6_MTU; + st->hdrlen = 48; + break; + default: + errno = EPFNOSUPPORT; + return (-1); + } + + /* Snaffle the UDP socket. */ + st->sk = sk; + + /* Turn on kernel path-MTU discovery and force DF on. */ + i = IP_PMTUDISC_PROBE; + if (setsockopt(st->sk, st->sol, st->so_mtu_discover, &i, sizeof(i))) + return (-1); + + /* Read the initial MTU guess back and report it. */ + sz = sizeof(mtu); + if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz)) + return (-1); + + /* Done. */ + return (mtu); +} + +static void linux_finish(void *stv) { ; } -# error "path MTU discovery not implemented" +static void linux_selprep(void *stv, int *maxfd, fd_set *fd_in) + { struct linux_state *st = stv; ADDFD(st->sk); } + +static int linux_xmit(void *stv, int mtu) +{ + struct linux_state *st = stv; + + /* Write the packet. */ + if (write(st->sk, buf, mtu - st->hdrlen) >= 0) return (RC_OK); + else if (errno == EMSGSIZE) return (RC_LOWER); + else return (RC_FAIL); +} + +static int linux_selproc(void *stv, fd_set *fd_in, struct probestate *ps) +{ + struct linux_state *st = stv; + int mtu; + socklen_t sz; + ssize_t n; + unsigned char b[65536]; + + /* Read an answer. If it looks like the right kind of error then report a + * success. This is potentially wrong, since we can't tell whether an + * error was delayed from an earlier probe. However, we never return + * RC_LOWER from this method, so the packet sizes ought to be monotonically + * decreasing and this won't cause trouble. Otherwise update from the + * kernel's idea of the right MTU. + */ + if (FD_ISSET(st->sk, fd_in)) { + n = read(st->sk, &buf, sizeof(buf)); + if (n >= 0 ? + mypacketp(ps, b, n) : + errno == ECONNREFUSED || errno == EHOSTUNREACH) + return (RC_HIGHER); + sz = sizeof(mtu); + if (getsockopt(st->sk, st->sol, st->so_mtu, &mtu, &sz)) + return (RC_FAIL); + return (mtu); + } + return (RC_OK); +} + +static const struct probe_ops linux_ops = { + "linux", OPS_CHAIN, sizeof(struct linux_state), + linux_setup, linux_finish, + linux_selprep, linux_xmit, linux_selproc +}; + +#undef OPS_CHAIN +#define OPS_CHAIN &linux_ops #endif /*----- Help options ------------------------------------------------------*/ +static const struct probe_ops *probe_ops = OPS_CHAIN; + static void version(FILE *fp) { pquis(fp, "$, TrIPE version " VERSION "\n"); } static void usage(FILE *fp) - { pquis(fp, "Usage: $ [-t TIMEOUT] [-H HEADER] HOST [PORT]\n"); } +{ + pquis(fp, "Usage: $ [-46v] [-H HEADER] [-m METHOD]\n\ + [-r SECS] [-g FACTOR] [-t SECS] HOST [PORT]\n"); +} static void help(FILE *fp) { + const struct probe_ops *ops; + version(fp); fputc('\n', fp); usage(fp); @@ -143,51 +936,79 @@ static void help(FILE *fp) Options in full:\n\ \n\ -h, --help Show this help text.\n\ --v, --version Show version number.\n\ +-V, --version Show version number.\n\ -u, --usage Show brief usage message.\n\ \n\ --t, --timeout=TIMEOUT Time to wait for reply, in seconds.\n\ +-4, --ipv4 Restrict to IPv4 only.\n\ +-6, --ipv6 Restrict to IPv6 only.\n\ +-g, --growth=FACTOR Growth factor for retransmit interval.\n\ +-m, --method=METHOD Use METHOD to probe for MTU.\n\ +-r, --retransmit=SECS Retransmit if no reply after SEC.\n\ +-t, --timeout=SECS Give up expecting a reply after SECS.\n\ +-v, --verbose Write a running commentary to stderr.\n\ -H, --header=HEX Packet header, in hexadecimal.\n\ +\n\ +Probe methods:\n\ ", fp); + for (ops = probe_ops; ops; ops = ops->next) + printf("\t%s\n", ops->name); } /*----- Main code ---------------------------------------------------------*/ int main(int argc, char *argv[]) { - struct sockaddr_in sin; + struct param pp = { 0, 0.333, 3.0, 8.0, 0, OPS_CHAIN }; hex_ctx hc; dstr d = DSTR_INIT; size_t sz; - int i; - unsigned long u; - char *q; - struct hostent *h; - struct servent *s; - double to = 5.0; + int i, err; + struct addrinfo aihint = { 0 }, *ailist, *ai; + const char *host, *svc = "7"; unsigned f = 0; #define f_bogus 1u +#ifdef HAVE_GETIFADDRS + if ((rawicmp = socket(PF_INET, SOCK_RAW, IPPROTO_ICMP)) < 0 || + (rawudp = socket(PF_INET, SOCK_RAW, IPPROTO_UDP)) < 0) + rawerr = errno; + if ((rawicmp6 = socket(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6)) < 0 || + (rawudp6 = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW)) < 0) + rawerr6 = errno; +#endif + if (setuid(getuid())) + abort(); + ego(argv[0]); fillbuffer(buf, sizeof(buf)); - sin.sin_port = htons(7); + + aihint.ai_family = AF_UNSPEC; + aihint.ai_protocol = IPPROTO_UDP; + aihint.ai_socktype = SOCK_DGRAM; + aihint.ai_flags = AI_ADDRCONFIG; for (;;) { static const struct option opts[] = { { "help", 0, 0, 'h' }, - { "version", 0, 0, 'v' }, + { "version", 0, 0, 'V' }, { "usage", 0, 0, 'u' }, + { "ipv4", 0, 0, '4' }, + { "ipv6", 0, 0, '6' }, { "header", OPTF_ARGREQ, 0, 'H' }, + { "growth", OPTF_ARGREQ, 0, 'g' }, + { "method", OPTF_ARGREQ, 0, 'm' }, + { "retransmit", OPTF_ARGREQ, 0, 'r' }, { "timeout", OPTF_ARGREQ, 0, 't' }, + { "verbose", 0, 0, 'v' }, { 0, 0, 0, 0 } }; - i = mdwopt(argc, argv, "hvu" "H:", opts, 0, 0, 0); + i = mdwopt(argc, argv, "hVu" "46H:g:m:r:t:v", opts, 0, 0, 0); if (i < 0) break; switch (i) { case 'h': help(stdout); exit(0); - case 'v': version(stdout); exit(0); + case 'V': version(stdout); exit(0); case 'u': usage(stdout); exit(0); case 'H': @@ -195,16 +1016,26 @@ int main(int argc, char *argv[]) hex_init(&hc); hex_decode(&hc, optarg, strlen(optarg), &d); hex_decode(&hc, 0, 0, &d); - sz = d.len < sizeof(buf) ? d.len : sizeof(buf); + sz = d.len < 532 ? d.len : 532; memcpy(buf, d.buf, sz); + pp.seqoff = sz; break; - case 't': - errno = 0; - to = strtod(optarg, &q); - if (errno || *q) die(EXIT_FAILURE, "bad timeout"); + case '4': aihint.ai_family = AF_INET; break; + case '6': aihint.ai_family = AF_INET6; break; + case 'g': pp.regr = s2f(optarg, "retransmit growth factor"); break; + case 'r': pp.retx = s2f(optarg, "retransmit interval"); break; + case 't': pp.timeout = s2f(optarg, "timeout"); break; + + case 'm': + for (pp.pops = OPS_CHAIN; pp.pops; pp.pops = pp.pops->next) + if (strcmp(pp.pops->name, optarg) == 0) goto found_alg; + die(EXIT_FAILURE, "unknown probe algorithm `%s'", optarg); + found_alg: break; + case 'v': pp.f |= F_VERBOSE; break; + default: f |= f_bogus; break; @@ -216,26 +1047,18 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } - if ((h = gethostbyname(*argv)) == 0) - die(EXIT_FAILURE, "unknown host `%s': %s", *argv, hstrerror(h_errno)); - if (h->h_addrtype != AF_INET) - die(EXIT_FAILURE, "unsupported address family for host `%s'", *argv); - memcpy(&sin.sin_addr, h->h_addr, sizeof(struct in_addr)); - argv++; argc--; - - if (*argv) { - errno = 0; - u = strtoul(*argv, &q, 0); - if (!errno && !*q) - sin.sin_port = htons(u); - else if ((s = getservbyname(*argv, "udp")) == 0) - die(EXIT_FAILURE, "unknown UDP service `%s'", *argv); - else - sin.sin_port = s->s_port; + host = argv[0]; + if (argv[1]) svc = argv[1]; + if ((err = getaddrinfo(host, svc, &aihint, &ailist)) != 0) { + die(EXIT_FAILURE, "unknown host `%s' or service `%s': %s", + host, svc, gai_strerror(err)); } + for (ai = ailist; ai && !addrfamok(ai->ai_family); ai = ai->ai_next); + if (!ai) die(EXIT_FAILURE, "no supported address families for `%s'", host); + assert(ai->ai_addrlen <= sizeof(pp.a)); + memcpy(&pp.a, ai->ai_addr, ai->ai_addrlen); - sin.sin_family = AF_INET; - i = pathmtu(&sin, to); + i = pathmtu(&pp); if (i < 0) die(EXIT_FAILURE, "failed to discover MTU: %s", strerror(errno)); printf("%d\n", i);