3 * Report MTU on path to specified host
5 * (c) 2008 Straylight/Edgeware
8 /*----- Licensing notice --------------------------------------------------*
10 * This file is part of Trivial IP Encryption (TrIPE).
12 * TrIPE is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or
15 * (at your option) any later version.
17 * TrIPE is distributed in the hope that it will be useful,
18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 * GNU General Public License for more details.
22 * You should have received a copy of the GNU General Public License
23 * along with TrIPE; if not, write to the Free Software Foundation,
24 * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
27 /*----- Header files ------------------------------------------------------*/
42 #include <sys/types.h>
46 #include <sys/socket.h>
47 #include <netinet/in.h>
48 #include <arpa/inet.h>
51 #include <netinet/in_systm.h>
52 #include <netinet/ip.h>
53 #include <netinet/ip_icmp.h>
54 #include <netinet/udp.h>
58 #include <sys/ioctl.h>
60 #include <mLib/alloc.h>
61 #include <mLib/bits.h>
62 #include <mLib/dstr.h>
64 #include <mLib/mdwopt.h>
65 #include <mLib/quis.h>
66 #include <mLib/report.h>
69 /*----- Static variables --------------------------------------------------*/
71 static unsigned char buf
[65536];
75 /*----- Utility functions -------------------------------------------------*/
77 /* Step a value according to a simple LFSR. */
79 do (q) = ((q) & 0x8000) ? ((q) << 1) ^ POLY : ((q) << 1); while (0)
81 /* Fill buffer with a constant but pseudorandom string. Uses a simple
84 static void fillbuffer(unsigned char *p
, size_t sz
)
86 unsigned int y
= 0xbc20;
87 const unsigned char *l
= p
+ sz
;
92 for (i
= 0; i
< 8; i
++) STEP(y
);
96 /* Convert a string to floating point. */
97 static double s2f(const char *s
, const char *what
)
104 if (errno
|| *q
) die(EXIT_FAILURE
, "bad %s", what
);
108 /* Convert a floating-point value into a struct timeval. */
109 static void f2tv(struct timeval
*tv
, double t
)
110 { tv
->tv_sec
= t
; tv
->tv_usec
= (t
- tv
->tv_sec
)*MILLION
; }
112 /*----- Main algorithm skeleton -------------------------------------------*/
115 unsigned f
; /* Various flags */
116 #define F_VERBOSE 1u /* Give a running commentary */
117 double retx
; /* Initial retransmit interval */
118 double regr
; /* Retransmit growth factor */
119 double timeout
; /* Retransmission timeout */
120 int seqoff
; /* Offset to write sequence number */
121 const struct probe_ops
*pops
; /* Probe algorithm description */
122 struct sockaddr_in sin
; /* Destination address */
126 const struct param
*pp
;
132 const struct probe_ops
*next
;
134 int (*setup
)(void *, int, const struct param
*);
135 void (*finish
)(void *);
136 void (*selprep
)(void *, int *, fd_set
*);
137 int (*xmit
)(void *, int);
138 int (*selproc
)(void *, fd_set
*, struct probestate
*);
149 /* or a positive MTU upper-bound */
152 /* Add a file descriptor FD to the set `fd_in', updating `*maxfd'. */
154 do { FD_SET(fd, fd_in); if (*maxfd < fd) *maxfd = fd; } while (0)
156 /* Check whether a buffer contains a packet from our current probe. */
157 static int mypacketp(struct probestate
*ps
,
158 const unsigned char *p
, size_t sz
)
160 const struct param
*pp
= ps
->pp
;
162 return (sz
>= pp
->seqoff
+ 2 && LOAD16(p
+ pp
->seqoff
) == ps
->q
);
165 /* See whether MTU is an acceptable MTU value. Return an appropriate
166 * RC_... code or a new suggested MTU.
168 static int probe(struct probestate
*ps
, void *st
, int mtu
)
170 const struct param
*pp
= ps
->pp
;
172 struct timeval tv
, now
, when
, done
;
173 double timer
= pp
->retx
;
176 /* Set up the first retransmit and give-up timers. */
177 gettimeofday(&now
, 0);
178 f2tv(&tv
, pp
->timeout
); TV_ADD(&done
, &now
, &tv
);
179 f2tv(&tv
, timer
); TV_ADD(&when
, &now
, &tv
);
180 if (TV_CMP(&when
, >, &done
)) when
= done
;
182 /* Send the initial probe. */
183 if (pp
->f
& F_VERBOSE
)
184 moan("sending probe of size %d (seq = %04x)", mtu
, ps
->q
);
186 STORE16(buf
+ pp
->seqoff
, ps
->q
);
187 if ((rc
= pp
->pops
->xmit(st
, mtu
)) != RC_OK
) return (rc
);
191 /* Wait for something interesting to happen. */
192 maxfd
= 0; FD_ZERO(&fd_in
);
193 pp
->pops
->selprep(st
, &maxfd
, &fd_in
);
194 TV_SUB(&tv
, &when
, &now
);
195 if (select(maxfd
+ 1, &fd_in
, 0, 0, &tv
) < 0) return (RC_FAIL
);
196 gettimeofday(&now
, 0);
198 /* See whether the probe method has any answers for us. */
199 if ((rc
= pp
->pops
->selproc(st
, &fd_in
, ps
)) != RC_OK
) return (rc
);
201 /* If we've waited too long, give up. If we should retransmit, do
204 if (TV_CMP(&now
, >, &done
))
206 else if (TV_CMP(&now
, >, &when
)) {
207 if (pp
->f
& F_VERBOSE
) moan("re-sending probe of size %d", mtu
);
208 if ((rc
= pp
->pops
->xmit(st
, mtu
)) != RC_OK
) return (rc
);
210 timer
*= pp
->regr
; f2tv(&tv
, timer
); TV_ADD(&when
, &when
, &tv
);
211 } while (TV_CMP(&when
, <, &now
));
212 if (TV_CMP(&when
, >, &done
)) when
= done
;
217 /* Discover the path MTU to the destination address. */
218 static int pathmtu(const struct param
*pp
)
224 struct probestate ps
;
226 /* Build and connect a UDP socket. We'll need this to know the local port
227 * number to use if nothing else. Set other stuff up.
229 if ((sk
= socket(PF_INET
, SOCK_DGRAM
, 0)) < 0) goto fail_0
;
230 if (connect(sk
, (struct sockaddr
*)&pp
->sin
, sizeof(pp
->sin
))) goto fail_1
;
231 st
= xmalloc(pp
->pops
->statesz
);
232 if ((mtu
= pp
->pops
->setup(st
, sk
, pp
)) < 0) goto fail_2
;
233 ps
.pp
= pp
; ps
.q
= rand() & 0xffff;
236 /* And now we do a thing which is sort of like a binary search, except that
237 * we also take explicit clues as establishing a new upper bound, and we
238 * try to hug that initially.
241 rc
= probe(&ps
, st
, mtu
);
245 if (pp
->f
& F_VERBOSE
) moan("probe failed");
249 /* If we've not seen a dropped packet before then we don't know what
250 * this means yet -- in particular, we don't know which bit of the
251 * network is swallowing packets. Send a minimum-size probe. If
252 * that doesn't come back then assume that the remote host is
253 * swallowing our packets. If it does, then we assume that dropped
254 * packets are a result of ICMP fragmentation-needed reports being
255 * lost or suppressed.
257 if (pp
->f
& F_VERBOSE
) moan("gave up: black hole detected");
259 if (pp
->f
& F_VERBOSE
) moan("sending minimum-size probe");
260 switch (probe(&ps
, st
, lo
)) {
264 if (pp
->f
& F_VERBOSE
) {
265 moan("no reply from min-size probe: "
266 "assume black hole at target");
271 if (pp
->f
& F_VERBOSE
) {
272 moan("reply from min-size probe OK: "
273 "assume black hole in network");
278 if (pp
->f
& F_VERBOSE
)
279 moan("unexpected return code from probe");
285 if (droppy
) goto higher
; else goto lower
;
290 if (pp
->f
& F_VERBOSE
)
291 moan("probe returned: remote host is not a black hole");
295 if (pp
->f
& F_VERBOSE
) moan("probe returned: found correct MTU");
298 if (pp
->f
& F_VERBOSE
) moan("probe returned: guessing higher");
300 mtu
+= (hi
- lo
+ 1)/2;
306 if (pp
->f
& F_VERBOSE
) moan("error returned: found correct MTU");
309 if (pp
->f
& F_VERBOSE
) moan("error returned: guessing lower");
311 mtu
-= (hi
- lo
+ 1)/2;
315 if (pp
->f
& F_VERBOSE
) moan("error returned with new MTU estimate");
322 /* Clean up and return our result. */
323 pp
->pops
->finish(st
);
329 pp
->pops
->finish(st
);
338 /*----- Doing it the hard way ---------------------------------------------*/
340 #if defined(linux) || defined(__OpenBSD__)
345 # define sane_htons htons
346 # define sane_htonl htonl
352 static int rawicmp
= -1, rawudp
= -1, rawerr
= 0;
354 #define IPCK_INIT 0xffff
356 /* Compute an IP checksum over some data. This is a restartable interface:
357 * initialize A to `IPCK_INIT' for the first call.
359 static unsigned ipcksum(const void *buf
, size_t n
, unsigned a
)
361 unsigned long aa
= a
^ 0xffff;
362 const unsigned char *p
= buf
, *l
= p
+ n
;
364 while (p
< l
- 1) { aa
+= LOAD16_B(p
); p
+= 2; }
365 if (p
< l
) { aa
+= (unsigned)(*p
) << 8; }
366 do aa
= (aa
& 0xffff) + (aa
>> 16); while (aa
>= 0x10000);
367 return (aa
== 0xffff ? aa
: aa
^ 0xffff);
370 /* TCP/UDP pseudoheader structure. */
372 struct in_addr ph_src
, ph_dst
;
378 struct sockaddr_in me
, sin
;
379 int sk
, rawicmp
, rawudp
;
383 static int raw_setup(void *stv
, int sk
, const struct param
*pp
)
385 struct raw_state
*st
= stv
;
388 struct ifaddrs
*ifa
, *ifaa
, *ifap
;
391 /* If we couldn't acquire raw sockets, we fail here. */
392 if (rawerr
) { errno
= rawerr
; goto fail_0
; }
393 st
->rawicmp
= rawicmp
; st
->rawudp
= rawudp
; st
->sk
= sk
;
395 /* Initialize the sequence number. */
396 st
->q
= rand() & 0xffff;
398 /* Snaffle the local and remote address and port number. */
401 if (getsockname(sk
, (struct sockaddr
*)&st
->me
, &sz
))
404 /* There isn't a portable way to force the DF flag onto a packet through
405 * UDP, or even through raw IP, unless we write the entire IP header
406 * ourselves. This is somewhat annoying, especially since we have an
407 * uphill struggle keeping track of which systems randomly expect which
408 * header fields to be presented in host byte order. Oh, well.
411 if (setsockopt(rawudp
, IPPROTO_IP
, IP_HDRINCL
, &i
, sizeof(i
))) goto fail_0
;
413 /* Find an upper bound on the MTU. Do two passes over the interface
414 * list. If we can find matches for our local address then use the
415 * highest one of those; otherwise do a second pass and simply take the
416 * highest MTU of any network interface.
418 if (getifaddrs(&ifaa
)) goto fail_0
;
419 for (i
= 0; i
< 2; i
++) {
420 for (ifap
= 0, ifa
= ifaa
; ifa
; ifa
= ifa
->ifa_next
) {
421 if (!(ifa
->ifa_flags
& IFF_UP
) || !ifa
->ifa_addr
||
422 ifa
->ifa_addr
->sa_family
!= AF_INET
||
424 ((struct sockaddr_in
*)ifa
->ifa_addr
)->sin_addr
.s_addr
!=
425 st
->me
.sin_addr
.s_addr
) ||
426 (i
== 1 && ifap
&& strcmp(ifap
->ifa_name
, ifa
->ifa_name
) == 0) ||
427 strlen(ifa
->ifa_name
) >= sizeof(ifr
.ifr_name
))
430 strcpy(ifr
.ifr_name
, ifa
->ifa_name
);
431 if (ioctl(sk
, SIOCGIFMTU
, &ifr
)) goto fail_1
;
432 if (mtu
< ifr
.ifr_mtu
) mtu
= ifr
.ifr_mtu
;
436 if (mtu
< 0) { errno
= ENOTCONN
; goto fail_1
; }
448 static void raw_finish(void *stv
) { ; }
450 static void raw_selprep(void *stv
, int *maxfd
, fd_set
*fd_in
)
451 { struct raw_state
*st
= stv
; ADDFD(st
->sk
); ADDFD(st
->rawicmp
); }
453 static int raw_xmit(void *stv
, int mtu
)
455 struct raw_state
*st
= stv
;
456 unsigned char b
[65536], *p
;
462 /* Build the IP header. */
465 ip
->ip_hl
= sizeof(*ip
)/4;
466 ip
->ip_tos
= IPTOS_RELIABILITY
;
467 ip
->ip_len
= sane_htons(mtu
);
468 STEP(st
->q
); ip
->ip_id
= htons(st
->q
);
469 ip
->ip_off
= sane_htons(0 | IP_DF
);
471 ip
->ip_p
= IPPROTO_UDP
;
473 ip
->ip_src
= st
->me
.sin_addr
;
474 ip
->ip_dst
= st
->sin
.sin_addr
;
476 /* Build a UDP packet in the output buffer. */
477 udp
= (struct udphdr
*)(ip
+ 1);
478 udp
->uh_sport
= st
->me
.sin_port
;
479 udp
->uh_dport
= st
->sin
.sin_port
;
480 udp
->uh_ulen
= htons(mtu
- sizeof(*ip
));
483 /* Copy the payload. */
484 p
= (unsigned char *)(udp
+ 1);
485 memcpy(p
, buf
, mtu
- (p
- b
));
487 /* Calculate the UDP checksum. */
488 ph
.ph_src
= ip
->ip_src
;
489 ph
.ph_dst
= ip
->ip_dst
;
491 ph
.ph_p
= IPPROTO_UDP
;
492 ph
.ph_len
= udp
->uh_ulen
;
494 ck
= ipcksum(&ph
, sizeof(ph
), ck
);
495 ck
= ipcksum(udp
, mtu
- sizeof(*ip
), ck
);
496 udp
->uh_sum
= htons(ck
);
498 /* Send the whole thing off. If we're too big for the interface then we
499 * might need to trim immediately.
501 if (sendto(st
->rawudp
, b
, mtu
, 0,
502 (struct sockaddr
*)&st
->sin
, sizeof(st
->sin
)) < 0) {
503 if (errno
== EMSGSIZE
) return (RC_LOWER
);
514 static int raw_selproc(void *stv
, fd_set
*fd_in
, struct probestate
*ps
)
516 struct raw_state
*st
= stv
;
517 unsigned char b
[65536];
523 /* An ICMP packet: see what's inside. */
524 if (FD_ISSET(st
->rawicmp
, fd_in
)) {
525 if ((n
= read(st
->rawicmp
, b
, sizeof(b
))) < 0) goto fail_0
;
528 if (n
< sizeof(*ip
) || n
< sizeof(4*ip
->ip_hl
) ||
529 ip
->ip_v
!= 4 || ip
->ip_p
!= IPPROTO_ICMP
)
531 n
-= sizeof(4*ip
->ip_hl
);
533 icmp
= (struct icmp
*)(b
+ 4*ip
->ip_hl
);
534 if (n
< sizeof(*icmp
) || icmp
->icmp_type
!= ICMP_UNREACH
)
536 n
-= offsetof(struct icmp
, icmp_ip
);
539 if (n
< sizeof(*ip
) ||
540 ip
->ip_p
!= IPPROTO_UDP
|| ip
->ip_hl
!= sizeof(*ip
)/4 ||
541 ip
->ip_id
!= htons(st
->q
) ||
542 ip
->ip_src
.s_addr
!= st
->me
.sin_addr
.s_addr
||
543 ip
->ip_dst
.s_addr
!= st
->sin
.sin_addr
.s_addr
)
547 udp
= (struct udphdr
*)(ip
+ 1);
548 if (n
< sizeof(udp
) || udp
->uh_sport
!= st
->me
.sin_port
||
549 udp
->uh_dport
!= st
->sin
.sin_port
)
553 if (icmp
->icmp_code
== ICMP_UNREACH_PORT
) return (RC_HIGHER
);
554 else if (icmp
->icmp_code
!= ICMP_UNREACH_NEEDFRAG
) goto skip_icmp
;
555 else if (icmp
->icmp_nextmtu
) return (htons(icmp
->icmp_nextmtu
));
556 else return (RC_LOWER
);
560 /* If we got a reply to the current probe then we're good. If we got an
561 * error, or the packet's sequence number is wrong, then ignore it.
563 if (FD_ISSET(st
->sk
, fd_in
)) {
564 if ((n
= read(st
->sk
, b
, sizeof(b
))) < 0) return (RC_OK
);
565 else if (mypacketp(ps
, b
, n
)) return (RC_HIGHER
);
575 static const struct probe_ops raw_ops
= {
576 "raw", OPS_CHAIN
, sizeof(struct raw_state
),
577 raw_setup
, raw_finish
,
578 raw_selprep
, raw_xmit
, raw_selproc
582 #define OPS_CHAIN &raw_ops
584 /*----- Doing the job on Linux --------------------------------------------*/
589 # define IP_MTU 14 /* Blech! */
596 static int linux_setup(void *stv
, int sk
, const struct param
*pp
)
598 struct linux_state
*st
= stv
;
602 /* Snaffle the UDP socket. */
605 /* Turn on kernel path-MTU discovery and force DF on. */
607 if (setsockopt(st
->sk
, IPPROTO_IP
, IP_MTU_DISCOVER
, &i
, sizeof(i
)))
610 /* Read the initial MTU guess back and report it. */
612 if (getsockopt(st
->sk
, IPPROTO_IP
, IP_MTU
, &mtu
, &sz
))
619 static void linux_finish(void *stv
) { ; }
621 static void linux_selprep(void *stv
, int *maxfd
, fd_set
*fd_in
)
622 { struct linux_state
*st
= stv
; ADDFD(st
->sk
); }
624 static int linux_xmit(void *stv
, int mtu
)
626 struct linux_state
*st
= stv
;
628 /* Write the packet. */
629 if (write(st
->sk
, buf
, mtu
- 28) >= 0) return (RC_OK
);
630 else if (errno
== EMSGSIZE
) return (RC_LOWER
);
631 else return (RC_FAIL
);
634 static int linux_selproc(void *stv
, fd_set
*fd_in
, struct probestate
*ps
)
636 struct linux_state
*st
= stv
;
640 unsigned char b
[65536];
642 /* Read an answer. If it looks like the right kind of error then report a
643 * success. This is potentially wrong, since we can't tell whether an
644 * error was delayed from an earlier probe. However, we never return
645 * RC_LOWER from this method, so the packet sizes ought to be monotonically
646 * decreasing and this won't cause trouble. Otherwise update from the
647 * kernel's idea of the right MTU.
649 if (FD_ISSET(st
->sk
, fd_in
)) {
650 n
= read(st
->sk
, &buf
, sizeof(buf
));
652 mypacketp(ps
, b
, n
) :
653 errno
== ECONNREFUSED
|| errno
== EHOSTUNREACH
)
656 if (getsockopt(st
->sk
, IPPROTO_IP
, IP_MTU
, &mtu
, &sz
))
663 static const struct probe_ops linux_ops
= {
664 "linux", OPS_CHAIN
, sizeof(struct linux_state
),
665 linux_setup
, linux_finish
,
666 linux_selprep
, linux_xmit
, linux_selproc
670 #define OPS_CHAIN &linux_ops
674 /*----- Help options ------------------------------------------------------*/
676 static const struct probe_ops
*probe_ops
= OPS_CHAIN
;
678 static void version(FILE *fp
)
679 { pquis(fp
, "$, TrIPE version " VERSION
"\n"); }
681 static void usage(FILE *fp
)
683 pquis(fp
, "Usage: $ [-H HEADER] [-m METHOD]\n\
684 [-r SECS] [-g FACTOR] [-t SECS] HOST [PORT]\n");
687 static void help(FILE *fp
)
689 const struct probe_ops
*ops
;
698 -h, --help Show this help text.\n\
699 -v, --version Show version number.\n\
700 -u, --usage Show brief usage message.\n\
702 -g, --growth=FACTOR Growth factor for retransmit interval.\n\
703 -m, --method=METHOD Use METHOD to probe for MTU.\n\
704 -r, --retransmit=SECS Retransmit if no reply after SEC.\n\
705 -t, --timeout=SECS Give up expecting a reply after SECS.\n\
706 -H, --header=HEX Packet header, in hexadecimal.\n\
710 for (ops
= probe_ops
; ops
; ops
= ops
->next
)
711 printf("\t%s\n", ops
->name
);
714 /*----- Main code ---------------------------------------------------------*/
716 int main(int argc
, char *argv
[])
718 struct param pp
= { 0, 0.333, 3.0, 8.0, 0, OPS_CHAIN
};
731 if ((rawicmp
= socket(PF_INET
, SOCK_RAW
, IPPROTO_ICMP
)) < 0 ||
732 (rawudp
= socket(PF_INET
, SOCK_RAW
, IPPROTO_UDP
)) < 0)
734 if (setuid(getuid()))
738 fillbuffer(buf
, sizeof(buf
));
739 pp
.sin
.sin_port
= htons(7);
742 static const struct option opts
[] = {
743 { "help", 0, 0, 'h' },
744 { "version", 0, 0, 'V' },
745 { "usage", 0, 0, 'u' },
746 { "header", OPTF_ARGREQ
, 0, 'H' },
747 { "growth", OPTF_ARGREQ
, 0, 'g' },
748 { "method", OPTF_ARGREQ
, 0, 'm' },
749 { "retransmit", OPTF_ARGREQ
, 0, 'r' },
750 { "timeout", OPTF_ARGREQ
, 0, 't' },
751 { "verbose", 0, 0, 'v' },
755 i
= mdwopt(argc
, argv
, "hVu" "H:g:m:r:t:v", opts
, 0, 0, 0);
758 case 'h': help(stdout
); exit(0);
759 case 'V': version(stdout
); exit(0);
760 case 'u': usage(stdout
); exit(0);
765 hex_decode(&hc
, optarg
, strlen(optarg
), &d
);
766 hex_decode(&hc
, 0, 0, &d
);
767 sz
= d
.len
< 532 ? d
.len
: 532;
768 memcpy(buf
, d
.buf
, sz
);
772 case 'g': pp
.regr
= s2f(optarg
, "retransmit growth factor"); break;
773 case 'r': pp
.retx
= s2f(optarg
, "retransmit interval"); break;
774 case 't': pp
.timeout
= s2f(optarg
, "timeout"); break;
777 for (pp
.pops
= OPS_CHAIN
; pp
.pops
; pp
.pops
= pp
.pops
->next
)
778 if (strcmp(pp
.pops
->name
, optarg
) == 0) goto found_alg
;
779 die(EXIT_FAILURE
, "unknown probe algorithm `%s'", optarg
);
783 case 'v': pp
.f
|= F_VERBOSE
; break;
790 argv
+= optind
; argc
-= optind
;
791 if ((f
& f_bogus
) || 1 > argc
|| argc
> 2) {
796 if ((h
= gethostbyname(*argv
)) == 0)
797 die(EXIT_FAILURE
, "unknown host `%s': %s", *argv
, hstrerror(h_errno
));
798 if (h
->h_addrtype
!= AF_INET
)
799 die(EXIT_FAILURE
, "unsupported address family for host `%s'", *argv
);
800 memcpy(&pp
.sin
.sin_addr
, h
->h_addr
, sizeof(struct in_addr
));
805 u
= strtoul(*argv
, &q
, 0);
807 pp
.sin
.sin_port
= htons(u
);
808 else if ((s
= getservbyname(*argv
, "udp")) == 0)
809 die(EXIT_FAILURE
, "unknown UDP service `%s'", *argv
);
811 pp
.sin
.sin_port
= s
->s_port
;
814 pp
.sin
.sin_family
= AF_INET
;
817 die(EXIT_FAILURE
, "failed to discover MTU: %s", strerror(errno
));
819 if (ferror(stdout
) || fflush(stdout
) || fclose(stdout
))
820 die(EXIT_FAILURE
, "failed to write result: %s", strerror(errno
));
824 /*----- That's all, folks -------------------------------------------------*/