/* $Id: netpath.C,v 1.14 2005/10/19 23:52:26 dm Exp $ */ /* * * Copyright (C) 2003 David Mazieres (dm@uun.org) * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as * published by the Free Software Foundation; either version 2, or (at * your option) any later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA * */ #include "async.h" #include "qhash.h" #include "aios.h" #include "rawnet.h" #define NETPATH_VERBOSE 0 inline icmpsock::icmpclnt::icmpclnt (ref s, in_addr a, icmpsock::cb_t c) : cb (c), is (s), addr (a) { is->cbtab.insert (this); } inline icmpsock::icmpclnt::~icmpclnt () { is->cbtab.remove (this); } void icmpsock::rcb () { for (;;) { sockaddr_in sin; socklen_t sinlen = sizeof (sin); bzero (&sin, sizeof (sin)); sin.sin_family = AF_INET; inpkt pkt; bzero (&pkt, sizeof (pkt)); int n = recvfrom (icmpfd, &pkt, sizeof (pkt), 0, (sockaddr *) &sin, &sinlen); if (n <= 0) { if (n < 0 && errno != EAGAIN) warn ("recvfrom ICMP socket: %m\n"); return; } icmp_info ii; if (!icmp_parse (&ii, &pkt, n) || !ii.udphp) continue; for (icmpclnt *cp = cbtab[ii.iphp->ip_dst], *ncp; cp; cp = ncp) { ncp = cbtab.nextkeq (cp); (*cp->cb) (&ii); } } } void icmpsock::closefds () { if (icmpfd >= 0) { fdcb (icmpfd, selread, NULL); close (icmpfd); icmpfd = -1; } if (udpfd >= 0) { close (udpfd); udpfd = -1; } if (ipfd >= 0) { close (ipfd); ipfd = -1; } } bool icmpsock::icmp_parse (icmpsock::icmp_info *infop, inpkt *inp, int size) { bzero (infop, sizeof (*infop)); if (size < int (sizeof (inp->iph) + 8) /* || size < int (ntohs (inp->iph.ip_len)) XXX - kernel byte-swaps? */ || inp->iph.ip_p != IPPROTO_ICMP) return false; int hlen = inp->iph.ip_hl << 2; if (size < int (hlen + sizeof (icmp))) return false; infop->pkthdrp = &inp->iph; infop->icmpp = reinterpret_cast (&inp->data[hlen]); infop->type = infop->icmpp->icmp_type; infop->code = infop->icmpp->icmp_code; if (size < int (hlen + 8 + sizeof (struct ip) + 8)) return true; infop->iphp = &infop->icmpp->icmp_ip; int dhlen = infop->iphp->ip_hl << 2; if (dhlen < int (sizeof (struct ip)) || size < hlen + 8 + dhlen + 8 || infop->iphp->ip_p != IPPROTO_UDP) return true; infop->udphp = reinterpret_cast (&inp->data[hlen + 8 + dhlen]); return true; } void icmpsock::portalloc () { assert (udpfd == -1); udpfd = inetsocket (SOCK_DGRAM, ntohs (fromaddr.sin_port), ntohl (fromaddr.sin_addr.s_addr)); if (udpfd < 0) { if (errno != EADDRINUSE) warn ("socket: %m\n"); } else { close_on_exec (udpfd); socklen_t sinlen = sizeof (fromaddr); getsockname (udpfd, (sockaddr *) &fromaddr, &sinlen); } if (fromaddr.sin_addr.s_addr == htonl (INADDR_ANY)) { vec av; myipaddrs (&av); while (!av.empty () && av[0].s_addr == htonl (INADDR_LOOPBACK)) av.pop_front (); if (!av.empty ()) fromaddr.sin_addr = av[0]; else // What the hell, allows testing on disconnected machines fromaddr.sin_addr.s_addr = htonl (INADDR_LOOPBACK); } } bool icmpsock::init (const sockaddr_in *fa) { closefds (); fromaddr = *fa; portalloc (); ipfd = socket (AF_INET, SOCK_RAW, IPPROTO_RAW); if (ipfd < 0) { warn ("RAW IP socket: %m\n"); return false; } close_on_exec (ipfd); int data = 576; #if 0 if (setsockopt (ipfd, SOL_SOCKET, SO_SNDBUF, (char *) &data, sizeof(data)) < 0) { warn("SO_SNDBUF: %m\n"); return false; } #endif data = 1; if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL, (char *) &data, sizeof(data)) < 0) { warn("IP_HDRINCL: %m\n"); return false; } make_async (ipfd); icmpfd = socket (AF_INET, SOCK_RAW, IPPROTO_ICMP); if (icmpfd < 0) { warn ("ICMP socket: %m\n"); return false; } close_on_exec (icmpfd); make_async (icmpfd); fdcb (icmpfd, selread, wrap (this, &icmpsock::rcb)); return true; } #define IPPROTO_ICMP_UDP 117 #define IPPROTO_ICMP_TCP 106 #if PLANET_LAB bool icmpsock::init_plab (const sockaddr_in *fa) { int one = 1; closefds (); fromaddr = *fa; portalloc (); if (udpfd >= 0) { close (udpfd); udpfd = -1; } ipfd = socket (AF_INET, SOCK_RAW, IPPROTO_UDP); if (ipfd < 0) { warn ("RAW IP socket: %m\n"); return false; } close_on_exec (ipfd); if (bind (ipfd, (sockaddr *) &fromaddr, sizeof (fromaddr)) < 0) { warn ("bind of RAW/UDP socket: %m\n"); return false; } if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL, (char *) &one, sizeof(one)) < 0) { warn("IP_HDRINCL (RAW UDP): %m\n"); return false; } make_async (ipfd); icmpfd = socket (AF_INET, SOCK_RAW, IPPROTO_ICMP_UDP); if (icmpfd < 0) { warn ("ICMP socket: %m\n"); return false; } close_on_exec (icmpfd); if (bind (icmpfd, (sockaddr *) &fromaddr, sizeof (fromaddr)) < 0) { warn ("bind of ICMP/UDP socket: %m\n"); return false; } /* XXX - this is done in planet lab traceroute code -- is it needed? */ if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL, (char *) &one, sizeof(one)) < 0) { warn("IP_HDRINCL (ICMP/UDP): %m\n"); return false; } make_async (icmpfd); fdcb (icmpfd, selread, wrap (this, &icmpsock::rcb)); return true; } #endif /* PLANET_LAB */ void icmpsock::sendpkt (const sockaddr_in *to, u_int ttl, u_int16_t datasize, u_int16_t id, u_int16_t sum, const sockaddr_in *fromp) { int pktlen = xoffsetof (outpkt, payload[datasize]); outpkt pkt; bzero (&pkt, pktlen); if (fromp && fromp->sin_addr.s_addr != htonl (INADDR_ANY)) pkt.iph.ip_src = fromp->sin_addr; else pkt.iph.ip_src = fromaddr.sin_addr; pkt.iph.ip_dst = to->sin_addr; pkt.iph.ip_off = htons (0); pkt.iph.ip_hl = sizeof (pkt.iph) >> 2; pkt.iph.ip_p = IPPROTO_UDP; pkt.iph.ip_v = 4; pkt.iph.ip_ttl = ttl; pkt.iph.ip_len = htons (pktlen); pkt.iph.ip_id = htons (id); if (fromp && fromp->sin_port != htons (0)) pkt.udph.uh_sport = fromp->sin_port; else pkt.udph.uh_sport = fromaddr.sin_port; pkt.udph.uh_dport = to->sin_port; pkt.udph.uh_ulen = htons (pktlen - sizeof (pkt.iph)); if (sum) { assert (datasize >= 2); assert (pkt.iph.ip_src.s_addr != htonl (INADDR_ANY)); pkt.udph.uh_sum = ntohs (sum); u_int32_t usum = ntohs (pkt.iph.ip_p) + pkt.udph.uh_ulen; usum = cksum (&pkt.iph.ip_src, 8, usum); usum = ~cksum (&pkt.udph, ntohs (pkt.udph.uh_ulen), usum) & 0xffff; if (!usum) usum = 0xffff; *reinterpret_cast (&pkt.udph + 1) = usum; } else { u_int32_t usum = ntohs (pkt.iph.ip_p) + pkt.udph.uh_ulen; usum = cksum (&pkt.iph.ip_src, 8, usum); usum = ~cksum (&pkt.udph, ntohs (pkt.udph.uh_ulen), usum) & 0xffff; if (!usum) usum = 0xffff; pkt.udph.uh_sum = usum; } #if 0 struct ph { in_addr s, d; u_int8_t z, p; u_int16_t l, sp, dp, l2, uc; }; ph h = { pkt.iph.ip_src, pkt.iph.ip_dst, 0, pkt.iph.ip_p, pkt.udph.uh_ulen, pkt.udph.uh_sport, pkt.udph.uh_dport, pkt.udph.uh_ulen, pkt.udph.uh_sum }; u_int16_t s2 = ~cksum (&h, sizeof (h)); warn ("ttl %d, sum 0x%x\n", ttl, ntohs (pkt.udph.uh_sum)); if (ttl == 64) { warn << "IP header " << hexdump (&pkt.iph, sizeof (pkt.iph)) << "\n"; warn << "UDP header " << hexdump (&pkt.udph, sizeof (pkt.udph)) << "\n"; warn << "pseudo-header " << hexdump (&h, sizeof (h)) << "\n"; } #endif static bool ip_hdrincl_ok, ip_hdrincl_swapped; if (ip_hdrincl_swapped) { /* Yuck... Might be required by FreeBSD */ pkt.iph.ip_len = ntohs (pkt.iph.ip_len); } errno = 0; int n = sendto (ipfd, &pkt, pktlen, 0, (sockaddr *) to, sizeof (*to)); if (n < 0 && errno == EINVAL && !ip_hdrincl_swapped && !ip_hdrincl_ok) { pkt.iph.ip_len = ntohs (pkt.iph.ip_len); n = sendto (ipfd, &pkt, pktlen, 0, (sockaddr *) to, sizeof (*to)); if (n >= 0) { warn ("kernel seems to swap byte order of ip_len... yuck\n"); ip_hdrincl_swapped = true; } } if (n >= 0) ip_hdrincl_ok = true; if (n != pktlen) warn ("RAW IP sendto %d/%d: %m\n", n, pktlen); } traceroute::traceroute (ref ss, const sockaddr_in *d, int nhops, cb_t c, const sockaddr_in *srcp) : dest (*d), use_dstport (false), hops_req (nhops), hops_max (0), hops_total (-1), hops_found (0), xmit_count (0), ic (ss->setcb (d->sin_addr, wrap (this, &traceroute::rcb))), ntmo (0), tmo_lastfound (0), tmo (NULL) { cbvec.push_back (c); xmit_ttls.zsetsize (maxhops + 1); oxmit_ttls.zsetsize (maxhops + 1); ids.setsize (maxhops + 1); bzero (ids.base (), ids.size () * sizeof (ids[0])); use_src = srcp; if (use_src) src = *srcp; if (hops_req > maxhops) hops_req = maxhops; if (dest.sin_port == htons (0)) { dest.sin_port = htons (33435); use_dstport = true; } xmit (); } traceroute::~traceroute () { if (tmo) timecb_remove (tmo); } void traceroute::probe (u_int8_t ttl) { if (!xmit_ttls[ttl]) { xmit_ttls[ttl] = true; xmit_count++; } while (!ids[ttl]) ids[ttl] = arandom (); if (use_dstport) { dest.sin_port = htons (baseport + ttl); ic->is->sendpkt (&dest, ttl, 0, ids[ttl], 0, use_src ? &src : NULL); } else ic->is->sendpkt (&dest, ttl, 2 + (ttl & 0xf), ids[ttl], ttl, use_src ? &src : NULL); #if NETPATH_VERBOSE verbose.fmt ("%d -> probe % 2d ->\n", xmit_count, ttl); #endif /* NETPATH_VERBOSE */ } inline bool traceroute::shouldprobe (int prio, int ttl) { if (ttl < int (hops.size ()) && hops[ttl].s_addr != htonl (INADDR_ANY)) return false; if (prio == 0 && oxmit_ttls[ttl]) return false; return !xmit_ttls[ttl]; } void traceroute::timeout () { tmo = NULL; #if NETPATH_VERBOSE verbose.fmt ("=== TIMEOUT %d ===\n", ntmo + 1); #endif /* NETPATH_VERBOSE */ assert (ntmo < 5); if (ntmo++ > 3) { finish (); return; } if (ntmo > 1 && hops_found == tmo_lastfound && hops_max > 0 && hops_total == -1) { /* If there's a firewall, we'll never get a port unreachable * message, and hence never know hops_total. In such cases, we * can only terminate through timeouts, and so want to timeout * more quickly. We use the heuristic that if we've probed five * hops out and not found anything, we timeout quickly. */ int highprobe = maxhops; if (hops_req > 0 && hops_req < highprobe) highprobe = hops_req; highprobe = min (highprobe, hops_max + maxprobes); bool fastquit = true; for (int i = hops_max + 1; i <= highprobe; i++) if (!xmit_ttls[i]) fastquit = false; if (fastquit) { finish (); return; } } tmo_lastfound = hops_found; oxmit_ttls = xmit_ttls; xmit_ttls.setrange (0, xmit_ttls.size (), 0); xmit_count = 0; xmit (); } inline void tmoval (timespec *ts, int ntmo) { ts->tv_sec = 0; ts->tv_nsec = 0; switch (ntmo) { case 0: ts->tv_nsec = 100000000; break; case 1: ts->tv_nsec = 150000000; break; default: ts->tv_nsec = 250000000; break; } } void traceroute::proberange (int start, int low, int high) { assert (start >= 1); assert (low >= 1); assert (high <= maxhops); assert (hops_total == -1 || high <= hops_total); if (high && start > high) start = high; for (int prio = 0; xmit_count < maxprobes && prio <= 1; prio++) { for (int ttl = start; ttl >= low && xmit_count < maxprobes; ttl--) if (shouldprobe (prio, ttl)) probe (ttl); for (int ttl = start; ttl <= high && xmit_count < maxprobes; ttl++) if (shouldprobe (prio, ttl)) probe (ttl); } } void traceroute::xmit () { if (hops_req > 0 && hops_total > 0) proberange (min (hops_req, hops_total), 1, 0); else if (hops_req > 0) proberange (hops_req, 1, 0); else if (hops_total > 0) { if (!hops_req || hops_total + hops_req < 0) proberange (hops_total, 1, 0); else proberange (hops_total, hops_total + hops_req + 1, 0); } else if (hops_max) { if (!hops_req || hops_max + hops_req < 0) proberange (hops_max + maxprobes/2, 1, maxhops); else proberange (hops_max + maxprobes/2, hops_max + hops_req + 1, maxhops); } else if (ntmo) proberange (maxprobes, 1, maxhops); else { probe (maxhops); if (maxhops > 16) probe (16); if (maxhops > 10) probe (10); } if (!xmit_count) { finish (); return; } if (!tmo) { timespec ts; tmoval (&ts, ntmo); tmo = delaycb (ts.tv_sec, ts.tv_nsec, wrap (this, &traceroute::timeout)); } } void traceroute::getpkt (int hopno, in_addr addr, bool last) { #if NETPATH_VERBOSE verbose.fmt ("%d <- getpkt % 2d%s <- ", xmit_count, hopno, last ? "*" : " ") << inet_ntoa (addr) << "\n"; #endif /* NETPATH_VERBOSE */ if (hopno < 1 || hopno > maxhops || (hops_total > 0 && hopno > hops_total)) return; if (xmit_count > 0 && xmit_ttls[hopno]) { xmit_count--; xmit_ttls[hopno] = false; } if (hopno > hops_max) { if (last) { hops_total = hopno; xmit_count = 0; for (int i = 0; i < hops_total; i++) if (xmit_ttls[i]) xmit_count++; } hops_max = hopno; hops.setsize (hopno + 1); } if (!hops[hopno].s_addr) { hops[hopno].s_addr = addr.s_addr; hops_found++; } xmit (); } void traceroute::rcb (icmpsock::icmp_info *ii) { // int hopno = ntohs (ii->udphp->uh_ulen) - sizeof (struct udphdr); int hopno; if (use_dstport) hopno = ntohs (ii->udphp->uh_dport) - baseport; else hopno = ntohs (ii->udphp->uh_sum); /* FreeBSD for some reason zeroes out the checksum of a returned UDP * packet inside an ICMP packet. Thus, we search for the packet * (from the end, since this is most likely to be the end node). */ if (!use_dstport && !hopno && ii->iphp->ip_id) { hopno = ntohs (ii->udphp->uh_ulen) - sizeof (struct udphdr) - 2; if (hopno < 0 || hopno > 7) { warn ("bad traceroute UDP len %d\n", ntohs (ii->udphp->uh_ulen)); return; } hopno += ((maxhops + 0xf) & ~0xf); while (hopno > maxhops) hopno -= 0x10; u_int16_t id = ntohs (ii->iphp->ip_id); while (hopno > 0 && ids[hopno] != id) hopno -= 0x10; if (hopno <= 0) { warn ("bad traceroute IP id for %s\n", inet_ntoa (dest.sin_addr)); return; } } #if 0 warn ("(%d/%d) %d %s (found %d)\n", ii->type, ii->code, hopno, inet_ntoa (ii->pkthdrp->ip_src), hops_found); #endif if (hopno < 1 || hopno > maxhops) { warn ("bad traceroute hopno %d for %s\n", hopno, inet_ntoa (dest.sin_addr)); return; } if (ntohs (ii->iphp->ip_id) != ids[hopno]) { warn ("bad traceroute IP id for %s (hop %d)\n", inet_ntoa (dest.sin_addr), hopno); return; } if (ii->type == ICMP_UNREACH) { hopno = hopno - ii->iphp->ip_ttl + 1; if (hopno < 1 || hopno > maxhops) { warn ("bad traceroute unreach for %s (hopno %d)\n", inet_ntoa (dest.sin_addr), hopno); return; } getpkt (hopno, ii->pkthdrp->ip_src, true); } else if (ii->type == ICMP_TIMXCEED && ii->code == ICMP_TIMXCEED_INTRANS) getpkt (hopno, ii->pkthdrp->ip_src, false); } void traceroute::fail () { while (!cbvec.empty ()) (*cbvec.pop_front ()) (-1, NULL, -1); delete this; } void traceroute::finish () { #if NETPATH_VERBOSE warnx << verbose; #endif /* NETPATH_VERBOSE */ int first = 1; int last = hops_total > 0 ? hops_total : hops_max; if (last <= 0) { fail (); return; } if (hops_req > 0) last = min (hops_req, last); if (hops_req < 0) first = max (1, last + hops_req + 1); while (!cbvec.empty ()) (*cbvec.pop_front ()) (hops_total, hops.base () + first, last - first + 1); delete this; } static ifchgcb_t *rebind; static ptr is; void netpath_reset () { is = NULL; } traceroute * netpath (const sockaddr_in *destp, int hops, netpathcb_t cb, const sockaddr_in *srcp) { if (!is) { sockaddr_in source; bzero (&source, sizeof (source)); source.sin_family = AF_INET; is = New refcounted; if (!is->init (&source)) { if (!is->init_plab (&source)) { warn ("could not initialize RAW sockets for netpath\n"); is = NULL; (*cb) (-1, NULL, -1); return NULL; } warn ("using planet-lab protected raw sockets\n"); } if (!rebind) rebind = ifchgcb (wrap (netpath_reset)); } return New traceroute (is, destp, hops, cb, srcp); } void netpath_cancel (traceroute *trp) { delete trp; } void netpath_addcb (traceroute *trp, netpathcb_t cb) { trp->addcb (cb); } static int respending; static void result (str name, in_addr addr, int nhops, in_addr *av, int n) { aout << strbuf ("dest %s (%s)\n", name.cstr (), inet_ntoa (addr)); if (nhops > 0) aout << strbuf ("%d hops total\n", nhops); if (n > 0) for (int i = 0; i < n; i++) if (nhops <= 0) aout << strbuf ("% 2d ==> %s\n", i + 1, inet_ntoa (av[i])); else aout << strbuf ("% 2d ==> %s\n", nhops + i - n + 1, inet_ntoa (av[i])); else aout << "failed\n"; if (--respending <= 0) exit (0); } static void netpath_usage () __attribute__ ((noreturn)); static void netpath_usage () { warnx << "usage: " << progname << " --netpath host [nhops]\n"; exit (1); } void netpath_test (int argc, char **argv) { argc++; argv--; if (argc != 2 && argc != 3) netpath_usage (); int nhops = 0; if (argc == 3) nhops = atoi (argv[2]); struct hostent *hp = gethostbyname (argv[1]); if (!hp) fatal << argv[1] << ": no such host\n"; sockaddr_in to; bzero (&to, sizeof (to)); to.sin_family = AF_INET; to.sin_addr = *(in_addr *) hp->h_addr; respending = 1; netpath (&to, nhops, wrap (result, argv[1], to.sin_addr)); }