/* $Id: netpath.C,v 1.14 2005/10/19 23:52:26 dm Exp $ */
/*
*
* Copyright (C) 2003 David Mazieres (dm@uun.org)
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
* USA
*
*/
#include "async.h"
#include "qhash.h"
#include "aios.h"
#include "rawnet.h"
#define NETPATH_VERBOSE 0
inline
icmpsock::icmpclnt::icmpclnt (ref<icmpsock> s, in_addr a, icmpsock::cb_t c)
: cb (c), is (s), addr (a)
{
is->cbtab.insert (this);
}
inline
icmpsock::icmpclnt::~icmpclnt ()
{
is->cbtab.remove (this);
}
void
icmpsock::rcb ()
{
for (;;) {
sockaddr_in sin;
socklen_t sinlen = sizeof (sin);
bzero (&sin, sizeof (sin));
sin.sin_family = AF_INET;
inpkt pkt;
bzero (&pkt, sizeof (pkt));
int n = recvfrom (icmpfd, &pkt, sizeof (pkt), 0,
(sockaddr *) &sin, &sinlen);
if (n <= 0) {
if (n < 0 && errno != EAGAIN)
warn ("recvfrom ICMP socket: %m\n");
return;
}
icmp_info ii;
if (!icmp_parse (&ii, &pkt, n) || !ii.udphp)
continue;
for (icmpclnt *cp = cbtab[ii.iphp->ip_dst], *ncp; cp; cp = ncp) {
ncp = cbtab.nextkeq (cp);
(*cp->cb) (&ii);
}
}
}
void
icmpsock::closefds ()
{
if (icmpfd >= 0) {
fdcb (icmpfd, selread, NULL);
close (icmpfd);
icmpfd = -1;
}
if (udpfd >= 0) {
close (udpfd);
udpfd = -1;
}
if (ipfd >= 0) {
close (ipfd);
ipfd = -1;
}
}
bool
icmpsock::icmp_parse (icmpsock::icmp_info *infop, inpkt *inp, int size)
{
bzero (infop, sizeof (*infop));
if (size < int (sizeof (inp->iph) + 8)
/* || size < int (ntohs (inp->iph.ip_len)) XXX - kernel byte-swaps? */
|| inp->iph.ip_p != IPPROTO_ICMP)
return false;
int hlen = inp->iph.ip_hl << 2;
if (size < int (hlen + sizeof (icmp)))
return false;
infop->pkthdrp = &inp->iph;
infop->icmpp = reinterpret_cast<icmp *> (&inp->data[hlen]);
infop->type = infop->icmpp->icmp_type;
infop->code = infop->icmpp->icmp_code;
if (size < int (hlen + 8 + sizeof (struct ip) + 8))
return true;
infop->iphp = &infop->icmpp->icmp_ip;
int dhlen = infop->iphp->ip_hl << 2;
if (dhlen < int (sizeof (struct ip))
|| size < hlen + 8 + dhlen + 8
|| infop->iphp->ip_p != IPPROTO_UDP)
return true;
infop->udphp = reinterpret_cast<udphdr *> (&inp->data[hlen + 8 + dhlen]);
return true;
}
void
icmpsock::portalloc ()
{
assert (udpfd == -1);
udpfd = inetsocket (SOCK_DGRAM, ntohs (fromaddr.sin_port),
ntohl (fromaddr.sin_addr.s_addr));
if (udpfd < 0) {
if (errno != EADDRINUSE)
warn ("socket: %m\n");
}
else {
close_on_exec (udpfd);
socklen_t sinlen = sizeof (fromaddr);
getsockname (udpfd, (sockaddr *) &fromaddr, &sinlen);
}
if (fromaddr.sin_addr.s_addr == htonl (INADDR_ANY)) {
vec<in_addr> av;
myipaddrs (&av);
while (!av.empty () && av[0].s_addr == htonl (INADDR_LOOPBACK))
av.pop_front ();
if (!av.empty ())
fromaddr.sin_addr = av[0];
else // What the hell, allows testing on disconnected machines
fromaddr.sin_addr.s_addr = htonl (INADDR_LOOPBACK);
}
}
bool
icmpsock::init (const sockaddr_in *fa)
{
closefds ();
fromaddr = *fa;
portalloc ();
ipfd = socket (AF_INET, SOCK_RAW, IPPROTO_RAW);
if (ipfd < 0) {
warn ("RAW IP socket: %m\n");
return false;
}
close_on_exec (ipfd);
int data = 576;
#if 0
if (setsockopt (ipfd, SOL_SOCKET, SO_SNDBUF,
(char *) &data, sizeof(data)) < 0) {
warn("SO_SNDBUF: %m\n");
return false;
}
#endif
data = 1;
if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL,
(char *) &data, sizeof(data)) < 0) {
warn("IP_HDRINCL: %m\n");
return false;
}
make_async (ipfd);
icmpfd = socket (AF_INET, SOCK_RAW, IPPROTO_ICMP);
if (icmpfd < 0) {
warn ("ICMP socket: %m\n");
return false;
}
close_on_exec (icmpfd);
make_async (icmpfd);
fdcb (icmpfd, selread, wrap (this, &icmpsock::rcb));
return true;
}
#define IPPROTO_ICMP_UDP 117
#define IPPROTO_ICMP_TCP 106
#if PLANET_LAB
bool
icmpsock::init_plab (const sockaddr_in *fa)
{
int one = 1;
closefds ();
fromaddr = *fa;
portalloc ();
if (udpfd >= 0) {
close (udpfd);
udpfd = -1;
}
ipfd = socket (AF_INET, SOCK_RAW, IPPROTO_UDP);
if (ipfd < 0) {
warn ("RAW IP socket: %m\n");
return false;
}
close_on_exec (ipfd);
if (bind (ipfd, (sockaddr *) &fromaddr, sizeof (fromaddr)) < 0) {
warn ("bind of RAW/UDP socket: %m\n");
return false;
}
if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL,
(char *) &one, sizeof(one)) < 0) {
warn("IP_HDRINCL (RAW UDP): %m\n");
return false;
}
make_async (ipfd);
icmpfd = socket (AF_INET, SOCK_RAW, IPPROTO_ICMP_UDP);
if (icmpfd < 0) {
warn ("ICMP socket: %m\n");
return false;
}
close_on_exec (icmpfd);
if (bind (icmpfd, (sockaddr *) &fromaddr, sizeof (fromaddr)) < 0) {
warn ("bind of ICMP/UDP socket: %m\n");
return false;
}
/* XXX - this is done in planet lab traceroute code -- is it needed? */
if (setsockopt (ipfd, IPPROTO_IP, IP_HDRINCL,
(char *) &one, sizeof(one)) < 0) {
warn("IP_HDRINCL (ICMP/UDP): %m\n");
return false;
}
make_async (icmpfd);
fdcb (icmpfd, selread, wrap (this, &icmpsock::rcb));
return true;
}
#endif /* PLANET_LAB */
void
icmpsock::sendpkt (const sockaddr_in *to, u_int ttl,
u_int16_t datasize, u_int16_t id, u_int16_t sum,
const sockaddr_in *fromp)
{
int pktlen = xoffsetof (outpkt, payload[datasize]);
outpkt pkt;
bzero (&pkt, pktlen);
if (fromp && fromp->sin_addr.s_addr != htonl (INADDR_ANY))
pkt.iph.ip_src = fromp->sin_addr;
else
pkt.iph.ip_src = fromaddr.sin_addr;
pkt.iph.ip_dst = to->sin_addr;
pkt.iph.ip_off = htons (0);
pkt.iph.ip_hl = sizeof (pkt.iph) >> 2;
pkt.iph.ip_p = IPPROTO_UDP;
pkt.iph.ip_v = 4;
pkt.iph.ip_ttl = ttl;
pkt.iph.ip_len = htons (pktlen);
pkt.iph.ip_id = htons (id);
if (fromp && fromp->sin_port != htons (0))
pkt.udph.uh_sport = fromp->sin_port;
else
pkt.udph.uh_sport = fromaddr.sin_port;
pkt.udph.uh_dport = to->sin_port;
pkt.udph.uh_ulen = htons (pktlen - sizeof (pkt.iph));
if (sum) {
assert (datasize >= 2);
assert (pkt.iph.ip_src.s_addr != htonl (INADDR_ANY));
pkt.udph.uh_sum = ntohs (sum);
u_int32_t usum = ntohs (pkt.iph.ip_p) + pkt.udph.uh_ulen;
usum = cksum (&pkt.iph.ip_src, 8, usum);
usum = ~cksum (&pkt.udph, ntohs (pkt.udph.uh_ulen), usum) & 0xffff;
if (!usum)
usum = 0xffff;
*reinterpret_cast<u_int16_t *> (&pkt.udph + 1) = usum;
}
else {
u_int32_t usum = ntohs (pkt.iph.ip_p) + pkt.udph.uh_ulen;
usum = cksum (&pkt.iph.ip_src, 8, usum);
usum = ~cksum (&pkt.udph, ntohs (pkt.udph.uh_ulen), usum) & 0xffff;
if (!usum)
usum = 0xffff;
pkt.udph.uh_sum = usum;
}
#if 0
struct ph {
in_addr s, d;
u_int8_t z, p;
u_int16_t l, sp, dp, l2, uc;
};
ph h = { pkt.iph.ip_src, pkt.iph.ip_dst, 0, pkt.iph.ip_p,
pkt.udph.uh_ulen, pkt.udph.uh_sport, pkt.udph.uh_dport,
pkt.udph.uh_ulen, pkt.udph.uh_sum
};
u_int16_t s2 = ~cksum (&h, sizeof (h));
warn ("ttl %d, sum 0x%x\n", ttl, ntohs (pkt.udph.uh_sum));
if (ttl == 64) {
warn << "IP header " << hexdump (&pkt.iph, sizeof (pkt.iph)) << "\n";
warn << "UDP header " << hexdump (&pkt.udph, sizeof (pkt.udph)) << "\n";
warn << "pseudo-header " << hexdump (&h, sizeof (h)) << "\n";
}
#endif
static bool ip_hdrincl_ok, ip_hdrincl_swapped;
if (ip_hdrincl_swapped) {
/* Yuck... Might be required by FreeBSD */
pkt.iph.ip_len = ntohs (pkt.iph.ip_len);
}
errno = 0;
int n = sendto (ipfd, &pkt, pktlen, 0, (sockaddr *) to, sizeof (*to));
if (n < 0 && errno == EINVAL && !ip_hdrincl_swapped && !ip_hdrincl_ok) {
pkt.iph.ip_len = ntohs (pkt.iph.ip_len);
n = sendto (ipfd, &pkt, pktlen, 0, (sockaddr *) to, sizeof (*to));
if (n >= 0) {
warn ("kernel seems to swap byte order of ip_len... yuck\n");
ip_hdrincl_swapped = true;
}
}
if (n >= 0)
ip_hdrincl_ok = true;
if (n != pktlen)
warn ("RAW IP sendto %d/%d: %m\n", n, pktlen);
}
traceroute::traceroute (ref<icmpsock> ss, const sockaddr_in *d,
int nhops, cb_t c, const sockaddr_in *srcp)
: dest (*d), use_dstport (false), hops_req (nhops), hops_max (0),
hops_total (-1), hops_found (0), xmit_count (0),
ic (ss->setcb (d->sin_addr, wrap (this, &traceroute::rcb))),
ntmo (0), tmo_lastfound (0), tmo (NULL)
{
cbvec.push_back (c);
xmit_ttls.zsetsize (maxhops + 1);
oxmit_ttls.zsetsize (maxhops + 1);
ids.setsize (maxhops + 1);
bzero (ids.base (), ids.size () * sizeof (ids[0]));
use_src = srcp;
if (use_src)
src = *srcp;
if (hops_req > maxhops)
hops_req = maxhops;
if (dest.sin_port == htons (0)) {
dest.sin_port = htons (33435);
use_dstport = true;
}
xmit ();
}
traceroute::~traceroute ()
{
if (tmo)
timecb_remove (tmo);
}
void
traceroute::probe (u_int8_t ttl)
{
if (!xmit_ttls[ttl]) {
xmit_ttls[ttl] = true;
xmit_count++;
}
while (!ids[ttl])
ids[ttl] = arandom ();
if (use_dstport) {
dest.sin_port = htons (baseport + ttl);
ic->is->sendpkt (&dest, ttl, 0, ids[ttl], 0,
use_src ? &src : NULL);
}
else
ic->is->sendpkt (&dest, ttl, 2 + (ttl & 0xf), ids[ttl], ttl,
use_src ? &src : NULL);
#if NETPATH_VERBOSE
verbose.fmt ("%d -> probe % 2d ->\n", xmit_count, ttl);
#endif /* NETPATH_VERBOSE */
}
inline bool
traceroute::shouldprobe (int prio, int ttl)
{
if (ttl < int (hops.size ())
&& hops[ttl].s_addr != htonl (INADDR_ANY))
return false;
if (prio == 0 && oxmit_ttls[ttl])
return false;
return !xmit_ttls[ttl];
}
void
traceroute::timeout ()
{
tmo = NULL;
#if NETPATH_VERBOSE
verbose.fmt ("=== TIMEOUT %d ===\n", ntmo + 1);
#endif /* NETPATH_VERBOSE */
assert (ntmo < 5);
if (ntmo++ > 3) {
finish ();
return;
}
if (ntmo > 1 && hops_found == tmo_lastfound
&& hops_max > 0 && hops_total == -1) {
/* If there's a firewall, we'll never get a port unreachable
* message, and hence never know hops_total. In such cases, we
* can only terminate through timeouts, and so want to timeout
* more quickly. We use the heuristic that if we've probed five
* hops out and not found anything, we timeout quickly.
*/
int highprobe = maxhops;
if (hops_req > 0 && hops_req < highprobe)
highprobe = hops_req;
highprobe = min (highprobe, hops_max + maxprobes);
bool fastquit = true;
for (int i = hops_max + 1; i <= highprobe; i++)
if (!xmit_ttls[i])
fastquit = false;
if (fastquit) {
finish ();
return;
}
}
tmo_lastfound = hops_found;
oxmit_ttls = xmit_ttls;
xmit_ttls.setrange (0, xmit_ttls.size (), 0);
xmit_count = 0;
xmit ();
}
inline void
tmoval (timespec *ts, int ntmo)
{
ts->tv_sec = 0;
ts->tv_nsec = 0;
switch (ntmo) {
case 0:
ts->tv_nsec = 100000000;
break;
case 1:
ts->tv_nsec = 150000000;
break;
default:
ts->tv_nsec = 250000000;
break;
}
}
void
traceroute::proberange (int start, int low, int high)
{
assert (start >= 1);
assert (low >= 1);
assert (high <= maxhops);
assert (hops_total == -1 || high <= hops_total);
if (high && start > high)
start = high;
for (int prio = 0; xmit_count < maxprobes && prio <= 1; prio++) {
for (int ttl = start; ttl >= low && xmit_count < maxprobes; ttl--)
if (shouldprobe (prio, ttl))
probe (ttl);
for (int ttl = start; ttl <= high && xmit_count < maxprobes; ttl++)
if (shouldprobe (prio, ttl))
probe (ttl);
}
}
void
traceroute::xmit ()
{
if (hops_req > 0 && hops_total > 0)
proberange (min (hops_req, hops_total), 1, 0);
else if (hops_req > 0)
proberange (hops_req, 1, 0);
else if (hops_total > 0) {
if (!hops_req || hops_total + hops_req < 0)
proberange (hops_total, 1, 0);
else
proberange (hops_total, hops_total + hops_req + 1, 0);
}
else if (hops_max) {
if (!hops_req || hops_max + hops_req < 0)
proberange (hops_max + maxprobes/2, 1, maxhops);
else
proberange (hops_max + maxprobes/2, hops_max + hops_req + 1, maxhops);
}
else if (ntmo)
proberange (maxprobes, 1, maxhops);
else {
probe (maxhops);
if (maxhops > 16)
probe (16);
if (maxhops > 10)
probe (10);
}
if (!xmit_count) {
finish ();
return;
}
if (!tmo) {
timespec ts;
tmoval (&ts, ntmo);
tmo = delaycb (ts.tv_sec, ts.tv_nsec, wrap (this, &traceroute::timeout));
}
}
void
traceroute::getpkt (int hopno, in_addr addr, bool last)
{
#if NETPATH_VERBOSE
verbose.fmt ("%d <- getpkt % 2d%s <- ", xmit_count,
hopno, last ? "*" : " ") << inet_ntoa (addr) << "\n";
#endif /* NETPATH_VERBOSE */
if (hopno < 1 || hopno > maxhops || (hops_total > 0 && hopno > hops_total))
return;
if (xmit_count > 0 && xmit_ttls[hopno]) {
xmit_count--;
xmit_ttls[hopno] = false;
}
if (hopno > hops_max) {
if (last) {
hops_total = hopno;
xmit_count = 0;
for (int i = 0; i < hops_total; i++)
if (xmit_ttls[i])
xmit_count++;
}
hops_max = hopno;
hops.setsize (hopno + 1);
}
if (!hops[hopno].s_addr) {
hops[hopno].s_addr = addr.s_addr;
hops_found++;
}
xmit ();
}
void
traceroute::rcb (icmpsock::icmp_info *ii)
{
// int hopno = ntohs (ii->udphp->uh_ulen) - sizeof (struct udphdr);
int hopno;
if (use_dstport)
hopno = ntohs (ii->udphp->uh_dport) - baseport;
else
hopno = ntohs (ii->udphp->uh_sum);
/* FreeBSD for some reason zeroes out the checksum of a returned UDP
* packet inside an ICMP packet. Thus, we search for the packet
* (from the end, since this is most likely to be the end node). */
if (!use_dstport && !hopno && ii->iphp->ip_id) {
hopno = ntohs (ii->udphp->uh_ulen) - sizeof (struct udphdr) - 2;
if (hopno < 0 || hopno > 7) {
warn ("bad traceroute UDP len %d\n", ntohs (ii->udphp->uh_ulen));
return;
}
hopno += ((maxhops + 0xf) & ~0xf);
while (hopno > maxhops)
hopno -= 0x10;
u_int16_t id = ntohs (ii->iphp->ip_id);
while (hopno > 0 && ids[hopno] != id)
hopno -= 0x10;
if (hopno <= 0) {
warn ("bad traceroute IP id for %s\n", inet_ntoa (dest.sin_addr));
return;
}
}
#if 0
warn ("(%d/%d) %d %s (found %d)\n", ii->type, ii->code, hopno,
inet_ntoa (ii->pkthdrp->ip_src), hops_found);
#endif
if (hopno < 1 || hopno > maxhops) {
warn ("bad traceroute hopno %d for %s\n", hopno,
inet_ntoa (dest.sin_addr));
return;
}
if (ntohs (ii->iphp->ip_id) != ids[hopno]) {
warn ("bad traceroute IP id for %s (hop %d)\n",
inet_ntoa (dest.sin_addr), hopno);
return;
}
if (ii->type == ICMP_UNREACH) {
hopno = hopno - ii->iphp->ip_ttl + 1;
if (hopno < 1 || hopno > maxhops) {
warn ("bad traceroute unreach for %s (hopno %d)\n",
inet_ntoa (dest.sin_addr), hopno);
return;
}
getpkt (hopno, ii->pkthdrp->ip_src, true);
}
else if (ii->type == ICMP_TIMXCEED && ii->code == ICMP_TIMXCEED_INTRANS)
getpkt (hopno, ii->pkthdrp->ip_src, false);
}
void
traceroute::fail ()
{
while (!cbvec.empty ())
(*cbvec.pop_front ()) (-1, NULL, -1);
delete this;
}
void
traceroute::finish ()
{
#if NETPATH_VERBOSE
warnx << verbose;
#endif /* NETPATH_VERBOSE */
int first = 1;
int last = hops_total > 0 ? hops_total : hops_max;
if (last <= 0) {
fail ();
return;
}
if (hops_req > 0)
last = min (hops_req, last);
if (hops_req < 0)
first = max (1, last + hops_req + 1);
while (!cbvec.empty ())
(*cbvec.pop_front ()) (hops_total, hops.base () + first, last - first + 1);
delete this;
}
static ifchgcb_t *rebind;
static ptr<icmpsock> is;
void
netpath_reset ()
{
is = NULL;
}
traceroute *
netpath (const sockaddr_in *destp, int hops, netpathcb_t cb,
const sockaddr_in *srcp)
{
if (!is) {
sockaddr_in source;
bzero (&source, sizeof (source));
source.sin_family = AF_INET;
is = New refcounted<icmpsock>;
if (!is->init (&source)) {
if (!is->init_plab (&source)) {
warn ("could not initialize RAW sockets for netpath\n");
is = NULL;
(*cb) (-1, NULL, -1);
return NULL;
}
warn ("using planet-lab protected raw sockets\n");
}
if (!rebind)
rebind = ifchgcb (wrap (netpath_reset));
}
return New traceroute (is, destp, hops, cb, srcp);
}
void
netpath_cancel (traceroute *trp)
{
delete trp;
}
void
netpath_addcb (traceroute *trp, netpathcb_t cb)
{
trp->addcb (cb);
}
static int respending;
static void
result (str name, in_addr addr, int nhops, in_addr *av, int n)
{
aout << strbuf ("dest %s (%s)\n", name.cstr (), inet_ntoa (addr));
if (nhops > 0)
aout << strbuf ("%d hops total\n", nhops);
if (n > 0)
for (int i = 0; i < n; i++)
if (nhops <= 0)
aout << strbuf ("% 2d ==> %s\n", i + 1, inet_ntoa (av[i]));
else
aout << strbuf ("% 2d ==> %s\n", nhops + i - n + 1, inet_ntoa (av[i]));
else
aout << "failed\n";
if (--respending <= 0)
exit (0);
}
static void netpath_usage () __attribute__ ((noreturn));
static void
netpath_usage ()
{
warnx << "usage: " << progname << " --netpath host [nhops]\n";
exit (1);
}
void
netpath_test (int argc, char **argv)
{
argc++;
argv--;
if (argc != 2 && argc != 3)
netpath_usage ();
int nhops = 0;
if (argc == 3)
nhops = atoi (argv[2]);
struct hostent *hp = gethostbyname (argv[1]);
if (!hp)
fatal << argv[1] << ": no such host\n";
sockaddr_in to;
bzero (&to, sizeof (to));
to.sin_family = AF_INET;
to.sin_addr = *(in_addr *) hp->h_addr;
respending = 1;
netpath (&to, nhops, wrap (result, argv[1], to.sin_addr));
}
syntax highlighted by Code2HTML, v. 0.9.1