/* $Id: surblhost.c 154 2007-04-09 15:10:13Z cslsublevel3org $
 *
 * Copyright (C) 2006, 2007 Christian Stigen Larsen <csl@sublevel3.org>
 * Distributed under the GNU General Public License v2.
 *
 * http://csl.sublevel3.org
 *
 */

#if HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if HAVE_NETDB_H
#include <netdb.h>
#endif

#if HAVE_SYS_TYPES_H
#include <sys/types.h>
#endif

#if HAVE_SYS_SOCKET_H
#include <sys/socket.h>
#endif

#if HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif

#if HAVE_ARPA_INET_H
#include <arpa/inet.h>
#endif

#if HAVE_ARPA_NAMESER_H
#include <arpa/nameser.h>
#endif

#if HAVE_RESOLV_H
#include <resolv.h>
#endif

#if HAVE_NAMESER8_COMPAT_H
#include <nameser8_compat.h>
#endif

static struct options_ {
	const char* surbl;
	const char* tlds_file;
	const char* wlist_file;
	const char** ptlds;
	const char** pwhitelist;
	int check_whitelist;
	int hosts;
	int print_tlds;
	int print_whitelist;
	int quiet;
	int read_stdin;
	int recursive_strip;
	int run_test;
	int skip_whitelist;
	int strip_subdomains;
	int tlds_add;
	int verbose;
	int wlist_add;
} global_options;

static const char version[] = PACKAGE_STRING;
static const char copyright[] = "Copyright (C) 2006, 2007 Christian Stigen Larsen <csl@sublevel3.org>";
static const char license[] = "Distributed under the GNU General Public License v2";

extern int h_errno;
extern const char* whitelist[]; // whitelist.c
extern const char* two_level_tlds[]; // two-level-tlds.c

static int is_twolevel_tld(const char* host, const char** tldlist);

static void help()
{
	fprintf(stderr, "%s\n%s\n%s", version, copyright, license);
	fputs("\n\n"
"USAGE:  surblhost [option(s)]  [hostname(s)]\n"
"\n"
"Checks if given hostnames are blocklisted by surbl.org,\n"
"meaning that spam mail have links to the given site.\n"
"\n"
"OPTIONS\n"
"  -                      read hostnames from standard input, one per line\n"
"      --check-whitelist  checks if the whitelist is indeed ok\n"
"  -h, --help             print help\n"
"      --no-surbl         do not add .multi.surbl.org to checked hostnames\n"
"      --print-tlds       print current two-level tld list\n"
"      --print-whitelist  print current whitelist\n"
"  -q, --quiet            run silently\n"
"  -r, --recursive        check all subdomains, e.g. foo.bar.com and bar.com\n"
"  -f, --skip-whitelist   ignore whitelist (force check)\n"
"  -s, --strip-sub        strip subdomains, e.g. abc.def.com -> def.com\n"
"      --surbl=...        use another dns suffix than multi.surbl.org\n"
"      --test             check permanent test point, should be blocklisted\n"
"      --tlds-add=...     as --tlds=... but adds entries to precompiled list\n"
"      --tlds=...         read two-level tld from file, one entry per line\n"
"  -v, --verbose          verbose output\n"
"  -vv                    more verbose output\n"
"  -V  --version          print program version\n"
"      --whitelist-add=... as --whitelist but adds entries to precompiled list\n"
"      --whitelist=...    read whitelist from given file, one entry per line\n"
"\n"
"NOTES\n"
"   You should only check the base site name and TLD, i.e., if you want to\n"
"   investigate http://some.place.com, then you will just submit `place.com'\n"
"   to surblhost.  For more information, see http://www.surbl.org\n"
"\n"
"EXAMPLES\n"
"   surblhost -v somesite.com\n"
"   surblhost --test --verbose\n"
"   surblhost evilsite1.com evilsite2.com goodside1.com\n"
"   surblhost foo.com -q && echo whitelisted site || echo blocklisted site\n"
"\n"
"RETURN VALUES\n"
"   0 = no sites blocklisted\n"
"   1 = error\n"
"   2 = one or more site blocklisted\n"
"\n"
"Report bugs to <" PACKAGE_BUGREPORT ">\n"
, stderr);
}

static int check_blocklist(const int code)
{
	const int v = global_options.verbose;

	if ( code >= 2 )
		printf(v? "is blocklisted by " : "is blocklisted");

	if ( v ) {
		if ( code & 64 ) fputs(v>1? "prolocation/jwspamspy " : "[jp]", stdout);
		if ( code & 32 ) fputs(v>1? "abusebutler " : "[ab]", stdout);
		if ( code & 16 ) fputs(v>1? "outblaze " : "[ob]", stdout);
		if ( code & 8 )  fputs(v>1? "phising " : "[ph]", stdout);
		if ( code & 2 )  fputs(v>1? "spamcop " : "[sc]", stdout);
		if ( code & 4 )  fputs(v>1? "w.stearns " : "[ws]", stdout);
	}

	printf("\n");
	return code >= 2;
}

inline
static int is_ipaddr(const char* s)
{
	int a,b,c,d;
	return sscanf(s, "%d.%d.%d.%d", &a, &b, &c, &d) == 4;
}

static void check_hostname(const char* s)
{
	int dots = 0;
	int maxdots;
	const char* host = s;

	if ( *global_options.surbl==0 || is_ipaddr(s) )
		return;

	if ( global_options.recursive_strip )
		return;

	maxdots = is_twolevel_tld(s, global_options.ptlds) ? 2 : 1;

	while ( *s ) {

		if ( *s == '.' )
		if ( ++dots > maxdots ) {
			fprintf(stderr, "warning: hostname contains subdomains: %s (try option -r)\n", host);
			return;
		}
		
		++s;
	}
}

// Example: num_domains("a.b.c") == 3, num_domains("a.b") == 2
static int num_domains(const char* s)
{
	int r = *s? 1 : 0;
	
	while ( *s ) {
		if ( *s++=='.' ) ++r;
	}
	return r;
}

static const char* remove_subdomains(const char* orig, const int output_domains)
{
	const char* s = orig + strlen(orig);
	int dots=0;

	while ( s > orig ) {

		if ( *s == '.')
			++dots;

		if ( dots == output_domains ) {
			++s;
			break;
		}

		--s;
	}

	return s;
}

/*
 * This function is taken from the freshclam Clam Antivirus
 * package, Copyright (C) 2004 Tomasz Kojm <tkojm@clamav.net>,
 * and released under the GPL v2.
 *
 */
static char* txtquery(const char* domain, unsigned int* ttl)
{
#if ENABLE_DNSTXT
#if HAS_WORKING_RESOLV
	static int res_initialized = 0;
	#ifndef HAVE_MEMSET
	int nn;
	#endif
	unsigned char answer[PACKETSZ], *pt;
	char *txt;
	char host[128];
	int len, exp, cttl, size, txtlen, type;

	if ( !res_initialized ) {

		if ( res_init() < 0 ) {
			printf("res_init failed\n");
			return NULL;
		}

		res_initialized = 1;
	}

	#if HAVE_MEMSET
	memset(answer, 0, PACKETSZ);
	#else
	for ( nn=0; nn<PACKETSZ; ++nn ) answer[nn] = 0;
	#endif

	if ( (len = res_query(domain, ns_c_in, ns_t_txt, answer, PACKETSZ)) < 0 ) {
		printf("can't query %s\n", domain);
		return NULL;
	}

	pt = answer + sizeof(HEADER);

	if ( (exp = dn_expand(answer, answer + len, pt, host, sizeof(host))) < 0) {
		printf("dn_expand failed\n");
		return NULL;
	}

	pt += exp;

	GETSHORT(type, pt);
	if(type != T_TXT) {
		printf("broken DNS reply.\n");
		return NULL;
	}

	pt += INT16SZ; // class

	if((exp = dn_expand(answer, answer + len, pt, host, sizeof(host))) < 0) {
		printf("second dn_expand failed\n");
		return NULL;
	}

	pt += exp;
	GETSHORT(type, pt);
	if(type != T_TXT) {
		printf("not a TXT record\n");
		return NULL;
	}

	pt += INT16SZ; // class

	GETLONG(cttl, pt);
	*ttl = cttl;
	GETSHORT(size, pt);
	txtlen = *pt;

	if(txtlen >= size || !txtlen) {
		printf("broken TXT record (txtlen = %d, size = %d)\n", txtlen, size);
		return NULL;
	}

	if(!(txt = (char*)malloc(txtlen + 1)))
		return NULL;

	pt++;
	strncpy(txt, (char*)pt, txtlen);
	txt[txtlen] = 0;

	return txt;
#else
	// ENABLE_DNSTXT not set
	return NULL;
#endif
#endif
}

static int compmi(const void *m1, const void *m2)
{
	return strcmp(*(const char**)m1, *(const char**)m2);
}

static size_t listsize(const char** p)
{
	size_t n = 0;
	while ( *p++ ) ++n;
	return n;
}

/*
 * Returns 1 if `host' is in `wlist'
 * Returns 0 if not found
 *
 * Will automatically remove subdomains, example:
 *
 * www.google.com -> google.com -> white (returns 1)
 * w.googl.com    -> google.com -> white
 * google.com.jp  -> google.com -> white
 * evil.spam.com  -> spam.com   -> ??
 *
 */
static int in_whitelist(const char* host, const char** wlist)
{
	int n;
	const int wlen = listsize(wlist);

	for ( n=num_domains(host); n>1; --n ) {
		const char* h = remove_subdomains(host, n);
		if ( bsearch((char*)&h, (char*)wlist, wlen,
			sizeof(const char*), compmi) != NULL )
		{
			return 1;
		}
	}

	return 0;
}

static int is_twolevel_tld(const char* host, const char** tldlist)
{
	const int size = listsize(tldlist);

	if ( num_domains(host) >= 2 ) {
		const char* host_tld = remove_subdomains(host, 2);

		if ( bsearch((char*)&host_tld, (char*)tldlist, size,
			sizeof(const char*), compmi) != NULL )
		{
			return 1;
		}
	}

	if ( global_options.tlds_add && tldlist!=two_level_tlds )
		return is_twolevel_tld(host, two_level_tlds);

	// host is not a two-level TLD
	return 0;
}

inline
static void print_list(const char** wlist)
{
	while ( *wlist )
		puts(*wlist++);
}

static int lookup(const char* host)
{
	char *lookup;
	int adr[4];
	struct hostent *p, h;
	int res, bl;

	if ( !global_options.skip_whitelist ) {
		int res = in_whitelist(host, global_options.pwhitelist);

		if ( global_options.wlist_add ) // check user-specified wlist as well?
			res |= in_whitelist(host, whitelist);

		if ( res ) {
			fprintf(stdout, "%s is whitelisted\n", host);
			return 0;
		}
	}

	check_hostname(host);

	lookup = (char*)malloc(strlen(host) + strlen(global_options.surbl) + 2);

	if ( sscanf(host, "%d.%d.%d.%d", &adr[0], &adr[1], &adr[2], &adr[3]) == 4 ) {
		// reverse numeric addresses
		sprintf(lookup, "%d.%d.%d.%d", adr[3], adr[2], adr[1], adr[0]);
	} else
		strcpy(lookup, host);

	if ( *global_options.surbl ) {
		strcat(lookup, ".");
		strcat(lookup, global_options.surbl);
	}

	fprintf(stdout, "%s ", global_options.verbose? lookup : host);

	if ( (p = gethostbyname(lookup)) )
		memcpy(&h, p, sizeof(struct hostent));

	res = 0;

	if ( !p ) { // TODO: check for explicit NXDOMAIN
		fputs("is not blocklisted\n", stdout);
	} else {
		while ( *h.h_addr_list != NULL ) {

			#if HAVE_INET_NTOA
			if ( global_options.verbose > 1 ) // print ip address
				fprintf(stdout, "%s ", inet_ntoa(*(struct in_addr*) *h.h_addr_list));
			#endif

			bl = check_blocklist( (int) (*h.h_addr_list)[3] );

			#if ENABLE_DNSTXT
			if ( bl>0 && global_options.verbose>1 ) {
				unsigned int ttl;
				char *p = txtquery(lookup, &ttl);

				if ( p ) {
					fprintf(stdout, "%s txt record: %s\n", lookup, p);
					free(p);
				}
			}
			#endif

			res += bl;

			++h.h_addr_list;
		}
	}

	free(lookup);
	return res;
}

// perform lookup on hostname with all parameters in effect
// returns 0 if not in blocklist, or number of hits in blocklist
static int full_lookup(const char* s)
{
	int blockhits = 0;

	if ( !global_options.recursive_strip ) {

		const char* host = s;

		if ( global_options.strip_subdomains )
			host = remove_subdomains(s, is_twolevel_tld(s, global_options.ptlds) ? 3 : 2);

		blockhits += lookup(host);

	} else {

		int domains = num_domains(s);
		int min = !is_twolevel_tld(s, global_options.ptlds) ? 1 : 2;

		for ( ; domains > min; --domains )
			blockhits += lookup( remove_subdomains(s, domains) );
	}

	return blockhits;
}

inline
static char* remove_newline(char* s)
{
	const int l = strlen(s);
	if ( s[l-1] == '\n' ) s[l-1] = '\0';
	return s;
}

static const char** read_list(const char* file)
{
	int lsize = 100; // initial list size
	const char **plist;
	FILE *f;
	int count = 0;
	int l;
	char *s;
	char buf[512];

	if ( (f = fopen(file, "rt")) == NULL )
		return NULL;

	if ( !(plist = (const char**) malloc( sizeof(char*) * (lsize+1) )) ) {
		fputs("error: not enough memory to hold list\n", stderr),
		exit(1);
	}

	plist[0] = NULL;

	while ( !feof(f) ) {

		if ( !fgets(buf, sizeof(buf)/sizeof(char), f) )
			break;

		remove_newline(buf);

		l = strlen(buf);
		s = (char*) malloc(l+1);
		strcpy(s, buf);

		plist[count] = s;
		plist[count+1] = NULL;

		if ( ++count >= lsize ) {

			lsize *= 2;

			if ( !(plist = (const char**) realloc( plist, sizeof(char*) * (lsize+1) )) ) {
				fputs("error: not enough memory to grow list\n", stderr);
				exit(1);
			}
			
		}
	}

	if ( global_options.verbose ) {
		fprintf(stderr, "%s %d lines from %s\n",
			(lsize? "read" : "warning: read"), count, file);
	}

	return lsize? plist : NULL;
}

static void set_defaults(struct options_ *p)
{
	p->surbl = "multi.surbl.org";

	p->verbose = p->run_test = p->quiet = p->strip_subdomains
		= p->recursive_strip = p->skip_whitelist
		= p->read_stdin = p->print_whitelist
		= p->print_tlds = p->hosts
		= p->wlist_add = p->tlds_add
		= p->check_whitelist = 0;

	p->pwhitelist = whitelist; // hardcoded
	p->ptlds = two_level_tlds; // hardcoded

	p->wlist_file = p->tlds_file = NULL;
}

static void parse_options(int argc, char** argv, struct options_ *p)
{
	#define IF_OPT(str) if ( !strcmp(argv[n], str) )
	#define IF_OPTS(str1, str2) if ( !strcmp(argv[n], str1) || !strcmp(argv[n], str2) )
	#define IF_ARG(str, len) if ( !strncmp(argv[n], str, len) )

	int n;
	for ( n=1; n<argc; ++n ) {

		if ( *argv[n] != '-' ) {
			++ p->hosts;
			continue;
		}

		if ( argv[n][0] == '-' && argv[n][1] == 0 ) {
			p->read_stdin = 1;
			continue;
		}

		IF_ARG("--surbl=", 8) {
			p->surbl = argv[n] + 8;
			continue;
		}

		IF_OPTS("-f", "--skip-whitelist") {
			p->skip_whitelist = 1;
			continue;
		}

		IF_OPT("--print-tlds") {
			p->print_tlds = 1;
			continue;
		}

		IF_OPT("--print-whitelist") {
			p->print_whitelist = 1;
			continue;
		}

		IF_ARG("--whitelist=", 12) {
			p->wlist_file = argv[n] + 12;
			continue;
		}

		IF_ARG("--whitelist-add=", 16) {
			p->wlist_file = argv[n] + 16;
			p->wlist_add = 1;
			continue;
		}

		IF_ARG("--tlds=", 7) {
			p->tlds_file = argv[n] + 7;
			continue;
		}

		IF_ARG("--tlds-add=", 11) {
			p->tlds_file = argv[n] + 11;
			p->tlds_add = 1;
			continue;
		}

		IF_OPT("--no-surbl") {
			p->surbl = "";
			continue;
		}

		IF_OPT("--check-whitelist") {
			p->check_whitelist = 1;
			continue;
		}

		IF_OPTS("-h", "--help") {
			help();
			exit(0);
		}

		IF_OPTS("-V", "--version") {
			fprintf(stderr, "%s\n%s\n%s\n", version, copyright, license);
			exit(0);
		}

		IF_OPTS("-r", "--recursive" ) {
			p->recursive_strip= 1;
			continue;
		}

		IF_OPTS("-s", "--strip-sub") {
			p->strip_subdomains = 1;
			continue;
		}

		IF_OPTS("-v", "--verbose") {
			++ p->verbose;
			continue;
		}

		IF_OPT("-vv") {
			p->verbose += 2;
			continue;
		}

		IF_OPTS("-q", "--quiet") {
			p->quiet = 1;
			continue;
		}

		IF_OPT("--test") {
			p->run_test = 1;
			continue;
		}

		fprintf(stderr, "error: Unknown option %s\n\n", argv[n]);
		help();
		exit(1);
	}
}

static void check_options(struct options_* p)
{
	if ( p->quiet ) {
		p->verbose = 0;
		fclose(stdout);
	}

	// sort hardcoded lists
	qsort((char*)whitelist, listsize(whitelist), sizeof(char*), compmi);
	qsort((char*)two_level_tlds, listsize(two_level_tlds), sizeof(char*), compmi);

	if ( p->wlist_file ) {
		if ( (p->pwhitelist = read_list(p->wlist_file)) == NULL ) {
			fprintf(stderr, "error: could not read whitelist from %s\n", p->wlist_file);
			exit(1);
		}

		// sort for bsearch
		qsort((char*)p->pwhitelist, listsize(p->pwhitelist), sizeof(char*), compmi);

		// we do not free allocated memory explicitly (the lists), because
		// that is a waste of time; your OS will erase the entire process memory
		// faster.  If you don't believe that, ask the glibc developer Ulrich Drepper
		// (search his page on the web).
	}

	if ( p->tlds_file ) {
		if ( (p->ptlds = read_list(p->tlds_file)) == NULL ) {
			fprintf(stderr, "error: could not read two-level TLD list from %s\n", p->tlds_file);
			exit(1);
		}

		qsort((char*)p->ptlds, listsize(p->ptlds), sizeof(char*), compmi);
	}

	if ( p->print_tlds ) {
		fputs("two-level TLDs:\n", stderr);
		print_list(p->ptlds);

		if ( p->tlds_add && p->ptlds!=two_level_tlds )
			print_list(two_level_tlds);
	}

	if ( p->print_whitelist ) {
		fputs("whitelist:\n", stderr);
		print_list(p->pwhitelist);

		if ( p->wlist_add && p->pwhitelist!=whitelist )
			print_list(whitelist);
	}

	if ( p->print_whitelist || p->print_tlds )
		exit(0);

	if ( p->run_test ) {
		p->surbl = "";
		exit( lookup("test.sc.surbl.org.sc.surbl.org") > 0 ? 2 : 0 );
	}

	if ( p->hosts==0 && !p->read_stdin && !p->check_whitelist ) {
		fputs("error: no host(s) specified\n\n", stderr);
		help();
		exit(1);
	}
}

// returns number of hostnames in whitelist that are actually blocklisted
static int check_whitelist(const char** wlist)
{
	int hits = 0;

	// modify some parameters for our check:

	int skip_whitelist = global_options.skip_whitelist;
	int recursive_strip = global_options.recursive_strip;

	global_options.skip_whitelist = 1;
	global_options.recursive_strip = 1;

	for ( ; *wlist; ++wlist )
		hits += full_lookup(*wlist);

	global_options.skip_whitelist = skip_whitelist;
	global_options.recursive_strip = recursive_strip;

	return hits;
}

int main(int argc, char** argv)
{
	int hits, blockhits;
	struct options_ *p = &global_options;

	set_defaults(p);
	parse_options(argc, argv, p);
	check_options(p);

	if ( *p->surbl && p->verbose ) {
		fprintf(stderr, "checking against %s%s\n",
			p->surbl, p->recursive_strip? " recursively" : "");
	}

	if ( p->check_whitelist ) {
		fputs("checking if whitelist is indeed ok (this should be done rarely)\n\n", stderr);

		hits = check_whitelist(global_options.pwhitelist);

		if ( global_options.wlist_add)
			hits += check_whitelist(whitelist);

		if ( !hits ) {
			fputs("\nsummary: the whitelist is ok\n", stderr);
			return 0;
		} else {
			fprintf(stderr, "\nsummary: the whitelist is not ok -- found %d blocked hosts\n", hits);
			return 2;
		}
	}

	blockhits = 0;

	if ( p->read_stdin ) {
		char buf[2048];

		while ( fgets(buf, sizeof(buf)/sizeof(char), stdin) )
			blockhits += full_lookup( remove_newline(buf) );

	} else {
		int n;
		for ( n=1; n<argc; ++n ) {
			if ( *argv[n] != '-' )
				blockhits += full_lookup(argv[n]);
		}
	}

	if ( p->verbose && blockhits )
		fputs("\nSee http://www.surbl.org/lists.html for more information on the blocklists\n", stdout);

	return blockhits? 2 : 0;
}


syntax highlighted by Code2HTML, v. 0.9.1