/*
 * Copyright 1998-2000 Ben Smithurst <ben@smithurst.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * remove duplicates from a mailbox (uses Message-ID: only)
 * input: STDIN, output: STDOUT
 */

static const char rcsid[] =
	"$BCPS: src/mailutils/de-dupe.c,v 1.52 2003/01/19 19:18:25 ben Exp $";

#include "misc.h"

#define REALLOC(var, svar, add) do {				\
	svar += (add);						\
	if ((var = realloc(var, svar * sizeof *(var))) == NULL)	\
		err(1, "realloc");				\
} while (0)

typedef struct _idtree {
	int left, right;
	int id;
} IDTREE;

typedef struct {
	char *buf;
	int len, size;
} MESSAGE;

IDTREE *msgid_alloc(int);
int de_dupe(FILE *, FILE *);
int de_dupe_maildir(int, char *, FILE *);
int id_done(char *, int);
static void parse_line(char *, int *, int *);
void add_line(char *);
void print_message(FILE *);
void reset_message(void);
void usage(void);

IDTREE *msgids = NULL;
MESSAGE message = { NULL, 0, 0 };
char *idtext;
int pflag = 0;
size_t i_used = 0, i_alloc = 0, t_used = 0, t_alloc = 0;
static char msgid[1024];

int
main(int argc, char *argv[]) {
	int ch;

	while ((ch = getopt(argc, argv, "p")) != -1)
		switch (ch) {
		case 'p':
			pflag = 1;
			break;
		default:
			usage();
		}

	argv += optind;
	argc -= optind;

	signal(SIGTERM, signal_handler);
	signal(SIGINT, signal_handler);

	return (process(argv, de_dupe, de_dupe_maildir) ? 0 : 1);
}

void
tree_free(void) {
	size_t i;

	i_used = t_used = 0;

	for (i = 0; i < i_alloc; i++)
		msgids[i].left = msgids[i].right = msgids[i].id = -1;
}

int
de_dupe_maildir(int reset, char *file, FILE *in_fp) {
	static int header, done;
	char *line;
	size_t len;
	int retval = RET_NOCHANGE;

	if (reset & !pflag)
		tree_free();
	header = 1;
	done = 0;
	msgid[0] = '\0';

	while ((line = gl_getline(in_fp)) != NULL) {
		len = strlen(line);
		if (sig_count > 0) {
			warnx("de_dupe: signal received, returning");
			return (RET_LOCALERROR);
		}

		chop(line, len);

		parse_line(line, &header, &done);
		if (done) {
			retval = RET_DELETE;
			break;
		}
	}

	gl_destroy(in_fp);

	return (retval);
}

int
de_dupe(FILE *in_fp, FILE *out_fp) {
	char *line;
	int header, done;
	int retval;
	size_t len;

	/* initialize all variables */
	header = done = 0;
	msgid[0] = '\0';
	retval = RET_NOCHANGE;

	/* reset the message ID tree, unless -p given */
	if (!pflag)
		tree_free();

	while ((line = gl_getline(in_fp)) != NULL) {
		len = strlen(line);
		if (sig_count > 0) {
			warnx("de_dupe: signal received, returning");
			return (RET_LOCALERROR);
		}

		/* Terminate the string */
		len--;
		if (line[len] != '\n') {
			warnx("de_dupe: no trailing \\n");
			return (RET_LOCALERROR);
		}
		line[len] = '\0';

		/* reset things at start of new message */
		if (is_from(line, 1)) {
			if (done)
				retval = RET_CHANGE;
			else
				print_message(out_fp);
			reset_message();
			msgid[0] = '\0';
			done = 0;
			header = 1;
			add_line(line);
			continue;
		}

		/*
		 * check early for messages already done; we can skip
		 * the rest.
		 */
		if (done)
			continue;

		parse_line(line, &header, &done);

		add_line(line);
	}

	gl_destroy(in_fp);

	if (done)
		retval = RET_CHANGE;
	else
		print_message(out_fp);
	reset_message();

	return (retval);
}

/* Parses current line, looking for message id */
static void
parse_line(char *line, int *header, int *done) {
	char *p;

	if (*line == '\0') {
		*header = 0;
		return;
	}

	if (!*header || msgid[0] != '\0' || strncasecmp(line, "message-id:", 11) != 0)
		return;

	p = line + 11;
	while (isspace(*p)) p++;
	if (*p == '<') {
		char *end;
		int len;

		p++;
		end = strchr(p, '>');

		if (end != NULL) {
			*end = '\0';
			len = end - p;
		} else {
			warnx("no trailing '>' in "
			  "message-id <%s", p);
			len = strlen(p);
		}

		/*
		 * Don't waste time with strncpy() -- it
		 * zero fills the string.
		 */
		if (len >= sizeof msgid) {
			warnx("message-id <%s> "
			  "too long, truncated", p);
			p[sizeof msgid - 1] = '\0';
		}

		strcpy(msgid, p);

		if (end != NULL)
			*end = '>';
	}

	*done = id_done(msgid, 0);
}

/*
 * Add a line to the current message.
 */
void
add_line(char *p) {
	size_t len;

	len = strlen(p);
	if (message.len + len >= message.size)
		REALLOC(message.buf, message.size, len + 10240);

	if (len > 0)
		memcpy(&message.buf[message.len], p, len);
	message.buf[message.len + len] = '\n';
	message.len += len + 1;
}

/* Print out the current message */
void
print_message(FILE *fp) {

	if (message.len > 0)
		fwrite(message.buf, 1, message.len, fp);
}

/*
 * clear all nodes of a message list.
 * Don't free(), since they will be reused for the next message.
 */
void
reset_message(void) {

	message.len = 0;
}

/*
 * Mark an id as done, and return 0/1 for whether it was already done
 * before the change.
 */
int
id_done(char *id, int offset) {
	int diff;
	int len;
	IDTREE *new, *start;

	if (id[0] == '\0')
		return (0);

	len = strlen(id);

	/* See if the top node doesn't exist */
	if (msgids == NULL || msgids->id == -1) {
		new = msgid_alloc(len);
		strcpy(&idtext[new->id], id);
		return (0);
	}

	start = &msgids[offset];
	assert(start->left != offset && start->right != offset);

	diff = strcmp(id, &idtext[start->id]);

	if (diff == 0)
		return (1);

	/* see if there's a node on the right side */
	if (diff < 0) {
		if (start->left >= 0)
			return (id_done(id, start->left));
	} else {
		if (start->right >= 0)
			return (id_done(id, start->right));
	}

	/* create the new node */
	new = msgid_alloc(len);
	strcpy(&idtext[new->id], id);

	/* attach to the right side */
	start = &msgids[offset];
	assert(new - msgids != offset);
	if (diff < 0)
		start->left = new - msgids;
	else
		start->right = new - msgids;

	return (0);
}

/* Allocate a new IDTREE node with the specified length.  */
IDTREE *
msgid_alloc(int min) {
	IDTREE *new;

	if (i_used >= i_alloc)
		REALLOC(msgids, i_alloc, 128);

	if (t_used + min >= t_alloc)
		REALLOC(idtext, t_alloc, min + 10240);

	new = &msgids[i_used];
	new->left = new->right = -1;
	new->id = t_used;

	i_used++;
	t_used += min + 1;

	return (new);
}

void
usage(void) {
	fprintf(stderr, "usage: de-dupe [-p] [mailbox ...]\n");
	exit(EX_USAGE);
}


syntax highlighted by Code2HTML, v. 0.9.1