/*
 *
 * Copyright (C) 2007 François Pesce : francois.pesce (at) gmail (dot) com
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * 	http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <pcre.h>

#include <stdio.h>		/* fgetgrent */
#include <sys/types.h>		/* fgetgrent */
#include <grp.h>		/* fgetgrent */

#include <apr_file_info.h>
#include <apr_file_io.h>
#include <apr_getopt.h>
#include <napr_hash.h>
#include <apr_strings.h>
#include <apr_user.h>

#include "config.h"

#if HAVE_PUZZLE
#include <puzzle.h>
#endif

#if HAVE_ARCHIVE
#include <archive.h>
#include <archive_entry.h>
#endif

#include "checksum.h"
#include "debug.h"
#include "ft_file.h"
#include "napr_heap.h"

#define is_option_set(mask, option)  ((mask & option) == option)

#define set_option(mask, option, on)     \
    do {                                 \
        if (on)                          \
            *mask |= option;             \
        else                             \
            *mask &= ~option;            \
    } while (0)


#define OPTION_ICASE 0x0001
#define OPTION_FSYML 0x0002
#define OPTION_RECSD 0x0004
#define OPTION_VERBO 0x0008
#define OPTION_OPMEM 0x0010
#define OPTION_REGEX 0x0020
#define OPTION_SIZED 0x0040

#if HAVE_PUZZLE
#define OPTION_PUZZL 0x0080
#endif

#if HAVE_ARCHIVE
#define OPTION_UNTAR 0x0100
#endif

typedef struct ft_file_t
{
    apr_off_t size;
    char *path;
#if HAVE_ARCHIVE
    char *subpath;
#endif
#if HAVE_PUZZLE
    PuzzleCvec cvec;
    int cvec_ok:1;
#endif
    int prioritized:1;
} ft_file_t;

typedef struct ft_chksum_t
{
    apr_uint32_t val_array[HASHSTATE];	/* 256 bits (using Bob Jenkins http://www.burtleburtle.net/bob/c/checksum.c) */
    ft_file_t *file;
} ft_chksum_t;

typedef struct ft_fsize_t
{
    apr_off_t val;
    ft_chksum_t *chksum_array;
    apr_uint32_t nb_files;
    apr_uint32_t nb_checksumed;
} ft_fsize_t;

typedef struct ft_gid_t
{
    gid_t val;
} ft_gid_t;

typedef struct ft_conf_t
{
    apr_off_t minsize;
    apr_off_t excess_size;	/* Switch off mmap behavior */
    apr_pool_t *pool;		/* Always needed somewhere ;) */
    napr_heap_t *heap;		/* Will holds the files */
    napr_hash_t *sizes;		/* will holds the sizes hashed with http://www.burtleburtle.net/bob/hash/integer.html */
    napr_hash_t *gids;		/* will holds the gids hashed with http://www.burtleburtle.net/bob/hash/integer.html */
    napr_hash_t *ig_files;
    pcre *ig_regex;
    pcre *wl_regex;
    pcre *ar_regex;		/* archive regex */
    char *p_path;		/* priority path */
    char *username;
    apr_size_t p_path_len;
    apr_uid_t userid;
    apr_gid_t groupid;
    unsigned short int mask;
    char sep;
} ft_conf_t;

static int ft_file_cmp(const void *param1, const void *param2)
{
    const ft_file_t *file1 = param1;
    const ft_file_t *file2 = param2;

    if (file1->size < file2->size)
	return -1;
    else if (file2->size < file1->size)
	return 1;

    return 0;
}

static const void *ft_fsize_get_key(const void *opaque)
{
    const ft_fsize_t *fsize = opaque;

    return &(fsize->val);
}

static const void *ft_gids_get_key(const void *opaque)
{
    const ft_gid_t *gid = opaque;

    return &(gid->val);
}


static apr_size_t get_one(const void *opaque)
{
    return 1;
}

static int apr_uint32_key_cmp(const void *key1, const void *key2, apr_size_t len)
{
    apr_uint32_t i1 = *(apr_uint32_t *) key1;
    apr_uint32_t i2 = *(apr_uint32_t *) key2;

    return (i1 == i2) ? 0 : 1;
}

/* http://www.burtleburtle.net/bob/hash/integer.html */
static apr_uint32_t apr_uint32_key_hash(const void *key, apr_size_t klen)
{
    apr_uint32_t i = *(apr_uint32_t *) key;

    i = (i + 0x7ed55d16) + (i << 12);
    i = (i ^ 0xc761c23c) ^ (i >> 19);
    i = (i + 0x165667b1) + (i << 5);
    i = (i + 0xd3a2646c) ^ (i << 9);
    i = (i + 0xfd7046c5) + (i << 3);
    i = (i ^ 0xb55a4f09) ^ (i >> 16);

    return i;
}

static void ft_hash_add_ignore_list(napr_hash_t *hash, const char *file_list)
{
    const char *filename, *end;
    apr_uint32_t hash_value;
    apr_pool_t *pool;
    char *tmp;

    pool = napr_hash_pool_get(hash);
    filename = file_list;
    do {
	end = strchr(filename, 'c');
	if (NULL != end) {
	    tmp = apr_pstrndup(pool, filename, end - filename);
	}
	else {
	    tmp = apr_pstrdup(pool, filename);
	}
	napr_hash_search(hash, tmp, strlen(tmp), &hash_value);
	napr_hash_set(hash, tmp, hash_value);

	filename = end + 1;
    } while ((NULL != end) && ('\0' != *filename));
}

/** 
 * The function used to add recursively or not file and dirs.
 * @param conf Configuration structure.
 * @param filename name of a file or directory to add to the list of twinchecker.
 * @param gc_pool garbage collecting pool, will be cleaned by the caller.
 * @return APR_SUCCESS if no error occured.
 */
#define MATCH_VECTOR_SIZE 210
static apr_status_t ft_conf_add_file(ft_conf_t *conf, const char *filename, apr_pool_t *gc_pool)
{
    int ovector[MATCH_VECTOR_SIZE];
    char errbuf[128];
    apr_finfo_t finfo;
    apr_dir_t *dir;
    apr_int32_t statmask =
	APR_FINFO_SIZE | APR_FINFO_TYPE | APR_FINFO_USER | APR_FINFO_GROUP | APR_FINFO_UPROT | APR_FINFO_GPROT;
    apr_size_t fname_len;
    apr_uint32_t hash_value;
    apr_status_t status;
    int rc;

    /* Step 1 : Check if it's a directory and get the size if not */
    if (!is_option_set(conf->mask, OPTION_FSYML))
	statmask |= APR_FINFO_LINK;

    if (APR_SUCCESS != (status = apr_stat(&finfo, filename, statmask, gc_pool))) {
	DEBUG_ERR("error calling apr_stat on filename %s : %s", filename, apr_strerror(status, errbuf, 128));
	return status;
    }

    /* Step 1-bis, if we don't own the right to read it, skip it */
    if (0 != conf->userid) {
	if (finfo.user == conf->userid) {
	    if (!(APR_UREAD & finfo.protection)) {
		if (is_option_set(conf->mask, OPTION_VERBO))
		    fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
		return APR_SUCCESS;
	    }
	}
	else if (NULL != napr_hash_search(conf->gids, &finfo.group, 1, &hash_value)) {
	    if (!(APR_GREAD & finfo.protection)) {
		if (is_option_set(conf->mask, OPTION_VERBO))
		    fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
		return APR_SUCCESS;
	    }
	}
	else if (!(APR_WREAD & finfo.protection)) {
	    if (is_option_set(conf->mask, OPTION_VERBO))
		fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
	    return APR_SUCCESS;
	}
    }

    /* Step 2: If it is, browse it */
    if (APR_DIR == finfo.filetype) {
	if (0 != conf->userid) {
	    if (finfo.user == conf->userid) {
		if (!(APR_UEXECUTE & finfo.protection)) {
		    if (is_option_set(conf->mask, OPTION_VERBO))
			fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
		    return APR_SUCCESS;
		}
	    }
	    else if (NULL != napr_hash_search(conf->gids, &finfo.group, 1, &hash_value)) {
		if (!(APR_GEXECUTE & finfo.protection)) {
		    if (is_option_set(conf->mask, OPTION_VERBO))
			fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
		    return APR_SUCCESS;
		}
	    }
	    else if (!(APR_WEXECUTE & finfo.protection)) {
		if (is_option_set(conf->mask, OPTION_VERBO))
		    fprintf(stderr, "Skipping : [%s] (bad permission)\n", filename);
		return APR_SUCCESS;
	    }
	}

	if (APR_SUCCESS != (status = apr_dir_open(&dir, filename, gc_pool))) {
	    DEBUG_ERR("error calling apr_dir_open(%s): %s", filename, apr_strerror(status, errbuf, 128));
	    return status;
	}
	fname_len = strlen(filename);
	while ((APR_SUCCESS == (status = apr_dir_read(&finfo, APR_FINFO_NAME | APR_FINFO_TYPE, dir)))
	       && (NULL != finfo.name)) {
	    /* Check if it has to be ignored */
	    char *fullname;
	    apr_size_t fullname_len;

	    if (NULL != napr_hash_search(conf->ig_files, finfo.name, strlen(finfo.name), NULL))
		continue;

	    if (APR_DIR == finfo.filetype && !is_option_set(conf->mask, OPTION_RECSD))
		continue;

	    fullname = apr_pstrcat(gc_pool, filename, ('/' == filename[fname_len - 1]) ? "" : "/", finfo.name, NULL);
	    fullname_len = strlen(fullname);

	    if ((NULL != conf->ig_regex) && (APR_DIR != finfo.filetype)
		&& (0 <= (rc = pcre_exec(conf->ig_regex, NULL, fullname, fullname_len, 0, 0, ovector, MATCH_VECTOR_SIZE))))
		continue;

	    if ((NULL != conf->wl_regex) && (APR_DIR != finfo.filetype)
		&& (0 > (rc = pcre_exec(conf->wl_regex, NULL, fullname, fullname_len, 0, 0, ovector, MATCH_VECTOR_SIZE))))
		continue;

	    status = ft_conf_add_file(conf, fullname, gc_pool);

	    if (APR_SUCCESS != status) {
		DEBUG_ERR("error recursively calling ft_conf_add_file: %s", apr_strerror(status, errbuf, 128));
		return status;
	    }
	}
	if ((APR_SUCCESS != status) && (APR_ENOENT != status)) {
	    DEBUG_ERR("error calling apr_dir_read: %s", apr_strerror(status, errbuf, 128));
	    return status;
	}

	if (APR_SUCCESS != (status = apr_dir_close(dir))) {
	    DEBUG_ERR("error calling apr_dir_close: %s", apr_strerror(status, errbuf, 128));
	    return status;
	}
    }
    else if (APR_REG == finfo.filetype || ((APR_LNK == finfo.filetype) && (is_option_set(conf->mask, OPTION_FSYML)))) {
	apr_off_t finfosize;
	char *fname;
#if HAVE_ARCHIVE
	const char *subpath;
	/* XXX La */
	struct archive *a = NULL;
	struct archive_entry *entry = NULL;
	int rv;
#endif

	fname = apr_pstrdup(conf->pool, filename);
	finfosize = finfo.size;
#if HAVE_ARCHIVE
	subpath = NULL;
	fname_len = strlen(filename);
	if (is_option_set(conf->mask, OPTION_UNTAR)) {
	    if ((NULL != conf->ar_regex)
		&& (0 <= (rc = pcre_exec(conf->ar_regex, NULL, filename, fname_len, 0, 0, ovector, MATCH_VECTOR_SIZE)))) {
		a = archive_read_new();
		if (NULL == a) {
		    DEBUG_ERR("error calling archive_read_new()");
		    return APR_EGENERAL;
		}
		rv = archive_read_support_compression_all(a);
		if (0 != rv) {
		    DEBUG_ERR("error calling archive_read_support_compression_all(): %s", archive_error_string(a));
		    return APR_EGENERAL;
		}
		rv = archive_read_support_format_all(a);
		if (0 != rv) {
		    DEBUG_ERR("error calling archive_read_support_format_all(): %s", archive_error_string(a));
		    return APR_EGENERAL;
		}
		rv = archive_read_open_file(a, filename, 10240);
		if (0 != rv) {
		    DEBUG_ERR("error calling archive_read_open_file(%s): %s", filename, archive_error_string(a));
		    return APR_EGENERAL;
		}
	    }
	}

	do {
#endif
	    if (finfosize >= conf->minsize
#if HAVE_ARCHIVE
		&& ((NULL == a) || ((NULL != entry) && !(AE_IFDIR & archive_entry_filetype(entry))))
#endif
		) {
		ft_file_t *file;
		ft_fsize_t *fsize;

		file = apr_palloc(conf->pool, sizeof(struct ft_file_t));
		file->path = fname;
		file->size = finfosize;
#if HAVE_ARCHIVE
		if (subpath) {
		    file->subpath = apr_pstrdup(conf->pool, subpath);
		}
		else {
		    file->subpath = NULL;
		}
#endif
		if ((conf->p_path) && (fname_len >= conf->p_path_len)
		    && ((is_option_set(conf->mask, OPTION_ICASE) && !strncasecmp(filename, conf->p_path, conf->p_path_len))
			|| (!is_option_set(conf->mask, OPTION_ICASE) && !memcmp(filename, conf->p_path, conf->p_path_len)))) {
		    file->prioritized |= 0x1;
		}
		else {
		    file->prioritized &= 0x0;
		}
#if HAVE_PUZZLE
		file->cvec_ok &= 0x0;
#endif
		napr_heap_insert(conf->heap, file);

		if (NULL == (fsize = napr_hash_search(conf->sizes, &finfosize, 1, &hash_value))) {
		    fsize = apr_palloc(conf->pool, sizeof(struct ft_fsize_t));
		    fsize->val = finfosize;
		    fsize->chksum_array = NULL;
		    fsize->nb_checksumed = 0;
		    fsize->nb_files = 0;
		    napr_hash_set(conf->sizes, fsize, hash_value);
		}
		fsize->nb_files++;
	    }
#if HAVE_ARCHIVE
	    if (a) {
		rv = archive_read_next_header(a, &entry);
		if (ARCHIVE_EOF != rv) {
		    if (ARCHIVE_OK == rv) {
			finfosize = archive_entry_size(entry);
			subpath = archive_entry_pathname(entry);
		    }
		    else {
			/*
			 * if this is the first all to read_next_header, we may
			 * be processing a bad file, ignore it silently.
			 */
			if (NULL != subpath) {
			    DEBUG_ERR("error calling archive_read_next_header(%s): %s", fname, archive_error_string(a));
			    return APR_EGENERAL;
			}
			else {
			    break;
			}
		    }
		}
	    }
	} while (a && (ARCHIVE_EOF != rv));
	if (a)
	    archive_read_finish(a);
#endif
    }

    return APR_SUCCESS;
}

#if HAVE_ARCHIVE
static int copy_data(struct archive *ar, struct archive *aw)
{
    const void *buff;
    off_t offset;
    size_t size;
    int rv;

    for (;;) {
	rv = archive_read_data_block(ar, &buff, &size, &offset);
	if (rv == ARCHIVE_EOF) {
	    return ARCHIVE_OK;
	}
	if (rv != ARCHIVE_OK) {
	    DEBUG_ERR("error calling archive_read_data_block(): %s", archive_error_string(ar));
	    return rv;
	}
	rv = archive_write_data_block(aw, buff, size, offset);
	if (rv != ARCHIVE_OK) {
	    DEBUG_ERR("error calling archive_write_data_block(): %s", archive_error_string(aw));
	    return rv;
	}
    }
}

static char *ft_untar_file(ft_file_t *file, apr_pool_t *p)
{
    struct archive *a = NULL;
    struct archive *ext = NULL;
    struct archive_entry *entry = NULL;
    char *tmpfile = NULL;
    int rv;

    a = archive_read_new();
    if (NULL == a) {
	DEBUG_ERR("error calling archive_read_new()");
	return NULL;
    }
    rv = archive_read_support_compression_all(a);
    if (0 != rv) {
	DEBUG_ERR("error calling archive_read_support_compression_all(): %s", archive_error_string(a));
	return NULL;
    }
    rv = archive_read_support_format_all(a);
    if (0 != rv) {
	DEBUG_ERR("error calling archive_read_support_format_all(): %s", archive_error_string(a));
	return NULL;
    }
    rv = archive_read_open_file(a, file->path, 10240);
    if (0 != rv) {
	DEBUG_ERR("error calling archive_read_open_file(%s): %s", file->path, archive_error_string(a));
	return NULL;
    }

    ext = archive_write_disk_new();
    if (NULL == ext) {
	DEBUG_ERR("error calling archive_write_disk_new()");
	return NULL;
    }

    for (;;) {
	rv = archive_read_next_header(a, &entry);
	if (rv == ARCHIVE_EOF) {
	    DEBUG_ERR("subpath [%s] not found in archive [%s]", file->subpath, file->path);
	    return NULL;
	}
	if (rv != ARCHIVE_OK) {
	    DEBUG_ERR("error in archive (%s): %s", file->path, archive_error_string(a));
	    return NULL;
	}

	if (!strcmp(file->subpath, archive_entry_pathname(entry))) {
	    /*
	     * All I want is only a temporary filename, but gcc outputs me an
	     * ugly warning if I use tempnam...
	     * tmpfile = tempnam("/tmp/", "ftwin");
	     */
	    tmpfile = apr_pstrdup(p, "/tmp/ftwinXXXXXX");
	    rv = mkstemp(tmpfile);
	    if (rv < 0) {
		DEBUG_ERR("error creating tmpfile %s", tmpfile);
		return NULL;
	    }
	    close(rv);

	    archive_entry_copy_pathname(entry, tmpfile);

	    rv = archive_write_header(ext, entry);
	    if (rv == ARCHIVE_OK) {
		rv = copy_data(a, ext);
		if (rv != ARCHIVE_OK) {
		    DEBUG_ERR("error while copying data from archive (%s)", file->path);
		    apr_file_remove(tmpfile, p);
		    return NULL;
		}
	    }
	    else {
		DEBUG_ERR("error in archive (%s): %s", file->path, archive_error_string(a));
		apr_file_remove(tmpfile, p);
		return NULL;
	    }

	    break;
	}
    }

    archive_write_finish(ext);
    archive_read_finish(a);

    return tmpfile;
}

#endif

static apr_status_t ft_conf_process_sizes(ft_conf_t *conf)
{
    char errbuf[128];
    ft_file_t *file;
    ft_fsize_t *fsize;
    napr_heap_t *tmp_heap;
    apr_pool_t *gc_pool;
    apr_uint32_t hash_value;
    apr_status_t status;
    apr_size_t nb_processed, nb_files;

    if (is_option_set(conf->mask, OPTION_VERBO))
	fprintf(stderr, "Referencing files and sizes:\n");

    if (APR_SUCCESS != (status = apr_pool_create(&gc_pool, conf->pool))) {
	DEBUG_ERR("error calling apr_pool_create: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }
    nb_processed = 0;
    nb_files = napr_heap_size(conf->heap);

    tmp_heap = napr_heap_make(conf->pool, ft_file_cmp);
    while (NULL != (file = napr_heap_extract(conf->heap))) {
	if (NULL != (fsize = napr_hash_search(conf->sizes, &file->size, 1, &hash_value))) {
	    /* More than two files, we will need to checksum because :
	     * - 1 file of a size means no twin.
	     * - 2 files of a size means that anyway we must read the both, so
	     *   we will probably cmp them at that time instead of running CPU
	     *   intensive function like checksum.
	     */
	    if (1 == fsize->nb_files) {
		/* No twin possible, remove the entry */
		/*DEBUG_DBG("only one file of size %"APR_OFF_T_FMT, fsize->val); */
		napr_hash_remove(conf->sizes, fsize, hash_value);
	    }
	    else {
		if (NULL == fsize->chksum_array)
		    fsize->chksum_array = apr_palloc(conf->pool, fsize->nb_files * sizeof(struct ft_chksum_t));

		fsize->chksum_array[fsize->nb_checksumed].file = file;
		/* no multiple check, just a memcmp will be needed, don't call checksum on 0-length file too */
		if ((2 == fsize->nb_files) || (0 == fsize->val)) {
		    /*DEBUG_DBG("two files of size %"APR_OFF_T_FMT, fsize->val); */
		    memset(fsize->chksum_array[fsize->nb_checksumed].val_array, 0, HASHSTATE * sizeof(apr_int32_t));
		}
		else {
		    char *filepath;
#if HAVE_ARCHIVE
		    if (is_option_set(conf->mask, OPTION_UNTAR) && (NULL != file->subpath)) {
			filepath = ft_untar_file(file, gc_pool);
			if (NULL == filepath) {
			    DEBUG_ERR("error calling ft_untar_file");
			    apr_pool_destroy(gc_pool);
			    return APR_EGENERAL;
			}
		    }
		    else {
			filepath = file->path;
		    }
#else
		    filepath = file->path;
#endif
		    status =
			checksum_file(filepath, file->size, conf->excess_size,
				      fsize->chksum_array[fsize->nb_checksumed].val_array, gc_pool);
#if HAVE_ARCHIVE
		    if (is_option_set(conf->mask, OPTION_UNTAR) && (NULL != file->subpath))
			apr_file_remove(filepath, gc_pool);
#endif
		    /*
		     * no return status if != APR_SUCCESS , because : 
		     * Fault-check has been removed in case files disappear
		     * between collecting and comparing or special files (like
		     * device or /proc) are tried to access
		     */
		    if ((APR_SUCCESS != status) && (is_option_set(conf->mask, OPTION_VERBO)))
			fprintf(stderr, "\nskipping %s because: %s\n", file->path, apr_strerror(status, errbuf, 128));
		}
		if (APR_SUCCESS == status) {
		    fsize->nb_checksumed++;
		    napr_heap_insert(tmp_heap, file);
		}
	    }
	    if (is_option_set(conf->mask, OPTION_VERBO)) {
		fprintf(stderr, "\rProgress [%" APR_SIZE_T_FMT "/%" APR_SIZE_T_FMT "] %d%% ", nb_processed, nb_files,
			(int) ((float) nb_processed / (float) nb_files * 100.0));
		nb_processed++;
	    }
	}
	else {
	    DEBUG_ERR("inconsistency error found, no size[%" APR_OFF_T_FMT "] in hash for file %s", file->size, file->path);
	    apr_pool_destroy(gc_pool);
	    return APR_EGENERAL;
	}
    }
    if (is_option_set(conf->mask, OPTION_VERBO)) {
	fprintf(stderr, "\rProgress [%" APR_SIZE_T_FMT "/%" APR_SIZE_T_FMT "] %d%% ", nb_processed, nb_files,
		(int) ((float) nb_processed / (float) nb_files * 100.0));
	fprintf(stderr, "\n");
    }

    apr_pool_destroy(gc_pool);
    conf->heap = tmp_heap;

    return APR_SUCCESS;
}

static int chksum_cmp(const void *chksum1, const void *chksum2)
{
    const ft_chksum_t *chk1 = chksum1;
    const ft_chksum_t *chk2 = chksum2;
    int i;

    if (0 == (i = memcmp(chk1->val_array, chk2->val_array, HASHSTATE))) {
	return chk1->file->prioritized - chk2->file->prioritized;
    }
    else {
	i <<= 1;
    }

    return i;
}

#if HAVE_PUZZLE

static apr_status_t ft_conf_image_twin_report(ft_conf_t *conf)
{
    PuzzleContext context;
    double d;
    ft_file_t *file, *file_cmp;
    int i, heap_size;
    unsigned char already_printed;

    puzzle_init_context(&context);
    puzzle_set_max_width(&context, 5000);
    puzzle_set_max_height(&context, 5000);
    puzzle_set_lambdas(&context, 13);

    heap_size = napr_heap_size(conf->heap);
    for (i = 0; i < heap_size; i++) {

	file = napr_heap_get_nth(conf->heap, i);
	puzzle_init_cvec(&context, &(file->cvec));
	if (0 == puzzle_fill_cvec_from_file(&context, &(file->cvec), file->path)) {
	    file->cvec_ok |= 0x1;
	}
	else {
	    DEBUG_ERR("error calling puzzle_fill_cvec_from_file, ignoring file: %s", file->path);
	}

	if (is_option_set(conf->mask, OPTION_VERBO)) {
	    fprintf(stderr, "\rProgress [%i/%i] %d%% ", i, heap_size, (int) ((float) i / (float) heap_size * 100.0));
	}
    }
    if (is_option_set(conf->mask, OPTION_VERBO)) {
	fprintf(stderr, "\rProgress [%i/%i] %d%% ", i, heap_size, (int) ((float) i / (float) heap_size * 100.0));
	fprintf(stderr, "\n");
    }

    while (NULL != (file = napr_heap_extract(conf->heap))) {
	if (!(file->cvec_ok & 0x1))
	    continue;
	already_printed = 0;
	heap_size = napr_heap_size(conf->heap);
	for (i = 0; i < heap_size; i++) {
	    file_cmp = napr_heap_get_nth(conf->heap, i);
	    if (!(file_cmp->cvec_ok & 0x1))
		continue;

	    d = puzzle_vector_normalized_distance(&context, &(file->cvec), &(file_cmp->cvec), 0);
	    if (d < PUZZLE_CVEC_SIMILARITY_LOWER_THRESHOLD) {
		if (!already_printed) {
		    printf("%s%c", file->path, conf->sep);
		    already_printed = 1;
		}
		else {
		    printf("%c", conf->sep);
		}
		printf("%s", file_cmp->path);
	    }
	}

	if (already_printed)
	    printf("\n\n");

	puzzle_free_cvec(&context, &(file->cvec));
    }

    puzzle_free_context(&context);

    return APR_SUCCESS;
}
#endif

static apr_status_t ft_conf_twin_report(ft_conf_t *conf)
{
    char errbuf[128];
    apr_off_t old_size = -1;
    ft_file_t *file;
    ft_fsize_t *fsize;
    apr_uint32_t hash_value;
    apr_size_t i, j;
    int rv;
    apr_status_t status;
    unsigned char already_printed;

    if (is_option_set(conf->mask, OPTION_VERBO))
	fprintf(stderr, "Reporting duplicate files:\n");

    while (NULL != (file = napr_heap_extract(conf->heap))) {
	if (file->size == old_size)
	    continue;

	old_size = file->size;
	if (NULL != (fsize = napr_hash_search(conf->sizes, &file->size, 1, &hash_value))) {
	    qsort(fsize->chksum_array, fsize->nb_files, sizeof(ft_chksum_t), chksum_cmp);
	    for (i = 0; i < fsize->nb_files; i++) {
		if (NULL == fsize->chksum_array[i].file)
		    continue;
		already_printed = 0;
		for (j = i + 1; j < fsize->nb_files; j++) {
		    if (0 == memcmp(fsize->chksum_array[i].val_array, fsize->chksum_array[j].val_array, HASHSTATE)) {
			char *fpathi, *fpathj;
#if HAVE_ARCHIVE
			if (is_option_set(conf->mask, OPTION_UNTAR)) {
			    if (NULL != fsize->chksum_array[i].file->subpath) {
				fpathi = ft_untar_file(fsize->chksum_array[i].file, conf->pool);
				if (NULL == fpathi) {
				    DEBUG_ERR("error calling ft_untar_file");
				    return APR_EGENERAL;
				}
			    }
			    else {
				fpathi = fsize->chksum_array[i].file->path;
			    }
			    if (NULL != fsize->chksum_array[j].file->subpath) {
				fpathj = ft_untar_file(fsize->chksum_array[j].file, conf->pool);
				if (NULL == fpathj) {
				    DEBUG_ERR("error calling ft_untar_file");
				    return APR_EGENERAL;
				}
			    }
			    else {
				fpathj = fsize->chksum_array[j].file->path;
			    }
			}
			else {
			    fpathi = fsize->chksum_array[i].file->path;
			    fpathj = fsize->chksum_array[j].file->path;
			}
#else
			fpathi = fsize->chksum_array[i].file->path;
			fpathj = fsize->chksum_array[j].file->path;
#endif
			status = filecmp(conf->pool, fpathi, fpathj, fsize->val, conf->excess_size, &rv);
#if HAVE_ARCHIVE
			if (is_option_set(conf->mask, OPTION_UNTAR)) {
			    if (NULL != fsize->chksum_array[i].file->subpath)
				apr_file_remove(fpathi, conf->pool);
			    if (NULL != fsize->chksum_array[j].file->subpath)
				apr_file_remove(fpathj, conf->pool);
			}
#endif
			/*
			 * no return status if != APR_SUCCESS , because : 
			 * Fault-check has been removed in case files disappear
			 * between collecting and comparing or special files (like
			 * device or /proc) are tried to access
			 */
			if (APR_SUCCESS != status) {
			    if (is_option_set(conf->mask, OPTION_VERBO))
				fprintf(stderr, "\nskipping %s and %s comparison because: %s\n",
					fsize->chksum_array[i].file->path, fsize->chksum_array[j].file->path,
					apr_strerror(status, errbuf, 128));
			    rv = 1;
			}

			if (0 == rv) {
			    if (!already_printed) {
				if (is_option_set(conf->mask, OPTION_SIZED))
				    printf("size [%" APR_OFF_T_FMT "]:\n", fsize->val);
#if HAVE_ARCHIVE
				if (is_option_set(conf->mask, OPTION_UNTAR)
				    && (NULL != fsize->chksum_array[i].file->subpath))
				    printf("%s%c%s%c", fsize->chksum_array[i].file->path, (':' != conf->sep) ? ':' : '|',
					   fsize->chksum_array[i].file->subpath, conf->sep);
				else
#endif
				    printf("%s%c", fsize->chksum_array[i].file->path, conf->sep);
				already_printed = 1;
			    }
			    else {
				printf("%c", conf->sep);
			    }
#if HAVE_ARCHIVE
			    if (is_option_set(conf->mask, OPTION_UNTAR) && (NULL != fsize->chksum_array[j].file->subpath))
				printf("%s%c%s", fsize->chksum_array[j].file->path, (':' != conf->sep) ? ':' : '|',
				       fsize->chksum_array[j].file->subpath);
			    else
#endif
				printf("%s", fsize->chksum_array[j].file->path);
			    /* mark j as a twin ! */
			    fsize->chksum_array[j].file = NULL;
			    fflush(stdout);
			}
		    }
		    else {
			/* hash are ordered, so at first mismatch we check the next */
			break;
		    }
		}
		if (already_printed)
		    printf("\n\n");
	    }
	}
	else {
	    DEBUG_ERR("inconsistency error found, no size[%" APR_OFF_T_FMT "] in hash for file %s", file->size, file->path);
	    return APR_EGENERAL;
	}
    }

    return APR_SUCCESS;
}

static void version()
{
    fprintf(stdout, PACKAGE_STRING "\n");
    fprintf(stdout, "Copyright (C) 2007 François Pesce\n");
    fprintf(stdout, "Licensed under the Apache License, Version 2.0 (the \"License\");\n");
    fprintf(stdout, "you may not use this file except in compliance with the License.\n");
    fprintf(stdout, "You may obtain a copy of the License at\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "\thttp://www.apache.org/licenses/LICENSE-2.0\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "Unless required by applicable law or agreed to in writing, software\n");
    fprintf(stdout, "distributed under the License is distributed on an \"AS IS\" BASIS,\n");
    fprintf(stdout, "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n");
    fprintf(stdout, "See the License for the specific language governing permissions and\n");
    fprintf(stdout, "limitations under the License.\n\n");
    fprintf(stdout, "Report bugs to " PACKAGE_BUGREPORT "\n");
}

static void usage(const char *name, const apr_getopt_option_t *opt_option)
{
    int i;

    fprintf(stdout, PACKAGE_STRING "\n");
    fprintf(stdout, "Usage: %s [OPTION]... [FILES or DIRECTORIES]...\n", name);
    fprintf(stdout, "Find identical files passed as parameter or recursively found in directories.\n");
    fprintf(stdout, "\n");
    fprintf(stdout, "Mandatory arguments to long options are mandatory for short options too.\n");
    fprintf(stdout, "\n");

    for (i = 0; NULL != opt_option[i].name; i++) {
	fprintf(stdout, "-%c,\t--%s\t%s\n", opt_option[i].optch, opt_option[i].name, opt_option[i].description);
    }
}

static apr_status_t ft_pcre_free_cleanup(void *pcre_space)
{
    pcre_free(pcre_space);
    return APR_SUCCESS;
}

static pcre *ft_pcre_compile(const char *regex, int caseless, apr_pool_t *p)
{
    const char *errptr;
    int erroffset, options = PCRE_DOLLAR_ENDONLY | PCRE_DOTALL;
    pcre *result;

    if (caseless)
	options |= PCRE_CASELESS;

    result = pcre_compile(regex, options, &errptr, &erroffset, NULL);
    if (NULL == result)
	DEBUG_ERR("can't parse %s at [%.*s] for -e / --regex-ignore-file: %s", regex, erroffset, regex, errptr);
    else
	apr_pool_cleanup_register(p, result, ft_pcre_free_cleanup, apr_pool_cleanup_null);


    return result;
}

static apr_status_t fill_gids_ht(const char *username, napr_hash_t *gids, apr_pool_t *p)
{
    gid_t list[256];
    ft_gid_t *gid;
    apr_uint32_t hash_value;
    int i, nb_gid;

    nb_gid = getgroups(sizeof(list), list);
    if (nb_gid < 0) {
	DEBUG_ERR("error calling getgroups()");
	return APR_EGENERAL;
    }
    /* 
     * According to getgroups manpage:
     * It is unspecified whether the effective group ID of the calling process
     * is included in the returned list.  (Thus, an application should also
     * call getegid(2) and add or remove the resulting value.)
     */
    if (nb_gid < sizeof(list)) {
	list[nb_gid] = getegid();
	nb_gid++;
    }

    for (i = 0; i < nb_gid; i++) {
	if (NULL == (gid = napr_hash_search(gids, &(list[i]), 1, &hash_value))) {
	    gid = apr_palloc(p, sizeof(struct ft_gid_t));
	    gid->val = list[i];
	    napr_hash_set(gids, gid, hash_value);
	}
    }

    return APR_SUCCESS;
}

int main(int argc, const char **argv)
{
    static const apr_getopt_option_t opt_option[] = {
	{"case-unsensitive", 'c', FALSE, "this option applies to regex match."},
	{"display-size", 'd', FALSE, "\tdisplay size before duplicates."},
	{"regex-ignore-file", 'e', TRUE, "filenames that match this are ignored."},
	{"follow-symlink", 'f', FALSE, "follow symbolic links."},
	{"help", 'h', FALSE, "\t\tdisplay usage."},
#if HAVE_PUZZLE
	{"image-cmp", 'I', FALSE, "\twill run ftwin in image cmp mode (using libpuzzle)."},
#endif
	{"ignore-list", 'i', TRUE, "\tcomma-separated list of file names to ignore."},
	{"minimal-length", 'm', TRUE, "minimum size of file to process."},
	{"optimize-memory", 'o', FALSE, "reduce memory usage, but increase process time."},
	{"priority-path", 'p', TRUE, "\tfile in this path are displayed first when\n\t\t\t\tduplicates are reported."},
	{"recurse-subdir", 'r', FALSE, "recurse subdirectories."},
	{"separator", 's', TRUE, "\tseparator character between twins, default: \\n."},
#if HAVE_ARCHIVE
	{"tar-cmp", 't', FALSE, "\twill process files archived in .tar default: off."},
#endif
	{"verbose", 'v', FALSE, "\tdisplay a progress bar."},
	{"version", 'V', FALSE, "\tdisplay version."},
	{"whitelist-regex-file", 'w', TRUE, "filenames that doesn't match this are ignored."},
	{"excessive-size", 'x', TRUE, "excessive size of file that switch off mmap use."},
	{NULL, 0, 0, NULL},	/* end (a.k.a. sentinel) */
    };
    char errbuf[128];
    char *regex = NULL, *wregex = NULL, *arregex = NULL;
    ft_conf_t conf;
    apr_getopt_t *os;
    apr_pool_t *pool, *gc_pool;
    apr_uint32_t hash_value;
    const char *optarg;
    int optch, i;
    apr_status_t status;

    if (APR_SUCCESS != (status = apr_initialize())) {
	DEBUG_ERR("error calling apr_initialize: %s", apr_strerror(status, errbuf, 128));
	return -1;
    }

    if (APR_SUCCESS != (status = apr_pool_create(&pool, NULL))) {
	DEBUG_ERR("error calling apr_pool_create: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }

    if (APR_SUCCESS != (status = apr_getopt_init(&os, pool, argc, argv))) {
	DEBUG_ERR("error calling apr_getopt_init: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }

    conf.pool = pool;
    conf.heap = napr_heap_make(pool, ft_file_cmp);
    conf.ig_files = napr_hash_str_make(pool, 32, 8);
    conf.sizes = napr_hash_make(pool, 4096, 8, ft_fsize_get_key, get_one, apr_uint32_key_cmp, apr_uint32_key_hash);
    conf.gids = napr_hash_make(pool, 4096, 8, ft_gids_get_key, get_one, apr_uint32_key_cmp, apr_uint32_key_hash);
    /* To avoid endless loop, ignore looping directory ;) */
    napr_hash_search(conf.ig_files, ".", 1, &hash_value);
    napr_hash_set(conf.ig_files, ".", hash_value);
    napr_hash_search(conf.ig_files, "..", 2, &hash_value);
    napr_hash_set(conf.ig_files, "..", hash_value);
    conf.ig_regex = NULL;
    conf.wl_regex = NULL;
    conf.ar_regex = NULL;
    conf.p_path = NULL;
    conf.p_path_len = 0;
    conf.minsize = 0;
    conf.sep = '\n';
    conf.excess_size = 50 * 1024 * 1024;
    conf.mask = 0x00;

    while (APR_SUCCESS == (status = apr_getopt_long(os, opt_option, &optch, &optarg))) {
	switch (optch) {
	case 'c':
	    set_option(&conf.mask, OPTION_ICASE, 1);
	    break;
	case 'd':
	    set_option(&conf.mask, OPTION_SIZED, 1);
	    break;
	case 'e':
	    regex = apr_pstrdup(pool, optarg);
	    break;
	case 'f':
	    set_option(&conf.mask, OPTION_FSYML, 1);
	    break;
	case 'h':
	    usage(argv[0], opt_option);
	    return 0;
	case 'i':
	    ft_hash_add_ignore_list(conf.ig_files, optarg);
	    break;
#if HAVE_PUZZLE
	case 'I':
	    set_option(&conf.mask, OPTION_ICASE, 1);
	    set_option(&conf.mask, OPTION_PUZZL, 1);
	    wregex = apr_pstrdup(pool, ".*\\.(gif|png|jpg)$");
	    break;
#endif
	case 'm':
	    conf.minsize = strtoul(optarg, NULL, 10);
	    if (ULONG_MAX == conf.minsize) {
		DEBUG_ERR("can't parse %s for -m / --minimal-length", optarg);
		apr_terminate();
		return -1;
	    }
	    break;
	case 'o':
	    set_option(&conf.mask, OPTION_OPMEM, 1);
	    break;
	case 'p':
	    conf.p_path = apr_pstrdup(pool, optarg);
	    conf.p_path_len = strlen(conf.p_path);
	    break;
	case 'r':
	    set_option(&conf.mask, OPTION_RECSD, 1);
	    break;
	case 's':
	    conf.sep = *optarg;
	    break;
#if HAVE_ARCHIVE
	case 't':
	    set_option(&conf.mask, OPTION_UNTAR, 1);
	    arregex = apr_pstrdup(pool, ".*(\\.tar)?\\.(gz|Z|bz2)$");
	    break;
#endif
	case 'v':
	    set_option(&conf.mask, OPTION_VERBO, 1);
	    break;
	case 'V':
	    version();
	    return 0;
	case 'w':
	    wregex = apr_pstrdup(pool, optarg);
	    break;
	case 'x':
	    conf.excess_size = strtoul(optarg, NULL, 10);
	    if (ULONG_MAX == conf.minsize) {
		DEBUG_ERR("can't parse %s for -x / --excessive-size", optarg);
		apr_terminate();
		return -1;
	    }
	    break;
	}
    }

    if (APR_SUCCESS != (status = apr_uid_current(&(conf.userid), &(conf.groupid), pool))) {
	DEBUG_ERR("error calling apr_uid_current: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }

    if (APR_SUCCESS != (status = apr_uid_name_get(&(conf.username), conf.userid, pool))) {
	DEBUG_ERR("error calling apr_uid_name_get: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }

    if (APR_SUCCESS != (status = fill_gids_ht(conf.username, conf.gids, pool))) {
	DEBUG_ERR("error calling fill_gids_ht: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }

    if (NULL != regex) {
	conf.ig_regex = ft_pcre_compile(regex, is_option_set(conf.mask, OPTION_ICASE), pool);
	if (NULL == conf.ig_regex) {
	    apr_terminate();
	    return -1;
	}
    }

    if (NULL != wregex) {
	conf.wl_regex = ft_pcre_compile(wregex, is_option_set(conf.mask, OPTION_ICASE), pool);
	if (NULL == conf.wl_regex) {
	    apr_terminate();
	    return -1;
	}
    }

    if (NULL != arregex) {
	conf.ar_regex = ft_pcre_compile(arregex, is_option_set(conf.mask, OPTION_ICASE), pool);
	if (NULL == conf.ar_regex) {
	    apr_terminate();
	    return -1;
	}
    }

    /* Step 1 : Browse the file */
    if (APR_SUCCESS != (status = apr_pool_create(&gc_pool, pool))) {
	DEBUG_ERR("error calling apr_pool_create: %s", apr_strerror(status, errbuf, 128));
	apr_terminate();
	return -1;
    }
    for (i = os->ind; i < argc; i++) {
	if (APR_SUCCESS != (status = ft_conf_add_file(&conf, argv[i], gc_pool))) {
	    DEBUG_ERR("error calling ft_conf_add_file: %s", apr_strerror(status, errbuf, 128));
	    apr_terminate();
	    return -1;
	}
    }
    apr_pool_destroy(gc_pool);

    if (0 < napr_heap_size(conf.heap)) {
#if HAVE_PUZZLE
	if (is_option_set(conf.mask, OPTION_PUZZL)) {
	    /* Step 2: Report the image twins */
	    if (APR_SUCCESS != (status = ft_conf_image_twin_report(&conf))) {
		DEBUG_ERR("error calling ft_conf_image_twin_report: %s", apr_strerror(status, errbuf, 128));
		apr_terminate();
		return status;
	    }
	}
	else {
#endif
	    /* Step 2: Process the sizes set */
	    if (APR_SUCCESS != (status = ft_conf_process_sizes(&conf))) {
		DEBUG_ERR("error calling ft_conf_process_sizes: %s", apr_strerror(status, errbuf, 128));
		apr_terminate();
		return -1;
	    }

	    /* Step 3: Report the twins */
	    if (APR_SUCCESS != (status = ft_conf_twin_report(&conf))) {
		DEBUG_ERR("error calling ft_conf_twin_report: %s", apr_strerror(status, errbuf, 128));
		apr_terminate();
		return status;
	    }
#if HAVE_PUZZLE
	}
#endif
    }
    else {
	DEBUG_ERR("Please submit at least two files...");
	usage(argv[0], opt_option);
	return -1;
    }

    apr_terminate();

    return 0;
}


syntax highlighted by Code2HTML, v. 0.9.1