/*-
* Copyright (c) 2007 B. Luevelsmeyer
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* $Id: dupfind.c,v 1.19 2007/05/03 14:34:47 bernd Exp $
*/
/* Find duplicate files; duplicates in this context are files
* that have the same contents without being hardlinked or
* softlinked to each other.
*
* All files are compared by their length, and files of equal
* length are compared by a checksum. (These checksums are stored
* in a database, so the checksum is computed only once per
* file.) If the checksums match, then a direct comparison of the
* files' contents is done.
*
* This source is using the 'fastcrc' library by G. Adam
* Stanislav to compute the checksums.
*
* I've used CRC checksums rather than MD5 checksums because they
* compute faster, and because they are 32 bits wide, which
* hopefully will make life easier for the database. Anyway, CRCs
* are good enough; I've only had 2 or 3 matching CRCs for
* non-matching files while comparing several million files.
*/
#include <unistd.h>
#include <stdio.h>
#include <ftw.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <db.h>
#include <fcntl.h>
#include <curses.h>
#include <term.h>
#include <errno.h>
#include <fastcrc.h>
#include <signal.h>
#include <sys/time.h>
#ifdef NDEBUG
#error NDEBUG should not be defined for this source
#endif
/* Command line options */
static int symbolics, /* output symlinks */
hards, /* output hardlinks */
duplicates, /* output file copies */
debug, /* debug output for developers */
progress, /* progress output (current directory) */
counters, /* output statistic counters */
zero; /* don't output files of size 0 as duplicates */
/* Statistics counters */
static unsigned directories_visited, unreadable_directories, found_files,
nonreg_found, unstat_files, filenames_stored,
filesizes_stored, possible_hardlinks, found_hardlinks,
duplicates_considered, matching_sizes, calculated_crcs,
cached_crcs, matching_crcs_found, duplicates_found,
symlinks_found,unreadable_files,zero_files;
static unsigned long long bytes_in_dups,usec_read,bytes_read;
/* Clear line to eol if progress output is shown, preserving
* errno.
*/
static int p(int c) { return putc(c,stderr); }
static void prog_clean(void)
{
if(progress)
{
int e = errno;
tputs(clr_eol,1,p);
errno = e;
}
}
/* Two buffers for calculating CRCs and comparing files. */
/* I tried mmap()ing the files instead of reading them into
* buffers, but it only gives me 60% of the read() speed, and
* there's a 2 GB limit on mmap().
*/
#define BUFFLEN (20*1024)
static char buf1[BUFFLEN], buf2[BUFFLEN];
/* Compute a file's CRC checksum, using the fastcrc lib. Returns
* the CRC, or a random number if the file cannot be read.
*
* In case a CRC cannot be computed, we simulate a CRC by rand()
* in order to not have all unreadable files appear to have the
* same constant CRC, forcing useless comparisons. This "wrong"
* CRC doesn't harm, as ultimately file duplicates will only be
* found if the files actually are readable and equal. For the
* same reason, the quality of the random number generator
* doesn't matter too much as long as the numbers don't repeat
* too often.
*/
static unsigned get_crc(const char *path, off_t len)
{
int fd = open(path,O_RDONLY);
++calculated_crcs;
if(fd<0)
{
++unreadable_files;
prog_clean();
fprintf(stderr,"cannot open >%s<: ",path);
perror(0);
return rand();
}
else
{
unsigned crc = ecrcinit();
off_t complete = 0;
struct timeval prev,aft;
gettimeofday(&prev,0);
for(;;)
{
const ssize_t bytes=read(fd,buf1,sizeof buf1);
if(bytes<0)
{
++unreadable_files;
prog_clean();
fprintf(stderr,"error reading from >%s<: ",path);
perror(0);
close(fd);
return rand();
}
else if(bytes==0)
break;
else
{
crc = ecrcpartial(crc,buf1,bytes);
complete += bytes;
}
}
gettimeofday(&aft,0);
usec_read += ((long long)aft.tv_sec*1000000+aft.tv_usec)
- ((long long)prev.tv_sec*1000000+prev.tv_usec);
bytes_read += complete;
crc = ecrcfinish(crc);
close(fd);
if(complete!=len)
{
prog_clean();
fprintf(stderr,"file size mismatch for >%s<\n",path);
}
if(debug)
prog_clean(),printf("::%s: %#x\n",path,crc);
return crc;
}
}
/* Compare two files. Returns 0 if the files are equal, !=0 else. */
static int files_compare(const char *a, const char *b, off_t len)
{
struct timeval prev,aft;
int fd_a = open(a,O_RDONLY);
int fd_b;
off_t complete = 0;
if(fd_a<0)
{
prog_clean();
fprintf(stderr,"cannot open >%s<: ",a);
perror(0);
return 1;
}
fd_b = open(b,O_RDONLY);
if(fd_b<0)
{
prog_clean();
fprintf(stderr,"cannot open >%s<: ",b);
perror(0);
close(fd_a);
return 1;
}
gettimeofday(&prev,0);
for(;;)
{
const ssize_t bytes_a = read(fd_a,buf1,sizeof buf1);
const ssize_t bytes_b = bytes_a>=0 ? read(fd_b,buf2,bytes_a?bytes_a:1) : 0;
if(bytes_a<0 || bytes_b<0)
{
/* read error */
prog_clean();
fprintf(stderr,"cannot read >%s<: ",bytes_a<0?a:b);
perror(0);
close(fd_a);
close(fd_b);
return 1;
}
else if(bytes_a==0 && bytes_b==0)
{
/* both ended */
close(fd_a);
close(fd_b);
gettimeofday(&aft,0);
usec_read += ((long long)aft.tv_sec*1000000+aft.tv_usec)
- ((long long)prev.tv_sec*1000000+prev.tv_usec);
bytes_read += 2*complete;
if(complete!=len)
{
prog_clean();
fprintf(stderr,"files of wrong length: >%s< and >%s<\n",a,b);
return 1;
}
else
return 0;
}
else if(bytes_a==0 || bytes_b==0)
{
/* one file ended, the other not */
prog_clean();
fprintf(stderr,"filesize mismatch: >%s< and >%s<\n",a,b);
close(fd_a);
close(fd_b);
return 1;
}
else
{
/* read from both files */
complete += bytes_a;
/* now look if we got the same amount... */
if(bytes_a!=bytes_b)
{
prog_clean();
fprintf(stderr,"short read() from >%s<\n",bytes_a<bytes_b?a:b);
close(fd_a);
close(fd_b);
return 1;
}
if(memcmp(buf1,buf2,bytes_a))
{
/* so they differ. */
close(fd_a);
close(fd_b);
gettimeofday(&aft,0);
usec_read += ((long long)aft.tv_sec*1000000+aft.tv_usec)
- ((long long)prev.tv_sec*1000000+prev.tv_usec);
bytes_read += 2*complete;
return 1;
}
}
}
}
/* Databases */
/* Generally, for all database accesses a database failure will
* not be tolerated, i.e. there is a lot of assert()s in this
* source. The reason is: the program has no possibility of
* continuing without the databases, so it *must* be aborted if
* the databases fail; and in such a case, the way of aborting
* doesn't matter.
*/
/* This database is used to find filenames for known device/inode
* pairs in case a matching CRC is found, or in case of
* hardlinks.
*/
static DB *name_db;
/* The key structure */
struct dev_ino_key
{
dev_t dev;
ino_t inode;
};
/* The data is simply the filename, 0-terminated */
/* Compare device/inode pairs; compare first on device, and if
* they match on inode.
*/
static int device_inode_compare(const DBT *a_, const DBT *b_)
{
const struct dev_ino_key *a = a_->data;
const struct dev_ino_key *b = b_->data;
assert(a_->size == b_->size && a_->size==sizeof *a);
if(a->dev==b->dev)
return a->inode<b->inode ? -1 : a->inode>b->inode;
else
return a->dev<b->dev ? -1 : 1;
}
/* Size-database, indexed by sizes and containing CRC and
* device/inode. This database is used to check whether there are
* matching CRCs. Of course, this database may contain many
* instances of the same key, because files of the same length
* aren't unusual.
*/
static DB *size_db;
/* The key is simply an off_t, containing the file's length. */
/* The data is a struct: */
struct size_data
{
struct dev_ino_key dik; /* device and inode of the file */
off_t size; /* length of the file */
unsigned crc; /* CRC for the file, if known */
unsigned crc_known; /* indicates whether the CRC is known */
};
/* Compare two file sizes */
static int size_compare(const DBT *a_, const DBT *b_)
{
off_t a,b;
assert(a_->size==b_->size && a_->size==sizeof a);
a=*(off_t*)(a_->data);
b=*(off_t*)(b_->data);
return a<b ? -1 : a>b;
}
/* A new filename has been read by nftw(). Look if it's some sort
* of copy.
*/
static void gather(dev_t device, ino_t inode, nlink_t nlinks, off_t size, const char *path)
{
/* description of this file, for entry into the databases */
struct size_data size_data;
size_data.dik.dev = device;
size_data.dik.inode = inode;
size_data.size = size;
size_data.crc_known = 0;
++found_files;
if(!size)
++zero_files;
if(debug)
prog_clean(),printf(">>%s\n",path);
/* Look if device/inode already known (i.e. this is a
* hardlink to a known file); skip this, if the file doesn't
* have hardlinks, because then we don't need to look for
* them.
*
* Even if we don't output hardlinks, this is still
* necessary, because if it's a hardlink to a known file then
* it also contains the same contents, but we don't want to
* report it as a duplicate, so we must find the hardlink
* information in order to avoid finding the file as a
* duplicate.
*/
if(nlinks>1)
{
DBT name_dbt;
DBT dev_ino_dbt;
dev_ino_dbt.data = &size_data.dik;
dev_ino_dbt.size = sizeof size_data.dik;
++possible_hardlinks;
switch(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0))
{
case 0:
++found_hardlinks;
if(debug)
prog_clean(),puts("(hardlink found)");
if(hards)
prog_clean(),printf("== >%s<\t>%s<\t>%lld<\n",(char*)name_dbt.data,path,size);
/* It is not necessary to consider the file any
* more, because all the considering may be done
* to the hardlink that's in the database
* already.
*/
return;
case 1: /* not found */
if(debug)
prog_clean(),puts("(no hardlink)");
break;
case -1: /* db error */
perror("name_db->get() failed");
assert(0);
return;
}
}
/* This is not a known hard link; might be a duplicate. */
/* For all files of this size in the size db, check if the
* CRC matches, and if so, compare the files directly; omit
* if duplicates are not required. Don't consider files of
* size 0 if the user wants them to be suppressed.
*/
if(duplicates && (size || !zero))
{
int res;
DBT s_key,dev_ino_crc;
off_t req_size = size; /* Don't use the parameter, it might
* be overwritten by the
* search result.
*/
s_key.data = &req_size;
s_key.size = sizeof req_size;
++duplicates_considered;
for(res=size_db->seq(size_db,&s_key,&dev_ino_crc,R_CURSOR);
!res;
res=size_db->seq(size_db,&s_key,&dev_ino_crc,R_NEXT))
{
off_t *sk = s_key.data;
assert(s_key.size == sizeof *sk);
if(*sk!=size)
/* We are done, no more size-matches, so the file
* is unique.
*/
break;
else
/* The file has the same size - this is a
* potential duplicate.
*/
{
/* So at this point in time and space, we need
* the CRCs to match.
*/
/* The filename of the old file in the databases */
DBT name_dbt = { 0,0 };
/* Device, inode and CRC of the old file - we've
* got that already.
*/
struct size_data sd;
assert(dev_ino_crc.size == sizeof sd);
sd = *(struct size_data *)dev_ino_crc.data;
++matching_sizes;
/* get CRC of current file */
if(!size_data.crc_known)
{
size_data.crc = get_crc(path,size);
size_data.crc_known = 1;
}
/* If CRC of old file unknown, get it (and update
* database).
*/
if(!sd.crc_known)
{
/* First, get the filename from the name
* database
*/
{
DBT dev_ino_dbt;
dev_ino_dbt.size = sizeof sd.dik;
dev_ino_dbt.data = &sd.dik;
assert(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0)==0);
}
/* Then calculate the old file's CRC */
sd.crc = get_crc(name_dbt.data,size);
sd.crc_known = 1;
/* And stuff it into the size database so it
* won't be calculated again
*/
{
DBT key,data;
key.data = &size;
key.size = sizeof size;
data.data = &sd;
data.size = sizeof sd;
assert(size_db->put(size_db,&key,&data,R_CURSOR)==0);
}
}
else
++cached_crcs;
/* At this point, we have both CRCs. */
if(size_data.crc==sd.crc)
{
/* The files have the same checksum and hence
* are very likely to have the same contents.
* A direct comparison won't hurt the
* performance too much.
*/
/* Get name of old file from database, if not
* already done for CRC calculation.
*/
if(!name_dbt.data)
{
DBT dev_ino_dbt;
dev_ino_dbt.size = sizeof sd.dik;
dev_ino_dbt.data = &sd.dik;
assert(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0)==0);
}
++matching_crcs_found;
if(debug)
prog_clean(),printf("? comparing >%s< >%s<\n",path,(char*)name_dbt.data);
if(!files_compare(path,name_dbt.data,size))
{
/* Now the files really have the same
* contents.
*/
++duplicates_found;
bytes_in_dups += size;
prog_clean();
printf("|| >%s<\t>%s<\t>%lld<\n",(char*)name_dbt.data,path,size);
/* If the current file has link count 1, then
* it cannot be relevant in the future,
* because it won't have hardlinks and
* duplicates may compare to the existing
* entry.
*/
if(nlinks==1)
return;
else
/* No need to compare to other files
* of the same size in the database;
* but since the file has more than 1
* link, it has to go into the
* database, in order to find
* hardlinks.
*/
break;
}
else
/* Strange stuff: The files have the same
* CRC and the same length, but different
* contents. Now this *may* happen, but
* it's quite unlikely.
*/
if(debug)
prog_clean(),printf("! >%s< >%s< CRC mis-match\n",path, (char*)name_dbt.data);
} /* same CRC */
} /* same size */
} /* while files of at least this size */
} /* if duplicates */
/* At this point, the file is unique or it is the first of
* several hardlinked files.
*/
/* Put it into name database; omit files with only 1 link if
* not searching for duplicates. Also omit files of size 0 if
* the user wants them suppressed, unless they have more than
* 1 link.
*/
if(nlinks>1 || (duplicates && (size || !zero)))
{
DBT name_dbt,dev_ino_dbt;
if(debug)
prog_clean(),printf("take %s\n",path);
++filenames_stored;
name_dbt.data = (void*)path;
name_dbt.size = strlen(path)+1;
dev_ino_dbt.data = &size_data.dik;
dev_ino_dbt.size = sizeof size_data.dik;
switch(name_db->put(name_db,&dev_ino_dbt,&name_dbt,R_NOOVERWRITE))
{
case -1: /* database error */
perror("name_db->put() failed");
assert(0);
return;
case 1: /* Same device/inode pair a second time,
* without hardlink; happens if you supply it
* twice on the command line. We will ignore
* it silently.
*/
if(debug)
prog_clean(),fprintf(stderr,"found >%s< twice\n",path);
return;
case 0: /* ok */
break;
}
}
/* Size db; only necessary if searching for duplicates. Omit
* files of size 0 if the user doesn't want to see them.
*/
if(duplicates && (size || !zero))
{
DBT size_dbt, dev_ino_dbt;
++filesizes_stored;
size_dbt.data = &size;
size_dbt.size = sizeof size;
dev_ino_dbt.data = &size_data;
dev_ino_dbt.size = sizeof size_data;
switch(size_db->put(size_db,&size_dbt,&dev_ino_dbt,0))
{
case -1: /* database error */
perror("size_db->put() failed");
assert(0);
return;
case 1: /* error because key already in database;
* should be impossible because they are
* allowed
*/
assert(0);
return;
case 0: /* ok */
break;
}
}
}
/* nftw() callback */
static int addfile(const char *path, const struct stat *stat, int flag, struct FTW *ftw_info)
{
/* unused */
ftw_info = ftw_info;
switch(flag)
{
case FTW_F: /* A regular file. */
/* ignore devices etc. */
if(S_ISREG(stat->st_mode))
gather(stat->st_dev,stat->st_ino,stat->st_nlink,stat->st_size,path);
else
++nonreg_found;
return 0;
case FTW_NS: /* A file for which no stat(2)
* information was available. The
* contents of the stat structure are
* undefined.
*/
++unstat_files;
prog_clean();
fprintf(stderr,"cannot stat() >%s<\n",path);
return 0;
case FTW_DNR: /* A directory which cannot be read. The
* directory will not be descended into.
*/
++unreadable_directories;
prog_clean();
fprintf(stderr,"cannot read directory >%s<: ",path);
perror(0);
return 0;
case FTW_D: /* A directory being visited in
* pre-order.
*/
case FTW_DP: /* A directory being visited in post-order
* (nftw() only).
*/
++directories_visited;
if(progress)
{
fprintf(stderr,"%.*s",(int)(columns-1),path);
prog_clean();
putc('\r',stderr);
fflush(stderr);
}
return 0;
case FTW_SL: /* A symbolic link. */
case FTW_SLN: /* A symbolic link with a non-existent
* target (nftw() only).
*/
++symlinks_found;
if(symbolics)
{
char buf[2000];
int len = readlink(path,buf, sizeof buf);
if(len==-1)
{
prog_clean();
fprintf(stderr,"cannot read symlink >%s<: ",path);
perror(0);
}
else
prog_clean(),printf("%s >%.*s<\t>%s<\t>%lld<\n",flag==FTW_SL?"->":"o>",len,buf,path,stat->st_size);
}
return 0;
/* ftw() errors */
default:
return 1;
}
}
/* statistics are output at program end, if desired, and on
* SIGINFO
*/
static void stat_dump(FILE *f)
{
prog_clean();
fprintf(f,
"%8u directories visited\n"
"%8u unreadable directories skipped\n"
"%8u regular files found\n"
"%8u files of size 0\n"
"%8u symbolic links found\n"
"%8u devices and sockets skipped\n"
"%8u files without stat information skipped\n"
"%8u unreadable files\n"
"%8u files having more than one link\n"
"%8u hardlink pairs confirmed\n"
"%8u duplicates considered\n"
"%8u cases of matching filesizes\n"
"%8u calculated CRCs\n"
"%8u times found CRCs in database\n"
"%8u pairs of matching CRCs found\n"
"%8u duplicates found\n"
"%8llu bytes in duplicates\n"
"%8u filenames in database\n"
"%8u sizes and CRCs in database\n"
"%8g bytes/sec read\n",
directories_visited,
unreadable_directories,
found_files,
zero_files,
symlinks_found,
nonreg_found,
unstat_files,
unreadable_files,
possible_hardlinks,
found_hardlinks,
duplicates_considered,
matching_sizes,
calculated_crcs,
cached_crcs,
matching_crcs_found,
duplicates_found,
bytes_in_dups,
filenames_stored,
filesizes_stored,
(double)bytes_read/(usec_read?usec_read:1)*1000000
);
}
static void siginfo(int signo)
{
const int e = errno;
signo=signo;
stat_dump(stderr);
errno = e;
}
int main(int argc, char **argv)
{
int c;
/* look where to place the databases */
char *tmp = getenv("TMPDIR");
if(!tmp) tmp = "/tmp";
setvbuf(stdout,0,_IOLBF,0);
while((c=getopt(argc,argv,"cshdv0p"))!=-1)
switch(c)
{
case 'c': counters=1; break;
case 's': symbolics=1; break;
case 'h': hards=1; break;
case 'd': duplicates=1; break;
case 'v': debug=1; break;
case '0': zero=1; break;
case 'p':
if(setupterm(0, STDERR_FILENO, 0)==OK)
progress=1;
else
{
perror("cannot show progress indicator");
return 1;
}
break;
case '?':
default:
fputs("usage: dupfind [-cshdv0p] [path ...]\n",stderr);
return 2;
}
argc-=optind;
argv+=optind;
{ /* btree access method for name database */
char namedb_name[80];
BTREEINFO bi = { 0,0,0,0,0,device_inode_compare,0,0 };
int fd;
int namlen = snprintf(namedb_name,sizeof namedb_name,"%s/dupfind.names.XXXXXX",tmp);
assert(namlen>0 && namlen<(int)sizeof namedb_name);
assert((fd=mkstemp(namedb_name))>=0);
/* can't use the fd for the database... */
close(fd);
if(!(name_db = dbopen(namedb_name,O_EXLOCK|O_RDWR,0,DB_BTREE,&bi)))
{
perror("dbopen() failed");
return 3;
}
/* don't leave the file hanging around after program end */
if(!debug)
remove(namedb_name);
}
{ /* btree access method for size database */
char sizedb_name[80];
BTREEINFO bi = { R_DUP,0,0,0,0,size_compare,0,0 };
int fd;
int namlen=snprintf(sizedb_name,sizeof sizedb_name,"%s/dupfind.sizes.XXXXXX",tmp);
assert(namlen>0 && namlen<(int)sizeof sizedb_name);
assert((fd=mkstemp(sizedb_name))>=0);
close(fd);
if(!(size_db = dbopen(sizedb_name,O_EXLOCK|O_RDWR,0,DB_BTREE,&bi)))
{
perror("dbopen() failed");
return 4;
}
if(!debug)
remove(sizedb_name);
}
signal(SIGINFO, siginfo);
srand(time(0));
for(;*argv;argv++)
if(nftw(*argv,addfile,10,FTW_PHYS)==-1)
{
perror("nftw() failed");
return 5;
}
name_db->close(name_db);
size_db->close(size_db);
prog_clean();
if(counters)
stat_dump(stdout);
return 0;
}
syntax highlighted by Code2HTML, v. 0.9.1