/*- * Copyright (c) 2007 B. Luevelsmeyer * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: dupfind.c,v 1.19 2007/05/03 14:34:47 bernd Exp $ */ /* Find duplicate files; duplicates in this context are files * that have the same contents without being hardlinked or * softlinked to each other. * * All files are compared by their length, and files of equal * length are compared by a checksum. (These checksums are stored * in a database, so the checksum is computed only once per * file.) If the checksums match, then a direct comparison of the * files' contents is done. * * This source is using the 'fastcrc' library by G. Adam * Stanislav to compute the checksums. * * I've used CRC checksums rather than MD5 checksums because they * compute faster, and because they are 32 bits wide, which * hopefully will make life easier for the database. Anyway, CRCs * are good enough; I've only had 2 or 3 matching CRCs for * non-matching files while comparing several million files. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef NDEBUG #error NDEBUG should not be defined for this source #endif /* Command line options */ static int symbolics, /* output symlinks */ hards, /* output hardlinks */ duplicates, /* output file copies */ debug, /* debug output for developers */ progress, /* progress output (current directory) */ counters, /* output statistic counters */ zero; /* don't output files of size 0 as duplicates */ /* Statistics counters */ static unsigned directories_visited, unreadable_directories, found_files, nonreg_found, unstat_files, filenames_stored, filesizes_stored, possible_hardlinks, found_hardlinks, duplicates_considered, matching_sizes, calculated_crcs, cached_crcs, matching_crcs_found, duplicates_found, symlinks_found,unreadable_files,zero_files; static unsigned long long bytes_in_dups,usec_read,bytes_read; /* Clear line to eol if progress output is shown, preserving * errno. */ static int p(int c) { return putc(c,stderr); } static void prog_clean(void) { if(progress) { int e = errno; tputs(clr_eol,1,p); errno = e; } } /* Two buffers for calculating CRCs and comparing files. */ /* I tried mmap()ing the files instead of reading them into * buffers, but it only gives me 60% of the read() speed, and * there's a 2 GB limit on mmap(). */ #define BUFFLEN (20*1024) static char buf1[BUFFLEN], buf2[BUFFLEN]; /* Compute a file's CRC checksum, using the fastcrc lib. Returns * the CRC, or a random number if the file cannot be read. * * In case a CRC cannot be computed, we simulate a CRC by rand() * in order to not have all unreadable files appear to have the * same constant CRC, forcing useless comparisons. This "wrong" * CRC doesn't harm, as ultimately file duplicates will only be * found if the files actually are readable and equal. For the * same reason, the quality of the random number generator * doesn't matter too much as long as the numbers don't repeat * too often. */ static unsigned get_crc(const char *path, off_t len) { int fd = open(path,O_RDONLY); ++calculated_crcs; if(fd<0) { ++unreadable_files; prog_clean(); fprintf(stderr,"cannot open >%s<: ",path); perror(0); return rand(); } else { unsigned crc = ecrcinit(); off_t complete = 0; struct timeval prev,aft; gettimeofday(&prev,0); for(;;) { const ssize_t bytes=read(fd,buf1,sizeof buf1); if(bytes<0) { ++unreadable_files; prog_clean(); fprintf(stderr,"error reading from >%s<: ",path); perror(0); close(fd); return rand(); } else if(bytes==0) break; else { crc = ecrcpartial(crc,buf1,bytes); complete += bytes; } } gettimeofday(&aft,0); usec_read += ((long long)aft.tv_sec*1000000+aft.tv_usec) - ((long long)prev.tv_sec*1000000+prev.tv_usec); bytes_read += complete; crc = ecrcfinish(crc); close(fd); if(complete!=len) { prog_clean(); fprintf(stderr,"file size mismatch for >%s<\n",path); } if(debug) prog_clean(),printf("::%s: %#x\n",path,crc); return crc; } } /* Compare two files. Returns 0 if the files are equal, !=0 else. */ static int files_compare(const char *a, const char *b, off_t len) { struct timeval prev,aft; int fd_a = open(a,O_RDONLY); int fd_b; off_t complete = 0; if(fd_a<0) { prog_clean(); fprintf(stderr,"cannot open >%s<: ",a); perror(0); return 1; } fd_b = open(b,O_RDONLY); if(fd_b<0) { prog_clean(); fprintf(stderr,"cannot open >%s<: ",b); perror(0); close(fd_a); return 1; } gettimeofday(&prev,0); for(;;) { const ssize_t bytes_a = read(fd_a,buf1,sizeof buf1); const ssize_t bytes_b = bytes_a>=0 ? read(fd_b,buf2,bytes_a?bytes_a:1) : 0; if(bytes_a<0 || bytes_b<0) { /* read error */ prog_clean(); fprintf(stderr,"cannot read >%s<: ",bytes_a<0?a:b); perror(0); close(fd_a); close(fd_b); return 1; } else if(bytes_a==0 && bytes_b==0) { /* both ended */ close(fd_a); close(fd_b); gettimeofday(&aft,0); usec_read += ((long long)aft.tv_sec*1000000+aft.tv_usec) - ((long long)prev.tv_sec*1000000+prev.tv_usec); bytes_read += 2*complete; if(complete!=len) { prog_clean(); fprintf(stderr,"files of wrong length: >%s< and >%s<\n",a,b); return 1; } else return 0; } else if(bytes_a==0 || bytes_b==0) { /* one file ended, the other not */ prog_clean(); fprintf(stderr,"filesize mismatch: >%s< and >%s<\n",a,b); close(fd_a); close(fd_b); return 1; } else { /* read from both files */ complete += bytes_a; /* now look if we got the same amount... */ if(bytes_a!=bytes_b) { prog_clean(); fprintf(stderr,"short read() from >%s<\n",bytes_adata; const struct dev_ino_key *b = b_->data; assert(a_->size == b_->size && a_->size==sizeof *a); if(a->dev==b->dev) return a->inodeinode ? -1 : a->inode>b->inode; else return a->devdev ? -1 : 1; } /* Size-database, indexed by sizes and containing CRC and * device/inode. This database is used to check whether there are * matching CRCs. Of course, this database may contain many * instances of the same key, because files of the same length * aren't unusual. */ static DB *size_db; /* The key is simply an off_t, containing the file's length. */ /* The data is a struct: */ struct size_data { struct dev_ino_key dik; /* device and inode of the file */ off_t size; /* length of the file */ unsigned crc; /* CRC for the file, if known */ unsigned crc_known; /* indicates whether the CRC is known */ }; /* Compare two file sizes */ static int size_compare(const DBT *a_, const DBT *b_) { off_t a,b; assert(a_->size==b_->size && a_->size==sizeof a); a=*(off_t*)(a_->data); b=*(off_t*)(b_->data); return ab; } /* A new filename has been read by nftw(). Look if it's some sort * of copy. */ static void gather(dev_t device, ino_t inode, nlink_t nlinks, off_t size, const char *path) { /* description of this file, for entry into the databases */ struct size_data size_data; size_data.dik.dev = device; size_data.dik.inode = inode; size_data.size = size; size_data.crc_known = 0; ++found_files; if(!size) ++zero_files; if(debug) prog_clean(),printf(">>%s\n",path); /* Look if device/inode already known (i.e. this is a * hardlink to a known file); skip this, if the file doesn't * have hardlinks, because then we don't need to look for * them. * * Even if we don't output hardlinks, this is still * necessary, because if it's a hardlink to a known file then * it also contains the same contents, but we don't want to * report it as a duplicate, so we must find the hardlink * information in order to avoid finding the file as a * duplicate. */ if(nlinks>1) { DBT name_dbt; DBT dev_ino_dbt; dev_ino_dbt.data = &size_data.dik; dev_ino_dbt.size = sizeof size_data.dik; ++possible_hardlinks; switch(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0)) { case 0: ++found_hardlinks; if(debug) prog_clean(),puts("(hardlink found)"); if(hards) prog_clean(),printf("== >%s<\t>%s<\t>%lld<\n",(char*)name_dbt.data,path,size); /* It is not necessary to consider the file any * more, because all the considering may be done * to the hardlink that's in the database * already. */ return; case 1: /* not found */ if(debug) prog_clean(),puts("(no hardlink)"); break; case -1: /* db error */ perror("name_db->get() failed"); assert(0); return; } } /* This is not a known hard link; might be a duplicate. */ /* For all files of this size in the size db, check if the * CRC matches, and if so, compare the files directly; omit * if duplicates are not required. Don't consider files of * size 0 if the user wants them to be suppressed. */ if(duplicates && (size || !zero)) { int res; DBT s_key,dev_ino_crc; off_t req_size = size; /* Don't use the parameter, it might * be overwritten by the * search result. */ s_key.data = &req_size; s_key.size = sizeof req_size; ++duplicates_considered; for(res=size_db->seq(size_db,&s_key,&dev_ino_crc,R_CURSOR); !res; res=size_db->seq(size_db,&s_key,&dev_ino_crc,R_NEXT)) { off_t *sk = s_key.data; assert(s_key.size == sizeof *sk); if(*sk!=size) /* We are done, no more size-matches, so the file * is unique. */ break; else /* The file has the same size - this is a * potential duplicate. */ { /* So at this point in time and space, we need * the CRCs to match. */ /* The filename of the old file in the databases */ DBT name_dbt = { 0,0 }; /* Device, inode and CRC of the old file - we've * got that already. */ struct size_data sd; assert(dev_ino_crc.size == sizeof sd); sd = *(struct size_data *)dev_ino_crc.data; ++matching_sizes; /* get CRC of current file */ if(!size_data.crc_known) { size_data.crc = get_crc(path,size); size_data.crc_known = 1; } /* If CRC of old file unknown, get it (and update * database). */ if(!sd.crc_known) { /* First, get the filename from the name * database */ { DBT dev_ino_dbt; dev_ino_dbt.size = sizeof sd.dik; dev_ino_dbt.data = &sd.dik; assert(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0)==0); } /* Then calculate the old file's CRC */ sd.crc = get_crc(name_dbt.data,size); sd.crc_known = 1; /* And stuff it into the size database so it * won't be calculated again */ { DBT key,data; key.data = &size; key.size = sizeof size; data.data = &sd; data.size = sizeof sd; assert(size_db->put(size_db,&key,&data,R_CURSOR)==0); } } else ++cached_crcs; /* At this point, we have both CRCs. */ if(size_data.crc==sd.crc) { /* The files have the same checksum and hence * are very likely to have the same contents. * A direct comparison won't hurt the * performance too much. */ /* Get name of old file from database, if not * already done for CRC calculation. */ if(!name_dbt.data) { DBT dev_ino_dbt; dev_ino_dbt.size = sizeof sd.dik; dev_ino_dbt.data = &sd.dik; assert(name_db->get(name_db,&dev_ino_dbt,&name_dbt,0)==0); } ++matching_crcs_found; if(debug) prog_clean(),printf("? comparing >%s< >%s<\n",path,(char*)name_dbt.data); if(!files_compare(path,name_dbt.data,size)) { /* Now the files really have the same * contents. */ ++duplicates_found; bytes_in_dups += size; prog_clean(); printf("|| >%s<\t>%s<\t>%lld<\n",(char*)name_dbt.data,path,size); /* If the current file has link count 1, then * it cannot be relevant in the future, * because it won't have hardlinks and * duplicates may compare to the existing * entry. */ if(nlinks==1) return; else /* No need to compare to other files * of the same size in the database; * but since the file has more than 1 * link, it has to go into the * database, in order to find * hardlinks. */ break; } else /* Strange stuff: The files have the same * CRC and the same length, but different * contents. Now this *may* happen, but * it's quite unlikely. */ if(debug) prog_clean(),printf("! >%s< >%s< CRC mis-match\n",path, (char*)name_dbt.data); } /* same CRC */ } /* same size */ } /* while files of at least this size */ } /* if duplicates */ /* At this point, the file is unique or it is the first of * several hardlinked files. */ /* Put it into name database; omit files with only 1 link if * not searching for duplicates. Also omit files of size 0 if * the user wants them suppressed, unless they have more than * 1 link. */ if(nlinks>1 || (duplicates && (size || !zero))) { DBT name_dbt,dev_ino_dbt; if(debug) prog_clean(),printf("take %s\n",path); ++filenames_stored; name_dbt.data = (void*)path; name_dbt.size = strlen(path)+1; dev_ino_dbt.data = &size_data.dik; dev_ino_dbt.size = sizeof size_data.dik; switch(name_db->put(name_db,&dev_ino_dbt,&name_dbt,R_NOOVERWRITE)) { case -1: /* database error */ perror("name_db->put() failed"); assert(0); return; case 1: /* Same device/inode pair a second time, * without hardlink; happens if you supply it * twice on the command line. We will ignore * it silently. */ if(debug) prog_clean(),fprintf(stderr,"found >%s< twice\n",path); return; case 0: /* ok */ break; } } /* Size db; only necessary if searching for duplicates. Omit * files of size 0 if the user doesn't want to see them. */ if(duplicates && (size || !zero)) { DBT size_dbt, dev_ino_dbt; ++filesizes_stored; size_dbt.data = &size; size_dbt.size = sizeof size; dev_ino_dbt.data = &size_data; dev_ino_dbt.size = sizeof size_data; switch(size_db->put(size_db,&size_dbt,&dev_ino_dbt,0)) { case -1: /* database error */ perror("size_db->put() failed"); assert(0); return; case 1: /* error because key already in database; * should be impossible because they are * allowed */ assert(0); return; case 0: /* ok */ break; } } } /* nftw() callback */ static int addfile(const char *path, const struct stat *stat, int flag, struct FTW *ftw_info) { /* unused */ ftw_info = ftw_info; switch(flag) { case FTW_F: /* A regular file. */ /* ignore devices etc. */ if(S_ISREG(stat->st_mode)) gather(stat->st_dev,stat->st_ino,stat->st_nlink,stat->st_size,path); else ++nonreg_found; return 0; case FTW_NS: /* A file for which no stat(2) * information was available. The * contents of the stat structure are * undefined. */ ++unstat_files; prog_clean(); fprintf(stderr,"cannot stat() >%s<\n",path); return 0; case FTW_DNR: /* A directory which cannot be read. The * directory will not be descended into. */ ++unreadable_directories; prog_clean(); fprintf(stderr,"cannot read directory >%s<: ",path); perror(0); return 0; case FTW_D: /* A directory being visited in * pre-order. */ case FTW_DP: /* A directory being visited in post-order * (nftw() only). */ ++directories_visited; if(progress) { fprintf(stderr,"%.*s",(int)(columns-1),path); prog_clean(); putc('\r',stderr); fflush(stderr); } return 0; case FTW_SL: /* A symbolic link. */ case FTW_SLN: /* A symbolic link with a non-existent * target (nftw() only). */ ++symlinks_found; if(symbolics) { char buf[2000]; int len = readlink(path,buf, sizeof buf); if(len==-1) { prog_clean(); fprintf(stderr,"cannot read symlink >%s<: ",path); perror(0); } else prog_clean(),printf("%s >%.*s<\t>%s<\t>%lld<\n",flag==FTW_SL?"->":"o>",len,buf,path,stat->st_size); } return 0; /* ftw() errors */ default: return 1; } } /* statistics are output at program end, if desired, and on * SIGINFO */ static void stat_dump(FILE *f) { prog_clean(); fprintf(f, "%8u directories visited\n" "%8u unreadable directories skipped\n" "%8u regular files found\n" "%8u files of size 0\n" "%8u symbolic links found\n" "%8u devices and sockets skipped\n" "%8u files without stat information skipped\n" "%8u unreadable files\n" "%8u files having more than one link\n" "%8u hardlink pairs confirmed\n" "%8u duplicates considered\n" "%8u cases of matching filesizes\n" "%8u calculated CRCs\n" "%8u times found CRCs in database\n" "%8u pairs of matching CRCs found\n" "%8u duplicates found\n" "%8llu bytes in duplicates\n" "%8u filenames in database\n" "%8u sizes and CRCs in database\n" "%8g bytes/sec read\n", directories_visited, unreadable_directories, found_files, zero_files, symlinks_found, nonreg_found, unstat_files, unreadable_files, possible_hardlinks, found_hardlinks, duplicates_considered, matching_sizes, calculated_crcs, cached_crcs, matching_crcs_found, duplicates_found, bytes_in_dups, filenames_stored, filesizes_stored, (double)bytes_read/(usec_read?usec_read:1)*1000000 ); } static void siginfo(int signo) { const int e = errno; signo=signo; stat_dump(stderr); errno = e; } int main(int argc, char **argv) { int c; /* look where to place the databases */ char *tmp = getenv("TMPDIR"); if(!tmp) tmp = "/tmp"; setvbuf(stdout,0,_IOLBF,0); while((c=getopt(argc,argv,"cshdv0p"))!=-1) switch(c) { case 'c': counters=1; break; case 's': symbolics=1; break; case 'h': hards=1; break; case 'd': duplicates=1; break; case 'v': debug=1; break; case '0': zero=1; break; case 'p': if(setupterm(0, STDERR_FILENO, 0)==OK) progress=1; else { perror("cannot show progress indicator"); return 1; } break; case '?': default: fputs("usage: dupfind [-cshdv0p] [path ...]\n",stderr); return 2; } argc-=optind; argv+=optind; { /* btree access method for name database */ char namedb_name[80]; BTREEINFO bi = { 0,0,0,0,0,device_inode_compare,0,0 }; int fd; int namlen = snprintf(namedb_name,sizeof namedb_name,"%s/dupfind.names.XXXXXX",tmp); assert(namlen>0 && namlen<(int)sizeof namedb_name); assert((fd=mkstemp(namedb_name))>=0); /* can't use the fd for the database... */ close(fd); if(!(name_db = dbopen(namedb_name,O_EXLOCK|O_RDWR,0,DB_BTREE,&bi))) { perror("dbopen() failed"); return 3; } /* don't leave the file hanging around after program end */ if(!debug) remove(namedb_name); } { /* btree access method for size database */ char sizedb_name[80]; BTREEINFO bi = { R_DUP,0,0,0,0,size_compare,0,0 }; int fd; int namlen=snprintf(sizedb_name,sizeof sizedb_name,"%s/dupfind.sizes.XXXXXX",tmp); assert(namlen>0 && namlen<(int)sizeof sizedb_name); assert((fd=mkstemp(sizedb_name))>=0); close(fd); if(!(size_db = dbopen(sizedb_name,O_EXLOCK|O_RDWR,0,DB_BTREE,&bi))) { perror("dbopen() failed"); return 4; } if(!debug) remove(sizedb_name); } signal(SIGINFO, siginfo); srand(time(0)); for(;*argv;argv++) if(nftw(*argv,addfile,10,FTW_PHYS)==-1) { perror("nftw() failed"); return 5; } name_db->close(name_db); size_db->close(size_db); prog_clean(); if(counters) stat_dump(stdout); return 0; }