/* * Copyright (c) 2001-2005 Greg Becker. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $Id: dits.c 30 2005-10-28 01:38:08Z greg $ */ /* dits - Data Integrity Test Suite * * dits is a tool for validating disk drivers, file systems, and * lock managers. More than an exerciser, dits is able to verify * that no detectable data integrity errors have occurred. * * The tool does its work in three mutually exclusive phases: * * (1) Init - Write a unique ID to each sector in the test bed. * * (2) Test - Continuously select arbitrary non-overlapping ranges of * sectors and swap them. File range locking is used to ensure mutual * exclusion amongst concurrently executing processes working in the * same test bed. * * (3) Check - Verify that each and every unique ID written in * the init phase are intact and neither missing nor duplicated. */ static char svnid[] = "$Id: dits.c 30 2005-10-28 01:38:08Z greg $"; static char svnrev[] = "$Revision: 30 $"; #if HAVE_CONFIG_H #include "config.h" #endif #include #if STDC_HEADERS #include #include #include #else #if HAVE_STDLIB_H #include #endif #endif #if HAVE_STRING_H #if !STDC_HEADERS && HAVE_MEMORY_H #include #endif #include #endif #if HAVE_STRINGS_H #include #endif #if HAVE_INTTYPES_H #include #else #if HAVE_STDINT_H #include #endif #endif #if HAVE_UNISTD_H #include #endif #if HAVE_ERRNO_H #include #endif #include #include #include #include #include #include #include #include #include #include #if HAVE_AIO #include #endif /* HAVE_AIO */ #define NDEBUG #include "dits.h" #define DPRINTF(lvl, ...) \ do { \ if (verbosity >= (lvl)) { \ printf(__VA_ARGS__); \ } \ } while (0) #ifndef DEV_BSIZE #define DEV_BSIZE (512) #endif /* Default min/max number of sectors to manage read/write per syscall * during test(). */ #define MINSWAPSECTORS 1LL /* Must be a power of 2 !!! */ #define MAXSWAPSECTORS 2048LL /* Must be a power of 2 !!! */ /* All the test process share a table of info records into which they * record their individual current active range locks. If a test * process dies, its file locks are released automatically by the * OS, but its range locks in the info table persist, which is a * good thing because it keeps other test processes from using those * locked ranges until the parent has a chance to roll back the * transaction and release the locks. */ typedef struct { int locked; /* Range locks are valid */ pid_t pid; /* Pid of test process */ struct flock src; /* Source range lock */ struct flock dst; /* Destination range lock */ } info_t; /* Run time check object. For a given testbed of nsectors, an array of * nsectors rtck_t objects tracks the ID and CRC of each record on disk. * This provides a modicum of run time sanity checking, but comes at * the expense of memory usage directly proportional to the size of * the testbed. The rtck array is indexed by sector to retrieve the * in core information related to what is supposed to be in that sector * on disk. It is compared to the data returned from disk for the same * sector, thereby allowing dits to catch errant underlying subsystems * that return the wrong data. */ typedef struct { int64_t id; crc32_t crc; } rtck_t; /* The only type of repair that is possible is recovery from a * swap that has only partially taken place. E.g., while swapping * blocks A and B, after reading them both into core we write A's * data to block B and then crash. On the death of a child, the * parent scans the blocks of the child's locked ranges and * generates a defect list of all the id's found and the address * at which they were found (x and y). So, x[id] contains the * sector number of the record with the id. If an id is encountered * a second time, its address is stored in y[id]. We can then * simply scan y for valid entries and roll them all back. */ typedef struct { int64_t nsectors; /* Number of sectors in x&y lists */ int64_t inplace; /* # whose id matched block */ int64_t *id; /* Record ID */ off_t *x; /* First occurrence list */ off_t *y; /* Second occurrence list */ char **msg; /* Detailed message */ crc32_t *crc; } defect_t; #if HAVE_AIO #ifdef _AIO_AIX_SOURCE /* Legacy aio on aix. */ #define AIO_ERROR(aiocb) aio_error((aiocb)->aio_handle) #define AIO_SUSPEND(ap, apsz) aio_suspend((apsz), (ap)) #else /* POSIX aio. */ #define AIO_ERROR(aiocb) aio_error((aiocb)) #define AIO_SUSPEND(ap, apsz) \ aio_suspend((const struct aiocb **)(ap), (apsz), (struct timespec *)0) #endif /* _AIO_AIX_SOURCE */ /* xaio is used to track aio reads and writes. Basically, every * swap involves two xaio records which are tied together via * the "sibling" link. The records are initialized and dispatched. * Rather than wait on the request, the caller continues, * generating more requests, periodically checking and dealing * with requests that finished. */ typedef struct xaio { struct aiocb aio; /* Must be first field!!! */ struct flock lk; sector_t *sb; int nrgns; int done; int idx; struct xaio *sibling; int read; struct xaio *next; /* xaio free queue list node */ u_long magic; } xaio_t; static xaio_t *xaiohead = 0; int getidx(int n, xaio_t **ap, int *pidx); int chkoverlap(int n, xaio_t **ap, xaio_t *tgt); int getxaiobuf(int nrgns, int fd, xaio_t **x1, xaio_t **x2); void freexaiobuf(xaio_t *x1, xaio_t *x2); void testaio(info_t *infobase, rtck_t *rtckbase, int fdInfo, int fdLock, int fdPart, int64_t nsectors); typedef struct aiocb *aiocbptr_t; #else typedef void *aiocbptr_t; #endif /* HAVE_AIO */ /* Variables set by the option parser. * * f{varname} - flag, true if option was given * arg{varname} - optarg argument */ int fAIO, fCheck, fConcurrency, fDump, fDup, fHelp, fInit, fOSeek; int fRepair, fTest, fVersion; int fExec = 1; int fHdrs = 1; int fRtck = 1; char *argSession; int verbosity = 0; char *excludes[256]; char given[256]; int64_t maxswapsectors = MAXSWAPSECTORS; int64_t minswapsectors = MINSWAPSECTORS; long maxiterations = 0; u_long maxblksps = 0; /* Max blocks per second */ off_t oseek = 0; /* Device seek offset from zero */ int naioreqs = 2; int nprocs = 4; char *progname; time_t gctime = 0; char *gtmpdir = "/var/tmp"; int fSigAlarm; int fSig; char state[256]; /* For initstate() */ void init(char *partition, int64_t sectors); void check(char *partition); void teststart(char *partition); void test(info_t *infobase, rtck_t *rtckbase, int fdInfo, int fdLock, int fdPart, int64_t nsectors); void dump(char *partition, off_t start, off_t stop); void chkrgn(FILE *fp, sector_t *sb, off_t offset, size_t len, rtck_t *rtckbase, defect_t *d, crc32_t *signature); void saverollback(sector_t *x, sector_t *y, int64_t nrgns); void dumprgn(FILE *fp, sector_t *sb, off_t offset, int64_t nrgns, rtck_t *rtck); void updatergn(sector_t *sb, off_t src, off_t dst, size_t len, pid_t pid, time_t time, rtck_t *rtckbase); int repair(int fdPart, defect_t *d); int cleanlocks(info_t *victim, int fdInfo, int fdPart, int64_t nsectors); RETSIGTYPE sigHandler(int sig); RETSIGTYPE sigAlarm(int sig); void getrgn(int fd, sector_t *sb, off_t offset, size_t len, aiocbptr_t aio); void putrgn(int fd, sector_t *sb, off_t offset, size_t len, aiocbptr_t aio); long svnrev2num(char *revision); int getpsize(int fd, off_t *psize); void getustime(long *utime, long *stime); int dupsession(char *old, char *new); void usage() { printf("usage: %s -i [-Hv] [-o skip] [-S max] partition nsectors\n", progname); printf("usage: %s -t [-Hnv]" #if HAVE_AIO " [-a naioreqs]" #endif /* HAVE_AIO */ " [-b bps] [-C nprocs] [-I iter] [-o skip]" #if HAVE_MMAP " [-r]" #endif /* HAVE_MMAP */ " [-S max] [-s min] partition\n", progname); printf("usage: %s -c [-HRv] " #if HAVE_AIO " [-a naioreqs]" #endif /* HAVE_AIO */ " [-o skip]" #if HAVE_MMAP " [-r]" #endif /* HAVE_MMAP */ " [-S max] partition\n", progname); printf("usage: %s -D src dst\n", progname); printf("usage: %s -d [-Hv] [-o skip] [-S max] " "partition [start [stop]]\n", progname); printf("usage: %s -h\n", progname); printf("usage: %s -V\n", progname); #if HAVE_AIO printf("-a nreqs use aio, limit to nreqs outstanding requests\n"); #endif /* HAVE_AIO */ printf("-b bps limit throughput to bps blocks/sec\n"); printf("-C nprocs set concurrency (default: %d)\n", nprocs); printf("-c check\n"); printf("-D old new duplicate session 'old' to 'new'\n"); printf("-d dump the given range of sectors\n"); printf("-H suppress column headers\n"); printf("-h show this help list\n"); printf("-I iter max iterations\n"); printf("-i initialize test bed\n"); printf("-N name specify the session name\n"); printf("-n don't perform write operations\n"); printf("-o skip skip first nsectors in partition\n"); printf("-R attempt to repair a corrupted test bed\n"); #if HAVE_MMAP printf("-r disable run time sanity checking\n"); #endif /* HAVE_MMAP */ printf("-S max maximum sectors to swap (default: %lld)\n", maxswapsectors); printf("-s min minimum sectors to swap (default: %lld)\n", minswapsectors); printf("-t test\n"); printf("-V print version\n"); printf("-v increase verbosity\n"); printf("partition a disk partition, volume, file, etc, ...\n"); printf("nsectors number of sectors in test bed\n"); printf("start starting block number\n"); printf("stop ending block number\n"); } int chkexcludes(int c) { char *msg = (char *)0; char *pc; int i; /* First, check to see if any option already given excludes * this option. */ for (i = 0; i < 256; ++i) { if (given[i]) { for (pc = excludes[i]; pc && *pc; ++pc) { if (*pc == (char)c) { msg = "excludes"; goto done; } } } } /* Next, check to see if this option excludes any already given. */ for (pc = excludes[c]; pc && *pc; ++pc) { if (given[(int)*pc]) { msg = "is excluded by"; i = *pc; goto done; } } done: if (msg) { eprintf("%s: option `-%c' %s `-%c', use `-h' for help\n", progname, (char)i, msg, (char)c); return i; } return 0; } int main(int argc, char **argv) { int nargsexpected; int nargsoptional; unsigned long ul; char *envopts; int c; DPRINTF(3, "sizeof(sector_t) == %lu\n", sizeof(sector_t)); assert(sizeof(sector_t) == 512); #if HAVE_AIO assert(sizeof(xaio_t) < sizeof(sector_t)); #endif /* HAVE_AIO */ progname = strrchr(argv[0], (int)'/'); progname = (progname ? progname+1 : argv[0]); (void)initstate((u_long)time((time_t *)0), state, sizeof(state)); crc32_init(); /* TODO: Get options from the environment. */ envopts = getenv("DITS"); if (envopts) { eprintf("DITS=%s ignored\n", envopts); } /* Initialize the option exlusion tables. */ excludes['c'] = "bCDIhinstVX"; excludes['D'] = "bcCIhinstVX"; excludes['d'] = "abCDcIhinoRstVX"; excludes['h'] = "abCDcdIinoRrstVvX"; excludes['i'] = "abCDcdIhRstV"; excludes['R'] = "abCDdhIirstVX"; excludes['t'] = "cDdhiRV"; excludes['V'] = "abCDcdIhinoRrsStvX"; nargsexpected = 0; nargsoptional = 0; while (-1 != (c = getopt(argc, argv, ":a:b:C:DcdHhI:iN:no:RrS:s:tVvX:"))) { if (chkexcludes(c)) { exit(EX_USAGE); } given[c] = c; switch (c) { case 'a': #if HAVE_AIO fAIO = !0; naioreqs = atoi(optarg); if (naioreqs < 2) { eprintf("nreqs must be [2 < nreqs]\n"); exit(EX_USAGE); } if (naioreqs & 0x0001) { eprintf("nreqs must be a multiple of 2\n"); exit(EX_USAGE); } /* TODO: For now we set nprocs to 1 because mulitple * procs using aio don't play well together. */ if (fConcurrency && (nprocs > 1)) { DPRINTF(0, "Use of aio not compatible with " "more than one process. Using -C1\n"); } nprocs = 1; #else eprintf("Not built with -DHAVE_AIO.\n"); exit(EX_USAGE); #endif /* HAVE_AIO */ break; case 'b': maxblksps = strtoul(optarg, (char **)0, 0); if (maxblksps < 0) { eprintf("max blks/sec must be >= 0\n"); exit(EX_USAGE); } break; case 'C': fConcurrency = !0; nprocs = atoi(optarg); if (nprocs < 1) { eprintf("nprocs must be > 0\n"); exit(EX_USAGE); } if (fAIO && (nprocs > 1)) { DPRINTF(0, "Use of aio not compatible with " "more than one process. Using -C1\n"); nprocs = 1; } break; #if HAVE_MMAP case 'D': fDup = !0; nargsexpected = 2; break; #endif /* HAVE_MMAP */ case 'c': fCheck = !0; nargsexpected = 1; break; case 'd': fDump = !0; nargsexpected = 1; nargsoptional = 2; break; case 'H': fHdrs = 0; break; case 'h': fHelp = !0; break; case 'I': maxiterations = strtol(optarg, (char **)0, 0); if (maxiterations < 1) { eprintf("max iterations must be > 0\n"); exit(EX_USAGE); } break; case 'i': fInit = !0; nargsexpected = 1; nargsoptional = 1; break; case 'N': argSession = optarg; break; case 'n': fExec = 0; break; case 'o': fOSeek = !0; oseek = strtoll(optarg, (char **)0, 0); if (oseek < 0) { eprintf("oseek must be >= 0\n"); exit(EX_USAGE); } break; case 'R': fRepair = !0; break; case 'r': fRtck = 0; break; case 'S': maxswapsectors = strtoll(optarg, (char **)0, 0); if (maxswapsectors < 0) { eprintf("max swap sectors must be > 0\n"); exit(EX_USAGE); } else if (maxswapsectors < minswapsectors) { eprintf("max swap sectors (%lld) must be " ">= %lld\n", maxswapsectors, minswapsectors); exit(EX_USAGE); } break; case 's': minswapsectors = strtoll(optarg, (char **)0, 0); if (minswapsectors < 1) { eprintf("min swap sectors must be > 0\n"); exit(EX_USAGE); } else if (minswapsectors > maxswapsectors) { eprintf("min swap sectors must be <= %lld\n", maxswapsectors); exit(EX_USAGE); } break; case 't': fTest = !0; nargsexpected = 1; break; case 'V': fVersion = !0; break; case 'v': ++verbosity; break; case 'X': ul = strtoul(optarg, (char **)0, 0); (void)initstate(ul, state, sizeof(state)); break; case '?': eprintf("invalid option `-%c'\n", optopt); exit(EX_USAGE); default: eprintf("option `-%c' requires a parameter\n", optopt); exit(EX_USAGE); } } argc -= optind; argv += optind; if (argc < nargsexpected) { eprintf("mandatory arguments required, use `-h' for help\n"); exit(EX_USAGE); } else if (argc > nargsexpected + nargsoptional) { eprintf("extraneous arguments detected, use `-h' for help\n"); exit(EX_USAGE); } if (fInit) { int64_t nsectors = 0; if (argc > 1) { nsectors = strtoll(argv[1], (char **)0, 0); if (nsectors < 0) { eprintf("nsectors < 0... surely you jest?\n"); exit(EX_USAGE); } } init(argv[0], nsectors); } else if (fTest) { teststart(argv[0]); } else if (fCheck) { check(argv[0]); } else if (fDup) { dupsession(argv[0], argv[1]); } else if (fDump) { off_t start = 0; off_t stop = 0; if (argc > 1) { start = strtoll(argv[1], (char **)0, 0); if (start < 0) { eprintf("start must be >= 0\n"); exit(EX_USAGE); } } if (argc > 2) { stop = strtoll(argv[2], (char **)0, 0); if (stop < start) { eprintf("stop must be >= start (%lld)\n", start); exit(EX_USAGE); } } dump(argv[0], start, stop); } else if (fHelp) { usage(); } else if (fVersion) { printf("%s.%ld\n", VERSION, svnrev2num(svnrev)); } else { eprintf("one of [-cdhitV] required, use `-h' for help\n"); exit(EX_USAGE); } return 0; } /* Write a unique record to each sector of the given partition. */ void init(char *partition, int64_t nsectors) { char *partbasename; char infopath[128]; char lockpath[128]; char rtckpath[128]; uint32_t revision; uint64_t sbflags; rtck_t *rtckbase; char *src, *dst; sector_t *sb; size_t sbsz; time_t now; int fdPart; int fdRTCK; int i, j; now = time((time_t *)0); fdPart = open(partition, O_RDWR); if (-1 == fdPart) { eprintf("open(%s): %s\n", partition, strerror(errno)); exit(EX_OSERR); } partbasename = strrchr(partition, (int)'/'); if (!partbasename) { partbasename = partition; } else { ++partbasename; } /* Use the partition base name for the session name if the * session name wasn't specified. */ if (!argSession) { argSession = partbasename; } /* If nsectors is zero then we try to determine and use the full * size of the partition. */ if (nsectors == 0) { struct stat sb; int64_t psize; int rc; rc = fstat(fdPart, &sb); if (rc) { eprintf("init: fstat(%s): %s\n", partition, strerror(errno)); exit(EX_OSERR); } if (S_ISBLK(sb.st_mode)) { eprintf("Warning: Using block device yields inaccurate " "results in determining partition size\n"); eprintf("Please specify the test bed size.\n"); exit(EX_USAGE); } else if (S_ISCHR(sb.st_mode)) { rc = getpsize(fdPart, &psize); } else { psize = sb.st_size; } nsectors = psize / DEV_BSIZE; if (nsectors <= 0) { eprintf("Partition/volume/file must be at least " "one %d-byte sector\n", DEV_BSIZE); exit(EX_USAGE); } } if (-1 == lseek(fdPart, oseek * sizeof(*sb), SEEK_SET)) { eprintf("init: lseek(%s, %lld): %s\n", partition, oseek * sizeof(*sb), strerror(errno)); exit(EX_OSERR); } DPRINTF(2, "DEVICE: \t%s\n", partition); DPRINTF(2, "NSECTORS:\t%lld\n", nsectors); DPRINTF(2, "OSEEK: \t%lld\n", oseek); DPRINTF(2, "SESSION: \t%s\n", argSession); /* Create the shared run time check table. */ fdRTCK = -1; rtckbase = (rtck_t *)0; sbflags = 0; /* Try to create the run time check file. */ while (fRtck && (fdRTCK == -1)) { sbflags |= DITS_FRTCK; (void)snprintf(rtckpath, sizeof(rtckpath), "%s/%s-rtck-%s", gtmpdir, progname, argSession); DPRINTF(2, "Creating primary rtck file: %s\n", rtckpath); fdRTCK = open(rtckpath, O_CREAT|O_TRUNC|O_RDWR, 0600); if (-1 == fdRTCK) { DPRINTF(2, "open(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } if (-1 == ftruncate(fdRTCK, nsectors * sizeof(*rtckbase))) { eprintf("ftruncate(%s, %lld): %s\n", rtckpath, nsectors * sizeof(*rtckbase), strerror(errno)); exit(EX_OSERR); } rtckbase = (rtck_t *) mmap((void *)0, nsectors * sizeof(*rtckbase), PROT_READ|PROT_WRITE, MAP_SHARED, fdRTCK, 0); if (rtckbase == MAP_FAILED) { eprintf("mmap(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } /* Fill the whole file with zero writes so as * to prevent fragmentation. */ bzero(rtckbase, nsectors * sizeof(*rtckbase)); } sbsz = maxswapsectors * sizeof(*sb); sb = malloc(sbsz); if (!sb) { eprintf("malloc(%d) failed\n", sbsz); exit(EX_OSERR); } bzero(sb, sbsz); revision = svnrev2num(svnrev); /* We buffer a "region" of changes and write them out * in one call to write(). */ for (i = 0; i < nsectors; i += maxswapsectors) { size_t min; min = DITS_MIN(nsectors - i, maxswapsectors); for (j = 0; j < min; ++j) { record_t *r = &sb[j].rec[0]; sb[j].magic = DITS_MAGIC; sb[j].revision = revision; sb[j].flags = sbflags; /* Initialize the invariant fields. */ r[0].id = i + j + oseek; r[0].nsectors = nsectors; r[0].ctime = now; r[1] = r[0]; r[1].id = -1; (void)strncpy(sb[j].session, argSession, sizeof(sb[j].session)); sb[j].session[sizeof(sb[j].session)-1] = '\0'; } bcopy(state, sb->payload, sizeof(sb->payload)); DPRINTF(3, "Initializing sectors [%lld - %lld]\n", i + oseek, i + oseek + min - 1); updatergn(sb, i + oseek, i + oseek, min, -1, now, rtckbase); putrgn(fdPart, sb, i + oseek, min, (aiocbptr_t)0); } if (-1 != fdRTCK) { (void)close(fdRTCK); } free(sb); (void)close(fdPart); /* Remove the info and lock files if they for some reason exist. */ (void)snprintf(infopath, sizeof(infopath), "%s/%s-info-%s", gtmpdir, progname, argSession); (void)unlink(infopath); (void)snprintf(lockpath, sizeof(lockpath), "%s/%s-locks-%s", gtmpdir, progname, argSession); (void)unlink(lockpath); } /* Allocate a defect table. This can be quite large for a large test bed. */ defect_t * defect_alloc(int64_t nsectors) { defect_t *d; d = malloc(sizeof(*d)); if (!d) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d)); return (defect_t *)0; } bzero((char *)d, sizeof(*d)); d->x = malloc(sizeof(*d->x) * nsectors); if (!d->x) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d->x) * nsectors); free(d); return (defect_t *)0; } memset(d->x, -1, sizeof(*d->x) * nsectors); d->y = malloc(sizeof(*d->y) * nsectors); if (!d->y) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d->y) * nsectors); free(d->x); return (defect_t *)0; } memset(d->y, -1, sizeof(*d->y) * nsectors); d->id = malloc(sizeof(*d->id) * nsectors); if (!d->id) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d->id) * nsectors); free(d->y); free(d->x); return (defect_t *)0; } memset(d->id, -1, sizeof(*d->id) * nsectors); d->crc = malloc(sizeof(*d->crc) * nsectors); if (!d->crc) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d->crc) * nsectors); free(d->id); free(d->y); free(d->x); return (defect_t *)0; } memset(d->crc, -1, sizeof(*d->crc) * nsectors); d->msg = malloc(sizeof(*d->msg) * nsectors); if (!d->msg) { eprintf("defect_alloc: malloc(%lu) failed\n", sizeof(*d->msg) * nsectors); free(d->crc); free(d->id); free(d->y); free(d->x); return (defect_t *)0; } memset(d->msg, 0, sizeof(*d->msg) * nsectors); d->nsectors = nsectors; return d; } void defect_free(defect_t *d) { int i; if (d) { for (i = 0; i < d->nsectors; ++i) { if (d->msg[i]) { free(d->msg[i]); } } free(d->msg); free(d->crc); free(d->id); free(d->y); free(d->x); free(d); } } /* Check that all the data written by init() is intact and that there * are no missing nor duplicated records. */ void check(char *partition) { char *partbasename; char infopath[128]; char lockpath[128]; char rtckpath[128]; crc32_t signature; struct stat stat; rtck_t *rtckbase; int64_t nsectors; char *src, *dst; char hdr[128]; sector_t *sb; defect_t *d; size_t sbsz; int64_t rgn; int fdRTCK; int fdPart; int nerrs; int64_t i; int rc; fdPart = open(partition, (fRepair ? O_RDWR : O_RDONLY)); if (-1 == fdPart) { eprintf("check(%s): open: %s\n", partition, strerror(errno)); exit(EX_OSERR); } /* Get the base name of the partition. */ partbasename = strrchr(partition, '/'); if (!partbasename) { partbasename = partition; } else { ++partbasename; } sbsz = maxswapsectors * sizeof(*sb); sb = malloc(sbsz); if (!sb) { eprintf("check(%s): malloc(%lu) failed\n", partition, sbsz); exit(EX_OSERR); } /* Get the first record which tells how many records are * in the test bed. */ getrgn(fdPart, sb, oseek, (int64_t)1, (aiocbptr_t)0); chkrgn(stderr, sb, oseek, (int64_t)1, (rtck_t *)0, (defect_t *)0, (crc32_t *)0); nsectors = sb->rec[0].nsectors; gctime = sb->rec[0].ctime; DPRINTF(2, "DEVICE: \t%s\n", partition); DPRINTF(2, "NSECTORS:\t%lld\n", nsectors); DPRINTF(2, "OSEEK: \t%lld\n", oseek); DPRINTF(2, "SESSION: \t%s\n", sb->session); DPRINTF(2, "CTIME: \t%s", ctime((time_t *)&gctime)); /* Use the partition base name for the session name if the * session name wasn't specified. */ if (!argSession) { static char session[32]; (void)strncpy(session, sb->session, sizeof(session)); session[sizeof(session) - 1] = '\000'; argSession = session; } #if HAVE_MMAP /* Open the run time check table. */ fdRTCK = -1; rtckbase = (rtck_t *)0; /* Try to open the run time check file, but proceed regardless * of whether one is found. */ while (fRtck && (fdRTCK == -1) && (sb->flags & DITS_FRTCK)) { (void)snprintf(rtckpath, sizeof(rtckpath), "%s/%s-rtck-%s", gtmpdir, progname, argSession); fdRTCK = open(rtckpath, O_RDWR); if (-1 == fdRTCK) { eprintf("The run time check file %s could not be " "opened.\n", rtckpath); eprintf("open(%s): %s\n", rtckpath, strerror(errno)); break; } rtckbase = (rtck_t *) mmap((void *)0, nsectors * sizeof(*rtckbase), PROT_READ|PROT_WRITE, MAP_SHARED, fdRTCK, 0); if (rtckbase == MAP_FAILED) { eprintf("mmap(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } /* Check that the number of entries in the mapped run time * cross check file agrees with the size of the test bed that * we just determined from the first sector in the test bed. */ rc = fstat(fdRTCK, &stat); if (rc) { eprintf("fstat(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } if (stat.st_size != (nsectors * sizeof(*rtckbase))) { eprintf("The number of sectors (%lld) as read from " "the first sector in the test bed doesn't " "agree with the number of entries in the run " "time cross check file (%lld)\n", (long long)nsectors, (long long)stat.st_size / sizeof(*rtckbase)); exit(EX_DATAERR); } /* TODO: Attempt to lock the file and deal with issues * arising from some other process using the file. */ DPRINTF(2, "Using rtck file: %s\n", rtckpath); } #else #error "TODO - this implementation does not support mmap" #endif /* HAVE_MMAP */ d = defect_alloc(nsectors + oseek); if (!d) { exit(EX_OSERR); } /* Iteratively read in chunks of the test bed so that chkrgn can * check the region and populate the defect table for each chunk. */ signature = 0; if (fAIO) { #if HAVE_AIO xaio_t **axaio; xaio_t *xrd; size_t min; ssize_t cc; int gbrc; int ichk; int ird; ird = ichk = 0; axaio = malloc(naioreqs * sizeof(*axaio)); assert(axaio); bzero(axaio, naioreqs * sizeof(*axaio)); for (rgn = 0; rgn < nsectors; rgn += maxswapsectors) { min = DITS_MIN(nsectors - rgn, maxswapsectors); gbrc = getxaiobuf(min, -1, &xrd, (xaio_t **)0); assert(!gbrc); getrgn(fdPart, xrd->sb, rgn + oseek, xrd->nrgns, &xrd->aio); assert(!axaio[ird % naioreqs]); axaio[ird++ % naioreqs] = xrd; if ((ird % naioreqs) == (ichk % naioreqs)) { struct aiocb *araio[2]; int64_t rgnchk; xaio_t *xchk; xchk = axaio[ichk % naioreqs]; assert(xchk); araio[0] = &xchk->aio; rc = AIO_ERROR(&xchk->aio); if (rc) { if (rc != EINPROGRESS) { perror("aio_error"); exit(1); } rc = AIO_SUSPEND(araio, 1); if (rc) { perror("aio_suspend"); exit(1); } } cc = aio_return(&xchk->aio); if (cc != xchk->aio.aio_nbytes) { eprintf("check: aio_return: cc (%lu) " "!= aio_nbytes (%lu)\n", cc, xchk->aio.aio_nbytes); exit(EX_OSERR); } rgnchk = xchk->aio.aio_offset / sizeof(*xchk->sb); DPRINTF(3, "Checking sectors [%lld - %lld] " "Signature: ", rgnchk + oseek, rgnchk + oseek + xchk->nrgns, signature); chkrgn(stderr, xchk->sb, rgnchk + oseek, xchk->nrgns, rtckbase, d, &signature); DPRINTF(3, "%lX\n", signature); freexaiobuf(xchk, (xaio_t *)0); axaio[ichk++ % naioreqs] = (xaio_t *)0; } } while (ichk < ird) { struct aiocb *araio[1]; xaio_t *xchk; int64_t rgnchk; xchk = axaio[ichk % naioreqs]; assert(xchk); araio[0] = &xchk->aio; rc = AIO_ERROR(&xchk->aio); if (rc) { if (rc != EINPROGRESS) { perror("aio_error"); exit(1); } rc = AIO_SUSPEND(araio, 1); if (rc) { perror("aio_suspend"); exit(1); } } cc = aio_return(&xchk->aio); if (cc != xchk->aio.aio_nbytes) { eprintf("check: aio_return: cc (%lu) " "!= aio_nbytes (%lu)\n", cc, xchk->aio.aio_nbytes); exit(EX_OSERR); } rgnchk = xchk->aio.aio_offset / sizeof(*xchk->sb); DPRINTF(3, "Checking sectors [%lld - %lld] " "Signature: ", rgnchk + oseek, rgnchk + oseek + xchk->nrgns, signature); chkrgn(stderr, xchk->sb, rgnchk + oseek, xchk->nrgns, rtckbase, d, &signature); DPRINTF(3, "%lX\n", signature); freexaiobuf(xchk, (xaio_t *)0); axaio[ichk % naioreqs] = (xaio_t *)0; ++ichk; } free(axaio); #endif /* HAVE_AIO */ } else { for (rgn = 0; rgn < nsectors; rgn += maxswapsectors) { size_t min; min = DITS_MIN(nsectors - rgn, maxswapsectors); DPRINTF(3, "Checking sectors [%lld - %lld] " "Signature: ", rgn + oseek, rgn + min + oseek, signature); getrgn(fdPart, sb, rgn + oseek, min, (aiocbptr_t)0); chkrgn(stderr, sb, rgn + oseek, min, rtckbase, d, &signature); DPRINTF(3, "%lX\n", signature); } DPRINTF(2, "\n"); } if (fRepair) { repair(fdPart, d); } /* Check that each record id maps to one and only one sector. */ nerrs = 0; hdr[0] = '\000'; if (fHdrs) { (void)snprintf(hdr, sizeof(hdr), " SECTOR ID TO DUPTO CRC DETAILS\n"); } for (i = oseek; i < nsectors + oseek; ++i) { if (d->x[i] == -1) { eprintf("%s%8lu %8lld %8lld %8lld %08lX M %s\n", hdr, (u_long)i, d->id[i], d->x[i], d->y[i], d->crc[i], d->msg[i] ? d->msg[i] : "Record missing"); ++nerrs; hdr[0] = '\000'; } else if (d->y[i] != -1) { eprintf("%s%8lu %8lld %8lld %8lld %08lX D %s\n", hdr, (u_long)i, d->id[i], d->x[i], d->y[i], d->crc[i], d->msg[i] ? d->msg[i] : "Duplicate record at sector ???"); ++nerrs; hdr[0] = '\000'; } else if ((verbosity > 2) || d->msg[i]) { eprintf("%s%8lu %8lld %8lld %8lld %08lX %s\n", hdr, (u_long)i, d->id[i], d->x[i], d->y[i], d->crc[i], d->msg[i] ? d->msg[i] : ""); hdr[0] = '\000'; } } if (nerrs) { eprintf("Data integrity errors: %d\n", nerrs); exit(EX_DATAERR); } if (fHdrs) { DPRINTF(1, "DEVICE NSECTORS INPLACE " "SIGNATURE\n"); } DPRINTF(1, "%-12s %8lld %8lld (%07.3f%%) %lX\n", partbasename, d->nsectors, d->inplace, (d->inplace * 100.0)/d->nsectors, signature); /* Remove the info file if it exists. */ (void)snprintf(infopath, sizeof(infopath), "%s/%s-info-%s", gtmpdir, progname, argSession); if (-1 != unlink(infopath)) { DPRINTF(2, "Info file %s removed.\n", infopath); } (void)snprintf(lockpath, sizeof(lockpath), "%s/%s-locks-%s", gtmpdir, progname, argSession); if (-1 != unlink(lockpath)) { DPRINTF(2, "Lock file %s removed.\n", lockpath); } if (-1 != fdRTCK) { (void)close(fdRTCK); } defect_free(d); free(sb); (void)close(fdPart); } /* Scan the defect list for duplicates. The sector address of the first * occurrence of an id is in d->x[id], and the second occurrence, if any, * is in d->y[id]. */ int repair(int fdPart, defect_t *d) { int64_t nrepaired; sector_t x, y; int64_t i; nrepaired = 0; for (i = 0; i < d->nsectors; ++i) { if (d->y[i] == -1) { continue; } getrgn(fdPart, &x, d->x[i], (int64_t)1, (aiocbptr_t)0); dumprgn(stderr, &x, d->x[i], (int64_t)1, (rtck_t *)0); getrgn(fdPart, &y, d->y[i], (int64_t)1, (aiocbptr_t)0); dumprgn(stderr, &y, d->y[i], (int64_t)1, (rtck_t *)0); /* If the id's don't match, then these sectors were not * involved in the same interrupted swap transaction. The * implications are that this record has occurred more than * twice, meaning there are serious problems somewhere. */ if (x.rec[0].id != y.rec[0].id) { eprintf("repair: id mismatch: sector/id: " "%lld/%lld %lld/%lld\n", d->x[i], x.rec[0].id, d->y[i], y.rec[0].id); continue; } DPRINTF(2, "repair: repairing sectors %lld %lld\n", d->x[i], d->y[i]); /* Roll back the problem sector. */ if (d->x[i] < d->y[i]) { x.rec[0] = x.rec[1]; bzero(&x.rec[1], sizeof(x.rec[1])); x.rec[1].id = -1; putrgn(fdPart, &x, d->x[i], 1, (aiocbptr_t)0); assert(d->x[x.rec[0].id] == -1); d->x[x.rec[0].id] = d->x[i]; } else { y.rec[0] = y.rec[1]; bzero(&y.rec[1], sizeof(y.rec[1])); y.rec[1].id = -1; putrgn(fdPart, &y, d->y[i], 1, (aiocbptr_t)0); assert(d->x[y.rec[0].id] == -1); d->x[y.rec[0].id] = d->y[i]; } d->y[i] = -1; ++nrepaired; } DPRINTF(1, "repair: repaired %lld sectors\n", nrepaired); return 0; } /* Print detailed information about the records in the specified range. */ void dump(char *partition, off_t start, off_t stop) { char *partbasename; char infopath[128]; char rtckpath[128]; int64_t nsectors; rtck_t *rtckbase; char *src, *dst; sector_t *sb; size_t sbsz; int64_t rgn; int fdRTCK; int fdPart; int64_t i; assert(start >= 0); fdPart = open(partition, O_RDONLY); if (-1 == fdPart) { eprintf("dump: open(%s): %s\n", partition, strerror(errno)); exit(EX_OSERR); } sbsz = maxswapsectors * sizeof(*sb); sb = malloc(sbsz); if (!sb) { eprintf("dump: malloc(%lu) failed\n", sbsz); exit(EX_OSERR); } /* Get the first record which tells how many records are * in the test bed. */ getrgn(fdPart, sb, oseek, (int64_t)1, (aiocbptr_t)0); chkrgn(stdout, sb, oseek, (int64_t)1, (rtck_t *)0, (defect_t *)0, (crc32_t *)0); nsectors = sb->rec[0].nsectors; gctime = sb->rec[0].ctime; DPRINTF(2, "DEVICE: \t%s\n", partition); DPRINTF(2, "NSECTORS:\t%lld\n", nsectors); DPRINTF(2, "OSEEK: \t%lld\n", oseek); DPRINTF(2, "SESSION: \t%s\n", sb->session); if (start >= nsectors) { eprintf("start (%lld) lies outside the test bed " "of %lld sectors\n", start, nsectors); exit(EX_USAGE); } if (stop == 0) { stop = nsectors; } else if (stop < start) { DPRINTF(1, "limiting stop to %lld\n", nsectors); stop = nsectors; } /* Get the base name of the partition. */ partbasename = strrchr(partition, '/'); if (!partbasename) { partbasename = partition; } else { ++partbasename; } /* Use the session name from the testbed if the session * name wasn't specified. */ if (!argSession) { static char session[32]; (void)strncpy(session, sb->session, sizeof(session)); session[sizeof(session) - 1] = '\000'; argSession = session; } #if HAVE_MMAP fdRTCK = -1; rtckbase = (rtck_t *)0; /* Try to open the run time check table. */ while (fRtck && (fdRTCK == -1) && (sb->flags & DITS_FRTCK)) { (void)snprintf(rtckpath, sizeof(rtckpath), "%s/%s-rtck-%s", gtmpdir, progname, argSession); fdRTCK = open(rtckpath, O_RDWR); if (-1 == fdRTCK) { eprintf("The run time check file %s could not be " "opened.\n", rtckpath); eprintf("open(%s): %s\n", rtckpath, strerror(errno)); break; } rtckbase = (rtck_t *) mmap((void *)0, nsectors * sizeof(*rtckbase), PROT_READ|PROT_WRITE, MAP_SHARED, fdRTCK, 0); if (rtckbase == MAP_FAILED) { eprintf("mmap(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } DPRINTF(2, "using rtck file: %s\n", rtckpath); } #else #error TODO - this implementation does not support mmap #endif /* HAVE_MMAP */ /* Iteratively read in chunks of the test bed so that chkrgn * can populate the defect table for each chunk. */ for (rgn = start; rgn < stop; rgn += maxswapsectors) { size_t min; min = DITS_MIN(stop - rgn, maxswapsectors); DPRINTF(3, "reading sectors [%lld-%lld]\n", rgn, rgn + min); getrgn(fdPart, sb, rgn, min, (aiocbptr_t)0); dumprgn(stdout, sb, rgn, min, rtckbase); } if (-1 != fdRTCK) { (void)close(fdRTCK); } free(sb); (void)close(fdPart); } void dumprgn(FILE *fp, sector_t *sb, off_t offset, int64_t nrgns, rtck_t *rtckbase) { char msg[256]; int i, j; snprintf(msg, sizeof(msg), " SECTOR REC ID PID CTIME " " MTIME SRC DST LEN CRC\n"); fprintf(fp, "%s", msg); for (i = 0; i < nrgns; ++i, ++sb) { snprintf(msg, sizeof(msg), "%8lld %3d %8lld %5lld %8llX %8llX " "%8lld %8lld %5d %08X\n", offset+i, 0, sb->rec[0].id, sb->rec[0].pid, sb->rec[0].ctime, sb->rec[0].mtime, sb->rec[0].src, sb->rec[0].dst, sb->rec[0].len, sb->crc); fprintf(fp, "%s", msg); if (verbosity > 1) { snprintf(msg, sizeof(msg), "%8lld %3d %8lld %5lld %8llX %8llX " "%8lld %8lld %5d %08X\n", offset+i, 1, sb->rec[1].id, sb->rec[1].pid, sb->rec[1].ctime, sb->rec[1].mtime, sb->rec[1].src, sb->rec[1].dst, sb->rec[1].len, sb->crc); fprintf(fp, "%s", msg); } } } /* Start some number of child processes to do the actual work of * region swapping. */ void teststart(char *partition) { char *partbasename; char lockpath[128]; char infopath[128]; char rtckpath[128]; char *psrc, *pdst; int64_t nsectors; info_t *infobase; rtck_t *rtckbase; struct stat stat; info_t *victim; char *devname; sector_t sb; int fdRTCK; int fdInfo; int fdLock; int fdPart; pid_t pid; int64_t i; int rc; /* Open the data partition (i.e., the testbed). */ fdPart = open(partition, O_RDWR); if (-1 == fdPart) { eprintf("open(%s): %s\n", partition, strerror(errno)); exit(EX_OSERR); } /* Get the base name of the partition. */ partbasename = strrchr(partition, '/'); if (!partbasename) { partbasename = partition; } else { ++partbasename; } /* Get the first record so as to find out the total number * of records involved in the test. */ getrgn(fdPart, &sb, oseek, (int64_t)1, (aiocbptr_t)0); chkrgn(stderr, &sb, oseek, (int64_t)1, (rtck_t *)0, (defect_t *)0, (crc32_t *)0); nsectors = sb.rec[0].nsectors; gctime = sb.rec[0].ctime; DPRINTF(2, "DEVICE: \t%s\n", partition); DPRINTF(2, "NSECTORS:\t%lld\n", nsectors); DPRINTF(2, "OSEEK: \t%lld\n", oseek); DPRINTF(2, "SESSION: \t%s\n", sb.session); DPRINTF(2, "CTIME: \t%s", ctime((time_t *)&gctime)); /* Use the session name from the testbed if the session * name wasn't specified. */ if (!argSession) { static char session[32]; (void)strncpy(session, sb.session, sizeof(session)); session[sizeof(session) - 1] = '\000'; argSession = session; } #if HAVE_MMAP /* Open the run time check table. */ fdRTCK = -1; rtckbase = (rtck_t *)0; /* Try to open the run time check file, but proceed regardless * of whether one is found. */ while (fRtck && (fdRTCK == -1) && (sb.flags & DITS_FRTCK)) { (void)snprintf(rtckpath, sizeof(rtckpath), "%s/%s-rtck-%s", gtmpdir, progname, argSession); fdRTCK = open(rtckpath, O_RDWR); if (-1 == fdRTCK) { eprintf("The primary run time check file %s could " "not be opened.\n", rtckpath); eprintf("open(%s): %s\n", rtckpath, strerror(errno)); break; } rtckbase = (rtck_t *) mmap((void *)0, nsectors * sizeof(*rtckbase), PROT_READ|PROT_WRITE, MAP_SHARED, fdRTCK, 0); if (rtckbase == MAP_FAILED) { eprintf("mmap(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } rc = fstat(fdRTCK, &stat); if (rc) { eprintf("fstat(%s): %s\n", rtckpath, strerror(errno)); exit(EX_OSERR); } DPRINTF(2, "Using primary rtck file: %s\n", rtckpath); /* Check that the number of entries in the mapped run time * cross check file agrees with the size of the test bed that * we just determined from the first sector in the test bed. */ if (stat.st_size != (nsectors * sizeof(*rtckbase))) { eprintf("The number of sectors (%lld) as read from " "the first sector in the test bed doesn't " "agree with the number of entries in the run " "time cross check file (%lld)\n", (long long)nsectors, (long long)stat.st_size / sizeof(*rtckbase)); exit(EX_DATAERR); } } #else #error TODO - this implementation does not support mmap #endif /* HAVE_MMAP */ /* Create the lock file. We use a lock file rather than the * partition because file locking doesn't work on device special * files. Each byte in the lock file represents one sector in * the partition file. */ (void)snprintf(lockpath, sizeof(lockpath), "%s/%s-locks-%s", gtmpdir, progname, argSession); fdLock = open(lockpath, O_CREAT|O_EXCL|O_RDWR, 0600); if (-1 == fdLock) { eprintf("Test bed is dirty. Run `%s -cRv' to repair.\n", lockpath, progname); eprintf("open(%s): %s\n", lockpath, strerror(errno)); exit(EX_OSERR); } if (-1 == ftruncate(fdLock, nsectors)) { eprintf("ftruncate(%s): %s\n", lockpath, strerror(errno)); exit(EX_OSERR); } (void)close(fdLock); DPRINTF(2, "Using lock file: %s\n", lockpath); /* The info file name is constructed from the base name of the * given partition. Existence of this file means that the program * terminated abnormally and the test bed should be checked. */ (void)snprintf(infopath, sizeof(infopath), "%s/%s-info-%s", gtmpdir, progname, argSession); /* The parent removes the lock info file on normal termination. * If the file exists, it means that something bad happened * during the test phase, e.g., the machine crashed or the * parent process died. In any event, the file needs to be * examined and all in progress transactions rolled back. */ if (0 == access(infopath, R_OK|W_OK)) { eprintf("access(%s): %s\n", infopath, strerror(errno)); eprintf("Test bed is dirty. Run `%s -cRv %s' to repair.\n", progname, partition); exit(EX_DATAERR); } /* Create the shared info file, which is a table of range locks * currently held by each child processes. */ fdInfo = open(infopath, O_CREAT|O_EXCL|O_RDWR, 0600); if (-1 == fdInfo) { eprintf("open(%s): %s", infopath, strerror(errno)); exit(EX_OSERR); } if (-1 == ftruncate(fdInfo, (nprocs + 1) * sizeof(*infobase))) { eprintf("ftruncate(%s): %s\n", infopath, strerror(errno)); exit(EX_OSERR); } DPRINTF(2, "Using info file: %s\n", infopath); #if HAVE_MMAP infobase = (info_t *)mmap((void *)0, (nprocs + 1) * sizeof(*infobase), PROT_READ|PROT_WRITE, MAP_SHARED, fdInfo, 0); if (infobase == MAP_FAILED) { eprintf("mmap(%s): %s", infopath, strerror(errno)); exit(EX_OSERR); } bzero((char *)infobase, (nprocs + 1) * sizeof(*infobase)); infobase[nprocs].pid = -1; /* End of table sentinel */ victim = infobase; #else #error TODO - this implementation does not support mmap #endif /* HAVE_MMAP */ #if HAVE_ALARM (void)signal(SIGHUP, sigHandler); (void)signal(SIGINT, sigHandler); (void)signal(SIGPIPE, sigHandler); (void)signal(SIGXCPU, sigHandler); (void)signal(SIGXFSZ, sigHandler); (void)signal(SIGVTALRM, sigHandler); (void)signal(SIGPROF, sigHandler); (void)signal(SIGUSR1, sigHandler); (void)signal(SIGUSR2, sigHandler); (void)signal(SIGTERM, sigHandler); #if defined(AIX4) || defined(AIX5) (void)signal(SIGDANGER, sigHandler); #endif #else #error TODO - this implementation does not support alarm #endif /* HAVE_ALARM */ rc = setpgid(0, getpid()); if (rc) { eprintf("teststart: setpgrp: %s\n", strerror(errno)); } spawn: while (victim->pid == 0) { pid = fork(); switch (pid) { case -1: eprintf("fork: %s\n", strerror(errno)); sleep(3); continue; case 0: victim->pid = getpid(); /* Each child needs his own private file offsets. */ (void)close(fdPart); fdPart = open(partition, O_RDWR); if (-1 == fdPart) { eprintf("open(%s): %s\n", partition, strerror(errno)); _exit(EX_OSERR); } fdLock = open(lockpath, O_RDWR); if (-1 == fdLock) { eprintf("open(%s): %s\n", lockpath, strerror(errno)); _exit(EX_OSERR); } if (fAIO) { #if HAVE_AIO testaio(infobase, rtckbase, fdInfo, fdLock, fdPart, nsectors); #endif /* HAVE_AIO */ _exit(0); } test(infobase, rtckbase, fdInfo, fdLock, fdPart, nsectors); _exit(0); default: DPRINTF(3, "teststart: child %d started...\n", pid); ++victim; break; } } if (maxiterations > 0) { DPRINTF(1, "%-6s %7s %7s %8s %9s %10s %10s\n", "PID", "USRTIME", "SYSTIME", "NSWAPS", "SWAPS/SEC", "NBLKSWR", "BLKSWR/SEC"); } while (1) { unsigned int status; if (fSig) { rc = killpg(0, SIGTERM); if (rc) { eprintf("teststart: killpg(0, SIGTERM): %s\n", strerror(errno)); } fSig = 0; } pid = wait((int *)&status); if (-1 == pid) { if (EINTR == errno) { DPRINTF(3, "teststart: wait interrupted\n", pid); DPRINTF(1, "\n%-6s %7s %7s %8s %9s %10s %10s\n", "PID", "USRTIME", "SYSTIME", "NSWAPS", "SWAPS/SEC", "NBLKSWR", "BLKSWR/SEC"); continue; } else if (ECHILD == errno) { break; } eprintf("teststart: wait: %s\n", strerror(errno)); exit(EX_OSERR); } /* Find this child's entry in the info table. */ for (victim = infobase; victim->pid != pid; ++victim) { /* Do nothing */ } assert(victim->pid == pid); if (WIFSTOPPED(status)) { DPRINTF(0, "teststart: child %d stopped(%d)\n", pid, WSTOPSIG(status)); } else if (WIFEXITED(status) && WEXITSTATUS(status)) { DPRINTF(0, "teststart: child %d exited(%d)\n", pid, WEXITSTATUS(status)); } else if (WIFSIGNALED(status)) { DPRINTF(0, "teststart: child %d signaled(%d)\n", pid, WTERMSIG(status)); } /* It may be necessary to roll back any changes if * the child terminated abnomally. */ if ((WIFEXITED(status) && WEXITSTATUS(status)) || WIFSIGNALED(status)) { cleanlocks(victim, fdInfo, fdPart, nsectors); victim->pid = 0; goto spawn; } victim->pid = 0; } DPRINTF(2, "Removing info file: %s\n", infopath); (void)unlink(infopath); DPRINTF(2, "Removing lock file: %s\n", lockpath); (void)unlink(lockpath); } int dupsession(char *old, char *new) { char ortckpath[128]; char nrtckpath[128]; struct stat stat; int fdOld, fdNew; rtck_t *ortck; rtck_t *nrtck; ssize_t cc; int rc; #if HAVE_MMAP (void)snprintf(ortckpath, sizeof(ortckpath), "%s/%s-rtck-%s", gtmpdir, progname, old); DPRINTF(2, "Opening old rtck file: %s\n", ortckpath); fdOld = open(ortckpath, O_RDWR); if (-1 == fdOld) { exit(EX_DATAERR); } rc = fstat(fdOld, &stat); if (rc) { eprintf("fstat(%s): %s\n", ortckpath, strerror(errno)); exit(EX_OSERR); } ortck = (rtck_t *)mmap((void *)0, stat.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fdOld, 0); if (ortck == MAP_FAILED) { eprintf("mmap(%s): %s\n", old, strerror(errno)); exit(EX_OSERR); } (void)snprintf(nrtckpath, sizeof(nrtckpath), "%s/%s-rtck-%s", gtmpdir, progname, new); DPRINTF(2, "Creating new rtck file: %s\n", nrtckpath); fdNew = open(nrtckpath, O_CREAT|O_TRUNC|O_RDWR, 0600); if (-1 == fdNew) { eprintf("open(%s): %s\n", nrtckpath, strerror(errno)); return; } if (-1 == ftruncate(fdNew, stat.st_size)) { eprintf("ftruncate(%s): %s\n", nrtckpath, strerror(errno)); exit(EX_OSERR); } nrtck = (rtck_t *)mmap((void *)0, stat.st_size, PROT_READ|PROT_WRITE, MAP_SHARED, fdNew, 0); if (nrtck == MAP_FAILED) { eprintf("mmap(%s): %s\n", new, strerror(errno)); exit(EX_OSERR); } DPRINTF(2, "Copying old rtck file (%s) " "to new (%s)...\n", ortckpath, nrtckpath); bcopy(ortck, nrtck, stat.st_size); (void)close(fdOld); (void)close(fdNew); #endif /* HAVE_MMAP */ return 0; } /* Lock the info file and scan all the lock ranges for conflicts. Conflicts * arise due to locks stranded by children who suffered traumatic deaths * (e.g., kill -9). The parent process will eventually clean up the info * table and rollback any half baked swaps. */ int itlock(int fdInfo, info_t *infobase, pid_t pid, struct flock *sw, struct flock *dw) { struct flock *sh, *dh; /* src held, dst held */ struct flock lk; info_t *me; int rc; /* Lock the entire info file (i.e., the shared memory file). */ lk.l_start = 0; lk.l_len = 0; lk.l_pid = 0; lk.l_whence = SEEK_SET; lk.l_type = F_WRLCK; if (-1 == fcntl(fdInfo, F_SETLKW, &lk)) { eprintf("chklocks(%d): fcntl(%d, F_SETLK): %s\n", pid, fdInfo, strerror(errno)); exit(EX_OSERR); } /* Check for conflicts. Conflicts arise due to child processes * that exited (abnormally) without releasing their locked ranges. */ rc = EBUSY; for (me = 0; infobase->pid != -1; ++infobase) { if (infobase->pid == pid) { me = infobase; continue; } else if (!infobase->locked) { continue; } sh = &infobase->src; dh = &infobase->dst; if (!((sw->l_start + sw->l_len <= sh->l_start) || (sw->l_start >= sh->l_start + sh->l_len))) { goto release; } if (!((sw->l_start + sw->l_len <= dh->l_start) || (sw->l_start >= dh->l_start + dh->l_len))) { goto release; } if (!((dw->l_start + dw->l_len <= sh->l_start) || (dw->l_start >= sh->l_start + sh->l_len))) { goto release; } if (!((dw->l_start + dw->l_len <= dh->l_start) || (dw->l_start >= dh->l_start + dh->l_len))) { goto release; } } /* Update the lock ranges and set "locked" to one, thereby * reserving these ranges even if the fcntl locks are lost * due to the child exiting. */ assert(me && !me->locked); me->src = *sw; me->dst = *dw; me->locked = 1; rc = 0; release: lk.l_type = F_UNLCK; if (-1 == fcntl(fdInfo, F_SETLKW, &lk)) { eprintf("chklocks(%d): fcntl(F_UNLCK): %s\n", pid, strerror(errno)); exit(EX_OSERR); } return rc; } void itunlock(int fdInfo, info_t *me) { struct flock lk; assert(me); lk.l_start = 0; lk.l_len = 0; lk.l_pid = 0; lk.l_whence = SEEK_SET; /* Release info table record lock. */ lk.l_type = F_WRLCK; if (-1 == fcntl(fdInfo, F_SETLKW, &lk)) { eprintf("test(%d): fcntl(info, F_WRLCK): %s\n", getpid(), strerror(errno)); exit(EX_OSERR); } assert(me->locked); me->locked = 0; lk.l_type = F_UNLCK; if (-1 == fcntl(fdInfo, F_SETLKW, &lk)) { eprintf("test(%d): fcntl(info, F_UNLCK): %s\n", getpid(), strerror(errno)); exit(EX_OSERR); } } /* cleanlocks is called to clean up stale locks left by a process * that exited abnormally. If the process died while holding locks, * then me->locked will be `true' and we have to examine the disk * regions covered by the child's range locks (src and dst), and * roll back records involved in an interrupted swap (if any). */ int cleanlocks(info_t *me, int fdInfo, int fdPart, int64_t nsectors) { sector_t *sb; defect_t *d; if (!me->locked) { return 0; } if ((me->src.l_len == 0) || (me->dst.l_len == 0)) { DPRINTF(1, "cleanlocks: len = 0 %ld %ld\n", me->src.l_len, me->dst.l_len); me->locked = 0; return 0; } assert(me->src.l_len == me->dst.l_len); /* At this point we know the child had some range locks, * so we need to see if there are any records that need * to be rolled back. */ DPRINTF(1, "cleanlocks: child %d has range locks...\n", me->pid); DPRINTF(2, "cleanlocks: src: l_start=%lld l_len=%d\n", (long long)me->src.l_start, me->src.l_len); DPRINTF(2, "cleanlocks: dst: l_start=%lld l_len=%d\n", (long long)me->dst.l_start, me->dst.l_len); sb = malloc(maxswapsectors * sizeof(*sb)); if (!sb) { eprintf("cleanlocks: out of memory\n"); return 0; } d = defect_alloc(nsectors + oseek); if (!d) { eprintf("cleanlocks: out of memory\n"); free(sb); return 0; } /* Get the regions involved in the swap. */ getrgn(fdPart, sb, me->src.l_start, me->src.l_len, (aiocbptr_t)0); chkrgn(stderr, sb, me->src.l_start, me->src.l_len, (rtck_t *)0, d, (crc32_t *)0); getrgn(fdPart, sb, me->dst.l_start, me->dst.l_len, (aiocbptr_t)0); chkrgn(stderr, sb, me->dst.l_start, me->dst.l_len, (rtck_t *)0, d, (crc32_t *)0); repair(fdPart, d); me->locked = 0; free(sb); defect_free(d); DPRINTF(1, "cleanlocks: child %d cleaned\n", me->pid); return 0; } /* Continuously select two arbitrary contiguous regions and swap them. */ void test(info_t *infobase, rtck_t *rtckbase, int fdInfo, int fdLock, int fdPart, int64_t nsectors) { time_t starttime, now; struct flock src, dst; sector_t *sb1, *sb2; int64_t nblkswr; int64_t tmp64; long nswaps; long utime; long stime; size_t len; info_t *me; pid_t pid; pid = getpid(); /* Limit maxswapsectors so that the workers have * a fighting chance to run without complete deadlock. */ tmp64 = nsectors / ((nprocs * 2) + 1); if (maxswapsectors > tmp64) { maxswapsectors = tmp64; DPRINTF(1, "test(%d): limiting maxswapsectors " "to %lld\n", pid, tmp64); if (maxswapsectors < 1) { eprintf("test(%d): the test bed needs to contain " "at least %d sectors given %d worker " "processes\n", pid, (nprocs * 2) + 1, nprocs); sleep(3); exit(EX_USAGE); } if (minswapsectors > maxswapsectors) { tmp64 /= 2; DPRINTF(1, "test(%d): limiting minswapsectors " "to %lld\n", pid, tmp64); minswapsectors = tmp64; } } DPRINTF(3, "test(%d) nsectors=%lld oseek=%lld\n", pid, nsectors, oseek); DPRINTF(3, "test(%d): minswapsectors=%lld maxswapsectors=%lld\n", pid, minswapsectors, maxswapsectors); /* Add pid to the seed so that the child processes * don't all generate the same sequence. */ starttime = time((time_t *)0); nswaps = 0; /* Allocate two buffers into which to hold the data to be swapped. */ sb1 = malloc(maxswapsectors * sizeof(*sb1)); sb2 = malloc(maxswapsectors * sizeof(*sb2)); if (!(sb1 && sb2)) { eprintf("test(%d): insufficient memory for %lld swapsector " "buffers\n", pid, maxswapsectors); exit(EX_OSERR); } bzero(sb1, sizeof(*sb1) * maxswapsectors); bzero(sb2, sizeof(*sb2) * maxswapsectors); if ((void(*)(int))-1 == signal(SIGALRM, sigAlarm)) { eprintf("test(%d): signal: %s\n", pid, strerror(errno)); exit(EX_OSERR); } /* Find my entry in the info table. */ for (me = infobase; me->pid != pid; ++me) { /* Do nothing */ } assert(me->pid == pid); nblkswr = 0; while (!fSig) { len = (random() % (maxswapsectors - minswapsectors + 1)) + minswapsectors; /* Get write locks. */ src.l_start = (random() % (nsectors - len - oseek + 1)) + oseek; src.l_len = len; src.l_pid = 0; src.l_type = F_WRLCK; src.l_whence = SEEK_SET; /* Set a watchdog so as to break deadlocks. */ fSigAlarm = 0; alarm(30); /* Try for an exclusive lock over the src region, * blocking if necessary (F_SETLKW vs F_SETLK). */ if (-1 == fcntl(fdLock, F_SETLKW, &src)) { alarm(0); if (EINTR == errno && fSig) { continue; } else if (EINTR == errno && fSigAlarm) { eprintf("test(%d): fcntl: deadlock?\n", pid); continue; } eprintf("test(%d) fcntl(srclock, F_WRLCK): %s\n", pid, strerror(errno)); exit(EX_OSERR); } alarm(0); /* Try for an exclusive lock over the dst region, * but do not block (F_SETLK). */ while (1) { dst.l_start = (random() % (nsectors - len - oseek+1)) + oseek; dst.l_len = len; dst.l_pid = 0; dst.l_type = F_WRLCK; dst.l_whence = SEEK_SET; /* If the dst region overlaps the src region * then reselect the dst region. */ if (!((dst.l_start > src.l_start + len) || (dst.l_start + len < src.l_start))) { continue; } if (-1 == fcntl(fdLock, F_SETLK, &dst)) { if (EAGAIN == errno) { DPRINTF(4, "test(%d): [%lld %lld %d] " "busy\n", pid, (long long)src.l_start, (long long)dst.l_start, len); continue; } else if (EINTR == errno) { continue; #if defined(AIX4) || defined(AIX5) } else if (EACCES == errno) { continue; #endif } eprintf("test(%d): fcntl(dstlock, F_WRLCK): " "%s start=%lld len=%d\n", pid, strerror(errno), (long long)dst.l_start, len); exit(EX_OSERR); } break; } if (src.l_start < 0 || dst.l_start < 0) { eprintf("test(%d): huh?", pid); exit(69); } /* Acquire info table lock. */ if (0 != itlock(fdInfo, infobase, pid, &src, &dst)) { DPRINTF(1, "test(%d): stranded lock detected\n", pid); goto release; } DPRINTF(3, "test(%d): swapping [%lld %lld %d]\n", pid, (long long)src.l_start, (long long)dst.l_start, len); /* Read and check the data from the selected regions. */ getrgn(fdPart, sb1, src.l_start, len, (aiocbptr_t)0); chkrgn(stderr, sb1, src.l_start, len, rtckbase, (defect_t *)0, (crc32_t *)0); getrgn(fdPart, sb2, dst.l_start, len, (aiocbptr_t)0); chkrgn(stderr, sb2, dst.l_start, len, rtckbase, (defect_t *)0, (crc32_t *)0); /* Save the rollback records. */ saverollback(sb1, sb2, len); /* Update all records. */ now = time((time_t *)0); updatergn(sb1, src.l_start, dst.l_start, len, pid, now, rtckbase); updatergn(sb2, dst.l_start, src.l_start, len, pid, now, rtckbase); /* Swap and write the regions. Always write the range * with the lowered disk block address first to make * rollback easier. */ if (src.l_start < dst.l_start) { putrgn(fdPart, sb2, src.l_start, len, (aiocbptr_t)0); putrgn(fdPart, sb1, dst.l_start, len, (aiocbptr_t)0); } else { putrgn(fdPart, sb1, dst.l_start, len, (aiocbptr_t)0); putrgn(fdPart, sb2, src.l_start, len, (aiocbptr_t)0); } /* Release info table record lock. */ itunlock(fdInfo, me); /* Release file range locks. */ release: dst.l_type = F_UNLCK; if (-1 == fcntl(fdLock, F_SETLKW, &dst)) { eprintf("test(%d): fcntl(dstlock, F_UNLCK): %s\n", pid, strerror(errno)); exit(EX_OSERR); } src.l_type = F_UNLCK; if (-1 == fcntl(fdLock, F_SETLKW, &src)) { eprintf("test(%d): fcntl(srclock, F_UNLCK): %s\n", pid, strerror(errno)); exit(EX_OSERR); } ++nswaps; if ((maxiterations > 0) && (nswaps >= maxiterations)) { break; } /* Add the number of blocks just written to the running * total (nblkswr). If the number of blocks written per * second thus far (nblkswrps) exceeds the maximum desired * number of blocks written per second (maxblksps), then * sleep long enough to try and keep the rates as * close as possible. */ nblkswr += len * 2; if ((maxblksps > 0) && (now > starttime)) { static int64_t nblkswr_saved = 0; static u_long delay = 0; if (nblkswr_saved < nblkswr) { int64_t nblkswrps; u_long adjust; nblkswrps = nblkswr / (now - starttime); nblkswr_saved = nblkswr; adjust = nblkswrps * 100000 / maxblksps; DPRINTF(3, "(%d): nblkswrps=%lld nblkswr=%lld " "delay=%lu\n", pid, nblkswrps, nblkswr, delay); if (nblkswrps > maxblksps) { delay += adjust; usleep(delay); } else if (delay > adjust) { delay -= adjust; } } } } getustime(&utime, &stime); now = time((time_t *)0); DPRINTF(1, "%-6d %7ld %7ld %8ld %9.1f %10lld %10.1f\n", pid, stime, utime, nswaps, (float)nswaps / (now - starttime), nblkswr, (float)nblkswr / (now - starttime)); free(sb1); free(sb2); } #if HAVE_AIO void testaio(info_t *infobase, rtck_t *rtckbase, int fdInfo, int fdLock, int fdPart, int64_t nsectors) { time_t starttime, now; struct aiocb **arwaio; int maxtries = 8; int64_t nblkswr; xaio_t **axaio; int64_t tmp64; int64_t nrgns; long nswaps; long utime; long stime; int rwidx; pid_t pid; int nrw; pid = getpid(); starttime = time((time_t *)0); nswaps = 0; /* Limit maxswapsectors so that the workers have * a fighting chance to run without complete deadlock. */ tmp64 = nsectors / ((naioreqs * 2) + 1); if (maxswapsectors > tmp64) { maxswapsectors = tmp64; DPRINTF(1, "testaio(%d): limiting maxswapsectors " "to %lld\n", pid, tmp64); if (maxswapsectors < 1) { eprintf("testaio(%d): the test bed needs to contain " "at least %d sectors given %d AIO requests\n", pid, (naioreqs * 2) + 1, naioreqs); sleep(3); exit(EX_USAGE); } if (minswapsectors > maxswapsectors) { tmp64 /= 2; DPRINTF(1, "testaio(%d): limiting minswapsectors " "to %lld\n", pid, tmp64); minswapsectors = tmp64; } } DPRINTF(3, "testaio(%d) nsectors=%lld oseek=%lld\n", pid, nsectors, oseek); DPRINTF(3, "testaio(%d): minswapsectors=%lld maxswapsectors=%lld\n", pid, minswapsectors, maxswapsectors); /* Allocate the aiocb pointer array. This array holds * pointers to aiocbs that are in flight. */ arwaio = malloc(naioreqs * sizeof(*arwaio)); if (!arwaio) { eprintf("testaio(%d): insufficient memory for %d " "aio requests\n", pid, naioreqs); exit(EX_OSERR); } bzero(arwaio, naioreqs * sizeof(*arwaio)); /* Allocate the xaio pointer array. This array hold * pointers to xaios for which there is an active * swap in progress. */ axaio = malloc(naioreqs * sizeof(*axaio)); if (!axaio) { eprintf("testaio(%d): insufficient memory for %d " "aio requests\n", pid, naioreqs); exit(EX_OSERR); } bzero(axaio, naioreqs * sizeof(*axaio)); if ((void(*)(int))-1 == signal(SIGALRM, sigAlarm)) { eprintf("testaio(%d): signal: %s\n", pid, strerror(errno)); exit(EX_OSERR); } nrw = 0; rwidx = 0; nblkswr = 0; while (1) { xaio_t *x1, *x2; int tries; int gbrc; int rc; int i; nrgns = (random() % (maxswapsectors - minswapsectors + 1)) + minswapsectors; /* Create two xaio control buffers, one for * each region that we are going to swap. */ assert(nrgns <= maxswapsectors); gbrc = getxaiobuf(nrgns, fdLock, &x1, &x2); if (gbrc) { goto reaprw; } x1->read = 1; x2->read = 1; /* Try for an exclusive lock over the first region, * but do not block (F_SETLK). */ x1->idx = getidx(naioreqs, axaio, &rwidx); for (tries = 0; tries < maxtries; ++tries) { x1->lk.l_start = random() % (nsectors - nrgns - oseek+1) + oseek; x1->lk.l_len = nrgns; x1->lk.l_pid = 0; x1->lk.l_type = F_WRLCK; x1->lk.l_whence = SEEK_SET; /* Check that this lock range doesn't overlap * any other request that is in progress. * * TODO: This code spins if naioreqs is too * large for the testbed. */ if (chkoverlap(naioreqs, axaio, x1)) { DPRINTF(3, "testaio(%d): overlapping " "lock rgns start=%lld\n", pid, (long long)x1->lk.l_start); continue; } /* Get an exclusive range lock to prevent * concurrent access to this region by any * other process working in this test bed. */ if (-1 == fcntl(fdLock, F_SETLK, &x1->lk)) { DPRINTF(4, "testaio(%d): fcntl(F_SETLK): %s\n", pid, strerror(errno)); sleep(1); if (EAGAIN == errno) { continue; } else if (EINTR == errno) { continue; #if defined(AIX4) || defined(AIX5) } else if (EACCES == errno) { continue; #endif } eprintf("testaio(%d): fcntl(F_SETLK): %s\n", pid, strerror(errno)); exit(EX_OSERR); } break; } if (tries < maxtries) { axaio[x1->idx] = x1; arwaio[x1->idx] = &x1->aio; } else { DPRINTF(3, "testaio(%d): couldn't lock first " "region\n", pid); x1->read = 0; x2->read = 0; freexaiobuf(x1, x2); goto reaprw; } /* Try for an exclusive lock over the second region, * but do not block (F_SETLK). */ x2->idx = getidx(naioreqs, axaio, &rwidx); for (tries = 0; tries < maxtries; ++tries) { x2->lk.l_start = random() % (nsectors - nrgns - oseek+1) + oseek; x2->lk.l_len = nrgns; x2->lk.l_pid = 0; x2->lk.l_type = F_WRLCK; x2->lk.l_whence = SEEK_SET; /* Check that this lock range doesn't overlap * any other request that is in progress. * * TODO: This code spins if naioreqs is too * large for the testbed. */ if (chkoverlap(naioreqs, axaio, x2)) { DPRINTF(3, "testaio(%d): overlapping " "lock rgns start=%lld\n", pid, (long long)x2->lk.l_start); continue; } /* Get an exclusive range lock to prevent * concurrent access to this region by any * other process working in this test bed. */ if (-1 == fcntl(fdLock, F_SETLK, &x2->lk)) { DPRINTF(4, "testaio(%d): F_SETLK: %s\n", pid, strerror(errno)); sleep(1); if (EAGAIN == errno) { continue; } else if (EINTR == errno) { continue; #if defined(AIX4) || defined(AIX5) } else if (EACCES == errno) { continue; #endif } eprintf("testaio(%d): fcntl(F_WRLCK): " "%s start=%lld nrgns=%lld\n", pid, strerror(errno), (long long)x1->lk.l_start, nrgns); exit(EX_OSERR); } break; } if (tries < maxtries) { axaio[x2->idx] = x2; arwaio[x2->idx] = &x2->aio; DPRINTF(3, "testaio(%d): aio read [%lld %lld %lu]\n", pid, (long long)x1->lk.l_start, (long long)x2->lk.l_start, x1->nrgns); /* Read the data from the selected regions. */ getrgn(fdPart, x1->sb, x1->lk.l_start, x1->nrgns, &x1->aio); getrgn(fdPart, x2->sb, x2->lk.l_start, x2->nrgns, &x2->aio); nrw += 2; } else { x1->lk.l_type = F_UNLCK; if (-1 == fcntl(fdLock, F_SETLKW, &x1->lk)) { eprintf("testaio(%d): fcntl(F_UNLCK): " "%s\n", pid, strerror(errno)); exit(EX_OSERR); } DPRINTF(3, "testaio(%d): couldn't lock second " "region\n", pid); x1->read = 0; x2->read = 0; axaio[x1->idx] = 0; arwaio[x1->idx] = 0; freexaiobuf(x1, x2); goto reaprw; } reaprw: /* Here we scan the array of requests to reap * read requests that have completed. */ for (i = 0; i < naioreqs; ++i) { ssize_t cc; x1 = axaio[i]; if (!x1 || !x1->read || !arwaio[i]) { continue; } assert(!x1->done); rc = AIO_ERROR(&x1->aio); if (rc) { if (rc != EINPROGRESS) { eprintf("testaio(%d): aio_error: %s\n", pid, strerror(rc)); exit(EX_OSERR); } continue; } cc = aio_return(&x1->aio); if (cc != x1->aio.aio_nbytes) { eprintf("testaio(%d): aio_return: cc (%lu) " "!= aio_nbytes (%lu)\n", pid, cc, x1->aio.aio_nbytes); exit(EX_OSERR); } x1->done = 1; x1->read = 0; x2 = x1->sibling; arwaio[i] = 0; DPRINTF(3, "testaio(%d): read done [%lld %lld %lu] " "cc=%ld\n", pid, (long long)x1->lk.l_start, (long long)x2->lk.l_start, x1->nrgns, cc); chkrgn(stderr, x1->sb, x1->lk.l_start, x1->nrgns, rtckbase, (defect_t *)0, (crc32_t *)0); /* If the sibling is done, then both reads have * completed. We can now initiate the write/swap * phase of the transaction. */ if (x2->done) { x1->done = x2->done = 0; /* Save the rollback records. */ saverollback(x1->sb, x2->sb, x1->nrgns); arwaio[x1->idx] = &x1->aio; arwaio[x2->idx] = &x2->aio; DPRINTF(3, "testaio(%d): aio write " "[%lld %lld %lu]\n", pid, (long long)x1->lk.l_start, (long long)x2->lk.l_start, x1->nrgns); now = time((time_t *)0); updatergn(x1->sb, x1->lk.l_start, x2->lk.l_start, x1->nrgns, pid, now, rtckbase); updatergn(x2->sb, x2->lk.l_start, x1->lk.l_start, x2->nrgns, pid, now, rtckbase); putrgn(fdPart, x1->sb, x2->lk.l_start, x1->nrgns, &x1->aio); putrgn(fdPart, x2->sb, x1->lk.l_start, x2->nrgns, &x2->aio); nblkswr += x1->nrgns + x2->nrgns; } } /* Here we scan the array of requests to reap * write requests that have completed. */ for (i = 0; i < naioreqs; ++i) { ssize_t cc; x1 = axaio[i]; if (!x1 || x1->read || !arwaio[i]) { continue; } assert(!x1->done); if (fExec) { rc = AIO_ERROR(&x1->aio); if (rc) { if (rc != EINPROGRESS) { eprintf("testaio(%d): " "aio_error: %s\n", pid, strerror(rc)); exit(EX_OSERR); } continue; } cc = aio_return(&x1->aio); if (cc != x1->aio.aio_nbytes) { eprintf("testaio(%d): aio_return: cc " "(%lu) != aio_nbytes (%lu)\n", pid, cc, x1->aio.aio_nbytes); exit(EX_OSERR); } } x1->done = 1; x2 = x1->sibling; arwaio[i] = 0; DPRINTF(3, "testaio(%d): write done [%lld %lld %lu] " "cc=%lu\n", pid, (long long)x1->lk.l_start, (long long)x2->lk.l_start, x1->nrgns, cc); /* Release range lock. */ x1->lk.l_type = F_UNLCK; if (-1 == fcntl(fdLock, F_SETLKW, &x1->lk)) { eprintf("testaio(%d): fcntl(F_UNLCK): " "%s\n", pid, strerror(errno)); exit(EX_OSERR); } /* If the sibling is done, then both writes * have completed and the swap is complete. */ if (x2->done) { axaio[x1->idx] = 0; axaio[x2->idx] = 0; freexaiobuf(x1, x2); nrw -= 2; ++nswaps; if ((maxiterations > 0) && (--maxiterations == 0)) { ++fSig; } } } /* If we caught a signal to terminate, then we must * wait for all the in flight I/O to finish. */ if (fSig) { static int gavemsg = 0; if (!nrw) { break; } else if (!gavemsg) { DPRINTF(3, "testaio(%d): flushing %d " "buffers...\n", pid, nrw); fflush(stdout); gavemsg = 1; } } /* Here we must stay in the reaper loop until space * frees up in the aio request tracking table (axio[]). */ if ((naioreqs - nrw < 2) || gbrc || fSig) { DPRINTF(3, "testaio(%d): nrw=%d gbrc=%d fsig=%d\n", pid, nrw, gbrc, fSig); if (nrw > 0) { rc = AIO_SUSPEND(arwaio, naioreqs); if (rc == -1) { DPRINTF(4, "testaio(%d): aio_suspend: " "%s nrw=%d\n", pid, strerror(errno), nrw); sleep(1); } } goto reaprw; } /* Add the number of blocks just written to the running * total (nblkswr). If the number of blocks written per * second thus far (nblkswrps) exceeds the maximum desired * number of blocks written per second (maxblksps), then * sleep long enough to try and keep the rates as * close as possible. */ if ((maxblksps > 0) && (now > starttime)) { static int64_t nblkswr_saved = 0; static u_long delay = 0; if (nblkswr_saved < nblkswr) { int64_t nblkswrps; u_long adjust; nblkswrps = nblkswr / (now - starttime); nblkswr_saved = nblkswr; adjust = nblkswrps * 100000 / maxblksps; DPRINTF(3, "(%d): nblkswrps=%ld nblkswr=%lld " "delay=%lu\n", pid, nblkswrps, nblkswr, delay); if (nblkswrps > maxblksps) { delay += adjust; usleep(delay); } else if (delay > adjust) { delay -= adjust; } } } } free(arwaio); free(axaio); getustime(&utime, &stime); now = time((time_t *)0); DPRINTF(1, "%-6d %7ld %7ld %8ld %9.1f %10lld %10.1f\n", pid, utime, stime, nswaps, (float)nswaps / (now - starttime), nblkswr, (float)nblkswr / (now - starttime)); } #endif /* HAVE_AIO */ /* Update region prepares each record in the given range * to be written to disk. */ void updatergn(sector_t *sb, off_t src, off_t dst, size_t len, pid_t pid, time_t curtime, rtck_t *rtckbase) { long randupd; int i; randupd = random(); for (i = 0; i < len; ++i, ++sb) { record_t *r = &sb->rec[0]; r->pid = pid; r->mtime = curtime; r->src = src + i; r->dst = dst + i; r->len = len; ++r->nswaps; /* Randomly updated number for increasing the probability * that the record is different from its predecessor. */ sb->randupd = randupd; sb->kscratch = 0; sb->crc = 0; sb->crc = crc32((u_char *)sb, sizeof(*sb), CRC32_PRELOAD); if (rtckbase && fExec) { rtckbase[dst + i].id = r->id; rtckbase[dst + i].crc = sb->crc; } } } /* For each record in the given range, check region calculates * the crc and compares it to the stored crc in effort to * validate that the record has not been corrupted. */ void chkrgn(FILE *fp, sector_t *sb, off_t offset, size_t len, rtck_t *rtckbase, defect_t *d, crc32_t *signature) { int i; /* Check the integrity of the data read. */ for (i = 0; i < len; ++i, ++sb) { record_t *r = &sb->rec[0]; char msg[1024]; crc32_t crc; /* The kernel scratch area is used for in-kernel * validation (sequence number validation, etc, ... * We must set it to zero before computing the crc. */ sb->kscratch = 0; /* If the testbed is corrupted, then we want to kill off all * the test processes rather than let the parent try to fix * things. The base defect list pointer `d' is usually null * during test mode and valid during check mode. We use * that to decide whether we should abort or continue. */ crc = sb->crc; sb->crc = 0; sb->crc = crc32((u_char *)sb, sizeof(*sb), CRC32_PRELOAD); if (crc != sb->crc) { (void)snprintf(msg, sizeof(msg), "Sector %lld corrupted: Expected CRC " "of %lX but got %lX", offset + i, sb->crc, crc); if (d) { d->msg[offset + i] = strdup(msg); continue; } fprintf(fp, msg); fprintf(fp, "\n"); dumprgn(fp, sb, 0, 1, rtckbase); (void)killpg(0, SIGTERM); exit(EX_DATAERR); } if (rtckbase) { msg[0] = '\000'; if (rtckbase[offset + i].id != r->id) { snprintf(msg, sizeof(msg), "Sector %lld: Expected ID of %lld " "but got %lld", offset + i, rtckbase[offset + i].id, r->id); } else if (rtckbase[offset + i].crc != sb->crc) { snprintf(msg, sizeof(msg), "Sector %lld: Expected CRC of %lX " "but got %lX", offset +i, rtckbase[offset + i].crc, sb->crc); } if (msg[0]) { if (d) { d->msg[offset + i] = strdup(msg); continue; } fprintf(fp, msg); fprintf(fp, "\n"); dumprgn(fp, sb, 0, 1, rtckbase); (void)killpg(0, SIGTERM); exit(EX_DATAERR); } } if ((gctime > 0) && (gctime != r->ctime)) { snprintf(msg, sizeof(msg), "Sector %lld: Expected ctime of %llX " "but got %llX", offset + i, ctime, r->ctime); if (d) { d->msg[offset + i] = strdup(msg); continue; } fprintf(fp, msg); fprintf(fp, "\n"); dumprgn(fp, sb, 0, 1, rtckbase); (void)killpg(0, SIGTERM); exit(EX_DATAERR); } /* The record's dst field should match the sector * from which the record was read. */ if (r->dst != offset + i) { snprintf(msg, sizeof(msg), "Sector %lld: Expected dst block of %lld " "but got %lld\n", offset + i, offset + i, r->dst); if (d) { d->msg[offset + i] = strdup(msg); } else { fprintf(fp, msg); fprintf(fp, "\n"); dumprgn(fp, sb, 0, 1, rtckbase); (void)killpg(0, SIGTERM); exit(EX_DATAERR); } } if (d) { if ((r->id < 0) || (r->id > d->nsectors)) { fprintf(fp, "chkrgn: invalid record id %lld " "at sector %lld\n", r->id, offset + i); fprintf(fp, "chkrgn: nsectors=%lld\n", d->nsectors); exit(EX_DATAERR); } if (r->id == (offset + i)) { ++d->inplace; } d->id[offset + i] = r->id; d->crc[offset + i] = crc; /* Save in d->x[r->id] the sector at which the record * r->id was found. If d->x[r->id] is not (-1), then * this is a duplicate, and so we store the duplicate * info in d->y[r->id] in hopes that we can repair * the problem. */ if (d->x[r->id] == -1) { d->x[r->id] = offset + i; } else if (d->y[r->id] == -1) { d->y[r->id] = offset + i; if (!d->msg[r->id]) { (void)snprintf(msg, sizeof(msg), "Duplicate record at " "sector %lld", offset + i); d->msg[r->id] = strdup(msg); } } else { (void)snprintf(msg, sizeof(msg), "%s, %lld", d->msg[r->id], offset + i); free(d->msg[r->id]); d->msg[r->id] = strdup(msg); } } if (signature) { *signature = crc32((u_char *)&crc, sizeof(crc), *signature); } } } /* Called prior to a swap write. Saves the previous contents of rec[0] * into rec[1] so that rec[0] can be rolled back in the case of * catastrophic error (e.g., system crash, kill -9, etc, ...) */ void saverollback(sector_t *x, sector_t *y, int64_t nrgns) { while (nrgns-- > 0) { x[nrgns].rec[1] = y[nrgns].rec[0]; y[nrgns].rec[1] = x[nrgns].rec[0]; } } /* Note that we received a signal. */ RETSIGTYPE sigHandler(int sig) { ++fSig; } /* Note that the watchdog timer expired. */ RETSIGTYPE sigAlarm(int sig) { ++fSigAlarm; } /* Error print. */ void eprintf(char *fmt, ...) { char msg[1024]; va_list ap; sprintf(msg, "%s: ", progname); va_start(ap, fmt); vsnprintf(msg+strlen(msg), sizeof(msg)-strlen(msg), fmt, ap); va_end(ap); fputs(msg, stderr); } #if HAVE_AIO int getxaiobuf(int nrgns, int fd, xaio_t **x1, xaio_t **x2) { assert(sizeof(xaio_t) <= sizeof(sector_t)); if (xaiohead) { *x1 = xaiohead; xaiohead = (*x1)->next; if ((*x1)->next) { (*x1)->next = (xaio_t *)0; } assert((*x1)->magic == 0xdeadbeef); } else { (*x1) = malloc((maxswapsectors + 1) * sizeof(sector_t)); if (!(*x1)) { eprintf("getxaiobuf: malloc(%lu) failed...\n", (nrgns + 1) * sizeof(sector_t)); return ENOMEM; } } bzero(*x1, sizeof(**x1)); (*x1)->sb = (sector_t *)((char *)(*x1) + sizeof(sector_t)); (*x1)->nrgns = nrgns; (*x1)->magic = 0x01234567; if (!x2) { return 0; } if (xaiohead) { *x2 = xaiohead; xaiohead = (*x2)->next; if ((*x2)->next) { (*x2)->next = (xaio_t *)0; } assert((*x2)->magic == 0xdeadbeef); } else { *x2 = malloc((maxswapsectors + 1) * sizeof(sector_t)); if (!(*x2)) { eprintf("getxaiobuf: malloc(%lu) failed...\n", (nrgns + 1) * sizeof(sector_t)); (*x1)->magic = 0xdeadbeef; (*x1)->next = xaiohead; xaiohead = (*x1); return ENOMEM; } } bzero(*x2, sizeof(**x2)); (*x2)->sb = (sector_t *)((char *)(*x2) + sizeof(sector_t)); (*x2)->nrgns = nrgns; (*x2)->magic = 0x01234567; (*x1)->sibling = *x2; (*x2)->sibling = *x1; return 0; } void freexaiobuf(xaio_t *xaio1, xaio_t *xaio2) { assert(xaio1->magic == 0x01234567); xaio1->magic = 0xdeadbeef; xaio1->next = xaiohead; xaiohead = xaio1; if (xaio2) { assert(xaio2->magic == 0x01234567); xaio2->magic = 0xdeadbeef; xaio2->next = xaiohead; xaiohead = xaio2; } } int getidx(int n, xaio_t **ap, int *pidx) { int idx = *pidx; int i; for (i = 0; i < n; ++i) { if (!ap[idx]) { return (*pidx = idx); } idx = (idx + 1) % n; } eprintf("getidx: array is full!\n"); for (i = 0; i < n; ++i) { fprintf(stderr, "%d: %p\n", i, ap[i]); } sleep(3); exit(EX_DATAERR); } int chkoverlap(int n, xaio_t **ap, xaio_t *tgt) { int i; for (i = 0; i < n; ++i) { xaio_t *x = ap[i]; if (x) { if (x->lk.l_start + x->lk.l_len < tgt->lk.l_start) { continue; } if (tgt->lk.l_start + tgt->lk.l_len < x->lk.l_start) { continue; } return 1; } } return 0; } #endif /* HAVE_AIO */ #if 0 /* The following C code (by Rob Warnock ) does CRC-32 in * BigEndian/BigEndian byte/bit order. That is, the data is sent most * significant byte first, and each of the bits within a byte is sent most * significant bit first, as in FDDI. You will need to twiddle with it to do * Ethernet CRC, i.e., BigEndian/LittleEndian byte/bit order. [Left as an * exercise for the reader.] * The CRCs this code generates agree with the vendor-supplied Verilog models * of several of the popular FDDI "MAC" chips. */ /* Build auxiliary table for parallel byte-at-a-time CRC-32. */ #define CRC32_POLY 0x04c11db7 /* AUTODIN II, Ethernet, & FDDI */ static crc32_t crc32_table[256]; void crc32_init(void) { crc32_t c; int i, j; for (i = 0; i < 256; ++i) { for (c = i << 24, j = 8; j > 0; --j) { c = c & 0x80000000 ? (c << 1) ^ CRC32_POLY : (c << 1); } crc32_table[i] = c; } } crc32_t crc32(u_char *buf, int len, crc32_t crc) { u_char *p; for (p = buf; len > 0; ++p, --len) { crc = (crc << 8) ^ crc32_table[(crc >> 24) ^ *p]; } return ~crc; /* transmit complement, per CRC-32 spec */ } #else /* This code is copyright © 1993 Richard Black. All rights are reserved. You * may use this code only if it includes a statement to that effect. * * This algorithm takes the observation used to produce algorithm three one * stage further. Whereas it still performs the division eight bits at a time * for cache performance reasons, it is designed in such a way that the data * can be fed into the remainder in thirty two bit units (which are more * efficient units on most computers). This necessitates re-ordering the bits * of the polynomial in a non-monotonic fashion depending on the endian of the * computer on which the algorithm is running. The polynomials in the lookup * table likewise have the same non-linear transform applied to them as they * are generated. * * Of course this now only works for word aligned data and assumes that the * data is an exact number of words. I do not regard these as significant * limitations. This code is approximately twice as fast as algorithm three. * It should also be noticed that since the data is not broken up into bytes, * this code has an even larger benefit when used as an integral part of a * data copy routine. The result is also in the local form and should be * written directly as a word to the data and not as a sequence of bytes. * The tinkerer will observe that the C can be made slightly more beautiful * by rearranging the insertion of the data to the start of the loop, using * a pre-condition of , and stopping early. However, apart from the issue at * the receiver, such code will be considerably slower, because the data is * required synchronously. In the code as I have written it, the compilergif * will perform the load on p early in the loop body, and so the load delay * will have passed by the time the data is required. * This loop compiles to 16 instructions on the Arm, 29 on the Mips, 30 on * the Alpha, and 19 on the HP-PA. The Arm's instruction count is so low * because of the ability to perform shifts for free on all operations, but * it looses out on its blocking loads to about the same number of cycle as * the Mips. The Alpha gains over the Mips with its s4addq instruction, but * looses this win because 32bit loads sign extend and require zeroed. The * HP-PA assembler is too weird to comment further. * On all architectures, assuming the 1K lookup table is in the cache, the * algorithm proceeds at an average of about 1 bit per clock cycle. This * represents 25 Mbit/sec on the FPC3 or Maxine, and 150 Mbit/sec on the * Sandpiper. * This implementation is particularly suitable for hardware at a point where * the data path is thirty two bits wide. The generating register can be * implemented as a 32 bit register with an 8 bit barrel roll operation. * Between each data word being exclusive-ored in, four rolls with exclusive * or of the quotient can be performed. (or two of sixteen). This makes the * xor circuitry much simpler and reduces the percentage of cycles which must * involve the actual data. */ #define CRC32_POLY 0x04c11db7 /* AUTODIN II, Ethernet, & FDDI */ static crc32_t crctab[256]; void crc32_init(void) { int i,j; unsigned int crc; for (i = 0; i < 256; i++) { crc = i << 24; for (j = 0; j < 8; j++) { if (crc & 0x80000000) crc = (crc << 1) ^ CRC32_POLY; else crc = crc << 1; } crctab[i] = crc; } } crc32_t crc32(unsigned char *data, int len, crc32_t crc) { unsigned int *p = (unsigned int *)data; unsigned int *e = (unsigned int *)(data + len); unsigned int result; if (len < 4) { abort(); } result = crc; result ^= ~*p++; while (p < e) { result = crctab[result >> 24] ^ result << 8; result = crctab[result >> 24] ^ result << 8; result = crctab[result >> 24] ^ result << 8; result = crctab[result >> 24] ^ result << 8; result ^= *p++; } return ~result; } #endif /* Read "len" number of sectors into "sb" starting from "offset". */ void getrgn(int fd, sector_t *sb, off_t offset, size_t len, aiocbptr_t aio) { ssize_t cc; /* Read the data over the locked regions. */ #if HAVE_AIO if (aio) { int rc; bzero(aio, sizeof(*aio)); aio->aio_offset = offset * sizeof(*sb); aio->aio_buf = (char *)sb; aio->aio_nbytes = len * sizeof(*sb); #if defined(AIX4) || defined(AIX5) aio->aio_whence = SEEK_SET; aio->aio_flag = AIO_SIGNAL; #endif /* AIX4 || AIX4 */ #ifdef _AIO_AIX_SOURCE /* Legacy aio. */ /* aio->aio_event Not Used */ rc = aio_read(fd, aio); #else /* Posix aio. */ aio->aio_lio_opcode = LIO_READ; aio->aio_fildes = fd; rc = aio_read(aio); #endif /* _AIO_AIX_SOURCE */ if (rc) { eprintf("getrgn: aio_read: %s\n", strerror(errno)); assert(0); exit(EX_OSERR); } return; } #endif /* HAVE_AIO */ /* TODO: We should use probably use pread(), if available. */ if (-1 == lseek(fd, offset * sizeof(*sb), SEEK_SET)) { eprintf("getrgn: lseek: %s\n", strerror(errno)); exit(EX_OSERR); } cc = read(fd, sb, len * sizeof(*sb)); if (-1 == cc) { eprintf("getrgn: read: %s\n", strerror(errno)); exit(EX_IOERR); } else if (cc != len * sizeof(*sb)) { eprintf("getrgn: read: tried to read %ld bytes at offset " "%ld but only %ld bytes were read\n", (long)cc, (long)offset * sizeof(*sb), (long)len * sizeof(*sb)); assert(0); exit(EX_IOERR); } } /* Write the given records to disk. */ void putrgn(int fd, sector_t *sb, off_t offset, size_t len, aiocbptr_t aio) { off_t off; ssize_t cc; if (!fExec) { return; } #if HAVE_AIO if (aio) { int rc; bzero(aio, sizeof(*aio)); aio->aio_offset = offset * sizeof(*sb); aio->aio_buf = (char *)sb; aio->aio_nbytes = len * sizeof(*sb); #if defined(AIX4) || defined(AIX5) aio->aio_flag = AIO_SIGNAL; aio->aio_whence = SEEK_SET; #endif /* AIX4 || AIX4 */ #ifdef _AIO_AIX_SOURCE /* Legacy aio. */ /* aio->aio_event Not Used */ rc = aio_write(fd, aio); #else /* Posix aio. */ aio->aio_lio_opcode = LIO_WRITE; aio->aio_fildes = fd; rc = aio_write(aio); #endif /* _AIO_AIX_SOURCE */ if (rc == -1) { eprintf("putrgn: aio_write: %s\n", strerror(errno)); exit(EX_OSERR); } return; } #endif /* HAVE_AIO */ /* TODO: We should use probably use pwrite(), if available. */ off = lseek(fd, offset * sizeof(*sb), SEEK_SET); if (off != (offset * sizeof(*sb))) { if (-1 == off) { eprintf("putrgn: lseek: %s\n", strerror(errno)); } else { eprintf("putrgn: lseek: invalid offset %d\n", off); } exit(EX_OSERR); } cc = write(fd, sb, len * sizeof(*sb)); if (-1 == cc) { eprintf("putrgn: write: %s\n", strerror(errno)); exit(EX_IOERR); } else if (cc != (len * sizeof(*sb))) { eprintf("putrgn: write: tried to write %ld bytes at offset " "%ld but only %ld bytes were written\n", (long)cc, (long)offset * sizeof(*sb), (long)len * sizeof(*sb)); assert(0); exit(EX_IOERR); } } /* Convert the nnn part of svn's "$Revision: 30 $" into a number. */ long svnrev2num(char *revision) { while (revision && *revision && (*revision != ' ')) { ++revision; } return strtoul(revision, (char **)0, 10) ; } /* Get the user and system time in milliseconds. */ void getustime(long *utime, long *stime) { struct rusage r; int rc; rc = getrusage(RUSAGE_SELF, &r); if (rc) { assert(0); } *utime = (r.ru_utime.tv_sec * 1000000 + r.ru_utime.tv_usec) / 1000; *stime = (r.ru_stime.tv_sec * 1000000 + r.ru_stime.tv_usec) / 1000; } /* $Id: dits.c 30 2005-10-28 01:38:08Z greg $ * * This program finds the size of a disk partition. It does so by * repeated brute force application of the half split method, starting * with the (maximum possible partition size / 2) and moving forward or * back depending on whether the device can be read at that offset. * * To be useful, this program needs to operate in a large file environment. * To attain that, it either needs to be compiled as a 64bit program, or * as a 32bit program that groks large files. * * While FreeBSD5 has native 32bit LFS support, AIX 5.2 and Solaris 9 * do not, and so compiling on those platforms requires additional * flags in order to pull in the correct definitions. In a typical 32bit * environment, off_t is a long. In a 32bit LFS environment, off_t becomes * a long long. * * AIX: cc -D_LARGEFILES getpsize.c * Solaris: cc -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64 getpsize.c * FreeBSD: cc getpsize.c */ /* Max number of bytes in a device. This number must be greater * than the number of blocks in any device or else getpsize() * won't work correctly. */ #define DEV_BYTES_MAX (1024LL*1024*1024*1024*1024) /* 1PB */ /*#define DEV_BYTES_MAX (128LL*1024*1024*1024*1024) /* 128TB */ /* Typically, I/O to a block disk device may only take place * on a sector boundary. This macro rounds the given byte * offset down to a sector boundary. */ #define DEV_ROUND(o) ((o) & ~(DEV_BSIZE - 1)) /* Returns the number of bytes in `psize' of the device given by `fd'. */ int getpsize(int fd, off_t *psize) { int64_t offset; int64_t diff; ssize_t cc; *psize = 0; offset = DEV_BYTES_MAX / 2; for (diff = offset / 2; diff > 0; diff /= 2) { char buf[512]; DPRINTF(4, "getpsize: %lld/%lld %lld/%lld %lld\n", offset, offset / DEV_BSIZE, DEV_ROUND(offset), DEV_ROUND(offset) / DEV_BSIZE, diff); cc = pread(fd, buf, sizeof(buf), (off_t)DEV_ROUND(offset)); if (-1 == cc) { if ((ENXIO == errno) || (EOVERFLOW == errno) || (EIO == errno)) { offset -= diff; } else if ((EINTR != errno) && (EAGAIN != errno)) { return errno; } } else if (sizeof(buf) == cc) { offset += diff; } else if (0 == cc) { break; } else if (cc != sizeof(buf)) { return EIO; } } *psize = DEV_ROUND(offset); DPRINTF(4, "getpsize: offset=%lld/%lld diff=%lld psize=%lld/%lld\n", offset, offset/DEV_BSIZE, diff, *psize, *psize / DEV_BSIZE); return 0; }