/* $Id: scan.hh,v 1.6 2006/05/19 14:46:25 atterer Exp $ -*- C++ -*- __ _ |_) /| Copyright (C) 2001-2002 | richard@ | \/¯| Richard Atterer | atterer.net ¯ '` ¯ This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2. See the file COPYING for details. *//** @file Scanning of input files, cache of information for each scanned file. */ #ifndef SCAN_HH #define SCAN_HH #include #include #include #include #include #include #include #include #include #include #include #include #include //______________________________________________________________________ /** First part of the filename of a "part", a directory on the local filesystem. It will not appear in the location list - it is just prepended to the filename to actually access the file data. */ class LocationPath { /** Objects are only created by JigdoCache */ friend class JigdoCache; public: /** Sort LocationPaths by the directory name */ bool operator<(const LocationPath& x) const { return path < x.path; } const string& getPath() const { return path; } const string& getLabel() const { return label; } LocationPath& setLabel(const string& l) { label = l; return *this; } const string& getUri() const { return uri; } private: template inline LocationPath(const StringP& p, const StringL& l, const StringU& u = ""); // Using default dtor & copy ctor // The data members are accessed directly by JigdoCache methods string path; // name of directory string label; // short label to identify path in location list string uri; // e.g. URL for directory on FTP server }; typedef set LocationPathSet; //______________________________________________________________________ /** One of the "parts" (=single files) that the image (=big file) consists of. The object is also responsible for scanning the file and creating a) an RsyncSum64 of the first blockLength bytes (as determined by JigdoCache). Caching is done in a "lazy" manner; first, only the file size and name is read, then the MD5 sum of the file's first block as well as the Rsync sum is calculated, and only when the 2nd MD5 sum or the entire file's sum is requested, the file is scanned til EOF. */ class FilePart { /** Objects are only created by JigdoCache */ friend class JigdoCache; public: /** Sort FileParts by RsyncSum of first bytes */ inline const string& getPath() const; LocationPathSet::iterator getLocation() { return path; } /** @return The further dir names and the leafname, after what getPath() returns. */ inline const string& leafName() const; inline uint64 size() const; inline time_t mtime() const; /** Returns null ptr if error and you don't throw it in your JigdoCache error handler */ inline const MD5* getSums(JigdoCache* c, size_t blockNr); /** Returns null ptr if error and you don't throw it */ inline const MD5Sum* getMD5Sum(JigdoCache* c); /** Returns null ptr if error and you don't throw it */ inline const RsyncSum64* getRsyncSum(JigdoCache* c); /** Mark the FilePart as deleted. Unlike the STL containers' erase(), this means that any iterator pointing to the element stays valid, you just cannot access the element. TODO: Check whether this feature is really needed anywhere anymore - I suspect not, maybe we should just delete the element. */ inline void markAsDeleted(JigdoCache* c); /** True if this file will be ignored (but not destroyed with "delete") by JigdoCache because attempts to open/read it failed, or other reasons. */ bool deleted() const { return fileSize == 0; } /** Do not call - this is public only because list<> must be able to delete FileParts */ ~FilePart() { } // default copy ctor / assignment op //__________ private: inline FilePart(LocationPathSet::iterator p, string rest, uint64 fSize, time_t fMtime); /* Called when the methods getSums/getMD5Sum() need data read from file. Might return null. */ const MD5* getSumsRead(JigdoCache* c, size_t blockNr); const MD5Sum* getMD5SumRead(JigdoCache* c); //__________ /* There are 3 states of a FilePart: a) sums.empty(): File has not been read from so far b) !sums.empty() && !mdValid: sums[0] and rsyncSum are valid c) !sums.empty() && mdValid: all sums[] and rsyncSum and md5Sum valid*/ LocationPathSet::iterator path; string pathRest; // further dir names after "path", and leafname of file uint64 fileSize; time_t fileMtime; /* RsyncSum64 of the first MkTemplate::blockLength bytes of the file. */ RsyncSum64 rsyncSum; bool rsyncValid() const { return sums.size() > 0; } /* File is split up into chunks of length md5BlockLength (the last one may be smaller) and the MD5 checksum of each is calculated. */ vector sums; /* Hash of complete file contents. mdValid is true iff md5Sum is cached and valid. NB, it is possible that mdValid==true, but sums.size()==0, after md5BlockSize has been changed. If sums.size()==fileSize/md5ChunkSize, then not necessarily mdValid==true */ MD5Sum md5Sum; enum Flags { EMPTY = 0, // Bit flag is set iff md5Sum contains the whole file's md5 sum MD_VALID = 1, /* This file was looked up in the cache file (whether successfully or not doesn't matter) - don't look it up again. */ WAS_LOOKED_UP = 2, // Write this file's info into the cache file during ~JigdoCache() TO_BE_WRITTEN = 4 }; Flags flags; bool getFlag(Flags f) const { return (flags & f) != 0; } inline void setFlag(Flags f); inline void clearFlag(Flags f); bool mdValid() const { return getFlag(MD_VALID); } # if HAVE_LIBDB // Offsets for binary representation in database (see cachefile.hh) enum { BLOCKLEN = 0, MD5BLOCKLEN = 4, MD5BLOCKS = 8, RSYNCSUM = 12, FILE_MD5SUM = 20, PART_MD5SUM = 36 }; size_t unserializeCacheEntry(const byte* data, size_t dataSize, size_t md5BlockLength); // Byte stream => FilePart struct SerializeCacheEntry; // FilePart => byte stream friend struct SerializeCacheEntry; # endif }; //______________________________________________________________________ /** If a JigdoCacheError happens during any call to a FilePart or JigdoCache method, repeating that call is possible to continue. */ struct JigdoCacheError : Error { explicit JigdoCacheError(const string& m) : Error(m) { } explicit JigdoCacheError(const char* m) : Error(m) { } }; /** A list of FileParts that is "lazy": Nothing is actually read from the files passed to it until that is really necessary. JigdoCache behaves like a list. A JigdoCache cannot hold zero-length files. They will be silently skipped when you try to add them. Important note: If you decide to throw the errors handed to your errorHandler, JigdoCacheErrors can be thrown by some methods. The default cerrPrint (guess what) only outputs the error message to cerr. */ class JigdoCache { friend class FilePart; public: class ProgressReporter; /** cacheFileName can be "" for no file cache */ explicit JigdoCache(const string& cacheFileName, size_t expiryInSeconds = 60*60*24*30, size_t bufLen = 128*1024, ProgressReporter& pr = noReport); /** The dtor will try to write cached data to the cache file */ ~JigdoCache(); /** Read a list of filenames from the object and store them in the JigdoCache. Only the file size is read during the call, Rsync/MD5 sums are calculated later, if/when needed. */ template inline void readFilenames(RecurseDir& rd); /** Set the sizes of cache's blockLength and md5BlockLength parameters. This means that the checksum returned by a FilePart's getRsyncSum() method will cover the specified size. NB: If the cache already contains files and the new parameters are not the same as the old, the start of the files will (eventually) have to be re-read to re-calculate the checksums for blockLength, and the *whole* file will have to be re-read for a changed md5BlockLength. */ void setParams(size_t blockLen,size_t md5BlockLen); size_t getBlockLen() const { return blockLength; } size_t getMD5BlockLen() const { return md5BlockLength; } /** Amount of data that JigdoCache will attempt to read per call to ifstream::read(), and size of buffer allocated. Minimum: 64k */ inline void setReadAmount(size_t bytes); /** To speed things up, JigdoCache by default keeps one file open and a read buffer allocated. You can close the file and deallocate the buffer with a call to this method. They will be re-opened/re-allocated automatically if/when needed. */ void deallocBuffer() { buffer.resize(0); } /** Return reporter supplied by JigdoCache creator */ ProgressReporter* getReporter() { return &reporter; } /** Set and get whether to check if files exist in the filesystem */ inline void setCheckFiles(bool check) { checkFiles = check; } inline bool getCheckFiles() const { return checkFiles; } /** Returns number of files in cache */ inline size_t size() const { return nrOfFiles; } /** Make a label for a certain directory name known. E.g. addLabel("pub/debian/","deb") will cause the file pub/debianf;/dists/stable/x.deb to appear as deb:dists/stable/x.deb in the location list. Each of the paramters can bei either string or const char*. Note: If no labels are defined, a name will automatically be created later: dirA, dirB, ..., dirZ, dirAA, ... */ template const LocationPathSet::iterator addLabel( const StringP& path, const StringL& label, const StringU& uri = ""); /** Access to the members like for a list<> */ typedef FilePart value_type; typedef FilePart& reference; //____________________ /** Only part of the iterator functionality implemented ATM */ class iterator { friend class JigdoCache; friend class FilePart; public: iterator() { } inline iterator& operator++(); // might throw(RecurseError, bad_alloc) FilePart& operator*() { return *part; } FilePart* operator->() { return &*part; } // Won't compare cache members - their being different is usu. a bug bool operator==(const iterator& i) const { Paranoid(cache == i.cache); return part == i.part; } bool operator!=(const iterator& i) const { return !(*this == i); } // Default dtor private: iterator(JigdoCache* c, const list::iterator& p) : cache(c), part(p) { } JigdoCache* cache; list::iterator part; }; friend class JigdoCache::iterator; /** First element of the JigdoCache */ inline iterator begin(); /** NB the list auto-extends, so the value of end() may change while you iterate over a JigdoCache. */ iterator end() { return iterator(this, files.end()); } //____________________ private: // Read one filename from recurseDir and (if success) add entry to "files" void addFile(const string& name); /// Default reporter: Only prints error messages to stderr static ProgressReporter noReport; size_t blockLength, md5BlockLength; /* Check if files exist in the filesystem */ bool checkFiles; /* List of files in the cache (not vector<> because jigdo-file keeps ptrs, and if a vector realloc()s, all elements' addresses may change) */ list files; // Equal to files.size() less any files that are deleted() size_t nrOfFiles; // Temporarily used during readFilenames() static struct stat fileInfo; // Look up LocationPath by directory name LocationPathSet locationPaths; size_t readAmount; vector buffer; ProgressReporter& reporter; # if HAVE_LIBDB CacheFile* cacheFile; size_t cacheExpiry; # endif }; //______________________________________________________________________ /** Class allowing JigdoCache to convey information back to the creator of a JigdoCache object. */ class JigdoCache::ProgressReporter { public: virtual ~ProgressReporter() { } /** General-purpose error reporting. NB: With JigdoCache, you can throw an exception inside this to abort whatever operation JigdoCache is doing. May be called during all checksum-returning methods of FilePart */ virtual void error(const string& message); /** Like error(), but for purely informational messages. Default implementation, just like that of error(), prints message to cerr */ virtual void info(const string& message); /** Called when the individual files are read. JigdoCache sometimes reads only the first md5BlockLength bytes and sometimes the whole file. This is *only* called when the whole file is read (if the file only consists of one MD5 block, it is also called). @param file File being scanned, or null if not applicable (Contains filename info). Default version does nothing at all - you need to supply an object of a derived class to JigdoCache() to act on this. @param offInFile Offset in this file */ virtual void scanningFile(const FilePart* file, uint64 offInFile); }; //______________________________________________________________________ FilePart::FilePart(LocationPathSet::iterator p, string rest, uint64 fSize, time_t fMtime) : path(p), pathRest(rest), fileSize(fSize), fileMtime(fMtime), rsyncSum(), sums(), md5Sum(), flags(EMPTY) { //pathRest.reserve(0); } const string& FilePart::getPath() const { Paranoid(!deleted()); return path->getPath(); } const string& FilePart::leafName() const { Paranoid(!deleted()); return pathRest; } uint64 FilePart::size() const { Paranoid(!deleted()); return fileSize; } time_t FilePart::mtime() const { Paranoid(!deleted()); return fileMtime; } const MD5* FilePart::getSums(JigdoCache* c, size_t blockNr) { Paranoid(!deleted()); if (mdValid() || (blockNr == 0 && !sums.empty())) return &sums[blockNr]; else return getSumsRead(c, blockNr); } const MD5Sum* FilePart::getMD5Sum(JigdoCache* c) { Paranoid(!deleted()); if (mdValid()) return &md5Sum; else return getMD5SumRead(c); } const RsyncSum64* FilePart::getRsyncSum(JigdoCache* c) { Paranoid(!deleted()); if (!rsyncValid()) { if (getSumsRead(c, 0) == 0) return 0; } Paranoid(rsyncValid()); return &rsyncSum; } void FilePart::markAsDeleted(JigdoCache* c) { fileSize = 0; sums.resize(0); --(c->nrOfFiles); } void FilePart::setFlag(Flags f) { flags = static_cast(static_cast(flags) | static_cast(f)); } void FilePart::clearFlag(Flags f) { flags = static_cast(static_cast(flags) & ~static_cast(f)); } //________________________________________ void JigdoCache::setReadAmount(size_t bytes) { if (bytes < 64*1024) bytes = 64*1024; readAmount = bytes; } template void JigdoCache::readFilenames(RecurseDir& rd) { string name; while (true) { bool status = rd.getName(name, &fileInfo, checkFiles); // Might throw error if (status == FAILURE) return; // No more names off_t stSize = fileInfo.st_size; # if HAVE_LIBDB if (!checkFiles) { const byte* data; size_t dataSize; try { if (cacheFile->findName(data, dataSize, name, stSize, fileInfo.st_mtime).failed()) continue; } catch (DbError e) { string err = subst(_("Error accessing cache: %1"), e.message); reporter.error(err); } } # endif if (stSize == 0) continue; // Skip zero-length files addFile(name); } } JigdoCache::iterator JigdoCache::begin() { list::iterator i = files.begin(); while (i != files.end() && i->deleted()) ++i; return iterator(this, i); } JigdoCache::iterator& JigdoCache::iterator::operator++() { list::iterator end = cache->files.end(); do ++part; while (part != end && part->deleted()); return *this; } //______________________________________________________________________ template LocationPath::LocationPath(const StringP& p, const StringL& l, const StringU& u) : path(p), label(l), uri(u) { //path.reserve(0); label.reserve(0); uri.reserve(0); } //________________________________________ /* NB: If an entry with the given path already exists, we must not delete it and create another, because it is referenced from at least one FilePart. Because of our version of LocationPath::operator<, only the path matters during the find(). */ template const LocationPathSet::iterator JigdoCache::addLabel(const StringP& path, const StringL& label, const StringU& uri) { LocationPath tmp(path, label, uri); if (!tmp.path.empty() && tmp.path[tmp.path.length() - 1] != DIRSEP) tmp.path += DIRSEP; LocationPathSet::iterator i = locationPaths.find(tmp); if (i == locationPaths.end()) { return locationPaths.insert(tmp).first; // Create new entry } else { implicit_cast(i->label) = tmp.label; // Overwrite old entry implicit_cast(i->uri) = tmp.uri; return i; } } #endif