# # This file is part of Documancer (http://documancer.sf.net) # # Copyright (C) 2004-2005 Vaclav Slavik # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # $Id: cache.py,v 1.8 2005/01/30 21:50:33 vaclavslavik Exp $ # # Cache management functions (cache stores fulltext index and any other # data that can be regenerated if needed, but speed up operation) # import os.path, string, shutil, pickle, cPickle, threading, shutil import utils, book # # A brief explanation is in order here: Documancer help providers generate # HTML code from other sources and they produce the process often involves # parsing data in one form and presenting them in another form. For example, # list of available info pages has to be generated on startup and this is # lengthy process. Since it only rarely changes, it makes sense to store it # in a cache that can be loaded much faster. Book content, local copies of # HTML docs from a web server or fulltext search index are another examples # of data with two important properties: # # - they can be regenerated from the source # - they take some time to generate, so its worth storing them in cache # # This module implements generic cache mechanism and takes care of # invalidating and regenerating data automatically. Every book (or, rather, # provider) can have several CacheObjects with various data. A cache object # in turn depends on Dependency objects - dependency is an object that can # tell whether it's outdated and if it is, its dependent cache objects must # be regenerated. Finally, a CacheObject may itself be a Dependency, although # it's rare in practice. # import time, os.path # -------------------------------------------------------------------------- # Dependencies: # -------------------------------------------------------------------------- # invalid timestamp - the object is always out of date: TIMESTAMP_FORCE_OUTDATED = None class Dependency: """Representation of dependency information. Dependency is capable of telling it's time of last change; if a CacheObject depends on it and has older timestamp, it must be recreated.""" def getTimestamp(self): """Returns object's timestamp interval, as seconds since Epoch. The returned value is tuple of two floats and all files that make the object were created within the returned interval. May return TIMESTAMP_FORCE_OUTDATED value, which indicates that the object needs remaking regardless of time values.""" raise NotImplementedError def isOutdated(self): """Check if the object is outdated or not (meaningful only if it has dependencies of its own).""" return False class FilesDependency(Dependency): """Dependency implementation for the common case when we depend on a set of files.""" def __init__(self, getFiles): """Ctor. @a getFiles is a callable that will be called to obtain the list of files that form the dependency.""" self.getFilesFunc = getFiles def getTimestamp(self): smin = -1 smax = 0 for f in self.getFilesFunc(): try: stamp = os.path.getmtime(f) smax = max(smax, stamp) if smin == -1: smin = stamp else: smin = min(smin, stamp) except os.error: # if the object doesn't exist, we are certainly out of date, # so return special value: return TIMESTAMP_FORCE_OUTDATED return (smin, smax) class FilesListDependency(FilesDependency): """FilesDependency with explicit list of files instead of a callback.""" def __init__(self, files): self.files = files FilesDependency.__init__(self, self._getFiles) def _getFiles(self): return self.files # -------------------------------------------------------------------------- # Cache objects: # -------------------------------------------------------------------------- # States in which case objects can be: # Unknown state - it wasn't yet determined if the object is up to date or not: STATE_UNKNOWN = 1 # The object is up to date, it doesn't need regenerating: STATE_UP_TO_DATE = 2 # The object is outdated, it must be regenerated: STATE_OUTDATED = 3 # The object is being regenerated right now: STATE_WORKED_ON = 4 class CacheObject(FilesDependency): """Cache item.""" def __init__(self, book, name, deps=[]): """CacheObject ctor. @param book The book that owns this object @param name Name of the item (file or dir name in cache dir) @param deps list of Dependency objects that make this object""" self.book = book self.name = name self.deps = deps self.state = STATE_UNKNOWN self.objlock = threading.Lock() FilesDependency.__init__(self, self._getFiles) def _getFiles(self): return [self.getFilename()] def getFilename(self): """Returns full file/dir name of the object.""" return os.path.join(cache.get(self.book).getDir(), self.name) def exists(self): """Returns true of the object exists in filesystem.""" self.lock() ex = os.path.exists(self.getFilename()) self.unlock() return ex def isOutdated(self): """Check if the object is outdated or not.""" # if we don't exist, we are certainly out of date: timestamp = self.getTimestamp() if timestamp == TIMESTAMP_FORCE_OUTDATED: return True # find the newest dependency: for d in self.deps: # a dependency is out of date, that means that so must be we: if d.isOutdated(): return True # if our oldest file/part is older than the newest part of a # dependency, we are out of date: if timestamp[0] < d.getTimestamp()[1]: return True # if we survived all the tests above, we are up to date: return False def isActive(self): """Returns true if the cache object is active (i.e. should be updated) and false otherwise. This can be used e.g. to disable fulltext search easily.""" return True def clear(self): """Clears the cache object, so that it has to be regenerated next time it is used.""" raise NotImplementedError def update(self, ctrl): """Updates the dependency if it is out of date. Should call update() of its own deps if appliable. Calls doUpdate(). @param ctrl is controller object. If update is capable of interrupting execution, it should periodically check the value of ctrl.cancel and stop if it becomes True. @return returns True if finished successfully, False if interrupted """ self.setState(STATE_WORKED_ON) # updated dependencies: for d in self.deps: if d.isOutdated(): try: if not d.update(ctrl): self.setState(STATE_OUTDATED) return False except AttributeError: pass # Dependency doesn't implement update # updated ourselves in temporary file/directory: filename = self.getFilename() + '.work' if not self.doUpdate(filename, ctrl): # cancelled, undo what was done so far: if os.path.isfile(filename): os.remove(filename) elif os.path.isdir(filename): shutil.rmtree(filename) self.setState(STATE_OUTDATED) return False # if the update succeeded, replace the old version with new one if os.path.exists(filename): self.lock() self.clear() os.rename(filename, self.getFilename()) self.unlock() self.setState(STATE_UP_TO_DATE) return True def doUpdate(self, filename, ctrl): """Implementation of update().""" raise NotImplementedError def setState(self, state): """Sets cache object's state to one of STATE_XXX constants and sends notifications about it.""" self.state = state notifications.notify(self) def lock(self): """Locks the object - should be called before the object is manipulated.""" self.objlock.acquire() def unlock(self): """Unlocks the object - should be called when the object is no longer being manipulated.""" self.objlock.release() class DirCacheObject(CacheObject): """Cache object that consists of directory with arbitrary files in it. Derived class must override update() to fill the directory with data.""" def clear(self): shutil.rmtree(self.getFilename(), ignore_errors=1) def doUpdate(self, filename, ctrl): # (re)create the temporary directory as empty one: if os.path.exists(filename): shutil.rmtree(filename, ignore_errors=0) os.mkdir(filename) class FileCacheObject(CacheObject): """Cache object that consists of single file. Derived class must override update() to fill the file with data.""" def clear(self): f = self.getFilename() if os.path.exists(f): os.remove(f) class DataCacheObject(FileCacheObject): """Cache object that is used to store any object (that supports cPickle) in the file.""" def __init__(self, book, name, deps=[]): FileCacheObject.__init__(self, book, name, deps) self.object = None def getObject(self): """Returns the stored object, possibly loading it from file.""" if self.object != None: return self.object # regenerate the object if it isn't stored, but *don't* do it if # it's only outdated (we want the update to happen in background in # that case): filename = self.getFilename() if os.path.isfile(filename): f = file(filename, 'rb') self.object = cPickle.load(f) f.close() return self.object def setObject(self, obj): self.object = obj f = file(self.getFilename(), 'wb') cPickle.dump(obj, f, protocol=pickle.HIGHEST_PROTOCOL) f.close() # -------------------------------------------------------------------------- # Cache for single book: # -------------------------------------------------------------------------- class BookCache: """Cache associated with single book.""" def __init__(self, parent, book): self.parent = parent self.book = book self.cacheDir = None self.contentsDir = None self.objects = {} def getDir(self): """Returns cache dir where this book's data are stored.""" if self.cacheDir == None: if self.book.hasAttr(book.ATTR_CACHEDIR): self.cacheDir = self.book.getAttr(book.ATTR_CACHEDIR) else: x = utils.bookNameToFileName(self.book.title) self.cacheDir = '%s%s' % (self.parent.cacheDir, x) if self.book.definitionFile != None and self.book.hasAttr('basedir'): self.cacheDir = os.path.join(os.path.dirname(self.book.definitionFile), self.book.getAttr('basedir'), self.cacheDir) # FIXME: basedir is html-only, nuke it if not os.path.isdir(self.cacheDir): os.mkdir(self.cacheDir) return self.cacheDir def addObject(self, obj): """Adds new cacheable object to the cache.""" self.objects[obj.name] = obj def getActiveObjects(self): """Returns active objects.""" return [o for o in self.objects.values() if o.isActive()] def remakeAll(self): """Regenerates all objects, regardless of whether they're up to date or not.""" # mark all cache objects as needing update: for o in self.getActiveObjects(): o.setState(STATE_OUTDATED) # and wake the cache worker thread: worker.workToDo.set() # -------------------------------------------------------------------------- # Cache management: # -------------------------------------------------------------------------- class CacheMgr: """Cache manager class.""" def __init__(self): self.cacheDir = None self.caches = {} def get(self, book): """Returns book-specific cache object.""" if book not in self.caches: if self.cacheDir == None: # create directory if it doesn't exist self.cacheDir = '%scache/' % utils.getConfigDir() if not os.path.isdir(self.cacheDir): os.mkdir(self.cacheDir) self.caches[book] = BookCache(self, book) return self.caches[book] def delete(self, book): """Deletes all traces of given book from the cache.""" dir = cache.get(book).getDir() del self.caches[book] shutil.rmtree(dir, ignore_errors=1) def getAllActiveObjects(self): """Returns all active objects in all books' caches.""" objs = [] for bk in self.caches.values(): objs += bk.getActiveObjects() return objs def setCurrentBook(self, bk): """Tells the updater which book is currently selected; this is used to make sure its cache items are updated more quickly.""" worker.setCurrentBook(bk) cache = CacheMgr() get = cache.get # -------------------------------------------------------------------------- # Notifications about cache changes: # -------------------------------------------------------------------------- class Notifications: def __init__(self): self.listeners = [] def add(self, listener): self.listeners.append(listener) def remove(self, listener): self.listeners.remove(listener) def notify(self, object): for l in self.listeners: l(object) notifications = Notifications() # -------------------------------------------------------------------------- # Cache updating thread: # -------------------------------------------------------------------------- class UpdateController: """This is passed to CacheObject.update() so that it knows when it should stop processing.""" def __init__(self): self.cancel = False self.clear(1) def clear(self, cnt): self.book = '' self.itemCount = cnt self.itemNum = 0 self.msg = '' def startBook(self, book): self.book = book.title self.itemNum += 1 self.message('') def message(self, text): """Displays progress message.""" self.msg = text self._showMessage() def _showMessage(self): msg = '%i/%i : Updating %s' % (self.itemNum, self.itemCount, self.book) if self.msg != '': msg += ': %s' % self.msg utils.uiCallback.setBusyText(msg) # re-check cache's state every minute: POLLING_TIMEOUT = 60 class WorkerThread(threading.Thread): """Daemon thread that watches the cache and regenerates outdated objects as needed.""" def __init__(self): # event object for signaling the need to re-scan things: self.stopRequested = False self.workToDo = threading.Event() self.ctrl = UpdateController() self.queue = [] self.queueLock = threading.Lock() self.currentObj = None self.currentBook = None threading.Thread.__init__(self) def run(self): while not self.stopRequested: objects = cache.getAllActiveObjects() # first, determine state of the objects: for o in objects: if o.state == STATE_UNKNOWN: if o.isOutdated(): o.setState(STATE_OUTDATED) else: o.setState(STATE_UP_TO_DATE) if self.stopRequested: return # then do something about the outdated ones: self.queueLock.acquire() # find all objects, make those that are for currentBook one 1st: objects = [o for o in objects if o.state == STATE_OUTDATED] self.queue = ([o for o in objects if o.book == self.currentBook] + [o for o in objects if o.book != self.currentBook]) self.queueLock.release() if len(self.queue) > 0: utils.uiCallback.showBusyIndicator() self.ctrl.clear(len(objects)) while len(self.queue) > 0: self.queueLock.acquire() o = self.currentObj = self.queue.pop(0) self.queueLock.release() self.ctrl.startBook(o.book) self.ctrl.cancel = self.stopRequested o.update(self.ctrl) self.queueLock.acquire() self.currentObj = None self.queueLock.release() if self.ctrl.cancel or self.stopRequested: break utils.uiCallback.hideBusyIndicator() if self.stopRequested: return # nothing more to do -- wait until there's more work or until # enough time passed for it to be worth re-checking everything self.workToDo.wait(POLLING_TIMEOUT) self.workToDo.clear() def cancelCurrentJob(self): """Cancels current job-in-progress.""" self.ctrl.cancel = True self.workToDo.set() def setCurrentBook(self, bk): """Sets current book, tells the worked thread to restart itself and start working on its cache items.""" self.queueLock.acquire() self.currentBook = bk interesting = [o for o in self.queue if o.book == bk] if (len(interesting) > 0 and self.currentObj != None and self.currentObj.book != bk): # some updates for this book are scheduled, cancel current job # and re-run the queue, so that cache object's needed by this book # have higher priority: self.cancelCurrentJob() self.queueLock.release() worker = WorkerThread() def shutdown(): worker.stopRequested = True worker.cancelCurrentJob() worker.join()