# Part of the A-A-P recipe executive: Access files which may be remote # Copyright (C) 2002-2003 Stichting NLnet Labs # Permission to copy and use this file is specified in the file COPYING. # If this file is missing you can find it here: http://www.a-a-p.org/COPYING # # Access files by their URL. # If they are remote, may download or upload the file. # Uses the Cache to avoid up/downloading too often. # import os.path import shutil import time from urlparse import urlparse from urllib import urlretrieve, urlcleanup, urlopen from Util import * from Message import * def is_url(name): """Return non-zero when "name" is a URL, zero when it's a local file.""" # On MS-Windows c:/name is a file, not a URL. if len(name) > 2 and name[1] == ':': return 0 scheme, mach, path, parm, query, frag = urlparse(name, '', 0) return scheme != '' def url_split3(name): """Split a URL into scheme, machine and path.""" # On MS-Windows c:/name is a file, not a URL. if len(name) > 2 and name[1] == ':': return '', '', name scheme, mach, path, parm, query, frag = urlparse(name, '', 0) path = path + parm + query + frag # Python 2.4 understands rsync, but includes an extra slash. if scheme == 'rsync' and mach != '': i = string.index(name, mach) + len(mach) if i < len(name) and name[i] == '/': path = name[i+1:] if scheme != '' and mach == '' and path[:2] == '//': # urlparse doesn't handle scp://machine/path correctly # Python 2.3 doesn't handle rsync either. mach = path[2:] path = '' i = string.find(mach, '/') if i > 0: path = mach[i + 1:] mach = mach[:i] if scheme == "file": scheme = '' # A file is the same as no scheme. # For "ftp://mach/foobar" the file is "foobar", not "/foobar". if scheme == "ftp" and len(path) > 1 and path[0] == '/': path = path[1:] return scheme, mach, path def url_time(recdict, name): """Obtain the timestamp in seconds (in GMT if possible) for the URL "name". Returns zero (very old) if the timestamp can't be obtained.""" if is_url(name): from Cache import cache_lookup c = cache_lookup(recdict, name) if c: # use timestamp for cached file. t = c.timestamp() else: # obtain timestamp for remote files. t = remote_time(recdict, name) else: try: t = os.path.getmtime(name) except (IOError, OSError): t = 0 return t def remote_time(recdict, name): """Get the timestamp of a remote file.""" try: msg_info(recdict, _('getting timestamp for "%s"') % name) up = urlopen(name) t = get_header_date(up.info()) up.close() if t == 1: msg_info(recdict, _('"%s" can be found but has no timestamp') % name) except: msg_info(recdict, _('Could not obtain timestamp for "%s"') % name) t = 0 return t def get_header_date(headers): """Get the date from a MIME header. Returns zero when not available.""" from rfc822 import parsedate if headers.has_key("Last-Modified"): return time.mktime(parsedate(headers["Last-Modified"])) if headers.has_key("Date"): return time.mktime(parsedate(headers["Date"])) # When a file does exist but has no timestamp return 1, so that it's # different from a file that does not exist. return 1 def get_progname_rsync(recdict): """ Use $RSYNC if defined, otherwise use "rsync -p --rsh=ssh --copy-links". """ return get_progname(recdict, "RSYNC", "rsync", " -p --rsh=ssh --copy-links") def get_progname_scp(recdict): """ Use $SCP if defined, otherwise use "scp -C -p". """ return get_progname(recdict, "SCP", "scp", " -C -p") def get_progname_rcp(recdict): """ Use $RCP if defined, otherwise use "rcp -p". """ return get_progname(recdict, "RCP", "rcp", " -p") def url_download(recdict, url, fname): """Attempt downloading file "url" to file "fname". Overwrite "fname" if it already exists. When "fname" is empty, use a temporary file. The caller has to use "url_cleanup()" when done with it. Returns a tuple of the filename and the timestamp of the remote file when possible. Throws an IOError if downloading failed.""" msg_info(recdict, _('Attempting download of "%s"' % url)) rtime = 0 fscheme, fmach, fpath = url_split3(url) # First try using a function the user specified, this overrules our own # stuff. scope = recdict.get("_no") if not scope is None: user_func = scope.get("fetch_" + fscheme) else: user_func = None if (user_func or fscheme == 'rcp' or fscheme == 'scp' or fscheme == 'rsync' or fscheme == 'ftp'): if fname == '': from RecPython import tempfname resfile = tempfname() else: resfile = fname if user_func: res = apply(user_func, (recdict, fmach, fpath, resfile)) if not res: raise IOError, (_("fetch_%s() could not download %s") % (fscheme, url)) elif fscheme == 'rcp': # Install rcp when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "rcp") cmd = '%s %s:%s %s' % (get_progname_rcp(recdict), fmach, fpath, resfile) logged_system(recdict, cmd) elif fscheme == 'scp': # Install scp when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "scp") cmd = '%s %s:%s %s' % (get_progname_scp(recdict), fmach, fpath, resfile) if os.name != "posix": # Can't use "tee" and scp may prompt for a password. cmd = "{interactive} " + cmd logged_system(recdict, cmd) elif fscheme == 'rsync': # Install rsync when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "rsync") cmd = '%s %s:%s %s' % (get_progname_rsync(recdict), fmach, fpath, resfile) if os.name != "posix": # Can't use "tee" and ssh may prompt for a password. cmd = "{interactive} " + cmd logged_system(recdict, cmd) elif fscheme == 'ftp': # urlretrieve() doesn't work well for ftp in Python 1.5, use ftplib. # This also allows us to cache the connections. # And it avoids a bug in urllib that trying to download a file without # read permission results in a directory listing. from CopyMove import ftpConnect # Create the output file first (opening ftp connection may take time). msg = '' try: ftpfile = open(resfile, "wb") except StandardError, e: msg = _('Cannot open "%s" for writing: %s') % (resfile, e) if not msg: ftp, msg = ftpConnect(fmach) if not msg: # Invoke the ftp command. Use a passive connection, this # appears to work best. import ftplib ftp.set_pasv(1) try: ftp.retrbinary("RETR " + fpath, ftpfile.write, 8192) except ftplib.all_errors, e: msg = e ftpfile.close() if msg: # Delete an empty or truncated result file. os.remove(resfile) if msg: raise IOError, msg else: if fname == '': # read to temporary file resfile, h = urlretrieve(url) else: resfile, h = urlretrieve(url, fname) if resfile != fname: # Using a cached file, need to make a copy. shutil.copy2(resfile, fname) resfile = fname urlcleanup() # When obtaining a file through http:// an non-existing page isn't # noticed. Check for a 404 error by looking in the file. Limit the # search to the first 1000 bytes, an error page should not be longer, # while an actual file can be very long. f = open(resfile) txt = f.read(1000) f.close() import re if re.search("