# Part of the A-A-P recipe executive: Access files which may be remote # Copyright (C) 2002-2003 Stichting NLnet Labs # Permission to copy and use this file is specified in the file COPYING. # If this file is missing you can find it here: http://www.a-a-p.org/COPYING # # Access files by their URL. # If they are remote, may download or upload the file. # Uses the Cache to avoid up/downloading too often. # import os.path import shutil import time from urlparse import urlparse from urllib import urlretrieve, urlcleanup, urlopen from Util import * from Message import * def is_url(name): """Return non-zero when "name" is a URL, zero when it's a local file.""" # On MS-Windows c:/name is a file, not a URL. if len(name) > 2 and name[1] == ':': return 0 scheme, mach, path, parm, query, frag = urlparse(name, '', 0) return scheme != '' def url_split3(name): """Split a URL into scheme, machine and path.""" # On MS-Windows c:/name is a file, not a URL. if len(name) > 2 and name[1] == ':': return '', '', name scheme, mach, path, parm, query, frag = urlparse(name, '', 0) path = path + parm + query + frag # Python 2.4 understands rsync, but includes an extra slash. if scheme == 'rsync' and mach != '': i = string.index(name, mach) + len(mach) if i < len(name) and name[i] == '/': path = name[i+1:] if scheme != '' and mach == '' and path[:2] == '//': # urlparse doesn't handle scp://machine/path correctly # Python 2.3 doesn't handle rsync either. mach = path[2:] path = '' i = string.find(mach, '/') if i > 0: path = mach[i + 1:] mach = mach[:i] if scheme == "file": scheme = '' # A file is the same as no scheme. # For "ftp://mach/foobar" the file is "foobar", not "/foobar". if scheme == "ftp" and len(path) > 1 and path[0] == '/': path = path[1:] return scheme, mach, path def url_time(recdict, name): """Obtain the timestamp in seconds (in GMT if possible) for the URL "name". Returns zero (very old) if the timestamp can't be obtained.""" if is_url(name): from Cache import cache_lookup c = cache_lookup(recdict, name) if c: # use timestamp for cached file. t = c.timestamp() else: # obtain timestamp for remote files. t = remote_time(recdict, name) else: try: t = os.path.getmtime(name) except (IOError, OSError): t = 0 return t def remote_time(recdict, name): """Get the timestamp of a remote file.""" try: msg_info(recdict, _('getting timestamp for "%s"') % name) up = urlopen(name) t = get_header_date(up.info()) up.close() if t == 1: msg_info(recdict, _('"%s" can be found but has no timestamp') % name) except: msg_info(recdict, _('Could not obtain timestamp for "%s"') % name) t = 0 return t def get_header_date(headers): """Get the date from a MIME header. Returns zero when not available.""" from rfc822 import parsedate if headers.has_key("Last-Modified"): return time.mktime(parsedate(headers["Last-Modified"])) if headers.has_key("Date"): return time.mktime(parsedate(headers["Date"])) # When a file does exist but has no timestamp return 1, so that it's # different from a file that does not exist. return 1 def get_progname_rsync(recdict): """ Use $RSYNC if defined, otherwise use "rsync -p --rsh=ssh --copy-links". """ return get_progname(recdict, "RSYNC", "rsync", " -p --rsh=ssh --copy-links") def get_progname_scp(recdict): """ Use $SCP if defined, otherwise use "scp -C -p". """ return get_progname(recdict, "SCP", "scp", " -C -p") def get_progname_rcp(recdict): """ Use $RCP if defined, otherwise use "rcp -p". """ return get_progname(recdict, "RCP", "rcp", " -p") def url_download(recdict, url, fname): """Attempt downloading file "url" to file "fname". Overwrite "fname" if it already exists. When "fname" is empty, use a temporary file. The caller has to use "url_cleanup()" when done with it. Returns a tuple of the filename and the timestamp of the remote file when possible. Throws an IOError if downloading failed.""" msg_info(recdict, _('Attempting download of "%s"' % url)) rtime = 0 fscheme, fmach, fpath = url_split3(url) # First try using a function the user specified, this overrules our own # stuff. scope = recdict.get("_no") if not scope is None: user_func = scope.get("fetch_" + fscheme) else: user_func = None if (user_func or fscheme == 'rcp' or fscheme == 'scp' or fscheme == 'rsync' or fscheme == 'ftp'): if fname == '': from RecPython import tempfname resfile = tempfname() else: resfile = fname if user_func: res = apply(user_func, (recdict, fmach, fpath, resfile)) if not res: raise IOError, (_("fetch_%s() could not download %s") % (fscheme, url)) elif fscheme == 'rcp': # Install rcp when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "rcp") cmd = '%s %s:%s %s' % (get_progname_rcp(recdict), fmach, fpath, resfile) logged_system(recdict, cmd) elif fscheme == 'scp': # Install scp when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "scp") cmd = '%s %s:%s %s' % (get_progname_scp(recdict), fmach, fpath, resfile) if os.name != "posix": # Can't use "tee" and scp may prompt for a password. cmd = "{interactive} " + cmd logged_system(recdict, cmd) elif fscheme == 'rsync': # Install rsync when needed. from DoInstall import assert_pkg from Work import getrpstack assert_pkg(getrpstack(recdict), recdict, "rsync") cmd = '%s %s:%s %s' % (get_progname_rsync(recdict), fmach, fpath, resfile) if os.name != "posix": # Can't use "tee" and ssh may prompt for a password. cmd = "{interactive} " + cmd logged_system(recdict, cmd) elif fscheme == 'ftp': # urlretrieve() doesn't work well for ftp in Python 1.5, use ftplib. # This also allows us to cache the connections. # And it avoids a bug in urllib that trying to download a file without # read permission results in a directory listing. from CopyMove import ftpConnect # Create the output file first (opening ftp connection may take time). msg = '' try: ftpfile = open(resfile, "wb") except StandardError, e: msg = _('Cannot open "%s" for writing: %s') % (resfile, e) if not msg: ftp, msg = ftpConnect(fmach) if not msg: # Invoke the ftp command. Use a passive connection, this # appears to work best. import ftplib ftp.set_pasv(1) try: ftp.retrbinary("RETR " + fpath, ftpfile.write, 8192) except ftplib.all_errors, e: msg = e ftpfile.close() if msg: # Delete an empty or truncated result file. os.remove(resfile) if msg: raise IOError, msg else: if fname == '': # read to temporary file resfile, h = urlretrieve(url) else: resfile, h = urlretrieve(url, fname) if resfile != fname: # Using a cached file, need to make a copy. shutil.copy2(resfile, fname) resfile = fname urlcleanup() # When obtaining a file through http:// an non-existing page isn't # noticed. Check for a 404 error by looking in the file. Limit the # search to the first 1000 bytes, an error page should not be longer, # while an actual file can be very long. f = open(resfile) txt = f.read(1000) f.close() import re if re.search("\\s*404\\s*not\\s*found", txt, re.IGNORECASE): try_delete(resfile) raise IOError, (_("fetch_%s() encountered a 404 error for %s") % (fscheme, url)) if h: rtime = get_header_date(h) if fname == '': msg_info(recdict, _('Downloaded "%s"' % url)) else: msg_info(recdict, _('Downloaded "%s" to "%s"' % (url, fname))) return resfile, rtime def url_cleanup(scheme): """Cleanup after using url_download with scheme "scheme".""" if not scheme in ['scp', 'rsync']: urlcleanup() # remove any cached file from urlretrieve() def download_file(recdict, url_dl, node, use_cache): """Download a file according to "url_dl" and copy it over "node.name". Use the cache when "use_cache" is non-zero, otherwise obtain a fresh copy. Can also be used for a local file, it is copied. Return non-zero for success.""" from Cache import local_name from VersCont import separate_scheme # When copying a local file invoke local_name() and copy the file, the # cache isn't really used, even though "use_cache" is set. url = url_dl["name"] scheme, fname = separate_scheme(url) if scheme == "file" or not is_url(url): use_cache = 1 # When not using the cache download directly to the destination file. # Avoids that the cache is filled with files that are never used again. if not use_cache: if skip_commands(): msg_info(recdict, _('Skip download for "%s"') % node.short_name()) return 1 try: f, rtime = url_download(recdict, url, node.absname) except EnvironmentError, e: msg_note(recdict, _('Cannot download "%s" to "%s": %s') % (url, node.short_name(), str(e))) return 0 return 1 if url_dl.has_key("cache_update"): cu = url_dl["cache_update"] else: cu = None # This downloads the file when it's not in the cache already. # TODO: handle attributes (e.g., login and password) fname, used_cache = local_name(recdict, url, cu) if fname and os.path.exists(fname): if skip_commands(): msg_info(recdict, _('Skip copy file: "%s"') % node.short_name()) else: # copy the downloaded file over the original one. try: shutil.copyfile(fname, node.absname) except IOError, e: raise UserError, (_('Cannot copy "%s" to "%s": ') % (fname, node.name) + str(e)) if used_cache: msg_info(recdict, _('Copied file from cache: "%s"') % node.short_name()) else: msg_info(recdict, _('Copied file "%s" to "%s"') % (fname, node.short_name())) return 1 return 0 def upload_file(recdict, url_dl, nodelist): """Upload nodes in "nodelist" according to "url_dl". Return list of nodes that failed.""" # TODO: use other attributes in url_dl, e.g. a login name. from CopyMove import remote_copy_move from VersCont import repl_file_name failed = [] # Make a copy of the nodelist. Remove items that have been done until none # are left. todolist = nodelist[:] while todolist: # Collect nodes that are in the same directory and where the source and # destination file names are identical. to_item = '' fromlist = [] for node in todolist[:]: to_name = repl_file_name(url_dl["name"], node.name) if os.path.basename(node.absname) == os.path.basename(to_name): d = os.path.dirname(to_name) if not d: d = "." if not to_item or d == to_item: fromlist.append({"name" : node.absname}) todolist.remove(node) to_item = d elif not to_item: # Source and target name are different, must copy this one by # itself. fromlist = [ {"name" : node.absname } ] todolist.remove(node) to_item = to_name break # When there is only one item include the file name in the to_item, # this avoids using the name of a directory for a file. if len(fromlist) == 1 and to_item != to_name: to_item = path_join(to_item, os.path.basename(fromlist[0]["name"])) msg_info(recdict, 'Uploading %s to %s' % (str(map(lambda x: x["name"], shorten_dictlist(fromlist))), to_item)) flist = remote_copy_move([], recdict, 1, fromlist, { "name" : to_item }, {"mkdir": 1}, 0, errmsg = 0) # Find the nodes for the failed file names. for f in flist: for node in nodelist: if node.absname == f: failed.append(node) return failed def remote_remove(recdict, url_dl, node): """Delete remote file for node "node" according to "url_dl".""" msg_info(recdict, 'Removing "%s" NOT IMPLEMENTED YET' % url_dl["name"]) return 1 # vim: set sw=4 et sts=4 tw=79 fo+=l: