ports//devel/aap/work/Remote.py

# Part of the A-A-P recipe executive: Access files which may be remote

# Copyright (C) 2002-2003 Stichting NLnet Labs
# Permission to copy and use this file is specified in the file COPYING.
# If this file is missing you can find it here: http://www.a-a-p.org/COPYING

#
# Access files by their URL.
# If they are remote, may download or upload the file.
# Uses the Cache to avoid up/downloading too often.
#

import os.path
import shutil
import time
from urlparse import urlparse
from urllib import urlretrieve, urlcleanup, urlopen

from Util import *
from Message import *

def is_url(name):
    """Return non-zero when "name" is a URL, zero when it's a local file."""
    # On MS-Windows c:/name is a file, not a URL.
    if len(name) > 2 and name[1] == ':':
        return 0
    scheme, mach, path, parm, query, frag = urlparse(name, '', 0)
    return scheme != ''


def url_split3(name):
    """Split a URL into scheme, machine and path."""
    # On MS-Windows c:/name is a file, not a URL.
    if len(name) > 2 and name[1] == ':':
        return '', '', name
    scheme, mach, path, parm, query, frag = urlparse(name, '', 0)
    path = path + parm + query + frag

    # Python 2.4 understands rsync, but includes an extra slash.
    if scheme == 'rsync' and mach != '':
        i = string.index(name, mach) + len(mach)
        if i < len(name) and name[i] == '/':
            path = name[i+1:]

    if scheme != '' and mach == '' and path[:2] == '//':
        # urlparse doesn't handle scp://machine/path correctly
        # Python 2.3 doesn't handle rsync either.
        mach = path[2:]
        path = ''
        i = string.find(mach, '/')
        if i > 0:
            path = mach[i + 1:]
            mach = mach[:i]

    if scheme == "file":
        scheme = ''     # A file is the same as no scheme.

    # For "ftp://mach/foobar" the file is "foobar", not "/foobar".
    if scheme == "ftp" and len(path) > 1 and path[0] == '/':
        path = path[1:]
    return scheme, mach, path


def url_time(recdict, name):
    """Obtain the timestamp in seconds (in GMT if possible) for the URL "name".
       Returns zero (very old) if the timestamp can't be obtained."""
    if is_url(name):
        from Cache import cache_lookup
        c = cache_lookup(recdict, name)
        if c:
            # use timestamp for cached file.
            t = c.timestamp()
        else:
            # obtain timestamp for remote files.
            t = remote_time(recdict, name)

    else:
        try:
            t = os.path.getmtime(name)
        except (IOError, OSError):
            t = 0
    return t


def remote_time(recdict, name):
    """Get the timestamp of a remote file."""
    try:
        msg_info(recdict, _('getting timestamp for "%s"') % name)
        up = urlopen(name)
        t = get_header_date(up.info())
        up.close()
        if t == 1:
            msg_info(recdict, _('"%s" can be found but has no timestamp') % name)
    except:
        msg_info(recdict, _('Could not obtain timestamp for "%s"') % name)
        t = 0
    return t


def get_header_date(headers):
    """Get the date from a MIME header.  Returns zero when not available."""
    from rfc822 import parsedate

    if headers.has_key("Last-Modified"):
        return time.mktime(parsedate(headers["Last-Modified"]))
    if headers.has_key("Date"):
        return time.mktime(parsedate(headers["Date"]))
    # When a file does exist but has no timestamp return 1, so that it's
    # different from a file that does not exist.
    return 1


def get_progname_rsync(recdict):
    """
    Use $RSYNC if defined, otherwise use "rsync -p --rsh=ssh --copy-links".
    """
    return get_progname(recdict, "RSYNC", "rsync", " -p --rsh=ssh --copy-links")

def get_progname_scp(recdict):
    """
    Use $SCP if defined, otherwise use "scp -C -p".
    """
    return get_progname(recdict, "SCP", "scp", " -C -p")

def get_progname_rcp(recdict):
    """
    Use $RCP if defined, otherwise use "rcp -p".
    """
    return get_progname(recdict, "RCP", "rcp", " -p")


def url_download(recdict, url, fname):
    """Attempt downloading file "url" to file "fname".
       Overwrite "fname" if it already exists.
       When "fname" is empty, use a temporary file.  The caller has to use
       "url_cleanup()" when done with it.
       Returns a tuple of the filename and the timestamp of the remote file
       when possible.
       Throws an IOError if downloading failed."""
    msg_info(recdict, _('Attempting download of "%s"' % url))
    rtime = 0

    fscheme, fmach, fpath = url_split3(url)

    # First try using a function the user specified, this overrules our own
    # stuff.
    scope = recdict.get("_no")
    if not scope is None:
        user_func = scope.get("fetch_" + fscheme)
    else:
        user_func = None

    if (user_func
            or fscheme == 'rcp'
            or fscheme == 'scp'
            or fscheme == 'rsync'
            or fscheme == 'ftp'):
        if fname == '':
            from RecPython import tempfname
            resfile = tempfname()
        else:
            resfile = fname

    if user_func:
        res = apply(user_func, (recdict, fmach, fpath, resfile))
        if not res:
            raise IOError, (_("fetch_%s() could not download %s")
                                                              % (fscheme, url))

    elif fscheme == 'rcp':
        # Install rcp when needed.
        from DoInstall import assert_pkg
        from Work import getrpstack
        assert_pkg(getrpstack(recdict), recdict, "rcp")

        cmd = '%s %s:%s %s' % (get_progname_rcp(recdict), fmach, fpath, resfile)
        logged_system(recdict, cmd)

    elif fscheme == 'scp':
        # Install scp when needed.
        from DoInstall import assert_pkg
        from Work import getrpstack
        assert_pkg(getrpstack(recdict), recdict, "scp")

        cmd = '%s %s:%s %s' % (get_progname_scp(recdict), fmach, fpath, resfile)
        if os.name != "posix":
            # Can't use "tee" and scp may prompt for a password.
            cmd = "{interactive} " + cmd

        logged_system(recdict, cmd)

    elif fscheme == 'rsync':
        # Install rsync when needed.
        from DoInstall import assert_pkg
        from Work import getrpstack
        assert_pkg(getrpstack(recdict), recdict, "rsync")

        cmd = '%s %s:%s %s' % (get_progname_rsync(recdict),
                                                         fmach, fpath, resfile)
        if os.name != "posix":
            # Can't use "tee" and ssh may prompt for a password.
            cmd = "{interactive} " + cmd

        logged_system(recdict, cmd)

    elif fscheme == 'ftp':
        # urlretrieve() doesn't work well for ftp in Python 1.5, use ftplib.
        # This also allows us to cache the connections.
        # And it avoids a bug in urllib that trying to download a file without
        # read permission results in a directory listing.
        from CopyMove import ftpConnect

        # Create the output file first (opening ftp connection may take time).
        msg = ''
        try:
            ftpfile = open(resfile, "wb")
        except StandardError, e:
            msg = _('Cannot open "%s" for writing: %s') % (resfile, e)

        if not msg:
            ftp, msg = ftpConnect(fmach)

            if not msg:
                # Invoke the ftp command.  Use a passive connection, this
                # appears to work best.
                import ftplib
                ftp.set_pasv(1)
                try:
                    ftp.retrbinary("RETR " + fpath, ftpfile.write, 8192)
                except ftplib.all_errors, e:
                    msg = e

            ftpfile.close()
            if msg:
                # Delete an empty or truncated result file.
                os.remove(resfile)

        if msg:
            raise IOError, msg

    else:
        if fname == '':
            # read to temporary file
            resfile, h = urlretrieve(url)
        else:
            resfile, h = urlretrieve(url, fname)
            if resfile != fname:
                # Using a cached file, need to make a copy.
                shutil.copy2(resfile, fname)
                resfile = fname
            urlcleanup()

        # When obtaining a file through http:// an non-existing page isn't
        # noticed.  Check for a 404 error by looking in the file.  Limit the
        # search to the first 1000 bytes, an error page should not be longer,
        # while an actual file can be very long.
        f = open(resfile)
        txt = f.read(1000)
        f.close()

        import re
        if re.search("<title>\\s*404\\s*not\\s*found", txt, re.IGNORECASE):
            try_delete(resfile)
            raise IOError, (_("fetch_%s() encountered a 404 error for %s")
                                                              % (fscheme, url))
        if h:
            rtime = get_header_date(h)

    if fname == '':
        msg_info(recdict, _('Downloaded "%s"' % url))
    else:
        msg_info(recdict, _('Downloaded "%s" to "%s"' % (url, fname)))

    return resfile, rtime


def url_cleanup(scheme):
    """Cleanup after using url_download with scheme "scheme"."""
    if not scheme in ['scp', 'rsync']:
        urlcleanup()    # remove any cached file from urlretrieve()


def download_file(recdict, url_dl, node, use_cache):
    """Download a file according to "url_dl" and copy it over "node.name".
       Use the cache when "use_cache" is non-zero, otherwise obtain a fresh
       copy.
       Can also be used for a local file, it is copied.
       Return non-zero for success."""
    from Cache import local_name
    from VersCont import separate_scheme

    # When copying a local file invoke local_name() and copy the file, the
    # cache isn't really used, even though "use_cache" is set.
    url = url_dl["name"]
    scheme, fname = separate_scheme(url)
    if scheme == "file" or not is_url(url):
        use_cache = 1

    # When not using the cache download directly to the destination file.
    # Avoids that the cache is filled with files that are never used again.
    if not use_cache:
        if skip_commands():
            msg_info(recdict, _('Skip download for "%s"') % node.short_name())
            return 1
        try:
            f, rtime = url_download(recdict, url, node.absname)
        except EnvironmentError, e:
            msg_note(recdict, _('Cannot download "%s" to "%s": %s')
                                        % (url, node.short_name(), str(e)))
            return 0
        return 1

    if url_dl.has_key("cache_update"):
        cu = url_dl["cache_update"]
    else:
        cu = None
    # This downloads the file when it's not in the cache already.
    # TODO: handle attributes (e.g., login and password)
    fname, used_cache = local_name(recdict, url, cu)

    if fname and os.path.exists(fname):
        if skip_commands():
            msg_info(recdict, _('Skip copy file: "%s"') % node.short_name())
        else:
            # copy the downloaded file over the original one.
            try:
                shutil.copyfile(fname, node.absname)
            except IOError, e:
                raise UserError, (_('Cannot copy "%s" to "%s": ')
                                                 % (fname, node.name) + str(e))
            if used_cache:
                msg_info(recdict, _('Copied file from cache: "%s"')
                                                           % node.short_name())
            else:
                msg_info(recdict, _('Copied file "%s" to "%s"')
                                                  % (fname, node.short_name()))
        return 1
    return 0


def upload_file(recdict, url_dl, nodelist):
    """Upload nodes in "nodelist" according to "url_dl".
       Return list of nodes that failed."""
    # TODO: use other attributes in url_dl, e.g. a login name.
    from CopyMove import remote_copy_move
    from VersCont import repl_file_name
    failed = []

    # Make a copy of the nodelist.  Remove items that have been done until none
    # are left.
    todolist = nodelist[:]
    while todolist:
        # Collect nodes that are in the same directory and where the source and
        # destination file names are identical.
        to_item = ''
        fromlist = []
        for node in todolist[:]:
            to_name = repl_file_name(url_dl["name"], node.name)
            if os.path.basename(node.absname) == os.path.basename(to_name):
                d = os.path.dirname(to_name)
                if not d:
                    d = "."
                if not to_item or d == to_item:
                    fromlist.append({"name" : node.absname})
                    todolist.remove(node)
                    to_item = d
            elif not to_item:
                # Source and target name are different, must copy this one by
                # itself.
                fromlist = [ {"name" : node.absname } ]
                todolist.remove(node)
                to_item = to_name
                break

        # When there is only one item include the file name in the to_item,
        # this avoids using the name of a directory for a file.
        if len(fromlist) == 1 and to_item != to_name:
            to_item = path_join(to_item, os.path.basename(fromlist[0]["name"]))

        msg_info(recdict, 'Uploading %s to %s' % (str(map(lambda x:
                             x["name"], shorten_dictlist(fromlist))), to_item))

        flist = remote_copy_move([], recdict, 1, fromlist,
                             { "name" : to_item }, {"mkdir": 1}, 0, errmsg = 0)

        # Find the nodes for the failed file names.
        for f in flist:
            for node in nodelist:
                if node.absname == f:
                    failed.append(node)

    return failed


def remote_remove(recdict, url_dl, node):
    """Delete remote file for node "node" according to "url_dl"."""
    msg_info(recdict, 'Removing "%s" NOT IMPLEMENTED YET' % url_dl["name"])
    return 1


# vim: set sw=4 et sts=4 tw=79 fo+=l:
syntax highlighted by Code2HTML, v. 0.9.1