/*
 * XML Catalog Manager (xmlcatmgr)
 * $Id: xmldoc.c,v 1.1 2004/08/31 19:07:23 jmmv Exp $
 *
 * Copyright (c) 2003, 2004 Julio M. Merino Vidal.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 * 3. Neither the name of the author nor the names of contributors may
 *    be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * This file implements the XML document parser.  It constructs a
 * parse tree, using xmlnode and xmlattr types as helpers.
 *
 * The parser is quite limited, but for xmlcatmgr's purpose, it's
 * enough.  Specially it lacks:
 * - Unicode support.
 * - DOCTYPE support (as it's now, it just reads its contents without
 *   parsing them).
 * - Entity support.
 */

#include "system.h"

#ifndef lint
__RCSID("$Id: xmldoc.c,v 1.1 2004/08/31 19:07:23 jmmv Exp $");
#endif

#include "grstr.h"
#include "mem.h"
#include "linklist.h"
#include "xmldoc.h"
#include "xmlnode.h"

#define SCOPE_GLOBAL 0
#define SCOPE_TAG 1
#define SCOPE_COMMENT 2
#define SCOPE_STRING 3

#define TOKEN_ERROR 0
#define TOKEN_LT 1
#define TOKEN_GT 2
#define TOKEN_EQUAL 3
#define TOKEN_SLASH 4
#define TOKEN_QUEST 5
#define TOKEN_EXCL 6
#define TOKEN_DASH 7
#define TOKEN_WORD 8
#define TOKEN_STRING 9
#define TOKEN_COMMENT 10

#define TOKEN_HAS_MEMORY(tk) \
    (tk == TOKEN_WORD || tk == TOKEN_STRING || tk == TOKEN_COMMENT)
#define TOKEN_FREE(tk, ptr) \
    if (TOKEN_HAS_MEMORY(tk)) \
        free(ptr);

#define ISWORD(ch) (isalnum(ch) || ch == '_')

static int read_token(FILE *, int, char **);
static int read_token_comment(FILE *, struct grstr *);
static int read_token_global(FILE *, struct grstr *);
static int read_token_string(FILE *, struct grstr *);
static int read_token_tag(FILE *, struct grstr *);

static bool read_prolog(FILE *, struct xmldoc *);
static bool read_xmldecl(FILE *, struct xmldoc *);
static struct doctype *read_doctype(FILE *, struct xmldoc *);
static struct xmlnode *get_node(FILE *, bool *);
static void get_node2(FILE *f, struct xmlnode **, bool *);
static struct xmlnode *get_node3(FILE *f, char *name);
static struct xmlnode *get_special_node(FILE *);
static struct xmlattr *get_attr(FILE *, char *);

/* --------------------------------------------------------------------- */

/*
 * Creates a new empty xmldoc object, returning a pointer to it.
 */
struct xmldoc *
xmldoc_new(const char *doctype)
{
    struct doctype *dt;
    struct xmlattr *xa;
    struct xmldoc *xd;

    xd = (struct xmldoc *)malloc(sizeof(struct xmldoc));
    if (xd == NULL)
        return NULL;

    dt = (struct doctype *)malloc(sizeof(struct doctype));
    dt->dt_text = strdup(doctype);

    xd->xd_doctype = dt;
    xd->xd_root = NULL;
    LINKLIST_INIT(&xd->xd_attrs);
    LINKLIST_INIT(&xd->xd_nodes);

    xa = xmlattr_new(strdup("version"), strdup("1.0"));
    LINKLIST_APPEND(&xd->xd_attrs, xa);

    return xd;
}

/* --------------------------------------------------------------------- */

/*
 * Constructs an XML parse tree from what is read from the stream.
 */
struct xmldoc *
xmldoc_parse(FILE *f)
{
    struct xmldoc *xd;
    struct xmlnode *xn;

    xd = (struct xmldoc *)malloc(sizeof(struct xmldoc));
    if (xd == NULL)
        return NULL;

    xd->xd_doctype = NULL;
    xd->xd_root = NULL;
    LINKLIST_INIT(&xd->xd_attrs);
    LINKLIST_INIT(&xd->xd_nodes);

    rewind(f);

    if (!read_prolog(f, xd)) {
        warnx("catalog does not look like an XML file; missing prolog");
        xmldoc_free(xd);
        return NULL;
    }

    while ((xn = get_node(f, NULL)) != NULL) {
        /* Got a node from the document.  If it is an element, we cannot
         * attach more than one, as it becomes the root element.
         * Otherwise, we attach it (it's a comment). */
        if (XMLNODE_TYPE(xn) == XMLNODE_TYPE_ELEMENT) {
            if (xd->xd_root != NULL) {
                warnx("root node already seen; <%s> discarded",
                      XMLNODE_TAG(xn));
                xmlnode_free(xn);
            } else {
                xmlnode_become_root(xn);
                LINKLIST_APPEND(&xd->xd_nodes, xn);
                xd->xd_root = xn;
            }
        } else {
            LINKLIST_APPEND(&xd->xd_nodes, xn);
        }
    }

    if (xd->xd_root == NULL) {
        warnx("no root node");
        xmldoc_free(xd);
        xd = NULL;
    }

    return xd;
}

/* --------------------------------------------------------------------- */

/*
 * Destroys the given xmldoc object.  It frees all attached attributes
 * and childs.
 */
void
xmldoc_free(struct xmldoc *xd)
{
    struct xmlattr *xa;
    struct xmlnode *xn;

    assert(xd != NULL);

    if (xd->xd_doctype != NULL) {
        free(xd->xd_doctype->dt_text);
        free(xd->xd_doctype);
    }

    xa = LINKLIST_FIRST(&(xd->xd_attrs));
    while (xa != NULL) {
        struct xmlattr *tmp;

        tmp = LINKLIST_NEXT(xa);
        xmlattr_free(xa);
        xa = tmp;
    }

    xn = LINKLIST_FIRST(&(xd->xd_nodes));
    while (xn != NULL) {
        struct xmlnode *tmp;

        tmp = LINKLIST_NEXT(xn);
        xmlnode_free(xn);
        xn = tmp;
    }

    free(xd);
}

/* --------------------------------------------------------------------- */

/*
 * Dumps the given XML document to a stream, at its actual position.
 */
bool
xmldoc_write(struct xmldoc *xd, FILE *f)
{
    bool res;
    struct xmlattr *aiter;
    struct xmlnode *niter;

    res = true;

    res &= fprintf(f, "<?xml") != -1;

    LINKLIST_FOREACH(aiter, &(xd->xd_attrs)) {
        res &= fprintf(f, " ") != -1;
        res &= xmlattr_write(aiter, f);
    }
    res &= fprintf(f, "?>\n") != -1;

    if (xd->xd_doctype != NULL) {
        res &= fprintf(f, "<!DOCTYPE %s>\n\n", xd->xd_doctype->dt_text) != -1;
    }
 
    LINKLIST_FOREACH(niter, &(xd->xd_nodes)) {
        res &= xmlnode_write(niter, f);
    }

    return res;
}

/* --------------------------------------------------------------------- */

/*
 * Appends the given node to the document.  If it's an element, the
 * document must not contain a root element yet.  Otherwise, it's simply
 * attached (i.e., it's a comment).
 */
void xmldoc_append_node(struct xmldoc *xd, struct xmlnode *xn)
{
    if (XMLNODE_TYPE(xn) == XMLNODE_TYPE_ELEMENT ||
        XMLNODE_TYPE(xn) == XMLNODE_TYPE_ROOT) {
        assert(xd->xd_root == NULL);
        xd->xd_root = xn;
    }
    LINKLIST_APPEND(&xd->xd_nodes, xn);
}

/* --------------------------------------------------------------------- */

/*
 * Reads the XML prolog, i.e. the XML declaration and the DOCTYPE.
 */
static bool
read_prolog(FILE *f, struct xmldoc *xd)
{
    bool res;

    assert(f != NULL && xd != NULL);

    res = read_xmldecl(f, xd);
    if (!res) {
        warnx("missing XML declaration; invalid document");
    } else {
        struct doctype *dt;

        dt = read_doctype(f, xd);
        if (dt == NULL)
            warnx("missing DOCTYPE declaration; omitting error");
        else
            xd->xd_doctype = dt;
    }

    return res;
}

/* --------------------------------------------------------------------- */

/*
 * Reads the XML declaration.  Attributes from it are directly attached
 * to the node.
 */
static bool
read_xmldecl(FILE *f, struct xmldoc *xd)
{
    char *token;
    int type;

    type = read_token(f, SCOPE_GLOBAL, NULL);
    if (type != TOKEN_LT)
        return false;

    type = read_token(f, SCOPE_TAG, NULL);
    if (type != TOKEN_QUEST)
        return false;

    type = read_token(f, SCOPE_TAG, &token);
    if (type != TOKEN_WORD || strcmp(token, "xml") != 0) {
        TOKEN_FREE(type, token);
        return false;
    }
    TOKEN_FREE(type, token);

    while ((type = read_token(f, SCOPE_TAG, &token)) != TOKEN_ERROR &&
           type == TOKEN_WORD) {
        struct xmlattr *attr;

        attr = get_attr(f, token);
        if (attr != NULL) {
            LINKLIST_APPEND(&xd->xd_attrs, attr);
        } else {
            free(token);
        }
    }
    TOKEN_FREE(type, token);

    if (type != TOKEN_QUEST) {
        TOKEN_FREE(type, token);
        return false;
    }

    type = read_token(f, SCOPE_TAG, NULL);
    if (type != TOKEN_GT)
        return false;

    return true;
}

/* --------------------------------------------------------------------- */

/*
 * Reads the DOCTYPE declaration from the document.  We don't try to
 * parse it.
 */
static struct doctype *
read_doctype(FILE *f, struct xmldoc *xd)
{
    char *token;
    int type;
    long int pos;
    struct doctype *dt;

    assert(f != NULL && xd != NULL);

    dt = NULL;

    pos = ftell(f);
    type = read_token(f, SCOPE_GLOBAL, NULL);
    if (type == TOKEN_LT) {
        type = read_token(f, SCOPE_TAG, NULL);
        if (type != TOKEN_EXCL) {
            fseek(f, pos, SEEK_SET);
        } else {
            type = read_token(f, SCOPE_TAG, &token);
            TOKEN_FREE(type, token);
            if (type != TOKEN_WORD) {
                fseek(f, pos, SEEK_SET);
            } else {
                int ch;
                struct grstr *buf;

                while ((ch = fgetc(f)) != EOF && isspace(ch));
                if (ch != EOF) {
                    buf = grstr_new();
                    do {
                        grstr_append_char(buf, ch);
                    } while ((ch = fgetc(f)) != EOF && ch != '>');

                    if (ch != EOF) {
                        dt = (struct doctype *)malloc(sizeof(struct doctype));
                        dt->dt_text = grstr_to_text(buf);
                    }
                }
            }
        }
    }

    return dt;
}

/* --------------------------------------------------------------------- */

/*
 * Get the next node from the file, which may be either a tag (either
 * a real tag or comment) or a text node.  If what we get is a closing
 * tag, i.e., we read something like '</tagname>', the function sets
 * 'closing' to true, to indicate the caller we have reached a base case
 * of this recursive function.
 */
static struct xmlnode *
get_node(FILE *f, bool *closing)
{
    char *token;
    int type;
    struct xmlnode *xn;

    assert(f != NULL);

    type = read_token(f, SCOPE_GLOBAL, &token);
    switch (type) {
    case TOKEN_ERROR:
        xn = NULL;
        break;
    case TOKEN_LT:
        get_node2(f, &xn, closing);
        break;
    default:
        assert(TOKEN_HAS_MEMORY(type));

        xn = xmlnode_new(XMLNODE_TYPE_TEXT, strdup("#TEXT#"));
        xmlnode_set_text(xn, token);

        if (closing != NULL)
            *closing = false;
    }

    return xn;
}

/* --------------------------------------------------------------------- */

/*
 * Get the next node from the file (part two).  We assume that we've
 * already read the opening character '<'.  If what we get is a closing
 * tag, i.e., we read something like '/tagname>', the function sets
 * 'closing' to true, to indicate the caller we have reached a base case
 * of this recursive function.
 */
static void
get_node2(FILE *f, struct xmlnode **xn, bool *closing)
{
    char *token;
    int type;

    assert(f != NULL && xn != NULL);

    if (closing != NULL)
        *closing = false;

    type = read_token(f, SCOPE_TAG, &token);
    switch (type) {
    case TOKEN_ERROR:
        warn("error reading element name");
        *xn = NULL;
        break;
    case TOKEN_EXCL:
        *xn = get_special_node(f);
        break;
    case TOKEN_SLASH:
        *xn = NULL;
        type = read_token(f, SCOPE_TAG, &token);
        if (type != TOKEN_WORD) {
            TOKEN_FREE(type, token);
            warnx("unexpected token (word expected)");
        } else {
            if (closing == NULL) {
                warnx("unbalanced tags (</%s> unexpected)", token);
                free(token);
            } else {
                char *tagname;

                tagname = token;

                type = read_token(f, SCOPE_TAG, NULL);
                if (type != TOKEN_GT)
                    warnx("unexpected token ('>' expected)");
                else {
                    *xn = xmlnode_new(XMLNODE_TYPE_ELEMENT, tagname);
                    *closing = true;
                }
            }
        }
        break;
    case TOKEN_WORD:
        *xn = get_node3(f, token);
        break;
    default:
        TOKEN_FREE(type, token);
        warnx("unexpected token (element name expected)");
        *xn = NULL;
    }
}

/* --------------------------------------------------------------------- */

/*
 * Get the next node from the file (part three).  We assume we've already
 * read the opening character for the tag '<' and its name (which is
 * provided in the 'name' parameter), so we are ready to read its arguments
 * (if any) or the closing character(s).
 */
static struct xmlnode *
get_node3(FILE *f, char *name)
{
    bool done;
    char *token;
    int type;
    struct xmlnode *xn;

    xn = xmlnode_new(XMLNODE_TYPE_ELEMENT, name);
    if (xn == NULL)
        return NULL;

    done = false;
    while (!done && (type = read_token(f, SCOPE_TAG, &token)) != TOKEN_ERROR) {
        bool closing;
        struct xmlattr *attr;
        struct xmlnode *xn2;

        switch (type) {
        case TOKEN_WORD:
            attr = get_attr(f, token);
            if (attr != NULL) {
                XMLNODE_APPEND_ATTR(xn, attr);
            } else {
                TOKEN_FREE(type, token);
            }
            break;
        case TOKEN_SLASH:
            type = read_token(f, SCOPE_TAG, NULL);
            if (type != TOKEN_GT) {
                warnx("unexpected token ('>' expected)");
                xmlnode_free(xn);
                xn = NULL;
            } else
                done = true;
            break;
        case TOKEN_GT:
            while (!done && (xn2 = get_node(f, &closing)) != NULL) {
                if (closing) {
                    if (strcmp(XMLNODE_TAG(xn), XMLNODE_TAG(xn2)) != 0) {
                        warnx("unbalanced tags (</%s> expected but "
                              "</%s> found)", XMLNODE_TAG(xn),
                              XMLNODE_TAG(xn2));
                        xmlnode_free(xn);
                        xn = NULL;
                    } else
                        done = true;
                    xmlnode_free(xn2);
                } else {
                    XMLNODE_APPEND_CHILD(xn, xn2);
                }
            }
            break;
        default:
            TOKEN_FREE(type, token);
            warnx("unexpected token");
            break;
        }
    }

    return xn;
}

/* --------------------------------------------------------------------- */

/*
 * Parse a special node, one that starts with the '<!--' sequence.
 * We only recognize comments so far.
 */
static struct xmlnode *
get_special_node(FILE *f)
{
    char *token;
    int type;
    struct xmlnode *n;

    n = NULL;

    type = read_token(f, SCOPE_TAG, NULL);
    if (type == TOKEN_DASH) {
        type = read_token(f, SCOPE_TAG, NULL);
        if (type != TOKEN_DASH) {
            warnx("unexpected token ('-' expected, comment start)");
        } else {
            type = read_token(f, SCOPE_COMMENT, &token);
            if (type == TOKEN_COMMENT) {
                n = xmlnode_new(XMLNODE_TYPE_COMMENT, strdup("#COMMENT#"));
                xmlnode_set_text(n, token);
                type = read_token(f, SCOPE_TAG, NULL);
                if (type != TOKEN_GT)
                    warnx("unexpected token ('>' expected)");
            } else
                TOKEN_FREE(type, token);
        }
    } else {
        warnx("unexpected token (\"!--\" sequence expected)");
    }

    return n;
}

/* --------------------------------------------------------------------- */

/*
 * Get an attribute from the file.  We assume that its name has been
 * already read and is passed in the 'name' parameter, so the file should
 * be pointing to an equal character followed by a quoted string.
 */
static struct xmlattr *
get_attr(FILE *f, char *name)
{
    char *token;
    int type;
    struct xmlattr *attr;

    attr = NULL;

    type = read_token(f, SCOPE_TAG, NULL);
    if (type != TOKEN_EQUAL) {
        warnx("unexpected token ('=' expected)");
    } else {
        type = read_token(f, SCOPE_TAG, &token);
        if (type != TOKEN_STRING) {
            TOKEN_FREE(type, token);
            warnx("unexpected token (quoted string expected)");
        } else {
            attr = xmlattr_new(name, token);
        }
    }

    return attr;
}

/* --------------------------------------------------------------------- */

/*
 * Read the next token from the document.  scope's value is one of
 * SCOPE_*, and changes the semantics of the subsequent parsing (i.e.,
 * a string can contain spaces, but a word cannot).  If dest is NULL,
 * the token we expected from the file must not have memory associated
 * to it, so it can't be neither a word, nor a string, nor a comment.
 */
static int
read_token(FILE *f, int scope, char **dest)
{
    int type;
    struct grstr *buf;

    assert(f != NULL);

    if (dest != NULL) {
        buf = grstr_new();
        if (buf == NULL)
            return TOKEN_ERROR;
    } else
        buf = NULL;

    type = TOKEN_ERROR;

    switch (scope) {
    case SCOPE_COMMENT:
        type = read_token_comment(f, buf);
        break;
    case SCOPE_GLOBAL:
        type = read_token_global(f, buf);
        break;
    case SCOPE_STRING:
        type = read_token_string(f, buf);
        break;
    case SCOPE_TAG:
        type = read_token_tag(f, buf);
        break;
    }

    if (TOKEN_HAS_MEMORY(type)) {
        assert(dest != NULL);
        *dest = grstr_to_text(buf);
    } else {
        if (dest != NULL)
            *dest = NULL;
        if (buf != NULL)
            grstr_free(buf);
    }

    return type;
}

/* --------------------------------------------------------------------- */

/*
 * Read a comment from the document.  We assume we have already read the
 * '<!--' part, so we are ready to get all text until we find the '--'
 * character sequence.
 */
static int
read_token_comment(FILE *f, struct grstr *gs)
{
    int ch, type;

    assert(f != NULL && gs != NULL);

    type = TOKEN_COMMENT;

    ch = EOF;
    while (type != TOKEN_ERROR) {
        while (type != TOKEN_ERROR && (ch = fgetc(f)) != EOF && ch != '-')
            if (!grstr_append_char(gs, ch))
                type = TOKEN_ERROR;

        if (ferror(f))
            break;

        ch = fgetc(f);
        if (ch == '-')
            break;
        else {
            if (!grstr_append_char(gs, '-'))
                type = TOKEN_ERROR;
            ungetc(ch, f);
        }
    }

    return ferror(f) ? TOKEN_ERROR : type;
}

/* --------------------------------------------------------------------- */

/*
 * Read a token from the document, assuming we are in the global scope
 * (i.e., not inside a string, tag, etc.).  Therefore, we can only find
 * the start of a tag '<', or the start of a text node.
 */
static int
read_token_global(FILE *f, struct grstr *gs)
{
    int ch, type;

    assert(f != NULL);

    while ((ch = fgetc(f)) != EOF && isspace(ch));
    if (ch == EOF)
        return TOKEN_ERROR;

    type = TOKEN_ERROR;

    if (ch == '<') {
        type = TOKEN_LT;
    } else if (gs != NULL) {
        type = TOKEN_STRING;
        do {
            if (isspace(ch)) {
                grstr_append_char(gs, ' ');
                while ((ch = fgetc(f)) != EOF && isspace(ch));
                if (ch == EOF || ch == '<')
                    break;
                else
                    grstr_append_char(gs, ch);
            } else
                grstr_append_char(gs, ch);
        } while ((ch = fgetc(f)) != EOF && ch != '<');
        if (ch == '<')
            ungetc(ch, f);
    }

    return type;
}

/* --------------------------------------------------------------------- */

/*
 * Read a string from the document.  We assume we have already read its
 * opening character '"', and we have to read until we find another one.
 */
static int
read_token_string(FILE *f, struct grstr *gs)
{
    int ch, type;

    assert(f != NULL && gs != NULL);

    type = TOKEN_STRING;

    while (type != TOKEN_ERROR && (ch = fgetc(f)) != EOF && ch != '"')
        if (!grstr_append_char(gs, ch))
            type = TOKEN_ERROR;

    return ferror(f) ? TOKEN_ERROR : type;
}

/* --------------------------------------------------------------------- */

/*
 * Read the next token that appears inside a tag.  It can be almost
 * anything, though if it's a string 'gs' must not be NULL (otherwise
 * we didn't expect it).
 */
static int
read_token_tag(FILE *f, struct grstr *gs)
{
    int ch, type;

    assert(f != NULL);

    while ((ch = fgetc(f)) != EOF && isspace(ch));
    if (ferror(f))
        return TOKEN_ERROR;

    type = TOKEN_ERROR;

    switch (ch) {
    case EOF:
        break;
    case '-':
        type = TOKEN_DASH;
        break;
    case '=':
        type = TOKEN_EQUAL;
        break;
    case '!':
        type = TOKEN_EXCL;
        break;
    case '>':
        type = TOKEN_GT;
        break;
    case '?':
        type = TOKEN_QUEST;
        break;
    case '/':
        type = TOKEN_SLASH;
        break;
    case '"':
        if (gs != NULL)
            type = read_token_string(f, gs);
        break;
    default:
        if (gs != NULL) {
            type = TOKEN_WORD;
            do {
                if (!grstr_append_char(gs, ch)) {
                    type = TOKEN_ERROR;
                    break;
                }
            } while ((ch = fgetc(f)) != EOF && ISWORD(ch));
            if (ch != EOF)
                ungetc(ch, f);
        }
    }

    return type;
}

/*
 * Local Variables: ***
 * mode: c ***
 * c-file-style: "stroustrup" ***
 * End: ***
 * vim: syntax=c:expandtab:shiftwidth=4:softtabstop=4
 */