/* * XML Catalog Manager (xmlcatmgr) * $Id: xmldoc.c,v 1.1 2004/08/31 19:07:23 jmmv Exp $ * * Copyright (c) 2003, 2004 Julio M. Merino Vidal. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in * the documentation and/or other materials provided with the * distribution. * 3. Neither the name of the author nor the names of contributors may * be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * This file implements the XML document parser. It constructs a * parse tree, using xmlnode and xmlattr types as helpers. * * The parser is quite limited, but for xmlcatmgr's purpose, it's * enough. Specially it lacks: * - Unicode support. * - DOCTYPE support (as it's now, it just reads its contents without * parsing them). * - Entity support. */ #include "system.h" #ifndef lint __RCSID("$Id: xmldoc.c,v 1.1 2004/08/31 19:07:23 jmmv Exp $"); #endif #include "grstr.h" #include "mem.h" #include "linklist.h" #include "xmldoc.h" #include "xmlnode.h" #define SCOPE_GLOBAL 0 #define SCOPE_TAG 1 #define SCOPE_COMMENT 2 #define SCOPE_STRING 3 #define TOKEN_ERROR 0 #define TOKEN_LT 1 #define TOKEN_GT 2 #define TOKEN_EQUAL 3 #define TOKEN_SLASH 4 #define TOKEN_QUEST 5 #define TOKEN_EXCL 6 #define TOKEN_DASH 7 #define TOKEN_WORD 8 #define TOKEN_STRING 9 #define TOKEN_COMMENT 10 #define TOKEN_HAS_MEMORY(tk) \ (tk == TOKEN_WORD || tk == TOKEN_STRING || tk == TOKEN_COMMENT) #define TOKEN_FREE(tk, ptr) \ if (TOKEN_HAS_MEMORY(tk)) \ free(ptr); #define ISWORD(ch) (isalnum(ch) || ch == '_') static int read_token(FILE *, int, char **); static int read_token_comment(FILE *, struct grstr *); static int read_token_global(FILE *, struct grstr *); static int read_token_string(FILE *, struct grstr *); static int read_token_tag(FILE *, struct grstr *); static bool read_prolog(FILE *, struct xmldoc *); static bool read_xmldecl(FILE *, struct xmldoc *); static struct doctype *read_doctype(FILE *, struct xmldoc *); static struct xmlnode *get_node(FILE *, bool *); static void get_node2(FILE *f, struct xmlnode **, bool *); static struct xmlnode *get_node3(FILE *f, char *name); static struct xmlnode *get_special_node(FILE *); static struct xmlattr *get_attr(FILE *, char *); /* --------------------------------------------------------------------- */ /* * Creates a new empty xmldoc object, returning a pointer to it. */ struct xmldoc * xmldoc_new(const char *doctype) { struct doctype *dt; struct xmlattr *xa; struct xmldoc *xd; xd = (struct xmldoc *)malloc(sizeof(struct xmldoc)); if (xd == NULL) return NULL; dt = (struct doctype *)malloc(sizeof(struct doctype)); dt->dt_text = strdup(doctype); xd->xd_doctype = dt; xd->xd_root = NULL; LINKLIST_INIT(&xd->xd_attrs); LINKLIST_INIT(&xd->xd_nodes); xa = xmlattr_new(strdup("version"), strdup("1.0")); LINKLIST_APPEND(&xd->xd_attrs, xa); return xd; } /* --------------------------------------------------------------------- */ /* * Constructs an XML parse tree from what is read from the stream. */ struct xmldoc * xmldoc_parse(FILE *f) { struct xmldoc *xd; struct xmlnode *xn; xd = (struct xmldoc *)malloc(sizeof(struct xmldoc)); if (xd == NULL) return NULL; xd->xd_doctype = NULL; xd->xd_root = NULL; LINKLIST_INIT(&xd->xd_attrs); LINKLIST_INIT(&xd->xd_nodes); rewind(f); if (!read_prolog(f, xd)) { warnx("catalog does not look like an XML file; missing prolog"); xmldoc_free(xd); return NULL; } while ((xn = get_node(f, NULL)) != NULL) { /* Got a node from the document. If it is an element, we cannot * attach more than one, as it becomes the root element. * Otherwise, we attach it (it's a comment). */ if (XMLNODE_TYPE(xn) == XMLNODE_TYPE_ELEMENT) { if (xd->xd_root != NULL) { warnx("root node already seen; <%s> discarded", XMLNODE_TAG(xn)); xmlnode_free(xn); } else { xmlnode_become_root(xn); LINKLIST_APPEND(&xd->xd_nodes, xn); xd->xd_root = xn; } } else { LINKLIST_APPEND(&xd->xd_nodes, xn); } } if (xd->xd_root == NULL) { warnx("no root node"); xmldoc_free(xd); xd = NULL; } return xd; } /* --------------------------------------------------------------------- */ /* * Destroys the given xmldoc object. It frees all attached attributes * and childs. */ void xmldoc_free(struct xmldoc *xd) { struct xmlattr *xa; struct xmlnode *xn; assert(xd != NULL); if (xd->xd_doctype != NULL) { free(xd->xd_doctype->dt_text); free(xd->xd_doctype); } xa = LINKLIST_FIRST(&(xd->xd_attrs)); while (xa != NULL) { struct xmlattr *tmp; tmp = LINKLIST_NEXT(xa); xmlattr_free(xa); xa = tmp; } xn = LINKLIST_FIRST(&(xd->xd_nodes)); while (xn != NULL) { struct xmlnode *tmp; tmp = LINKLIST_NEXT(xn); xmlnode_free(xn); xn = tmp; } free(xd); } /* --------------------------------------------------------------------- */ /* * Dumps the given XML document to a stream, at its actual position. */ bool xmldoc_write(struct xmldoc *xd, FILE *f) { bool res; struct xmlattr *aiter; struct xmlnode *niter; res = true; res &= fprintf(f, "xd_attrs)) { res &= fprintf(f, " ") != -1; res &= xmlattr_write(aiter, f); } res &= fprintf(f, "?>\n") != -1; if (xd->xd_doctype != NULL) { res &= fprintf(f, "\n\n", xd->xd_doctype->dt_text) != -1; } LINKLIST_FOREACH(niter, &(xd->xd_nodes)) { res &= xmlnode_write(niter, f); } return res; } /* --------------------------------------------------------------------- */ /* * Appends the given node to the document. If it's an element, the * document must not contain a root element yet. Otherwise, it's simply * attached (i.e., it's a comment). */ void xmldoc_append_node(struct xmldoc *xd, struct xmlnode *xn) { if (XMLNODE_TYPE(xn) == XMLNODE_TYPE_ELEMENT || XMLNODE_TYPE(xn) == XMLNODE_TYPE_ROOT) { assert(xd->xd_root == NULL); xd->xd_root = xn; } LINKLIST_APPEND(&xd->xd_nodes, xn); } /* --------------------------------------------------------------------- */ /* * Reads the XML prolog, i.e. the XML declaration and the DOCTYPE. */ static bool read_prolog(FILE *f, struct xmldoc *xd) { bool res; assert(f != NULL && xd != NULL); res = read_xmldecl(f, xd); if (!res) { warnx("missing XML declaration; invalid document"); } else { struct doctype *dt; dt = read_doctype(f, xd); if (dt == NULL) warnx("missing DOCTYPE declaration; omitting error"); else xd->xd_doctype = dt; } return res; } /* --------------------------------------------------------------------- */ /* * Reads the XML declaration. Attributes from it are directly attached * to the node. */ static bool read_xmldecl(FILE *f, struct xmldoc *xd) { char *token; int type; type = read_token(f, SCOPE_GLOBAL, NULL); if (type != TOKEN_LT) return false; type = read_token(f, SCOPE_TAG, NULL); if (type != TOKEN_QUEST) return false; type = read_token(f, SCOPE_TAG, &token); if (type != TOKEN_WORD || strcmp(token, "xml") != 0) { TOKEN_FREE(type, token); return false; } TOKEN_FREE(type, token); while ((type = read_token(f, SCOPE_TAG, &token)) != TOKEN_ERROR && type == TOKEN_WORD) { struct xmlattr *attr; attr = get_attr(f, token); if (attr != NULL) { LINKLIST_APPEND(&xd->xd_attrs, attr); } else { free(token); } } TOKEN_FREE(type, token); if (type != TOKEN_QUEST) { TOKEN_FREE(type, token); return false; } type = read_token(f, SCOPE_TAG, NULL); if (type != TOKEN_GT) return false; return true; } /* --------------------------------------------------------------------- */ /* * Reads the DOCTYPE declaration from the document. We don't try to * parse it. */ static struct doctype * read_doctype(FILE *f, struct xmldoc *xd) { char *token; int type; long int pos; struct doctype *dt; assert(f != NULL && xd != NULL); dt = NULL; pos = ftell(f); type = read_token(f, SCOPE_GLOBAL, NULL); if (type == TOKEN_LT) { type = read_token(f, SCOPE_TAG, NULL); if (type != TOKEN_EXCL) { fseek(f, pos, SEEK_SET); } else { type = read_token(f, SCOPE_TAG, &token); TOKEN_FREE(type, token); if (type != TOKEN_WORD) { fseek(f, pos, SEEK_SET); } else { int ch; struct grstr *buf; while ((ch = fgetc(f)) != EOF && isspace(ch)); if (ch != EOF) { buf = grstr_new(); do { grstr_append_char(buf, ch); } while ((ch = fgetc(f)) != EOF && ch != '>'); if (ch != EOF) { dt = (struct doctype *)malloc(sizeof(struct doctype)); dt->dt_text = grstr_to_text(buf); } } } } } return dt; } /* --------------------------------------------------------------------- */ /* * Get the next node from the file, which may be either a tag (either * a real tag or comment) or a text node. If what we get is a closing * tag, i.e., we read something like '', the function sets * 'closing' to true, to indicate the caller we have reached a base case * of this recursive function. */ static struct xmlnode * get_node(FILE *f, bool *closing) { char *token; int type; struct xmlnode *xn; assert(f != NULL); type = read_token(f, SCOPE_GLOBAL, &token); switch (type) { case TOKEN_ERROR: xn = NULL; break; case TOKEN_LT: get_node2(f, &xn, closing); break; default: assert(TOKEN_HAS_MEMORY(type)); xn = xmlnode_new(XMLNODE_TYPE_TEXT, strdup("#TEXT#")); xmlnode_set_text(xn, token); if (closing != NULL) *closing = false; } return xn; } /* --------------------------------------------------------------------- */ /* * Get the next node from the file (part two). We assume that we've * already read the opening character '<'. If what we get is a closing * tag, i.e., we read something like '/tagname>', the function sets * 'closing' to true, to indicate the caller we have reached a base case * of this recursive function. */ static void get_node2(FILE *f, struct xmlnode **xn, bool *closing) { char *token; int type; assert(f != NULL && xn != NULL); if (closing != NULL) *closing = false; type = read_token(f, SCOPE_TAG, &token); switch (type) { case TOKEN_ERROR: warn("error reading element name"); *xn = NULL; break; case TOKEN_EXCL: *xn = get_special_node(f); break; case TOKEN_SLASH: *xn = NULL; type = read_token(f, SCOPE_TAG, &token); if (type != TOKEN_WORD) { TOKEN_FREE(type, token); warnx("unexpected token (word expected)"); } else { if (closing == NULL) { warnx("unbalanced tags ( unexpected)", token); free(token); } else { char *tagname; tagname = token; type = read_token(f, SCOPE_TAG, NULL); if (type != TOKEN_GT) warnx("unexpected token ('>' expected)"); else { *xn = xmlnode_new(XMLNODE_TYPE_ELEMENT, tagname); *closing = true; } } } break; case TOKEN_WORD: *xn = get_node3(f, token); break; default: TOKEN_FREE(type, token); warnx("unexpected token (element name expected)"); *xn = NULL; } } /* --------------------------------------------------------------------- */ /* * Get the next node from the file (part three). We assume we've already * read the opening character for the tag '<' and its name (which is * provided in the 'name' parameter), so we are ready to read its arguments * (if any) or the closing character(s). */ static struct xmlnode * get_node3(FILE *f, char *name) { bool done; char *token; int type; struct xmlnode *xn; xn = xmlnode_new(XMLNODE_TYPE_ELEMENT, name); if (xn == NULL) return NULL; done = false; while (!done && (type = read_token(f, SCOPE_TAG, &token)) != TOKEN_ERROR) { bool closing; struct xmlattr *attr; struct xmlnode *xn2; switch (type) { case TOKEN_WORD: attr = get_attr(f, token); if (attr != NULL) { XMLNODE_APPEND_ATTR(xn, attr); } else { TOKEN_FREE(type, token); } break; case TOKEN_SLASH: type = read_token(f, SCOPE_TAG, NULL); if (type != TOKEN_GT) { warnx("unexpected token ('>' expected)"); xmlnode_free(xn); xn = NULL; } else done = true; break; case TOKEN_GT: while (!done && (xn2 = get_node(f, &closing)) != NULL) { if (closing) { if (strcmp(XMLNODE_TAG(xn), XMLNODE_TAG(xn2)) != 0) { warnx("unbalanced tags ( expected but " " found)", XMLNODE_TAG(xn), XMLNODE_TAG(xn2)); xmlnode_free(xn); xn = NULL; } else done = true; xmlnode_free(xn2); } else { XMLNODE_APPEND_CHILD(xn, xn2); } } break; default: TOKEN_FREE(type, token); warnx("unexpected token"); break; } } return xn; } /* --------------------------------------------------------------------- */ /* * Parse a special node, one that starts with the '