/*********************************************************************** * Copyright (C) 1995 Joe English * Freely redistributable *********************************************************************** * * rdsgmls.c,v 1.24 1998/11/20 03:48:50 joe Exp * * Author: Joe English * Created: Jan 1995 * Description: read output of sgmls. * Bugs: * This is *really* short on error checking. * It never tests the return value of malloc(), * and input errors are handled by dumping core. * Needless to say, it should be a *bit* more robust. * * 1998/11/20 03:48:50 * 1.24 */ #include #include #include #include "project.h" #include "strmap.h" #include "strmgt.h" #include "pile.h" #include "esis.h" #include "esisp.h" #include "lineout.h" /* from sgmls source distribution */ typedef enum /* character codes for SGMLS data escape characters */ { CHCODE_DATA, /* single characters, \\, \nnn octal escapes */ CHCODE_RE, /* "\n", record-end */ CHCODE_RS, /* \012, record-start */ CHCODE_SDATA, /* \|, SDATA entity bracket */ CHCODE_EOLN, /* true newline */ CHCODE_EOF, /* EOF */ CHCODE_ERR /* unparseable */ } CHCODE; /* Read a single character or \-escape sequence from sgmls. * Returns character and code. * %%% TODO: Handle \#d; escape sequences (decimal escape; new in nsgmls). */ static int rdchar(ESISInputStream stream, CHCODE *code_rtn) { int ch = ESISgetc(stream); if (ch == '\n') *code_rtn = CHCODE_EOLN; else if (ch == -1) *code_rtn = CHCODE_EOF; else if (ch == '\\') { switch (ch = ESISgetc(stream)) { case '\\' : *code_rtn = CHCODE_DATA; break; case 'n' : *code_rtn = CHCODE_RE; break; case '|' : *code_rtn = CHCODE_SDATA; break; case '0' : case '1' : case '2' : case '3' : case '4' : case '5' : case '6' : case '7' : { /* parse octal sequence */ char buf[4]; buf[0] = ch; buf[1] = ESISgetc(stream); if (isdigit(buf[1])) { buf[2] = ESISgetc(stream); buf[3] = '\0'; if (isdigit(buf[2])) { buf[3] = '\0'; ch = strtol(buf, NULL, 8); if (ch == 012) *code_rtn = CHCODE_RS; else *code_rtn = CHCODE_DATA; } else *code_rtn = CHCODE_ERR; } else *code_rtn = CHCODE_ERR; break; } default : ASSERT(0, "Bad character in escape sequence"); *code_rtn = CHCODE_ERR; } /* switch */ } /* if ch == '\\' */ else { *code_rtn = CHCODE_DATA; } return ch; } /* Read (and ignore) the rest of an input line. * (This is for unused/unimplemented/unrecognized SGMLS events) */ static void eatline(ESISInputStream stream) { int ch; do { ch = ESISgetc(stream); } while (ch != '\n' && ch != -1); } /* Read next alphanumeric token and intern it; * set *eoln = 1 if separator is '\n', 0 if it was a space */ static ESISToken rdtoken(ESISInputStream stream, int *eoln) { static char *tokbuf = 0; static int tokbufsize = 0; int n, ch; if (!tokbuf) tokbuf = malloc(tokbufsize=80); /* %%%check */ /* skip whitespace */ do { ch = ESISgetc(stream); } while (isspace(ch)); /* read token */ n=0; while (ch != -1 && !isspace(ch)) { tokbuf[n++] = ch; if (n >= tokbufsize) tokbuf = realloc(tokbuf,tokbufsize *= 2); /* %%%check */ ch = ESISgetc(stream); } *eoln = (ch == '\n'); tokbuf[n] = '\0'; return intern(tokbuf); } /* Read remainder of input line, * unescaping data and storing it on 'p'. * Returns: '\0'-terminated string. */ static char *rdunescape(ESISInputStream stream, pile p) { pstart(p); for (;;) { int ch; CHCODE chcode; ch = rdchar(stream,&chcode); switch (chcode) { case CHCODE_DATA: case CHCODE_RE: paddch(p, ch); continue; case CHCODE_EOLN : case CHCODE_EOF : break; case CHCODE_SDATA: /* %%% this can happen if SDATA entities are referenced * %%% in CDATA declared attribute values. Ignore it. */ continue; case CHCODE_RS : /* This can happen if there are newlines in PIs. * Why, I don't know, but ignore it. */ continue; default : ASSERT(0, "Bad character in escape sequence"); } break; } paddch(p,'\0'); return pfinish(p); } /* Read and parse character data record * add data nodes (CDATA, SDATA, RE) as children of specified nodes. * return: first node read (there can be more than one...) * %%% AIEEE!!! LOGIC!!! */ static void rddata(ESISBuilder ep, ESISInputStream stream) { pile p = ep->datapile; char *text; int ch; CHCODE chcode; enum { s_nonode, s_incdata, s_insdata, s_readre, s_end } state = s_nonode, nextstate = s_nonode; do { nextstate = state; ASSERT(state == s_nonode || state == s_insdata || state == s_incdata, "Bad state"); ch = rdchar(stream, &chcode); switch (chcode) { case CHCODE_EOLN: case CHCODE_EOF: nextstate = s_end; break; case CHCODE_DATA: addch: if (state == s_nonode) { pstart(p); state = s_incdata; } paddch(p,ch); continue; case CHCODE_RS: /* ignore */ continue; case CHCODE_RE: if (state == s_insdata) { ch = '\n'; goto addch; } nextstate = s_readre; break; case CHCODE_SDATA: nextstate = (state == s_insdata) ? s_nonode : s_insdata; break; default : ASSERT(0, "Bad chcode"); } /* leave current state: */ leavestate: if (state == s_incdata || state == s_insdata) { paddch(p,'\0'); text = pfinish(p); esis_create_datanode( ep, state == s_incdata ? EN_CDATA : EN_SDATA, text); } else if (state == s_readre) { esis_create_datanode(ep,EN_RE, "\n"); } /* enter new state: */ state = nextstate; if (state == s_insdata) pstart(p); if (state == s_readre) { nextstate = s_nonode; goto leavestate; } } while (state != s_end); return; } /* * Aname val * Dename name val * Read and parse the "name val" part of an attribute record, * add attribute to specified node. * VAL is one of: * IMPLIED * CDATA data * NOTATION nname * ENTITY name... * TOKEN token... * * BUGS: Does not distinguish between CDATA and SDATA in * CDATA attribute values. * Should treat different declared value types differently... */ static ESISNode rdattribute(ESISNode n, ESISInputStream stream, pile p) { ESISToken attname, dv; int eoln; char *attval; attname = rdtoken(stream, &eoln); ASSERT(!eoln, "attribute ended prematurely"); dv = rdtoken(stream, &eoln); if (!strcmp(dv,"IMPLIED")) { attval = 0; ASSERT(eoln,"Extra data after IMPLIED attribute record"); } else { attval = rdunescape(stream, p); eoln = 1; if (!strcmp(dv,"NOTATION")) esis_setprop(n, ENTPROP_NOTATION, attval); /* %%% other cases ? */ } return esis_create_attribute(n, attname, attval); } /* * "All data in syntactic content is a pseudo-element. [...] * References to data entities that are not replaced in ESIS * are treated as peers of characters" * Charles Goldfarb in 28 Apr 95 */ static void ensure_pel(ESISBuilder ep) { if (ep->curnode->type == EN_PEL) return; ASSERT(ep->curnode->type == EN_EL, "data in non-EL node"); esis_open_node(ep, EN_PEL); return; } static void ensure_not_pel(ESISBuilder ep) { if (ep->curnode->type == EN_PEL) (void)esis_close_node(ep); } /* estream_load_sgmls: * Reads SGMLS event stream, builds an ESIS tree. * Returns: new ESISDocument */ ESISDocument estream_load_sgmls(ESISInputStream stream) { int done = 0; ESISNode newelement = 0; char *sysid = 0; char *pubid = 0; int eoln = 0; ESISBuilder ep = esis_builder_start(); pile p = ep->datapile; while (!done) { int code = ESISgetc(stream); switch (code) { case -1 : done = 1; break; case CONFORMING_CODE: /* ASSERT "current node is document root" * return EOF */ eatline(stream); done = 1; break; /* * Auxilliary information codes: */ case SYSID_CODE: case PUBID_CODE: { char *str; pilemark m; m = pmark(p); str = rdunescape(stream,p); if (code == SYSID_CODE) { ASSERT(!sysid, "Input error: sysid read but not consumed"); sysid = malloc(strlen(str)+1); strcpy(sysid,str); } else { ASSERT(!pubid, "Input error: pubid read but not consumed"); pubid = malloc(strlen(str)+1); strcpy(pubid,str); } prelease(p,m); continue; } case FILE_CODE: case LOCATION_CODE: case LINK_ATTRIBUTE_CODE: case INCLUDED_ELEMENT_CODE: case DEFINE_EXTERNAL_TEXT_ENTITY_CODE: /* ignore these for now */ eatline(stream); continue; /* * Declarations: */ case DEFINE_NOTATION_CODE: /* %%% not used */ { (void)rdtoken(stream,&eoln); /* notation name */ ASSERT(eoln, "Misparsed NOTATION code"); if (pubid) free(pubid); /* %%% save these */ if (sysid) free(sysid); /* %%% map name->pub,sysid */ pubid = sysid = 0; continue; } /* Dename name val */ case DATA_ATTRIBUTE_CODE: { ESISNode ent; ESISToken name = rdtoken(stream, &eoln); ASSERT(!eoln, "Incomplete data attribute definition"); ent = esis_find_entity(ep, name); ASSERT(ent, "Data attribute definition for nonexistant entity"); rdattribute(ent, stream, p); continue; } /* Eename typ nname */ /* Iename typ */ /* Sename */ /* type in { CDATA, NDATA, SDATA } */ /* %%% logic is way too convoluted */ case DEFINE_EXTERNAL_ENTITY_CODE: case DEFINE_INTERNAL_ENTITY_CODE: case DEFINE_SUBDOC_ENTITY_CODE: { ESISNode ent; ESISToken nname, ename; ename = rdtoken(stream, &eoln); ent = esis_create_entity(ep, ename); if (code != DEFINE_INTERNAL_ENTITY_CODE) { if (sysid) { esis_setprop(ent, ENTPROP_SYSID, sysid); free(sysid); } if (pubid) { esis_setprop(ent, ENTPROP_PUBID, pubid); free(pubid); } sysid=pubid=0; } if (code == DEFINE_SUBDOC_ENTITY_CODE) { ASSERT(eoln, "Bad S command"); continue; } ASSERT(!eoln,"Incomplete code"); (void)rdtoken(stream, &eoln); /* %%% entity type; not used */ if (code == DEFINE_EXTERNAL_ENTITY_CODE) { nname = rdtoken(stream, &eoln); esis_setprop(ent, ENTPROP_NOTATION, nname); /* %%% set props: NOTSYSID, NOTPUBID */ } else { ent->text = rdunescape(stream,p); eoln = 1; } ASSERT(eoln,"Bad entity defn"); continue; } case APPINFO_CODE: /* %%% ignore this for now */ eatline(stream); continue; /*+++ * Event codes: */ case DATA_CODE: { ensure_pel(ep); rddata(ep, stream); break; } /* &name */ case REFERENCE_ENTITY_CODE: { ESISToken ename = rdtoken(stream,&eoln); ESISNode ent = esis_find_entity(ep, ename); ensure_pel(ep); esis_open_node(ep,EN_REFERENCE); ep->curnode->reference = ent; ep->curnode->name = ename; esis_close_node(ep); break; } /* ?data */ case PI_CODE: { /* PIs are not addressible in HyTime, so they * can appear anywhere in the tree. In particular, * a PEL node is *not* created. */ char *text = rdunescape(stream,p); esis_create_datanode(ep,EN_PI, text); break; } /* * Elements: */ case ATTRIBUTE_CODE: /* if (!newelement), newelement = new EL node (no GI, added later); * create AT attribute of newelement. * Continue. */ if (!newelement) { ensure_not_pel(ep); newelement = esis_open_node(ep,EN_EL); } rdattribute(newelement, stream, p); continue; case START_CODE: /* if newelement (created when attributes seen) assign GI * else create new EL node. * Return. */ if (!newelement) { ensure_not_pel(ep); newelement = esis_open_node(ep,EN_EL); } newelement->name = rdtoken(stream, &eoln); ASSERT(eoln, "Bad ( code)"); newelement = NULL; break; case END_CODE: { #if DEBUG ESISToken gi = #endif rdtoken(stream, &eoln); ensure_not_pel(ep); ASSERT(ep->curnode->type == EN_EL, "totally out of sync"); ASSERT(ep->curnode->name == gi, "out of sync"); esis_close_node(ep); break; } case START_SUBDOC_CODE: case END_SUBDOC_CODE: { (void)rdtoken(stream, &eoln);/* %%% document entity name; unused */ ASSERT(eoln, "Misgrokked SUBDOC event"); continue; /* %%% should create new SD node, DTD mgt */ /* %%% create PEL node? I think so... */ } default: ASSERT(0,"Forgot to implement an event type..."); eatline(stream); continue; } ASSERT(pubid == 0, "PUBID read but not used"); ASSERT(sysid == 0, "SYSID read but not used"); ASSERT(newelement == 0, "Attributes read but not used"); } /* while (!done...) */ if (done == 1) { /* Success */ return esis_builder_finish(ep); } else { esis_free_document(esis_builder_finish(ep)); return NULL; } } /*EOF*/