block? */ gboolean style; /* Are we in a <style> block? */ gboolean script; /* Are we in a <script> block? */ gboolean textarea; /* Are we in a <textarea> block? */ gint pre; /* Are we in a <pre> block? */ gboolean select; /* Are we in a <select> block? */ gboolean charEntity; /* Are we in an &... sequence? */ gboolean extension; /* Are we in an " */ if (p->searchCount < 2) p->searchCount++; } else if (p->searchCount == 2 && (**src == '>')) { p->comment = FALSE; /* We've got a "-->" sequence */ } else if (tolower (**src) == gtkhtmlStart [p->searchGtkHTMLCount]) { if (p->searchGtkHTMLCount == 8) { p->extension = TRUE; p->comment = FALSE; p->searchCount = 0; p->searchExtensionEndCount = 0; p->searchGtkHTMLCount = 0; } else p->searchGtkHTMLCount ++; } else { p->searchGtkHTMLCount = 0; if (p->searchCount < 2) p->searchCount = 0; } (*src)++; } static inline void extension_one_char (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->extension = FALSE; html_tokenizer_tokenize_one_char (t, src); p->extension = TRUE; } static void in_extension (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; /* check for "-->" */ if (!p->tquote && **src == '-') { if (p->searchExtensionEndCount < 2) p->searchExtensionEndCount ++; (*src) ++; } else if (!p->tquote && p->searchExtensionEndCount == 2 && **src == '>') { p->extension = FALSE; (*src) ++; } else { if (p->searchExtensionEndCount > 0) { if (p->extension) { const gchar *c = "-->"; while (p->searchExtensionEndCount) { extension_one_char (t, &c); p->searchExtensionEndCount --; } } } extension_one_char (t, src); } } static void in_script_or_style (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; /* Allocate memory to store the script or style */ if (p->scriptCodeSize + 11 > p->scriptCodeMaxSize) p->scriptCode = g_realloc (p->scriptCode, p->scriptCodeMaxSize += 1024); if ((**src == '>' ) && ( p->searchFor [p->searchCount] == '>')) { (*src)++; p->scriptCode [p->scriptCodeSize] = 0; p->scriptCode [p->scriptCodeSize + 1] = 0; if (p->script) { p->script = FALSE; } else { p->style = FALSE; } g_free (p->scriptCode); p->scriptCode = NULL; } /* Check if a </script> tag is on its way */ else if (p->searchCount > 0) { if (tolower (**src) == p->searchFor [p->searchCount]) { p->searchBuffer [p->searchCount] = **src; p->searchCount++; (*src)++; } else { gchar *c; p->searchBuffer [p->searchCount] = 0; c = p->searchBuffer; while (*c) p->scriptCode [p->scriptCodeSize++] = *c++; p->scriptCode [p->scriptCodeSize] = **src; (*src)++; p->searchCount = 0; } } else if (**src == '<') { p->searchCount = 1; p->searchBuffer [0] = '<'; (*src)++; } else { p->scriptCode [p->scriptCodeSize] = **src; (*src)++; } } static void add_unichar (HTMLTokenizer *t, gunichar wc) { struct _HTMLTokenizerPrivate *p = t->priv; p->utf8_length = 0; if (wc != '\0') { p->dest += g_unichar_to_utf8 (wc, p->dest); *(p->dest) = 0; } /* g_assert (g_utf8_validate (p->buffer, p->dest - p->buffer, NULL)); */ } static void add_byte (HTMLTokenizer *t, const gchar **src) { gunichar wc; struct _HTMLTokenizerPrivate *p = t->priv; if (p->utf8) { p->utf8_buffer[p->utf8_length] = **src; p->utf8_length++; wc = g_utf8_get_char_validated ((const gchar *)p->utf8_buffer, p->utf8_length); if (wc == -1 || p->utf8_length >= (sizeof(p->utf8_buffer)/sizeof(p->utf8_buffer[0]))) { add_unichar (t, '?'); (*src)++; return; } else if (wc == -2) { /* incomplete character check again */ (*src)++; return; } } else { wc = (guchar)**src; } add_unichar (t, wc); (*src)++; } static void flush_entity (HTMLTokenizer *t) { struct _HTMLTokenizerPrivate *p = t->priv; /* ignore the TAG_ESCAPE when flushing */ const char *str = p->searchBuffer + 1; while (p->searchCount--) { add_byte (t, &str); } } static void in_entity (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; gunichar entityValue = 0; /* See http://www.mozilla.org/newlayout/testcases/layout/entities.html for a complete entity list, ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT (or 'man iso_8859_1') for the character encodings. */ p->searchBuffer [p->searchCount + 1] = **src; p->searchBuffer [p->searchCount + 2] = '\0'; /* Check for � sequence */ if (p->searchBuffer[2] == '#') { if ((p->searchCount > 1) && (!isdigit (**src)) && (p->searchBuffer[3] != 'x')) { /* { */ p->searchBuffer [p->searchCount + 1] = '\0'; entityValue = strtoul (&(p->searchBuffer [3]), NULL, 10); p->charEntity = FALSE; } if ((p->searchCount > 1) && (!isalnum (**src)) && (p->searchBuffer[3] == 'x')) { /* &x12AB */ p->searchBuffer [p->searchCount + 1] = '\0'; entityValue = strtoul (&(p->searchBuffer [4]), NULL, 16); p->charEntity = FALSE; } } else { /* Check for &abc12 sequence */ if (!isalnum (**src)) { p->charEntity = FALSE; if ((p->searchBuffer [p->searchCount + 1] == ';') || (!p->tag)) { char *ename = p->searchBuffer + 2; p->searchBuffer [p->searchCount + 1] = '\0'; /* FIXME sucks */ entityValue = html_entity_parse (ename, 0); } } } if (p->searchCount > 9) { /* Ignore this sequence since it's too long */ p->charEntity = FALSE; flush_entity (t); } else if (p->charEntity) { /* Keep searching for end of character entity */ p->searchCount++; (*src)++; } else { /* * my reading of http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2 makes * seem correct to always collapse entity references, even in element names * and attributes. */ if (entityValue) { /* Insert plain char */ if (entityValue != TAG_ESCAPE) add_unichar (t, entityValue); if (**src == ';') (*src)++; } else { /* Ignore the sequence, just add it as plaintext */ flush_entity (t); } } } static void in_tag (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->startTag = FALSE; if (**src == '/') { if (p->pending == LFPending) { p->pending = NonePending; } } else if (((**src >= 'a') && (**src <= 'z')) || ((**src >= 'A') && (**src <= 'Z'))) { /* Start of a start tag */ } else if (**src == '!') { /*  */ } else if (**src == '?') { /* <? meta ?> */ } else { /* Invalid tag, just add it */ if (p->pending) html_tokenizer_add_pending (t); add_unichar (t, '<'); add_byte (t, src); return; } if (p->pending) html_tokenizer_add_pending (t); if (p->dest > p->buffer) { html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer); p->dest = p->buffer; } add_unichar (t, TAG_ESCAPE); add_unichar (t, '<'); p->tag = TRUE; p->searchCount = 1; /* Look for <!-- to start comment */ } static void start_entity (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; (*src)++; p->discard = NoneDiscard; if (p->pending) html_tokenizer_add_pending (t); p->charEntity = TRUE; p->searchBuffer[0] = TAG_ESCAPE; p->searchBuffer[1] = '&'; p->searchCount = 1; } static void start_tag (HTMLTokenizer *t, const gchar **src) { (*src)++; t->priv->startTag = TRUE; t->priv->discard = NoneDiscard; } static void end_tag (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; gchar *ptr; p->searchCount = 0; /* Stop looking for <!-- sequence */ add_unichar (t, '>'); /* Make the tag lower case */ ptr = p->buffer + 2; if (p->pre || *ptr == '/') { /* End tag */ p->discard = NoneDiscard; } else { /* Start tag */ /* Ignore CRLFs after a start tag */ p->discard = LFDiscard; } while (*ptr && *ptr !=' ') { *ptr = tolower (*ptr); ptr++; } html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer); p->dest = p->buffer; p->tag = FALSE; p->pending = NonePending; (*src)++; if (strncmp (p->buffer + 2, "pre", 3) == 0) { p->pre++; } else if (strncmp (p->buffer + 2, "/pre", 4) == 0) { p->pre--; } else if (strncmp (p->buffer + 2, "textarea", 8) == 0) { p->textarea = TRUE; } else if (strncmp (p->buffer + 2, "/textarea", 9) == 0) { p->textarea = FALSE; } else if (strncmp (p->buffer + 2, "title", 5) == 0) { p->title = TRUE; } else if (strncmp (p->buffer + 2, "/title", 6) == 0) { p->title = FALSE; } else if (strncmp (p->buffer + 2, "script", 6) == 0) { p->script = TRUE; p->searchCount = 0; p->searchFor = scriptEnd; p->scriptCode = g_malloc (1024); p->scriptCodeSize = 0; p->scriptCodeMaxSize = 1024; } else if (strncmp (p->buffer + 2, "style", 5) == 0) { p->style = TRUE; p->searchCount = 0; p->searchFor = styleEnd; p->scriptCode = g_malloc (1024); p->scriptCodeSize = 0; p->scriptCodeMaxSize = 1024; } else if (strncmp (p->buffer + 2, "select", 6) == 0) { p->select = TRUE; } else if (strncmp (p->buffer + 2, "/select", 7) == 0) { p->select = FALSE; } else if (strncmp (p->buffer + 2, "cell", 4) == 0) { g_warning ("<cell> tag not supported"); } else if (strncmp (p->buffer + 2, "table", 5) == 0) { html_tokenizer_blocking_push (t, Table); } else { if (p->blocking) { const gchar *bn = html_tokenizer_blocking_get_name (t); if (strncmp (p->buffer + 1, bn, strlen (bn)) == 0) { html_tokenizer_blocking_pop (t); } } } } static void in_crlf (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; if (p->tquote) { if (p->discard == NoneDiscard) p->pending = SpacePending; } else if (p->tag) { p->searchCount = 0; /* Stop looking for <!-- sequence */ if (p->discard == NoneDiscard) p->pending = SpacePending; /* Treat LFs inside tags as spaces */ } else if (p->pre || p->textarea) { if (p->discard == LFDiscard) { /* Ignore this LF */ p->discard = NoneDiscard; /* We have discarded 1 LF */ } else { /* Process this LF */ if (p->pending) html_tokenizer_add_pending (t); p->pending = LFPending; } } else { if (p->discard == LFDiscard) { /* Ignore this LF */ p->discard = NoneDiscard; /* We have discarded 1 LF */ } else { /* Process this LF */ if (p->pending == NonePending) p->pending = LFPending; } } /* Check for MS-DOS CRLF sequence */ if (**src == '\r') { p->skipLF = TRUE; } (*src)++; } static void in_space_or_tab (HTMLTokenizer *t, const gchar **src) { if (t->priv->tquote) { if (t->priv->discard == NoneDiscard) t->priv->pending = SpacePending; } else if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ if (t->priv->discard == NoneDiscard) t->priv->pending = SpacePending; } else if (t->priv->pre || t->priv->textarea) { if (t->priv->pending) html_tokenizer_add_pending (t); if (**src == ' ') t->priv->pending = SpacePending; else t->priv->pending = TabPending; } else { t->priv->pending = SpacePending; } (*src)++; } static void in_quoted (HTMLTokenizer *t, const gchar **src) { /* We treat ' and " the same in tags " */ t->priv->discard = NoneDiscard; if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ if ((t->priv->tquote == SINGLE_QUOTE && **src == '\"') /* match " */ || (t->priv->tquote == DOUBLE_QUOTE && **src == '\'')) { add_unichar (t, **src); (*src)++; } else if (*(t->priv->dest-1) == '=' && !t->priv->tquote) { t->priv->discard = SpaceDiscard; t->priv->pending = NonePending; if (**src == '\"') /* match " */ t->priv->tquote = DOUBLE_QUOTE; else t->priv->tquote = SINGLE_QUOTE; add_unichar (t, **src); (*src)++; } else if (t->priv->tquote) { t->priv->tquote = NO_QUOTE; add_byte (t, src); t->priv->pending = SpacePending; } else { /* Ignore stray "\'" */ (*src)++; } } else { if (t->priv->pending) html_tokenizer_add_pending (t); add_byte (t, src); } } static void in_assignment (HTMLTokenizer *t, const gchar **src) { t->priv->discard = NoneDiscard; if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ add_unichar (t, '='); if (!t->priv->tquote) { t->priv->pending = NonePending; t->priv->discard = SpaceDiscard; } } else { if (t->priv->pending) html_tokenizer_add_pending (t); add_unichar (t, '='); } (*src)++; } inline static void in_plain (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->discard = NoneDiscard; if (p->pending) html_tokenizer_add_pending (t); if (p->tag) { if (p->searchCount > 0) { if (**src == commentStart[p->searchCount]) { p->searchCount++; if (p->searchCount == 4) { /* Found <!-- sequence */ p->comment = TRUE; p->dest = p->buffer; p->tag = FALSE; p->searchCount = 0; return; } } else { p->searchCount = 0; /* Stop lookinf for <!-- sequence */ } } } add_byte (t, src); } static void html_tokenizer_tokenize_one_char (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; prepare_enough_space (t); if (p->skipLF && **src != '\n') p->skipLF = FALSE; if (p->skipLF) (*src) ++; else if (p->comment) in_comment (t, src); else if (p->extension) in_extension (t, src); else if (p->script || p->style) in_script_or_style (t, src); else if (p->charEntity) in_entity (t, src); else if (p->startTag) in_tag (t, src); else if (**src == '&') start_entity (t, src); else if (**src == '<' && !p->tag) start_tag (t, src); else if (**src == '>' && p->tag && !p->tquote) end_tag (t, src); else if ((**src == '\n') || (**src == '\r')) in_crlf (t, src); else if ((**src == ' ') || (**src == '\t')) in_space_or_tab (t, src); else if (**src == '\"' || **src == '\'') /* match " ' */ in_quoted (t, src); else if (**src == '=') in_assignment (t, src); else in_plain (t, src); } static void html_tokenizer_real_write (HTMLTokenizer *t, const gchar *string, size_t size) { const gchar *src = string; while ((src - string) < size) html_tokenizer_tokenize_one_char (t, &src); } static gchar * html_tokenizer_blocking_get_name (HTMLTokenizer *t) { switch (GPOINTER_TO_INT (t->priv->blocking->data)) { case Table: return "</table"; } return ""; } static void html_tokenizer_blocking_push (HTMLTokenizer *t, HTMLTokenType tt) { struct _HTMLTokenizerPrivate *p = t->priv; /* block tokenizer - we must block last token in buffers as it was already added */ if (!p->blocking) { p->tokens_num--; p->blocking_tokens_num++; } p->blocking = g_list_prepend (p->blocking, GINT_TO_POINTER (tt)); } static void html_tokenizer_blocking_pop (HTMLTokenizer *t) { struct _HTMLTokenizerPrivate *p = t->priv; p->blocking = g_list_remove (p->blocking, p->blocking->data); /* unblock tokenizer */ if (!p->blocking) { p->tokens_num += p->blocking_tokens_num; p->blocking_tokens_num = 0; } } /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ void html_tokenizer_begin (HTMLTokenizer *t, gchar *content_type) { g_return_if_fail (t && HTML_IS_TOKENIZER (t)); gtk_signal_emit (GTK_OBJECT (t), html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL], content_type); } void html_tokenizer_end (HTMLTokenizer *t) { g_return_if_fail (t && HTML_IS_TOKENIZER (t)); gtk_signal_emit (GTK_OBJECT (t), html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL]); } void html_tokenizer_write (HTMLTokenizer *t, const gchar *str, size_t size) { HTMLTokenizerClass *klass; g_return_if_fail (t && HTML_IS_TOKENIZER (t)); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->write) klass->write (t, str, size); else g_warning ("No write method defined."); } gchar * html_tokenizer_peek_token (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->peek_token) return klass->peek_token (t); g_warning ("No peek_token method defined."); return NULL; } gchar * html_tokenizer_next_token (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->next_token) return klass->next_token (t); g_warning ("No next_token method defined."); return NULL; } gboolean html_tokenizer_has_more_tokens (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), FALSE); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->has_more) { return klass->has_more (t); } g_warning ("No has_more method defined."); return FALSE; } HTMLTokenizer * html_tokenizer_clone (HTMLTokenizer *t) { HTMLTokenizerClass *klass; if (t == NULL) return NULL; g_return_val_if_fail (HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->clone) return klass->clone (t); g_warning ("No clone method defined."); return NULL; }

/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* Copyright (C) 1997 Martin Jones (mjones@kde.org) (C) 1997 Torben Weis (weis@kde.org) (C) 1999 Anders Carlsson (andersca@gnu.org) (C) 2000 Helix Code, Inc., Radek Doulik (rodo@helixcode.com) (C) 2001 Ximian, Inc. This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; see the file COPYING.LIB. If not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* The HTML Tokenizer */ #include #include #include #include #include "htmltokenizer.h" #include "htmlentity.h" enum { HTML_TOKENIZER_BEGIN_SIGNAL, HTML_TOKENIZER_END_SIGNAL, HTML_TOKENIZER_LAST_SIGNAL }; static guint html_tokenizer_signals[HTML_TOKENIZER_LAST_SIGNAL] = { 0 }; #define TOKEN_BUFFER_SIZE (1 << 10) typedef struct _HTMLBlockingToken HTMLBlockingToken; typedef struct _HTMLTokenBuffer HTMLTokenBuffer; typedef enum { Table } HTMLTokenType; struct _HTMLTokenBuffer { gint size; gint used; gchar * data; }; struct _HTMLTokenizerPrivate { /* token buffers list */ GList *token_buffers; /* current read_buf position in list */ GList *read_cur; /* current read buffer */ HTMLTokenBuffer * read_buf; HTMLTokenBuffer * write_buf; /* position in the read_buf */ gint read_pos; /* non-blocking and blocking unreaded tokens in tokenizer */ gint tokens_num; gint blocking_tokens_num; gchar *dest; gchar *buffer; gint size; gboolean skipLF; /* Skip the LF par of a CRLF sequence */ gboolean tag; /* Are we in an html tag? */ gboolean tquote; /* Are we in quotes in an html tag? */ gboolean startTag; gboolean comment; /* Are we in a comment block? */ gboolean title; /* Are we in a block? */ gboolean style; /* Are we in a <style> block? */ gboolean script; /* Are we in a <script> block? */ gboolean textarea; /* Are we in a <textarea> block? */ gint pre; /* Are we in a <pre> block? */ gboolean select; /* Are we in a <select> block? */ gboolean charEntity; /* Are we in an &... sequence? */ gboolean extension; /* Are we in an " */ if (p->searchCount < 2) p->searchCount++; } else if (p->searchCount == 2 && (**src == '>')) { p->comment = FALSE; /* We've got a "-->" sequence */ } else if (tolower (**src) == gtkhtmlStart [p->searchGtkHTMLCount]) { if (p->searchGtkHTMLCount == 8) { p->extension = TRUE; p->comment = FALSE; p->searchCount = 0; p->searchExtensionEndCount = 0; p->searchGtkHTMLCount = 0; } else p->searchGtkHTMLCount ++; } else { p->searchGtkHTMLCount = 0; if (p->searchCount < 2) p->searchCount = 0; } (*src)++; } static inline void extension_one_char (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->extension = FALSE; html_tokenizer_tokenize_one_char (t, src); p->extension = TRUE; } static void in_extension (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; /* check for "-->" */ if (!p->tquote && **src == '-') { if (p->searchExtensionEndCount < 2) p->searchExtensionEndCount ++; (*src) ++; } else if (!p->tquote && p->searchExtensionEndCount == 2 && **src == '>') { p->extension = FALSE; (*src) ++; } else { if (p->searchExtensionEndCount > 0) { if (p->extension) { const gchar *c = "-->"; while (p->searchExtensionEndCount) { extension_one_char (t, &c); p->searchExtensionEndCount --; } } } extension_one_char (t, src); } } static void in_script_or_style (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; /* Allocate memory to store the script or style */ if (p->scriptCodeSize + 11 > p->scriptCodeMaxSize) p->scriptCode = g_realloc (p->scriptCode, p->scriptCodeMaxSize += 1024); if ((**src == '>' ) && ( p->searchFor [p->searchCount] == '>')) { (*src)++; p->scriptCode [p->scriptCodeSize] = 0; p->scriptCode [p->scriptCodeSize + 1] = 0; if (p->script) { p->script = FALSE; } else { p->style = FALSE; } g_free (p->scriptCode); p->scriptCode = NULL; } /* Check if a </script> tag is on its way */ else if (p->searchCount > 0) { if (tolower (**src) == p->searchFor [p->searchCount]) { p->searchBuffer [p->searchCount] = **src; p->searchCount++; (*src)++; } else { gchar *c; p->searchBuffer [p->searchCount] = 0; c = p->searchBuffer; while (*c) p->scriptCode [p->scriptCodeSize++] = *c++; p->scriptCode [p->scriptCodeSize] = **src; (*src)++; p->searchCount = 0; } } else if (**src == '<') { p->searchCount = 1; p->searchBuffer [0] = '<'; (*src)++; } else { p->scriptCode [p->scriptCodeSize] = **src; (*src)++; } } static void add_unichar (HTMLTokenizer *t, gunichar wc) { struct _HTMLTokenizerPrivate *p = t->priv; p->utf8_length = 0; if (wc != '\0') { p->dest += g_unichar_to_utf8 (wc, p->dest); *(p->dest) = 0; } /* g_assert (g_utf8_validate (p->buffer, p->dest - p->buffer, NULL)); */ } static void add_byte (HTMLTokenizer *t, const gchar **src) { gunichar wc; struct _HTMLTokenizerPrivate *p = t->priv; if (p->utf8) { p->utf8_buffer[p->utf8_length] = **src; p->utf8_length++; wc = g_utf8_get_char_validated ((const gchar *)p->utf8_buffer, p->utf8_length); if (wc == -1 || p->utf8_length >= (sizeof(p->utf8_buffer)/sizeof(p->utf8_buffer[0]))) { add_unichar (t, '?'); (*src)++; return; } else if (wc == -2) { /* incomplete character check again */ (*src)++; return; } } else { wc = (guchar)**src; } add_unichar (t, wc); (*src)++; } static void flush_entity (HTMLTokenizer *t) { struct _HTMLTokenizerPrivate *p = t->priv; /* ignore the TAG_ESCAPE when flushing */ const char *str = p->searchBuffer + 1; while (p->searchCount--) { add_byte (t, &str); } } static void in_entity (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; gunichar entityValue = 0; /* See http://www.mozilla.org/newlayout/testcases/layout/entities.html for a complete entity list, ftp://ftp.unicode.org/Public/MAPPINGS/ISO8859/8859-1.TXT (or 'man iso_8859_1') for the character encodings. */ p->searchBuffer [p->searchCount + 1] = **src; p->searchBuffer [p->searchCount + 2] = '\0'; /* Check for � sequence */ if (p->searchBuffer[2] == '#') { if ((p->searchCount > 1) && (!isdigit (**src)) && (p->searchBuffer[3] != 'x')) { /* { */ p->searchBuffer [p->searchCount + 1] = '\0'; entityValue = strtoul (&(p->searchBuffer [3]), NULL, 10); p->charEntity = FALSE; } if ((p->searchCount > 1) && (!isalnum (**src)) && (p->searchBuffer[3] == 'x')) { /* &x12AB */ p->searchBuffer [p->searchCount + 1] = '\0'; entityValue = strtoul (&(p->searchBuffer [4]), NULL, 16); p->charEntity = FALSE; } } else { /* Check for &abc12 sequence */ if (!isalnum (**src)) { p->charEntity = FALSE; if ((p->searchBuffer [p->searchCount + 1] == ';') || (!p->tag)) { char *ename = p->searchBuffer + 2; p->searchBuffer [p->searchCount + 1] = '\0'; /* FIXME sucks */ entityValue = html_entity_parse (ename, 0); } } } if (p->searchCount > 9) { /* Ignore this sequence since it's too long */ p->charEntity = FALSE; flush_entity (t); } else if (p->charEntity) { /* Keep searching for end of character entity */ p->searchCount++; (*src)++; } else { /* * my reading of http://www.w3.org/TR/html4/intro/sgmltut.html#h-3.2.2 makes * seem correct to always collapse entity references, even in element names * and attributes. */ if (entityValue) { /* Insert plain char */ if (entityValue != TAG_ESCAPE) add_unichar (t, entityValue); if (**src == ';') (*src)++; } else { /* Ignore the sequence, just add it as plaintext */ flush_entity (t); } } } static void in_tag (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->startTag = FALSE; if (**src == '/') { if (p->pending == LFPending) { p->pending = NonePending; } } else if (((**src >= 'a') && (**src <= 'z')) || ((**src >= 'A') && (**src <= 'Z'))) { /* Start of a start tag */ } else if (**src == '!') { /*  */ } else if (**src == '?') { /* <? meta ?> */ } else { /* Invalid tag, just add it */ if (p->pending) html_tokenizer_add_pending (t); add_unichar (t, '<'); add_byte (t, src); return; } if (p->pending) html_tokenizer_add_pending (t); if (p->dest > p->buffer) { html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer); p->dest = p->buffer; } add_unichar (t, TAG_ESCAPE); add_unichar (t, '<'); p->tag = TRUE; p->searchCount = 1; /* Look for <!-- to start comment */ } static void start_entity (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; (*src)++; p->discard = NoneDiscard; if (p->pending) html_tokenizer_add_pending (t); p->charEntity = TRUE; p->searchBuffer[0] = TAG_ESCAPE; p->searchBuffer[1] = '&'; p->searchCount = 1; } static void start_tag (HTMLTokenizer *t, const gchar **src) { (*src)++; t->priv->startTag = TRUE; t->priv->discard = NoneDiscard; } static void end_tag (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; gchar *ptr; p->searchCount = 0; /* Stop looking for <!-- sequence */ add_unichar (t, '>'); /* Make the tag lower case */ ptr = p->buffer + 2; if (p->pre || *ptr == '/') { /* End tag */ p->discard = NoneDiscard; } else { /* Start tag */ /* Ignore CRLFs after a start tag */ p->discard = LFDiscard; } while (*ptr && *ptr !=' ') { *ptr = tolower (*ptr); ptr++; } html_tokenizer_append_token (t, p->buffer, p->dest - p->buffer); p->dest = p->buffer; p->tag = FALSE; p->pending = NonePending; (*src)++; if (strncmp (p->buffer + 2, "pre", 3) == 0) { p->pre++; } else if (strncmp (p->buffer + 2, "/pre", 4) == 0) { p->pre--; } else if (strncmp (p->buffer + 2, "textarea", 8) == 0) { p->textarea = TRUE; } else if (strncmp (p->buffer + 2, "/textarea", 9) == 0) { p->textarea = FALSE; } else if (strncmp (p->buffer + 2, "title", 5) == 0) { p->title = TRUE; } else if (strncmp (p->buffer + 2, "/title", 6) == 0) { p->title = FALSE; } else if (strncmp (p->buffer + 2, "script", 6) == 0) { p->script = TRUE; p->searchCount = 0; p->searchFor = scriptEnd; p->scriptCode = g_malloc (1024); p->scriptCodeSize = 0; p->scriptCodeMaxSize = 1024; } else if (strncmp (p->buffer + 2, "style", 5) == 0) { p->style = TRUE; p->searchCount = 0; p->searchFor = styleEnd; p->scriptCode = g_malloc (1024); p->scriptCodeSize = 0; p->scriptCodeMaxSize = 1024; } else if (strncmp (p->buffer + 2, "select", 6) == 0) { p->select = TRUE; } else if (strncmp (p->buffer + 2, "/select", 7) == 0) { p->select = FALSE; } else if (strncmp (p->buffer + 2, "cell", 4) == 0) { g_warning ("<cell> tag not supported"); } else if (strncmp (p->buffer + 2, "table", 5) == 0) { html_tokenizer_blocking_push (t, Table); } else { if (p->blocking) { const gchar *bn = html_tokenizer_blocking_get_name (t); if (strncmp (p->buffer + 1, bn, strlen (bn)) == 0) { html_tokenizer_blocking_pop (t); } } } } static void in_crlf (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; if (p->tquote) { if (p->discard == NoneDiscard) p->pending = SpacePending; } else if (p->tag) { p->searchCount = 0; /* Stop looking for <!-- sequence */ if (p->discard == NoneDiscard) p->pending = SpacePending; /* Treat LFs inside tags as spaces */ } else if (p->pre || p->textarea) { if (p->discard == LFDiscard) { /* Ignore this LF */ p->discard = NoneDiscard; /* We have discarded 1 LF */ } else { /* Process this LF */ if (p->pending) html_tokenizer_add_pending (t); p->pending = LFPending; } } else { if (p->discard == LFDiscard) { /* Ignore this LF */ p->discard = NoneDiscard; /* We have discarded 1 LF */ } else { /* Process this LF */ if (p->pending == NonePending) p->pending = LFPending; } } /* Check for MS-DOS CRLF sequence */ if (**src == '\r') { p->skipLF = TRUE; } (*src)++; } static void in_space_or_tab (HTMLTokenizer *t, const gchar **src) { if (t->priv->tquote) { if (t->priv->discard == NoneDiscard) t->priv->pending = SpacePending; } else if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ if (t->priv->discard == NoneDiscard) t->priv->pending = SpacePending; } else if (t->priv->pre || t->priv->textarea) { if (t->priv->pending) html_tokenizer_add_pending (t); if (**src == ' ') t->priv->pending = SpacePending; else t->priv->pending = TabPending; } else { t->priv->pending = SpacePending; } (*src)++; } static void in_quoted (HTMLTokenizer *t, const gchar **src) { /* We treat ' and " the same in tags " */ t->priv->discard = NoneDiscard; if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ if ((t->priv->tquote == SINGLE_QUOTE && **src == '\"') /* match " */ || (t->priv->tquote == DOUBLE_QUOTE && **src == '\'')) { add_unichar (t, **src); (*src)++; } else if (*(t->priv->dest-1) == '=' && !t->priv->tquote) { t->priv->discard = SpaceDiscard; t->priv->pending = NonePending; if (**src == '\"') /* match " */ t->priv->tquote = DOUBLE_QUOTE; else t->priv->tquote = SINGLE_QUOTE; add_unichar (t, **src); (*src)++; } else if (t->priv->tquote) { t->priv->tquote = NO_QUOTE; add_byte (t, src); t->priv->pending = SpacePending; } else { /* Ignore stray "\'" */ (*src)++; } } else { if (t->priv->pending) html_tokenizer_add_pending (t); add_byte (t, src); } } static void in_assignment (HTMLTokenizer *t, const gchar **src) { t->priv->discard = NoneDiscard; if (t->priv->tag) { t->priv->searchCount = 0; /* Stop looking for <!-- sequence */ add_unichar (t, '='); if (!t->priv->tquote) { t->priv->pending = NonePending; t->priv->discard = SpaceDiscard; } } else { if (t->priv->pending) html_tokenizer_add_pending (t); add_unichar (t, '='); } (*src)++; } inline static void in_plain (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; p->discard = NoneDiscard; if (p->pending) html_tokenizer_add_pending (t); if (p->tag) { if (p->searchCount > 0) { if (**src == commentStart[p->searchCount]) { p->searchCount++; if (p->searchCount == 4) { /* Found <!-- sequence */ p->comment = TRUE; p->dest = p->buffer; p->tag = FALSE; p->searchCount = 0; return; } } else { p->searchCount = 0; /* Stop lookinf for <!-- sequence */ } } } add_byte (t, src); } static void html_tokenizer_tokenize_one_char (HTMLTokenizer *t, const gchar **src) { struct _HTMLTokenizerPrivate *p = t->priv; prepare_enough_space (t); if (p->skipLF && **src != '\n') p->skipLF = FALSE; if (p->skipLF) (*src) ++; else if (p->comment) in_comment (t, src); else if (p->extension) in_extension (t, src); else if (p->script || p->style) in_script_or_style (t, src); else if (p->charEntity) in_entity (t, src); else if (p->startTag) in_tag (t, src); else if (**src == '&') start_entity (t, src); else if (**src == '<' && !p->tag) start_tag (t, src); else if (**src == '>' && p->tag && !p->tquote) end_tag (t, src); else if ((**src == '\n') || (**src == '\r')) in_crlf (t, src); else if ((**src == ' ') || (**src == '\t')) in_space_or_tab (t, src); else if (**src == '\"' || **src == '\'') /* match " ' */ in_quoted (t, src); else if (**src == '=') in_assignment (t, src); else in_plain (t, src); } static void html_tokenizer_real_write (HTMLTokenizer *t, const gchar *string, size_t size) { const gchar *src = string; while ((src - string) < size) html_tokenizer_tokenize_one_char (t, &src); } static gchar * html_tokenizer_blocking_get_name (HTMLTokenizer *t) { switch (GPOINTER_TO_INT (t->priv->blocking->data)) { case Table: return "</table"; } return ""; } static void html_tokenizer_blocking_push (HTMLTokenizer *t, HTMLTokenType tt) { struct _HTMLTokenizerPrivate *p = t->priv; /* block tokenizer - we must block last token in buffers as it was already added */ if (!p->blocking) { p->tokens_num--; p->blocking_tokens_num++; } p->blocking = g_list_prepend (p->blocking, GINT_TO_POINTER (tt)); } static void html_tokenizer_blocking_pop (HTMLTokenizer *t) { struct _HTMLTokenizerPrivate *p = t->priv; p->blocking = g_list_remove (p->blocking, p->blocking->data); /* unblock tokenizer */ if (!p->blocking) { p->tokens_num += p->blocking_tokens_num; p->blocking_tokens_num = 0; } } /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ void html_tokenizer_begin (HTMLTokenizer *t, gchar *content_type) { g_return_if_fail (t && HTML_IS_TOKENIZER (t)); gtk_signal_emit (GTK_OBJECT (t), html_tokenizer_signals[HTML_TOKENIZER_BEGIN_SIGNAL], content_type); } void html_tokenizer_end (HTMLTokenizer *t) { g_return_if_fail (t && HTML_IS_TOKENIZER (t)); gtk_signal_emit (GTK_OBJECT (t), html_tokenizer_signals[HTML_TOKENIZER_END_SIGNAL]); } void html_tokenizer_write (HTMLTokenizer *t, const gchar *str, size_t size) { HTMLTokenizerClass *klass; g_return_if_fail (t && HTML_IS_TOKENIZER (t)); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->write) klass->write (t, str, size); else g_warning ("No write method defined."); } gchar * html_tokenizer_peek_token (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->peek_token) return klass->peek_token (t); g_warning ("No peek_token method defined."); return NULL; } gchar * html_tokenizer_next_token (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->next_token) return klass->next_token (t); g_warning ("No next_token method defined."); return NULL; } gboolean html_tokenizer_has_more_tokens (HTMLTokenizer *t) { HTMLTokenizerClass *klass; g_return_val_if_fail (t && HTML_IS_TOKENIZER (t), FALSE); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->has_more) { return klass->has_more (t); } g_warning ("No has_more method defined."); return FALSE; } HTMLTokenizer * html_tokenizer_clone (HTMLTokenizer *t) { HTMLTokenizerClass *klass; if (t == NULL) return NULL; g_return_val_if_fail (HTML_IS_TOKENIZER (t), NULL); klass = HTML_TOKENIZER_CLASS (GTK_OBJECT (t)->klass); if (klass->clone) return klass->clone (t); g_warning ("No clone method defined."); return NULL; }