# # xmltoken.rb # # Copyright (C) Ueno Katsuhiro 2000 # # $Id: xmltoken.rb,v 1.2 2000/12/19 11:36:13 katsu Exp $ # require 'xmlscan' class XMLScanner module Tokenizer class Node def escape!(str) str.gsub!(/&/, '&') str.gsub!(//, '>') str.gsub!(/"/, '"') str end private :escape! def inspect "#{super.split(' ',2)[0]} #{to_s.inspect}" end def content nil end def to_s content end end class CharData < Node def initialize(str) @content = str end def concat(s) @content << s end attr_reader :content def to_s escape! @content end end class Comment < Node def initialize(src) @src = src end def content @src = @src.join if @src.is_a? Array @src end def to_s "" end end class PI < Node def initialize(target, pi) @target, @content = target, pi end attr_reader :target, :content def to_s "" end end class XMLDecl < Node def initialize(version, encoding, standalone) @version, @encoding, @standalone = version, encoding, standalone end attr_reader :version, :encoding, :standalone def to_s s = %'' end end class Doctype < Node def initialize(root, pubid, sysid) @root, @pubid, @sysid = root, pubid, sysid end attr_reader :root, :pubid, :sysid def public? not pubid.nil? end def system? pubid.nil? and not sysid.nil? end def to_s s = "' end end class Tag < Node attr_reader :name end class ETag < Tag def initialize(name) @name = name end def to_s "" end end class STag < Tag def initialize(name, attr) @name, @attr = name, attr end def attr_to_s @attr.collect{ |k,v| "#{k}=\"#{escape!(v)}\"" }.unshift('').join(' ') end private :attr_to_s def to_s "<#{@name}#{attr_to_s}>" end end class EmptyElem < STag def to_s "<#{@name}#{attr_to_s}/>" end end class Reference < Node attr_reader :content end class EntityRef < Reference def initialize(name,s) @name, @content = name, s end def to_s "&#{@name};" end end class CharRef < Reference def initialize(code) @content = code end def to_s "&\##{@content};" end end def initialize(*args) super @__token_parsed__ = [] end private def entityref_literal(ref) PredefinedEntity[ref] or '' end def scan_prolog ret = super if @__token_parsed__.empty? then ret else @__token_parsed__.push ret if ret @__token_parsed__.shift end end def on_chardata(str) super if (l = @__token_parsed__[-1]) and l.is_a? CharData then l.concat str else @__token_parsed__.push CharData.new(str) end nil end def on_entityref(ref) super @__token_parsed__.push EntityRef.new(ref, entityref_literal(ref)) nil end def on_charref(ref) super @__token_parsed__.push CharRef.new(ref) nil end def on_comment(strs) super ret = Comment.new(strs) if @prolog then @__token_parsed__.push ret ret = nil end ret end def on_pi(target, pi) super ret = PI.new(target, pi) if @prolog then @__token_parsed__.push ret ret = nil end ret end def on_xmldecl(version, encoding, standalone) super @__token_parsed__.push XMLDecl.new(version, encoding, standalone) nil end def on_doctype(root, pubid, sysid) super @__token_parsed__.push Doctype.new(root, pubid, sysid) nil end def on_etag(name) super ETag.new(name) end def on_stag(name, attr) super STag.new(name, attr) end def on_emptyelem(name, attr) super EmptyElem.new(name, attr) end public def get_token unless @__token_parsed__.empty? then @__token_parsed__.shift else step or @__token_parsed__.shift end end include Enumerable def each(src = nil) @src.feed src if src yield(scan_prolog) if @prolog while true yield @__token_parsed__.shift until @__token_parsed__.empty? break unless s = @src.pop s = scan_text(s) yield s if s end self end end end class XMLTokenizer < XMLScanner include Tokenizer private :step, :parse end if $0 == __FILE__ then class TestScanner < XMLTokenizer def on_error(path, lineno, msg) STDERR.printf "%s:%d: %s\n", path, lineno, msg end end if /\A--?\z/ === ARGV[0] then if (opt = ARGV.shift) == '--' and ARGV.size == 1 then p = IO.popen("diff -u #{ARGV[0]} -", 'w') STDOUT.reopen p class Hash def []=(k,v) (@a ||= []).push [ k, v ] end def each(&b) @a.each(&b) if defined? @a end end end end src = ARGF.read scan = TestScanner.new(src) if opt then t1 = Time.times.utime scan.each { |i| print i.to_s } t2 = Time.times.utime else t1 = Time.times.utime while t = scan.get_token end t2 = Time.times.utime end STDERR.printf "%2.3f sec\n", t2 - t1 end