# # xmlscan.rb # # Copyright (C) Ueno Katsuhiro 2000,2001 # # $Id: xmlscan.rb,v 1.8 2001/01/04 09:56:56 katsu Exp $ # class XMLScanner class ParseError < StandardError; end class PrivateArray < Array private(*superclass.instance_methods) end class XMLSource < PrivateArray class PortWrapper def gets ; @port.gets ; end def lineno ; 0 ; end def path ; '-' ; end def initialize(port) @port = port unless port.respond_to? :gets then if port.is_a? Array then @n = -1 def self.gets ; @port[@n += 1] ; end def self.lineno ; @n + 1 ; end else @port = port.to_s def self.gets ; s = @port ; @port = nil ; s ; end end end if port.respond_to? :lineno then def self.lineno ; @port.lineno ; end end if port.respond_to? :path then def self.path ; @port.path ; end end end def send_port(method, *args) @port.send(method, *args) end def self.wrap(port) if instance_methods.find { |i| not port.respond_to? i } then new port else port end end end module DummyPort def self.gets ; nil ; end def self.lineno ; 0 ; end def self.path ; '-' ; end end def initialize(port = nil) super() feed port end public def feed(port) if port then @port = PortWrapper.wrap(port) else @port = DummyPort end @eof = false @lineno_size, @lineno_count = -1, 0 self end def abort @eof = true clear self end def pop if at(1) or first == '>' or @eof then # at(1) == (size > 1) super else begin src = @port.gets unless src then @eof = true break end a = src.split(/(?=>?<)|>/, -1) a[0] = super << a[0] if last and (c = a[0][0]) != ?< and c != ?> concat a end until at(1) # at(1) == (size > 1) reverse! @lineno_size = size - 1 @lineno_count = super end end def tag_end? s = last and s[0] != ?< end def tag_start? s = last and s[0] == ?< end def eof? @eof and empty? end def lineno unless size == @lineno_size then @port.lineno else unless @lineno_count.is_a? Integer then if @lineno_count then @lineno_count = @lineno_count.sub(/\A\s+/, '').scan(/^/).size - 1 else @lineno_count = 0 end end @port.lineno - @lineno_count end end def path @port.path end def send_port(method, *args) if @port.is_a? PortWrapper then @port.send_port(method, *args) else @port.send(method, *args) end end end def initialize(port = nil) @src = XMLSource.new @prolog = false feed port if port end def feed(port) @src.feed port @prolog = true self end attr_reader :prolog alias in_prolog? prolog undef prolog private def on_error(path, lineno, msg) raise ParseError, sprintf('%s:%d: %s', path, lineno, msg) end def on_xmldecl(version, encoding, standalone) end def on_doctype(root, pubid, sysid) end def on_comment(strs) end def on_pi(target, pi) end def on_chardata(str) end def on_etag(name) end def on_stag(name, attr) end def on_emptyelem(name, attr) end def on_entityref(ref) s = entityref_literal(ref) on_chardata s if s end def on_charref(ref) s = parse_charref(ref) on_chardata s if s end def on_eof end private def parse_error(msg) on_error @src.path, @src.lineno, msg end PredefinedEntity = { 'lt' => '<', 'gt' => '>', 'quot' => '"', 'apos' => '\'', 'amp' => '&', } def charref_literal(code) [code].pack('N').sub(/\A\000+/mn, '') end def entityref_literal(ref) PredefinedEntity[ref] or begin parse_error "undefined general entity `#{ref}'" nil end end def parse_charref(ref) if /\A#(\d+)\z/ =~ ref then charref_literal $1.to_i elsif /\A#x([\dA-Fa-f]+)\z/ =~ ref then charref_literal $1.hex else parse_error "parse error at `&#{ref}'" nil end end def on_attribute_value(key, val) inc = 0 val.gsub!(/&([^;\s<>]+)?;?/) { |m| if m[-1] == ?; and (s = $1) then if s[0] == ?\# then rep = parse_charref(s) else rep = entityref_literal(s) end unless rep then @unexpanded_entityrefs = [] unless defined? @unexpanded_entityrefs @unexpanded_entityrefs.push [ key.dup, $~.begin(0) + inc, s ] rep = '' end inc += rep.size - m.size rep else parse_error "parse error at `#{m}'" m end } true end def scan_content(s) while true unless /&/ =~ s then on_chardata s else on_chardata s unless (s = $`).empty? $'.split(/&/, -1).each { |i| unless /;/ =~ i then parse_error "parse error at `&#{i.split(/\b|\s/,2)[0]}'" on_chardata('&' << i) next end e, i = $`, $' if /\s/ =~ e then parse_error "parse error at `&#{$`}'" on_chardata('&' << i) next end if e[0] == ?\# then parse_charref e else on_entityref e end on_chardata i unless i.empty? } end break if @src.tag_start? s = @src.pop break unless s s[0,0] = '>' unless s == '>' end end def scan_comment(s) s[0,4] = '' # '" until @src.tag_end? and (s = @src.pop) comm.push '>' if (c = s[0]) != ?< and c != ?> comm.push s end end on_comment comm end def scan_pi(s) unless /\A<\?(\S+)(?:\s+|(?=\?\z))/ =~ s then parse_error "parse error at `' if (c = s[0]) != ?< and c != ?> pi << s end pi.chop! on_pi target, pi end end def scan_cdata(s) cdata = [ s ] until @src.tag_end? and s[-1] == ?] and s[-2] == ?] s = @src.pop unless s then parse_error "unterminated CDATA section meets EOF" return on_chardata(cdata.join) end cdata.push '>' if (c = s[0]) != ?< and c != ?> cdata.push s end s.chop!.chop! on_chardata cdata.join end def unclosed_tag(t) if @src.eof? then # ' if (c = s[0]) != ?< and c != ?> v, s = s.split(re, 2) dst << v end until s s end def scan_etag(s) s[0,2] = '' # '" else parse_error "parse error at ` if @src.tag_end? then parse_error "found an empty start tag `<>'" else parse_error "parse error at `<'" return on_chardata('<' + s) end end else name = $` s = $' name[0,1] = '' if name.empty? then # < tag parse_error "parse error at `<'" return on_chardata('<' + s) end begin complete = true s.scan(/\s+(?:([^\s\/=]+)\s*=\s*('[^']*'?|"[^"]*"?)|\z)|\s*(\/\z)|\s*(.[^='"\s]*)/m ) { |key,val,e,err| if key then qmark = val.slice!(0,1) if val[-1] == qmark[0] then val.chop! else complete = false re = /#{qmark}/ begin s = @src.pop if not s then parse_error "unterminated #{t} meets EOF" complete = true break elsif (c = s[0]) == ?< then parse_error "`<' is found in attribute value" elsif c != ?> then val << '>' end v, s = s.split(re, 2) val << v end until s # always break here. end if on_attribute_value(key, val) then parse_error "doubled attribute `#{key}'" if attr.key? key attr[key] = val end elsif e then method = :on_emptyelem elsif err then parse_error "parse error at `#{err.split(/\b|\s/,2)[0]}'" end } end until complete end unclosed_tag 'start tag' unless @src.tag_end? send method, name, attr end def parse_internal_dtd(s) parse_error "internal DTD subset is not supported" end DOCTYPEPattern = /\A([^\s\["']+)(?:\s+(?:SYSTEM|PUBLIC\s+("[^"]*"|'[^"']*'))\s+("[^"]*"?|'[^']*'?))\s*/ def scan_doctype(s) unless DOCTYPEPattern =~ s then parse_error "parse error in DOCTYPE" return end root, pubid, sysid, s = $1, $2, $3, $' if pubid then pubid.chop! pubid[0,1] = '' end if sysid then c = sysid.slice!(0,1) if c[0] == sysid[-1] then sysid.chop! else s = read_until(/#{c}\s*/, sysid, 'DOCTYPE') end end if s[0] == ?[ then s[0,1] = '' parse_internal_dtd s elsif not s.empty? then parse_error "parse error at `#{s.split(/\b|\s/,2)[0]}'" end unclosed_tag 'DOCTYPE' unless @src.tag_end? on_doctype root, pubid, sysid end def scan_bang_tag(s) parse_error "parse error at ` then scan_text @src.pop else scan_content s end end XMLDeclPattern, TextDeclPattern = instance_eval { version = '\\s+version\\s*=\\s*("[^"\']+"|\'[^"\']+\')' encoding = '\\s+encoding\\s*=\\s*("[^"\']+"|\'[^"\']+\')' standalone = '\\s+standalone\\s*=\\s*("[^"]+"|\'[^\']+\')' [ /\A<\?xml#{version}(?:#{encoding}(?:#{standalone})?)?\s*\?\z/, /\A<\?xml(?:#{version})?#{encoding}\s*\?\z/ ] } def scan_prolog s = @src.pop if s and /\A<\?xml\b/ =~ s then unless XMLDeclPattern =~ s then parse_error 'parse error in XML declaration' else version, encoding, standalone = $1, $2, $3 version.chop! version[0,1] = '' if encoding then encoding.chop! encoding[0,1] = '' encoding.downcase! end if standalone then standalone.chop! standalone[0,1] = '' if standalone == 'yes' then standalone = true elsif standalone == 'no' then standalone = false else parse_error 'invalid standalone document declaration' standalone = nil end end on_xmldecl version, encoding, standalone unclosed_tag 'XML declaration' unless @src.tag_end? s = @src.pop end end while s if s[0] == ?< then if (c = s[1]) == ?! then if s[2] == ?- and s[3] == ?- then scan_comment s elsif /\A 'http://www.w3.org/XML/1998/namespace', } class ElementStack < superclass::ElementStack def initialize super @namespace = {} end attr_reader :namespace def default_namespace @namespace[:default] end def get_namespace(name) @namespace[name] or PredefinedNamespace[name] end def set_namespace(name, uri) push [ name, @namespace[name] ] if uri.empty? then @namespace.delete name else @namespace[name] = uri end end def pop_element(name) if name == last then pop while log = last and not log[2] pop @namespace[log[0]] = log[1] end push nil unless log self else nil end end def each reverse_each { |i| yield i if i[2] } end end private def expand_qualified_name(name, default = nil) unless /:/ =~ name then [ default, nil, name ] else prefix, localpart = $`, $' if localpart.empty? then parse_error "parse error at `:'" return [ nil, nil, name ] elsif /:/ =~ localpart then parse_error "localpart `#{localpart}' includes a colon" end unless namespace = @elemstack.get_namespace(prefix) then parse_error "undeclared namespace `#{prefix}'" namespace = nil end [ namespace, prefix, localpart ] end end def expand_attr_namespace(attr) dst = {} attr.each { |key,val| namespace, prefix, name = expand_qualified_name(key) h = dst[namespace] dst[namespace] = h = {} unless h h[name] = val } dst end def expand_attr_namespace_2(attr) attr.keys.each { |key| namespace, prefix, name = expand_qualified_name(key) if namespace then k = namespace + ' ' + name else k = ' ' + name end attr[k] = attr.delete(key) } attr end def name_in_errmsg(name) if name[1] then "#{name[1]}:#{name[2]}" else name[2] end end def on_stag(name, attr) super(expand_qualified_name(name, @elemstack.default_namespace), expand_attr_namespace(attr)) end def on_emptyelem(name, attr) super(expand_qualified_name(name, @elemstack.default_namespace), expand_attr_namespace(attr)) end def on_etag(name) super expand_qualified_name(name, @elemstack.default_namespace) end def on_pi(target, pi) parse_error "PI target must not include `:'" if /:/ =~ target end def on_attribute_value(key, val) super f = nil if key == 'xmlns' or f = (key[0,6] == 'xmlns:') then if f then name = key[6..-1] if name.empty? then parse_error "parse error at `:'" elsif /:/ =~ name then parse_error "namespace name `#{name}' includes a colon" elsif name[0,3].downcase == 'xml' then parse_error "prefix `#{name}' is reserved" elsif /\s/ =~ val then parse_error "invalid namespace `#{val}'" elsif val.empty? then parse_error "null namespace is declared as `#{name}'" else @elemstack.set_namespace name, val end else @elemstack.set_namespace :default, val end false else true end end end =begin supported by XMLScanner Well-Formedness Constraint: 属性指定の一意性 開始タグ又は空要素タグでは,同一の属性名が二回以上出現してはならない。 Well-Formedness Constraint: 属性値に<を含まないこと 属性値内で直接的又は間接的に参照する実体の置換テキストには,<を 含んではならない。 supported by WellFormedXMLScanner Well-Formedness Constraint: 要素型のマッチ 要素の終了タグの名前は,その要素の開始タグにおける要素型(の名前)とマッチしな ければならない。 Well-Formedness Constraint: 実体が宣言されていること DTDをもたない文書,パラメタ実体参照を含まない内部DTDサブセットだけをもつ文書, 又は "standalone='yes'" をもつ文書において,実体参照で用いる Name は, 外部サブセット及びパラメタ実体内以外に現れる実体宣言に含まれる名前と マッチしなければならない。ただし,整形式の文書は,実体amp, lt, gt, apos, quot を宣言する必要はない。一般実体の場合は,属性リスト宣言の デフォルト値内での参照より先に,宣言が現れなければならない。外部サブセット又は 外部パラメタ実体で実体を宣言するとき,妥当性を検証しないプロセサが,宣言を 読み,処理することを義務づけないことに注意。それらの文書では,実体は 宣言されなければならないという規則は,standalone='yes'の場合のみ, 整形式制約となる。 supported by XMLScanner with xmldtd.rb Well-Formedness Constraint: 内部サブセット内のパラメタ実体 DTDの内部サブセットでは,パラメタ実体参照は,マーク付け宣言が出現可能な場所だけ に出現できる。マーク付け宣言の一部としては出現できない。この制約は,外部パラメ タ実体又は外部サブセットでの参照には適用しない。 Well-Formedness Constraint: DTDの中 パラメタ実体参照は,DTD内にだけ,出現してよい。 supported by XMLParsedEntity Well-Formedness Constraint: 再帰なし 解析対象実体は,それ自体への参照を,直接にも間接にも含んではならない。 Well-Formedness Constraint: 外部実体への参照がないこと 属性値には,外部実体への直接的又は間接的な参照を含むことはできない。 Well-Formedness Constraint: 使用できる文字 文字参照で参照する文字は,Charの生成規則にマッチしなければならない。 Well-Formedness Constraint: 解析対象実体 実体参照は,解析対象外実体の名前を含んでいてはならない。解析対象外実体は, ENTITY型又はENTITIES 型として宣言した属性値としてだけ参照できる。 =end ## for internal general parsed entities class XMLParsedEntity < WellFormedXMLScanner def initialize(name, src) @name = name super src end #undef feed private end ## for external general parsed entities class ExtXMLScanner < XMLScanner def scan_prolog s = @src.pop if s and /\A<\?xml\b/ =~ s then unless TextDeclPattern =~ s then parse_error 'parse error in text declaration' else version, encoding = $1, $2 if encoding then encoding.chop! encoding[0,1] = '' encoding.downcase! end ret = on_xmldecl(version, encoding, nil) unclosed_tag 'text declaration' unless @src.tag_end? @prolog = false return ret end end @prolog = false s and scan_text(s) end end if __FILE__ == $0 then #class TestScanner < XMLScanner #class TestScanner < WellFormedXMLScanner class TestScanner < XMLScannerWithNamespace def on_error(path, lineno, msg) STDERR.printf "%s:%d: %s\n", path, lineno, msg end end STDOUT.sync = STDERR.sync = true if /\A--?\z/ === ARGV[0] then if ARGV.shift == '--' and ARGV.size == 1 then p = IO.popen("diff -u #{ARGV[0]} -", 'w') STDOUT.reopen p class Hash def []=(k,v) (@a ||= []).push [ k, v ] end def each(&b) @a.each(&b) if defined? @a end end end class TestScanner $".push 'xmlscan.rb' require 'xmltoken' def self.def_handler(*name) name.each { |i| eval %{ def on_#{i.downcase}(*a) super print Tokenizer::#{i}.new(*a).to_s end } } end def_handler 'CharData', 'Comment', 'PI', 'XMLDecl', 'Doctype' def_handler 'ETag', 'STag', 'EmptyElem' end end src = ARGF.read scan = TestScanner.new t1 = Time.times.utime scan.parse(src) t2 = Time.times.utime STDERR.printf "%2.3f sec\n", t2 - t1 end