# # htmlscan.rb # # Copyright (C) Ueno Katsuhiro 2000,2001 # # $Id: htmlscan.rb,v 1.5 2001/01/04 12:36:04 katsu Exp $ # require 'xmlscan' class XMLScanner module HTML private def entityref_literal(ref) PredefinedEntity[ref] end def on_attribute_value(key, val) if val then inc = 0 val.gsub!(/&([^;\s]+?\b);?/) { |m| if (s = $1)[0] == ?\# then rep = parse_charref(s) else rep = entityref_literal(s) end unless rep then @unexpanded_entityrefs = [] unless defined? @unexpanded_entityrefs @unexpanded_entityrefs.push [ key.dup, $~.begin(0) + inc, s ] rep = '' end inc += rep.size - m.size rep } end true end def scan_content(s) while true unless /&/ =~ s then on_chardata s else on_chardata s unless (s = $`).empty? $'.split(/&/, -1).each { |i| unless /\A([^;\s]+?\b);?/ =~ i then i = '&' << i else e, i = $1, $' if e[0] == ?\# then parse_charref e else on_entityref e end end on_chardata i unless i.empty? } end break if @src.tag_start? s = @src.pop break unless s on_chardata '>' unless s == '>' end end def scan_pi(s) s[0,2] = '' pi = s until @src.tag_end? s = @src.pop unless s then parse_error "unterminated PI meets EOF" break end pi << '>' if s[0] != ?< pi << s end on_pi '', pi end def scan_stag(s) attr = {} unless /(?=[\/\s])/ =~ s then name = s name[0,1] = '' if name.empty? then # << or <> if @src.tag_end? then parse_error "found an empty start tag `<>'" else parse_error "parse error at `<'" return on_chardata('<' + s) end end else name, s = $`, $' name[0,1] = '' if name.empty? then # < tag parse_error "parse error at `<'" unless /\A(?:(?!\n\s*\n)\s)*([^\/\s]+)/ =~ s then return on_chardata('<' << s) end name, s = $1, $' end begin complete = true s.scan(/\s+(?:([^=\s]+)(?:\s*=\s*('[^']*'?|"[^"]*"?|[^="'\s]+))?|\z)|\s*(.[^='"\s]*)/m ) { |key,val,err| if key then if val then if val[0] == ?" or val[0] == ?' then #'" qmark = val.slice!(0,1) if val[-1] == qmark[0] then val.chop! else s = read_until(/#{qmark}/, val, 'attribute value') complete = false # always break here. end end end if on_attribute_value(key, val) then parse_error "doubled attribute `#{key}'" if attr.key? key attr[key] = val || true end elsif err then parse_error "parse error at `#{err.split(/\b|\s/,2)[0]}'" end } end until complete end unclosed_tag 'start tag' unless @src.tag_end? on_stag name, attr end DOCTYPEPattern = instance_eval { pidc = '[-\'()+,./:=?;!*#@$_% \\r\\na-zA-Z0-9]' pidc2 = pidc.delete("'") /\A([^\s\["']+)(?:\s+(?:SYSTEM|PUBLIC(?:\s+("#{pidc}*"|'#{pidc2}*'))?)\s+("[^"]*"?|'[^']*'?))\s*/i } def scan_doctype(s) unless DOCTYPEPattern =~ s then parse_error "parse error in DOCTYPE" return end root, pubid, sysid, s = $1, $2, $3, $' if pubid then pubid.chop! pubid[0,1] = '' pubid.gsub!(/\s+/, ' ') end if sysid then c = sysid.slice!(0,1) if c[0] == sysid[-1] then sysid.chop! else s = read_until(/#{c}\s*/, sysid, 'DOCTYPE') end end parse_error "parse error at `#{s.split(/\b|\s/,2)[0]}'" unless s.empty? unclosed_tag 'DOCTYPE' unless @src.tag_end? pubid, sysid = sysid, nil if pubid.nil? and sysid on_doctype root, pubid, sysid end def scan_prolog while s = @src.pop and not /\A