#
# htmlscan.rb
#
# Copyright (C) Ueno Katsuhiro 2000,2001
#
# $Id: htmlscan.rb,v 1.5 2001/01/04 12:36:04 katsu Exp $
#
require 'xmlscan'
class XMLScanner
module HTML
private
def entityref_literal(ref)
PredefinedEntity[ref]
end
def on_attribute_value(key, val)
if val then
inc = 0
val.gsub!(/&([^;\s]+?\b);?/) { |m|
if (s = $1)[0] == ?\# then
rep = parse_charref(s)
else
rep = entityref_literal(s)
end
unless rep then
@unexpanded_entityrefs = [] unless defined? @unexpanded_entityrefs
@unexpanded_entityrefs.push [ key.dup, $~.begin(0) + inc, s ]
rep = ''
end
inc += rep.size - m.size
rep
}
end
true
end
def scan_content(s)
while true
unless /&/ =~ s then
on_chardata s
else
on_chardata s unless (s = $`).empty?
$'.split(/&/, -1).each { |i|
unless /\A([^;\s]+?\b);?/ =~ i then
i = '&' << i
else
e, i = $1, $'
if e[0] == ?\# then
parse_charref e
else
on_entityref e
end
end
on_chardata i unless i.empty?
}
end
break if @src.tag_start?
s = @src.pop
break unless s
on_chardata '>' unless s == '>'
end
end
def scan_pi(s)
s[0,2] = ''
pi = s
until @src.tag_end?
s = @src.pop
unless s then
parse_error "unterminated PI meets EOF"
break
end
pi << '>' if s[0] != ?<
pi << s
end
on_pi '', pi
end
def scan_stag(s)
attr = {}
unless /(?=[\/\s])/ =~ s then
name = s
name[0,1] = ''
if name.empty? then # << or <>
if @src.tag_end? then
parse_error "found an empty start tag `<>'"
else
parse_error "parse error at `<'"
return on_chardata('<' + s)
end
end
else
name, s = $`, $'
name[0,1] = ''
if name.empty? then # < tag
parse_error "parse error at `<'"
unless /\A(?:(?!\n\s*\n)\s)*([^\/\s]+)/ =~ s then
return on_chardata('<' << s)
end
name, s = $1, $'
end
begin
complete = true
s.scan(/\s+(?:([^=\s]+)(?:\s*=\s*('[^']*'?|"[^"]*"?|[^="'\s]+))?|\z)|\s*(.[^='"\s]*)/m
) { |key,val,err|
if key then
if val then
if val[0] == ?" or val[0] == ?' then #'"
qmark = val.slice!(0,1)
if val[-1] == qmark[0] then
val.chop!
else
s = read_until(/#{qmark}/, val, 'attribute value')
complete = false
# always break here.
end
end
end
if on_attribute_value(key, val) then
parse_error "doubled attribute `#{key}'" if attr.key? key
attr[key] = val || true
end
elsif err then
parse_error "parse error at `#{err.split(/\b|\s/,2)[0]}'"
end
}
end until complete
end
unclosed_tag 'start tag' unless @src.tag_end?
on_stag name, attr
end
DOCTYPEPattern = instance_eval {
pidc = '[-\'()+,./:=?;!*#@$_% \\r\\na-zA-Z0-9]'
pidc2 = pidc.delete("'")
/\A([^\s\["']+)(?:\s+(?:SYSTEM|PUBLIC(?:\s+("#{pidc}*"|'#{pidc2}*'))?)\s+("[^"]*"?|'[^']*'?))\s*/i
}
def scan_doctype(s)
unless DOCTYPEPattern =~ s then
parse_error "parse error in DOCTYPE"
return
end
root, pubid, sysid, s = $1, $2, $3, $'
if pubid then
pubid.chop!
pubid[0,1] = ''
pubid.gsub!(/\s+/, ' ')
end
if sysid then
c = sysid.slice!(0,1)
if c[0] == sysid[-1] then
sysid.chop!
else
s = read_until(/#{c}\s*/, sysid, 'DOCTYPE')
end
end
parse_error "parse error at `#{s.split(/\b|\s/,2)[0]}'" unless s.empty?
unclosed_tag 'DOCTYPE' unless @src.tag_end?
pubid, sysid = sysid, nil if pubid.nil? and sysid
on_doctype root, pubid, sysid
end
def scan_prolog
while s = @src.pop and not /\A