#!/usr/bin/ruby
$:.push './html-parser'
require 'htmlscan'
require 'sgml-parser'
class LooseHTMLScanner
def parse_ary(a)
parse a
end
end
class SGMLParser
alias parse feed
def parse_ary(a)
for i in a
feed i
end
end
end
Entries = [ SGMLParser, LooseHTMLScanner ]
require 'nkf'
#$KCODE = 'E'
def benchmark(klass, method, arg)
begin
scanner = klass.new
t1 = Time.times.utime
scanner.send method, arg
t2 = Time.times.utime
sprintf "%.2f", t2 - t1
rescue Exception
raise if $!.is_a? Interrupt
sprintf "=%s=", $!.type
end
end
if ARGV[0] == '-f' then
files = File.open(ARGV[1]).readlines
files.each { |i| i.chomp! }
else
files = ARGV
end
STDOUT.sync = true
print Entries.collect{ |i| i.name }.join("\t"), "\n"
print "at once (read)\tby line (gets)\n"
files.each { |i|
print i
src = File.open(i.chomp) { |f|
s = f.gets("\r\n\r\n") # skip http header
f.read or s
}
src = NKF.nkf('-dexm0', src)
print "\t", src.size
src_a = src.to_a
print "\t", src_a.size
Entries.each { |klass|
print "\t", benchmark(klass, :parse, src)
print "\t", benchmark(klass, :parse_ary, src_a)
}
print "\n"
}