require 'htree/htmlinfo' require 'htree/regexp-util' require 'htree/fstr' module HTree # :stopdoc: module Pat NameChar = /[-A-Za-z0-9._:]/ Name = /[A-Za-z_:]#{NameChar}*/ Nmtoken = /#{NameChar}+/ Comment_C = //m Comment = Comment_C.disable_capture CDATA_C = //m CDATA = CDATA_C.disable_capture QuotedAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)')/ QuotedAttr = QuotedAttr_C.disable_capture ValidAttr_C = /(#{Name})\s*=\s*(?:"([^"]*)"|'([^']*)'|(#{NameChar}*))|(#{Nmtoken})/ ValidAttr = ValidAttr_C.disable_capture InvalidAttr1_C = /(#{Name})\s*=\s*(?:'([^'<>]*)'|"([^"<>]*)"|([^\s<>"']*(?![^\s<>"'])))|(#{Nmtoken})/ InvalidAttr1 = InvalidAttr1_C.disable_capture InvalidAttr1End_C = /(#{Name})(?:\s*=\s*(?:'([^'<>]*)|"([^"<>]*)))/ InvalidAttr1End = InvalidAttr1End_C.disable_capture QuotedStartTag_C = /<(#{Name})((?:\s+#{QuotedAttr})*)\s*>/ QuotedStartTag = QuotedStartTag_C.disable_capture ValidStartTag_C = /<(#{Name})((?:\s+#{ValidAttr})*)\s*>/ ValidStartTag = ValidStartTag_C.disable_capture InvalidStartTag_C = /<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*>/ InvalidStartTag = InvalidStartTag_C.disable_capture StartTag = /#{QuotedStartTag}|#{ValidStartTag}|#{InvalidStartTag}/ QuotedEmptyTag_C = %r{<(#{Name})((?:\s+#{QuotedAttr})*)\s*/>} QuotedEmptyTag = QuotedEmptyTag_C.disable_capture ValidEmptyTag_C = %r{<(#{Name})((?:\s+#{ValidAttr})*)\s*/>} ValidEmptyTag = ValidEmptyTag_C.disable_capture InvalidEmptyTag_C = %r{<(#{Name})((?:(?:\b|\s+)#{InvalidAttr1})*)((?:\b|\s+)#{InvalidAttr1End})?\s*/>} InvalidEmptyTag = InvalidEmptyTag_C.disable_capture EmptyTag = /#{QuotedEmptyTag}|#{ValidEmptyTag}|#{InvalidEmptyTag}/ EndTag_C = %r{} EndTag = EndTag_C.disable_capture XmlVersionNum = /[a-zA-Z0-9_.:-]+/ XmlVersionInfo_C = /\s+version\s*=\s*(?:'(#{XmlVersionNum})'|"(#{XmlVersionNum})")/ XmlVersionInfo = XmlVersionInfo_C.disable_capture XmlEncName = /[A-Za-z][A-Za-z0-9._-]*/ XmlEncodingDecl_C = /\s+encoding\s*=\s*(?:"(#{XmlEncName})"|'(#{XmlEncName})')/ XmlEncodingDecl = XmlEncodingDecl_C.disable_capture XmlSDDecl_C = /\s+standalone\s*=\s*(?:'(yes|no)'|"(yes|no)")/ XmlSDDecl = XmlSDDecl_C.disable_capture XmlDecl_C = /<\?xml#{XmlVersionInfo_C}#{XmlEncodingDecl_C}?#{XmlSDDecl_C}?\s*\?>/ XmlDecl = /<\?xml#{XmlVersionInfo}#{XmlEncodingDecl}?#{XmlSDDecl}?\s*\?>/ # xxx: internal DTD subset is not recognized: '[' (markupdecl | DeclSep)* ']' S?)? SystemLiteral_C = /"([^"]*)"|'([^']*)'/ PubidLiteral_C = %r{"([\sa-zA-Z0-9\-'()+,./:=?;!*\#@$_%]*)"|'([\sa-zA-Z0-9\-()+,./:=?;!*\#@$_%]*)'} ExternalID_C = /(?:SYSTEM|PUBLIC\s+#{PubidLiteral_C})(?:\s+#{SystemLiteral_C})?/ DocType_C = //m DocType = DocType_C.disable_capture XmlProcIns_C = /<\?(#{Name})(?:\s+(.*?))?\?>/m XmlProcIns = XmlProcIns_C.disable_capture #ProcIns = /<\?([^>]*)>/m end def HTree.scan(input, is_xml=false) is_html = false cdata_content = nil text_start = 0 first_element = true index_xmldecl = 1 index_doctype = 2 index_xmlprocins = 3 index_quotedstarttag = 4 index_quotedemptytag = 5 index_starttag = 6 index_endtag = 7 index_emptytag = 8 index_comment = 9 index_cdata = 10 input.scan(/(#{Pat::XmlDecl}) |(#{Pat::DocType}) |(#{Pat::XmlProcIns}) |(#{Pat::QuotedStartTag}) |(#{Pat::QuotedEmptyTag}) |(#{Pat::StartTag}) |(#{Pat::EndTag}) |(#{Pat::EmptyTag}) |(#{Pat::Comment}) |(#{Pat::CDATA}) /ox) { match = $~ if cdata_content str = $& if match.begin(index_endtag) && str[Pat::Name] == cdata_content text_end = match.begin(0) if text_start < text_end yield [:text_cdata_content, HTree.frozen_string(input[text_start...text_end])] end yield [:etag, HTree.frozen_string(str)] text_start = match.end(0) cdata_content = nil end else str = match[0] text_end = match.begin(0) if text_start < text_end yield [:text_pcdata, HTree.frozen_string(input[text_start...text_end])] end text_start = match.end(0) if match.begin(index_xmldecl) yield [:xmldecl, HTree.frozen_string(str)] is_xml = true elsif match.begin(index_doctype) Pat::DocType_C =~ str root_element_name = $1 public_identifier = $2 || $3 system_identifier = $4 || $5 is_html = true if /\Ahtml\z/i =~ root_element_name is_xml = true if public_identifier && %r{\A-//W3C//DTD XHTML } =~ public_identifier yield [:doctype, HTree.frozen_string(str)] elsif match.begin(index_xmlprocins) yield [:procins, HTree.frozen_string(str)] elsif match.begin(index_starttag) || match.begin(index_quotedstarttag) yield stag = [:stag, HTree.frozen_string(str)] tagname = str[Pat::Name] if first_element if /\A(?:html|head|title|isindex|base|script|style|meta|link|object)\z/i =~ tagname is_html = true else is_xml = true end first_element = false end if !is_xml && ElementContent[tagname] == :CDATA cdata_content = tagname end elsif match.begin(index_endtag) yield [:etag, HTree.frozen_string(str)] elsif match.begin(index_emptytag) || match.begin(index_quotedemptytag) yield [:emptytag, HTree.frozen_string(str)] first_element = false #is_xml = true elsif match.begin(index_comment) yield [:comment, HTree.frozen_string(str)] elsif match.begin(index_cdata) yield [:text_cdata_section, HTree.frozen_string(str)] else raise Exception, "unknown match [bug]" end end } text_end = input.length if text_start < text_end if cdata_content yield [:text_cdata_content, HTree.frozen_string(input[text_start...text_end])] else yield [:text_pcdata, HTree.frozen_string(input[text_start...text_end])] end end return is_xml, is_html end # :startdoc: end