''' abstractParser.py Copyright 2006 Andres Riancho This file is part of w3af, w3af.sourceforge.net . w3af is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. w3af is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with w3af; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ''' import core.controllers.outputManager as om from core.controllers.w3afException import w3afException from sgmllib import SGMLParser import core.data.dc.form as form import core.data.parsers.urlParser as urlParser import string class abstractParser(SGMLParser): ''' This class is an abstract document parser. @author: Andres Riancho ( andres.riancho@gmail.com ) ''' def __init__(self, document, baseUrl, useTidy=True, verbose=0): SGMLParser.__init__(self, verbose) self._baseUrl = '' self._baseDomain = '' self._tagsContainingURLs = ('go', 'a','img', 'link', 'script', 'iframe', 'object', 'embed', 'area', 'frame', 'applet', 'input', 'base', 'div', 'layer', 'ilayer', 'bgsound', 'form') self._urlAttrs = ('href', 'src', 'data', 'action' ) self._urlsInDocument = [] self._forms = [] self._insideForm = False self._insideSelect = False self._commentsInDocument = [] self._metaRedirs = [] # "setBaseUrl" self._baseUrl = baseUrl self._baseDomain = urlParser.getDomain(baseUrl) self._useTidy = useTidy # Now we are ready to work self._preParse( document ) def unknown_starttag(self, tag, attrs): ''' Called for each start tag attrs is a list of (attr, value) tuples e.g. for
, tag="pre", attrs=[("class", "screen")]
Note that improperly embedded non-HTML code (like client-side Javascript)
may be parsed incorrectly by the ancestor, causing runtime script errors.
All non-HTML code must be enclosed in HTML comment tags ()
to ensure that it will pass through this parser unaltered (in handle_comment).
'''
self._findReferences(tag, attrs)
self._findForms(tag, attrs)
self._findMetaRedir(tag, attrs)
def _findMetaRedir( self, tag, attrs):
'''
Find meta tag redirections, like this one:
'''
if tag.lower() == 'meta':
hasHTTP_EQUIV = False
hasContent = False
content = ''
for attr in attrs:
if attr[0].lower() == 'http-equiv' and attr[1].lower() == 'refresh':
hasHTTP_EQUIV = True
if attr[0].lower() == 'content':
hasContent = True
content = attr[1].lower()
if hasContent and hasHTTP_EQUIV:
self._metaRedirs.append( content )
def _findReferences(self, tag, attrs):
'''
This method finds references inside an document document.
'''
if tag.lower() in self._tagsContainingURLs:
for attr in attrs:
if attr[0].lower() in self._urlAttrs:
if len( attr[1] ):
if attr[1][0] != '#':
self._urlsInDocument.append ( urlParser.urlJoin( self._baseUrl ,attr[1] ) )
break
def _parse(self, s):
'''
This method parses the document.
@parameter s: The document to parse.
'''
try:
self.feed(s)
self.close()
except Exception, e:
raise w3afException('Exception found while parsing document. Exception: ' + str(e) )
else:
# Saves A LOT of memory
# without this a run will use 4,799,936
# with this, a run will use 113,696
del self.rawdata
def getAccounts( self , documentString ):
'''
@return: A list with all mail users that are present in the documentString.
'''
accounts = []
if documentString.find('@') != -1:
# This two for loops were taken from Sergio Alvarez fingergoogle.py
for i in ('=','"', '\'','
', '[', ']', '<', '>', ':', ';', '&', '(', ')', '{', '}'):
documentString = string.replace(documentString, i, ' ')
documentString = string.split(documentString, '\n')
for line in documentString:
if line.count('@'+self._baseDomain):
split = string.split(line, ' ')
for i in split:
if i.count('@'+self._baseDomain):
if i[0] == '@':
continue
if string.split(i, '@')[1] != self._baseDomain:
continue
i = i[:-(len(self._baseDomain)+1)]
if len(i) > 1:
if i[-1] == '@':
i = i[:-1]
# If account aint in account list , then add
if accounts.count(i) == 0:
accounts.append(i)
return accounts
def handle_comment( self , text ):
'''
This method is called by parse when a comment is found.
'''
self._commentsInDocument.append( text )