''' htmlParser.py Copyright 2006 Andres Riancho This file is part of w3af, w3af.sourceforge.net . w3af is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. w3af is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with w3af; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ''' import core.controllers.outputManager as om from core.controllers.w3afException import w3afException import core.data.kb.config as cf try: import extlib.utidylib as tidy om.out.debug('htmlParser is using the bundled utidy library') except: try: import tidy om.out.debug('htmlParser is using the systems utidy library') except: raise w3afException('You have to install utidy lib.') from core.data.parsers.abstractParser import abstractParser import core.data.parsers.urlParser as urlParser import core.data.dc.form as form class htmlParser(abstractParser): ''' This class parses HTML's. @author: Andres Riancho ( andres.riancho@gmail.com ) ''' def __init__(self, document, baseUrl, useTidy=True, verbose=0): self._tidyOptions = dict( tidy_mark=0,\ quote_ampersand=0, quote_marks=0, \ quote_nbsp=0, force_output=1) self._tagsContainingURLs = ('a','img', 'link', 'script', 'iframe', 'object', 'embed', 'area', 'frame', 'applet', 'input', 'base', 'div', 'layer', 'ilayer', 'bgsound', 'form') self._urlAttrs = ('href', 'src', 'data', 'action' ) self._goodInputType = ['password','text','checkbox','radio','hidden','file','submit'] abstractParser.__init__(self, document, baseUrl, useTidy, verbose) def _preParse( self, HTMLDocument ): assert self._baseUrl != '', 'The base URL must be setted.' # If tidy is on, we should "fix" the html with it if self._useTidy: HTMLDocument = str ( tidy.parseString(HTMLDocument, **self._tidyOptions) ) # Now we are ready to work self._parse ( HTMLDocument ) def getForms( self, HTML ): ''' @parameter HTML: The HTML to parse. @return: Returns list of forms. ''' return self._forms def getReferences( self, HTML ): ''' Searches for references on a page. w3af searches references in every html tag, including: - a - forms - images - frames - etc. @parameter HTML: The HTML to parse. @return: Returns list of links. ''' return set( self._urlsInDocument ) def getComments( self, HTML ): ''' @parameter HTML: The HTML to parse. @return: Returns list of comment strings. ''' return set( self._commentsInDocument ) def getMetaRedir( self, HTML ): ''' @parameter HTML: The HTML to parse. @return: Returns list of meta redirections. ''' return set( self._metaRedirs ) def unknown_endtag(self, tag): ''' called for each end tag, e.g. for , tag will be "pre" ''' if tag.lower() == 'form' : self._insideForm = False if tag.lower() == 'select' : self._insideSelect = False def _findForms(self, tag, attrs): ''' This method finds forms inside an HTML document. ''' '''
''' if tag.lower() == 'form' : #Find the method method = 'GET' foundMethod = False for attr in attrs: if attr[0].lower() == 'method': method = attr[1].upper() foundMethod = True if not foundMethod: om.out.debug('htmlParser found a form without a method. Using GET as the default.') #Find the action foundAction = False for attr in attrs: if attr[0].lower() == 'action': action = urlParser.urlJoin( self._baseUrl ,attr[1] ) foundAction = True if not foundAction: om.out.debug('htmlParser found a form without an action. Javascript is being used.') #