''' xUrllib.py Copyright 2006 Andres Riancho This file is part of w3af, w3af.sourceforge.net . w3af is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. w3af is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with w3af; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ''' import urlOpenerSettings import core.controllers.outputManager as om import core.data.url.timeAnalysis as timeAnalysis from core.controllers.w3afException import w3afException from core.controllers.threads.threadManager import threadManager as tm from core.data.parsers.urlParser import * from core.data.constants.httpConstants import * import core.data.parsers.urlParser as urlParser import core.data.request.httpPostDataRequest as httpPostDataRequest import core.data.request.httpQsRequest as httpQsRequest from core.data.url.httpResponse import httpResponse as httpResponse from core.controllers.misc.lru import LRU # For subclassing requests and other things import urllib2 # for better debugging of handlers import traceback class sizeExceeded( Exception ): pass class xUrllib: ''' This is a urllib2 wrapper. @author: Andres Riancho ( andres.riancho@gmail.com ) ''' def __init__(self): self.settings = urlOpenerSettings.urlOpenerSettings() self._opener = None self._cacheOpener = None self._timeAnalysis = None # FIXME: Not thread safe self._errorCount = 0 # End of not thread safe self._dnsCache() self._tm = tm() self._sizeLRU = LRU(200) # User configured options (in an indirect way) self._grepPlugins = [] self._evasionPlugins = [] def _dnsCache( self ): ''' DNS cache trick This will speed up all the test ! Before this dns cache voodoo magic every request to the http server needed a dns query, this is slow on some networks so I added this feature. This method was taken from: # $Id: download.py,v 1.30 2004/05/13 09:55:30 torh Exp $ That is part of : swup-0.0.20040519/ Developed by: # Copyright 2001 - 2003 Trustix AS - # Copyright 2003 - 2004 Tor Hveem - # Copyright 2004 Omar Kilani for tinysofa - ''' om.out.debug('Enabling _dnsCache()') import socket if not hasattr( socket, 'alreadyConfigured' ): socket._getaddrinfo = socket.getaddrinfo _dns_cache = LRU(200) def _caching_getaddrinfo(*args, **kwargs): try: query = (args) res = _dns_cache[query] #This was too noisy and not so usefull #om.out.debug('Cached DNS response for domain: ' + query[0] ) return res except KeyError: res = socket._getaddrinfo(*args, **kwargs) _dns_cache[args] = res om.out.debug('DNS response from DNS server for domain: ' + query[0] ) return res if not hasattr( socket, 'alreadyConfigured' ): socket.getaddrinfo = _caching_getaddrinfo socket.alreadyConfigured = True def _init( self ): if self.settings.needUpdate or \ self._opener == None or self._cacheOpener == None: self.settings.needUpdate = False self.settings.buildOpeners() self._opener = self.settings.getCustomUrlopen() self._cacheOpener = self.settings.getCachedUrlopen() self._timeAnalysis = timeAnalysis.timeAnalysis() def getHeaders( self, uri ): ''' Returns a dict with the headers that would be used when sending a request to the remote server. ''' req = urllib2.Request( uri ) req = self._addHeaders( req ) return req.headers def GET(self, uri, data='', headers={}, useCache=False, grepResult=True, getSize=False ): ''' Gets a uri using a proxy, user agents, and other settings that where set previously. @param uri: This is the url to GET @param data: A Query String object with the data for the GET, the query string. @return: An httpResponse object. ''' self._init() if len( data ) > 0: req = urllib2.Request(uri + '?' + data ) else: req = urllib2.Request(uri ) req = self._addHeaders( req, headers ) if getSize: # Check the file size try: self._checkFileSize( req ) except sizeExceeded, se: return httpResponse( NO_CONTENT, '', {}, uri, uri ) except Exception, e: raise e return self._send( req , useCache=useCache, grepResult=grepResult) def POST(self, uri, data='', headers={}, grepResult=True, getSize=False ): ''' POST's data to a uri using a proxy, user agents, and other settings that where set previously. @param uri: This is the url where to post. @param data: A string with the data for the POST. @return: An httpResponse object. ''' self._init() req = urllib2.Request(uri, data ) req = self._addHeaders( req, headers ) if getSize: # Check the file size try: self._checkFileSize( req ) except sizeExceeded, se: return httpResponse( NO_CONTENT, '', {}, uri, uri ) except Exception, e: raise e return self._send( req , grepResult=grepResult) def getRemoteFileSize( self, uri, headers={}, useCache=True ): ''' @return: The file size of the remote file. ''' res = self.HEAD( uri, headers=headers, useCache=useCache ) fileLen = None for i in res.getHeaders(): if i.lower() == 'content-length': fileLen = res.getHeaders()[ i ] if fileLen.isdigit(): fileLen = int( fileLen ) else: msg = 'The content length header value of the response wasn\'t an integer... this is strange... The value is: ' + res.getHeaders()[ i ] om.out.error( msg ) raise w3afException( msg ) if fileLen != None: return fileLen else: om.out.debug( 'The response didn\'t contain a content-length header. Unable to return the remote file size of request with id: ' + str(res.id) ) # I prefer to fetch the file, before this om.out.debug was a "raise w3afException", but this didnt make much sense return 0 def __getattr__( self, methodName ): ''' This is a "catch-all" way to be able to handle every HTTP method. ''' class anyMethod: class methodRequest(urllib2.Request): def get_method(self): return self._method def set_method( self, method ): self._method = method def __init__( self, xu, method ): self._xurllib = xu self._method = method #(self, uri, data='', headers={}, useCache=False, grepResult=True, getSize=False ) def __call__( self, *args, **keywords ): if len( args ) != 1: raise w3afException('Invalid number of arguments. This method receives one argument and N keywords.') uri = args[0] self._xurllib._init() if 'data' in keywords: req = self.methodRequest( uri, keywords['data'] ) keywords.pop('data') else: req = self.methodRequest( uri ) req.set_method( self._method ) if 'headers' in keywords: req = self._xurllib._addHeaders( req, keywords['headers'] ) keywords.pop('headers') om.out.debug( req.get_method() + ' ' + uri) # def _send( self , req , useCache=False, useMultipart=False, grepResult=True ) return self._xurllib._send( req, keywords ) am = anyMethod( self, methodName ) return am def _addHeaders( self , req, headers={} ): # Add all custom Headers if they exist for i in self.settings.HeaderList: req.add_header( i[0], i[1] ) for h in headers.keys(): req.add_header( h, headers[h] ) return req def _checkURI( self, req ): if req.get_full_url().find( 'http' ) == 0: return True else: if not req.get_full_url().count( 'javascript:' ) and not req.get_full_url().count( 'mailto:' ): raise w3afException('Unsupported URL: ' + req.get_full_url() ) def _checkFileSize( self, req ): # No max file size. if self.settings.getMaxFileSize() == 0: pass else: # This will speed up the most frequent request, no HEAD is done to the last recently used URL's if req.get_full_url() not in self._sizeLRU: size = self.getRemoteFileSize( req.get_full_url() ) self._sizeLRU[ req.get_full_url() ] = size else: size = self._sizeLRU[ req.get_full_url() ] #om.out.debug('Size of response got from self._sizeLRU.') if self.settings.getMaxFileSize() < size : msg = 'File size of URL: ' + req.get_full_url() + ' exceeds the configured file size limit. Max size limit is: ' + str(greek(self.settings.getMaxFileSize())) + ' and file size is: ' + str(greek(size)) + ' .' om.out.debug( msg ) raise sizeExceeded( msg ) def _send( self , req , useCache=False, useMultipart=False, grepResult=True ): # Sanitize the URL self._checkURI( req ) # Evasion originalUrl = req._Request__original req._Request__original = self._evasion( req._Request__original ) res = None try: if useCache: res = self._cacheOpener.open( req ) else: res = self._opener.open( req ) except urllib2.URLError, e: # I get to this section of the code if a 400 error is returned # also possible when a proxy is configured and not available # also possible when auth credentials are wrong for the URI if hasattr(e, 'reason'): if e.reason[0] == -2: raise w3afException('Failed to resolve domain name for URL: ' + req.get_full_url() ) else: om.out.debug( 'We failed to reach the server. Reason: "' + str(e.reason) + '"; going to retry.') om.out.debug( 'Traceback for this error: ' + str( traceback.format_exc() ) ) req._Request__original = originalUrl return self._retry( req, useCache ) elif hasattr(e, 'code'): om.out.debug( req.get_method() + ' ' + originalUrl +' returned HTTP code "' + str(e.code) + '"' ) # Return this info to the caller code = int(e.code) info = e.info() geturl = e.geturl() read = self._readRespose( e ) httpResObj = httpResponse(code, read, info, geturl, originalUrl, id=e.id ) if grepResult: self._grepResult( req, httpResObj ) else: om.out.debug('No grep for : ' + geturl + ' , the plugin sent grepResult=False.') return httpResObj except KeyboardInterrupt, k: # Correct control+c handling... raise k except Exception, e: # This except clause will catch errors like # "(-3, 'Temporary failure in name resolution')" # "(-2, 'Name or service not known')" # for now... i think re-raising the exception is the best solution # contact me at andres.riancho@gmail.com if you have a better idea #raise w3afException( str(e) ) # best is to post a message om.out.debug( req.get_method() + ' ' + originalUrl +' returned HTTP code "' + str(NO_CONTENT) + '"' ) om.out.debug( 'Unhandled exception in xUrllib._send(): ' + str ( e ) ) om.out.debug( str( traceback.format_exc() ) ) return httpResponse( NO_CONTENT, '', {}, originalUrl, originalUrl ) else: # Everything ok ! om.out.debug( req.get_method() + ' ' + originalUrl +' returned HTTP code "' + str(res.code) + '"' ) code = int(res.code) info = res.info() geturl = res.geturl() read = self._readRespose( res ) httpResObj = httpResponse(code, read, info, geturl, originalUrl, id=res.id ) if grepResult: self._grepResult( req, httpResObj ) else: om.out.debug('No grep for : ' + geturl + ' , the plugin sent grepResult=False.') return httpResObj def _readRespose( self, res ): read = '' try: read = res.read() except KeyboardInterrupt, k: raise k except Exception, e: om.out.error( str ( e ) ) return read return read def _retry( self, req , useCache ): ''' Try to send the request again. ''' if self._errorCount < self.settings.getMaxRetrys() : self._errorCount += 1 return self._send( req, useCache ) else: self._errorCount = 0 raise w3afException('Too many retries when trying to get: ' + req.get_full_url() ) def setGrepPlugins(self, grepPlugins ): self._grepPlugins = grepPlugins def setEvasionPlugins( self, evasionPlugins ): evasionPlugins.sort() self._evasionPlugins = evasionPlugins def _evasion( self, uri ): path = getPathQs( uri ) for eplugin in self._evasionPlugins: path = eplugin.fuzzUrl( path ) uri = getProtocol( uri ) + '://' + getDomain( uri ) + path return uri def _grepResult(self, request, response): # The grep process is all done in another thread. This improved the # speed of all w3af. if len( self._grepPlugins ): # I'll create a fuzzable request based on the urllib2 request object fuzzReq = self._createFuzzFromRequest( request.get_method(), request.get_full_url(), request.get_data(), request.headers ) targs = (fuzzReq, response) self._tm.startFunction( target=self._grepWorker, args=targs, ownerObj=self ) def _createFuzzFromRequest( self, command, url, postData, headers ): ''' Creates a fuzzable request based on a request. This is for sending to the grep plugins. ''' res = None if postData and len( postData ): pdr = httpPostDataRequest.httpPostDataRequest() pdr.setURL( url ) pdr.setMethod( command ) if 'content-length' in headers.keys(): headers.pop('content-length') pdr.setHeaders( headers ) if 'content-Type' in headers.keys() and headers['content-Type'] == 'multipart/form-data': dc = cgi.parse_multipart( postData, headers ) for i in dc.keys(): dc = dc[ i ][0] pdr.setDc( dc ) else: dc = urlParser.getQueryString( 'http://a/?' + postData ) pdr.setDc( dc ) res = pdr else: # Just a query string request ! no postdata qsr = httpQsRequest.httpQsRequest() qsr.setURL( url ) qsr.setMethod( command ) dc = urlParser.getQueryString( url ) qsr.setDc( dc ) res = qsr return res def _grepWorker( self , request, response): for grepPlugin in self._grepPlugins: try: grepPlugin.testResponse( request, response) except KeyboardInterrupt: # Correcting control+c handling... raise _abbrevs = [ (1<<50L, 'P'), (1<<40L, 'T'), (1<<30L, 'G'), (1<<20L, 'M'), (1<<10L, 'k'), (1, '') ] def greek(size): """ Return a string representing the greek/metric suffix of a size """ for factor, suffix in _abbrevs: if size > factor: break return str( int(size/factor) ) + suffix