''' urlParser.py Copyright 2006 Andres Riancho This file is part of w3af, w3af.sourceforge.net . w3af is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation version 2 of the License. w3af is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with w3af; if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA ''' from core.controllers.w3afException import w3afException from core.data.dc.queryString import queryString import core.controllers.outputManager as om import urlparse as _uparse import urllib import cgi import socket import re ''' This module parses Url's. @author: Andres Riancho ( andres.riancho@gmail.com ) ''' def hasQueryString( url): ''' Analizes the uri to check for a query string. @parameter url: The uri to analize. @return: True if the URL has a query string. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) if qs != '': return True return False def getQueryString( url ): ''' Parses the query string and returns a dict. @parameter url: The url with the query string to parse. @return: A QueryString Object, example : - input url : http://localhost/foo.asp?xx=yy&bb=dd - output dict : { xx:yy , bb:dd } ''' parsedQs = None result = queryString() if hasQueryString( url ): scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) try: parsedQs = cgi.parse_qs( qs ,keep_blank_values=True,strict_parsing=True) except: #om.out.debug('Strange things found when parsing query string: ' + qs) pass else: for i in parsedQs.keys(): result[ i ] = parsedQs[ i ][0] return result def uri2url( url): ''' @parameter url: The url with the query string. @return: Returns a string contaning the URL without the query string. Example : - input url : http://localhost/foo.asp?xx=yy&bb=dd#fragment - output url string : http://localhost/foo.asp ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return scheme+'://'+domain+path def removeFragment( url ): ''' @parameter url: The url with fragments @return: Returns a string contaning the URL without the fragment. Example : - input url : http://localhost/foo.asp?xx=yy&bb=dd#fragment - output url string : http://localhost/foo.asp?xx=yy&bb=dd ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) if qs != '': res = scheme+'://'+domain+path+'?'+qs else: res = scheme+'://'+domain+path return res def baseUrl( url ): ''' @parameter url: The url with the query string. @return: Returns a string contaning the URL without the query string and without any path. Example : - input url : http://localhost/dir1/foo.asp?xx=yy&bb=dd - output url string : http://localhost/ ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return scheme+'://'+domain + '/' def urlJoin( baseurl , relative ): ''' Construct a full (``absolute'') URL by combining a ``base URL'' (base) with a ``relative URL'' (url). Informally, this uses components of the base URL, in particular the addressing scheme, the network location and (part of) the path, to provide missing components in the relative URL. Example: urljoin('http://www.cwi.nl/%7Eguido/Python.html', 'FAQ.html') yields the string 'http://www.cwi.nl/%7Eguido/FAQ.html' @param baseurl: The base url to join @param relative: The relative url to add to the base url ''' if relative.find('//') == 0: # This special case had to be generated cause of some pykto tests scheme, domain, path, x1, qs, x3 = _uparse.urlparse( baseurl ) lastSlash = path.rfind( '/' ) if lastSlash != 0: # I have more than one / path = path[: lastSlash] relative = relative[1:] response = scheme + '://' + domain + path + relative else: response = _uparse.urljoin( baseurl, relative ) return response def getDomain( url): ''' @parameter url: The url to parse. @return: Returns the domain name for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return domain def getProtocol( url ): ''' @parameter url: The url to parse. @return: Returns the domain name for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return scheme def getRootDomain( domain ): ''' Get the root domain name. Examples: input: www.ciudad.com.ar output: ciudad.com.ar input: i.love.myself.ru output: myself.ru Code taken from: http://getoutfoxed.com/node/41 ''' # TODO: this list should be updated from time to time, automatically. # taken from http:#en.wikipedia.org/wiki/List_of_Internet_top-level_domains gTopLevelDomainDict = { "ac":1,"ad":1,"ae":1,"aero":1,"af":1,"ag":1,"ai":1,"al":1,"am":1, "an":1,"ao":1,"aq":1,"ar":1,"arpa":1,"as":1,"at":1,"au":1,"aw":1, "az":1,"ba":1,"bb":1,"bd":1,"be":1,"bf":1,"bg":1,"bh":1,"bi":1, "biz":1,"bj":1,"bm":1,"bn":1,"bo":1,"br":1,"bs":1,"bt":1,"bv":1, "bw":1,"by":1,"bz":1,"ca":1,"cc":1,"cd":1,"cf":1,"cg":1,"ch":1, "ci":1,"ck":1,"cl":1,"cm":1,"cn":1,"co":1,"com":1,"coop":1,"cr":1, "cu":1,"cv":1,"cx":1,"cy":1,"cz":1,"de":1,"dj":1,"dk":1,"dm":1, "do":1,"dz":1,"ec":1,"edu":1,"ee":1,"eg":1,"er":1,"es":1,"et":1, "fi":1,"fj":1,"fk":1,"fm":1,"fo":1,"fr":1,"ga":1,"gb":1,"gd":1, "ge":1,"gf":1,"gg":1,"gh":1,"gi":1,"gl":1,"gm":1,"gn":1,"gov":1, "gp":1,"gq":1,"gr":1,"gs":1,"gt":1,"gu":1,"gw":1,"gy":1,"hk":1, "hm":1,"hn":1,"hr":1,"ht":1,"hu":1,"id":1,"ie":1,"il":1,"im":1, "in":1,"info":1,"int":1,"io":1,"iq":1,"ir":1,"is":1,"it":1,"je":1, "jm":1,"jo":1,"jp":1,"ke":1,"kg":1,"kh":1,"ki":1,"km":1,"kn":1, "kr":1,"kw":1,"ky":1,"kz":1,"la":1,"lb":1,"lc":1,"li":1,"lk":1, "lr":1,"ls":1,"lt":1,"lu":1,"lv":1,"ly":1,"ma":1,"mc":1,"md":1, "mg":1,"mh":1,"mil":1,"mk":1,"ml":1,"mm":1,"mn":1,"mo":1,"mp":1, "mq":1,"mr":1,"ms":1,"mt":1,"mu":1,"museum":1,"mv":1,"mw":1,"mx":1, "my":1,"mz":1,"na":1,"name":1,"nc":1,"ne":1,"net":1,"nf":1,"ng":1, "ni":1,"nl":1,"no":1,"np":1,"nr":1,"nu":1,"nz":1,"om":1,"org":1, "pa":1,"pe":1,"pf":1,"pg":1,"ph":1,"pk":1,"pl":1,"pm":1,"pn":1, "pr":1,"pro":1,"ps":1,"pt":1,"pw":1,"py":1,"qa":1,"re":1,"ro":1, "ru":1,"rw":1,"sa":1,"sb":1,"sc":1,"sd":1,"se":1,"sg":1,"sh":1, "si":1,"sj":1,"sk":1,"sl":1,"sm":1,"sn":1,"so":1,"sr":1,"st":1, "su":1,"sv":1,"sy":1,"sz":1,"tc":1,"td":1,"tf":1,"tg":1,"th":1, "tj":1,"tk":1,"tm":1,"tn":1,"to":1,"tp":1,"tr":1,"tt":1,"tv":1, "tw":1,"tz":1,"ua":1,"ug":1,"uk":1,"um":1,"us":1,"uy":1,"uz":1, "va":1,"vc":1,"ve":1,"vg":1,"vi":1,"vn":1,"vu":1,"wf":1,"ws":1, "ye":1,"yt":1,"yu":1,"za":1,"zm":1,"zw":1 } # break authority into two parts: subdomain(s), and base authority # e.g. images.google.com --> [images, google.com] # www.popo.com.au --> [www, popo.com.au] def splitAuthority(aAuthority): # walk down from right, stop at (but include) first non-toplevel domain chunks = re.split("\.",aAuthority) chunks.reverse() baseAuthority="" subdomain="" foundBreak = 0 for i in chunks: if (not foundBreak): baseAuthority = i + (".","")[baseAuthority==""] + baseAuthority else: subdomain = i + (".","")[subdomain==""] + subdomain if (not gTopLevelDomainDict.has_key(i)): foundBreak=1 return ([subdomain,baseAuthority]) # def to split URI into its parts, returned as URI object def decomposeURI(aURI): # http://www.faqs.org/rfcs/rfc2396.html uriDef = "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?" # 12 3 4 5 6 7 8 9 myRegEp = re.compile(uriDef) #m = myRegEp.exec(aURI) m = myRegEp.match(aURI) if (not m): return False scheme = ("",m.group(2))[bool(m.group(2))] authority = ("",m.group(4))[bool(m.group(4))] path = ("",m.group(5))[bool(m.group(5))] query = "" fragment = ("",m.group(9))[bool(m.group(9))] s = splitAuthority(authority) subdomain = s[0] baseAuthority = s[1] return baseAuthority if not domain.count('://'): # sometimes i make mistakes... domain = 'http://' + domain return decomposeURI( domain ) def getDomainPath( url): ''' @parameter url: The url to parse. @return: Returns the domain name and the path for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return scheme + '://' +domain+ path[:path.rfind('/')+1] def getFileName( url ): ''' @parameter url: The url to parse. @return: Returns the filename name and the path for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return path[path.rfind('/')+1:] def getExtension( url ): ''' @parameter url: The url to parse. @return: Returns the extension of the filename, if possible, else, ''. ''' fname = getFileName( url ) extension = fname[ fname.rfind('.') +1 :] if extension == fname: return '' else: return extension def allButScheme( url): ''' @parameter url: The url to parse. @return: Returns the domain name and the path for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return domain+ path[:path.rfind('/')+1] def getPath( url): ''' @parameter url: The url to parse. @return: Returns the domain name and the path for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) return path def getPathQs( url): ''' @parameter url: The url to parse. @return: Returns the domain name and the path for the url. ''' scheme, domain, path, x1, qs, x3 = _uparse.urlparse( url ) if qs != '': res = path + '?' + qs else: res = path return res def urlDecode( url ): ''' UrlDecode the url. ''' res = None if type(url) == type(""): import string res = urllib.unquote(string.replace(url, "+", " ")) return res def getDirectories( url ): ''' Get a list of all directories and subdirectories. Example: - url = 'http://www.o.com/a/b/c/' - return: ['http://www.o.com/a/b/c/','http://www.o.com/a/b/','http://www.o.com/a/','http://www.o.com/'] ''' res = [] dp = getDomainPath( url ) bu = baseUrl( url ) directories = dp.replace( bu, '' ) splittedDirs = directories.split('/') for i in xrange( len(splittedDirs) ): url = bu + '/'.join( splittedDirs[:i] ) if url[len( url )-1] != '/': url += '/' res.append( url ) return res