""" slimmer.py Peter Bengtsson, mail@peterbe.com, 2004-2006 slimmer.py is a simple set of functions for compressing/optimizing HTML, XHTML and CSS documents as strings. Ideally used from other modules used something like this:: >>> import slimmer >>> code = open('file.html').read() >>> slimmed = slimmer.xhtml_slimmer(code) >>> print len(code), len(slimmed) You have to estimate yourself if you think it's worth using slimmer on your documents if you're running a dynamic setting such as a web application (e.g. Zope with CheckoutableTemplates). On my PC I slimmed a 1MB .html document in 2.2 seconds and saved 100KB. Saved 31KB on a 110KB .css file in 0.063 seconds. And lastly, saved 17% in size in 0.016 seconds for www.python.org. Changes:: 0.1.21 May 2006 Serious bug fix in _js_slimmer() with code like: '''for (var e in somearray)''' the result could be '''for (vareinsomearray)''' 0.1.20 Feb 2006 Incorporated new experimental --hardcore option 0.1.19 Feb 2006 Fixed bug in how js_slimmer() removes // comments 0.1.18 Jan 2006 Improved js_slimmer() floppy whitespace in parameter lists 0.1.17 Aug 2005 Fix in css_slimmer() for voice-family: hack (thanks Jens) 0.1.16 Jun 2005 Improved js_slimmer() for sloppy function definitions 0.1.15 Jun 2005 Improved js_slimmer() for sloppy if|else|else if statements 0.1.14 Apr 2005 Added unit test of Holly-hack for CSS 0.1.13 Apr 2005 Improved js_slimmer() to make 'y = 123;y = document;' to instead become 'y=123;y=document;' 0.1.12 Mar 2005 Fixed css_slimmer() to put a linebreak before //--> 0.1.11 Feb 2005 Fixed js_slimmer() for some curly bracket endings 0.1.10 Jan 2005 (Major patch by Baruch Even) - Fixed the -t option for testing, it didn't work, --test did work. - Fixed a typo s/whatspace/whitespace/ - Fixed a bug were more than one consecutive space turned into nothing, added test 6 for this. - Revamped other code to completely eliminate end of lines. It works in FireFox 1.0 - Changed the test cases to fit - Removed the last ; before } -> s/;}/}/ - Changed the test cases to fit 0.1.9 Jan 2005 CLI interface can accept URLs 0.1.8 Dec 2004 Added an option (UNQUOTE_HTML_ATTRIBUTES) to remove quotes from HTML attributes. (default is off) 0.1.7 Dec 2004 Separate out from CheckoutableTemplates and __all__ variable fixed for js_slimmer. 0.1.6 Dec 2004 Care for MacIE5 CSS Hack (http://www.sam-i-am.com/work/sandbox/css/mac_ie5_hack.html) 0.1.5 Nov 2004 Some improvements to js_slimmer() 0.1.4 Nov 2004 Added first draft of js_slimmer() 0.1.3 Nov 2004 Much improved CLI functions 0.1.2 Sep 2004 Added basic CLI functions (see run()) 0.1.1 Sep 2004 Major speed improvment by removing the unquote_numerical feature. 0.1.0 Sep 2004 First version numbering """ __version__='0.1.21' __all__=['acceptableSyntax','slimmer','css_slimmer', 'html_slimmer','xhtml_slimmer','js_slimmer', '__version__'] import re, os, sys, getopt import urllib2 try: from js_function_slimmer import slim as js_function_slimmer except ImportError: js_function_slimmer = None ## Options # # If you're slimming HTML docs and really want to # convert border="0" to border=0, be aware that this # can take 5 times longer than without but compresses # the document at least twice as good. UNQUOTE_HTML_ATTRIBUTES = 0 # Define the syntax options we accept HTML = 'html' XHTML = 'xhtml' CSS = 'css' JS = 'js' OK_SYNTAX = (HTML, XHTML, CSS, JS) def acceptableSyntax(syntax): """ return the syntax as we recognize it or None """ syntax = str(syntax).lower().strip().replace(' ','').replace('-','') syntax = syntax.replace('stylesheet','css') # allow for alias if syntax in OK_SYNTAX: return syntax else: return None def slimmer(code, syntax=XHTML, hardcore=False): """ wrap all function we have """ if syntax == XHTML: return _xhtml_slimmer(code) elif syntax == HTML: return _html_slimmer(code) elif syntax == CSS: return _css_slimmer(code) elif syntax == JS: if hardcore: return _js_slimmer(code, slim_functions=True) else: return _js_slimmer(code) try: import itertools def anyTrue(pred, seq): return True in itertools.imap(pred,seq) except ImportError: def anyTrue(pred, seq): for e in seq: if pred(e): return True return False # CSS css_comments = re.compile(r'/\*.*?\*/', re.MULTILINE|re.DOTALL) hex_colour = re.compile(r'#\w{2}\w{2}\w{2}') def _css_slimmer(css): """ remove repeating whitespace ( \t\n) """ #css = css_comments.sub('', css) # remove comments remove_next_comment = 1 for css_comment in css_comments.findall(css): if css_comment[-3:]=='\*/': remove_next_comment=0 continue if remove_next_comment: css = css.replace(css_comment, '') else: remove_next_comment = 1 css = re.sub(r'\s\s+', ' ', css) # >= 2 whitespace becomes one whitespace css = re.sub(r'\s+\n', '', css) # no whitespace before end of line # Remove space before and after certain chars for char in ('{', '}', ':', ';', ','): css = re.sub(char+r'\s', char, css) css = re.sub(r'\s'+char, char, css) css = re.sub(r'\s+ css = re.sub(r'}\s(#|\w)', r'}\1', css) css = re.sub(r';}', r'}', css) # no need for the ; before end of attributes css = re.sub(r'}//-->', r'}\n//-->', css) css = simplifyHexColours(css) # voice-family hack. The declation: '''voice-family: "\"}\""''' requires # that extra space between the ':' and the first '"' which _css_slimmer() # removed. Put it back (http://real.issuetrackerproduct.com/0168) css = re.sub(r'voice-family:"\\"}\\""', r'voice-family: "\\"}\\""', css) return css.strip() # HTML f_IMD = re.I|re.MULTILINE|re.DOTALL f_MD = re.MULTILINE|re.DOTALL f_M = re.MULTILINE html_comments_oneline = re.compile(r'', re.I) html_inline_css = re.compile(r'.*?', f_IMD) html_inline_js = re.compile(r'.*?', f_IMD) any_tag = re.compile(r"<\w.*?>", f_IMD) excess_whitespace = re.compile(r' \s+|\s +', f_M) excess_whitespace1 = re.compile(r'\w\s+\w', f_M) excess_whitespace2 = re.compile(r'"\s+>', f_M) excess_whitespace3 = re.compile(r"'\s+>", f_M) excess_whitespace4 = re.compile('"\s\s+\w+="|\'\s\s+\w+=\'|"\s\s+\w+=|\'\s\s+\w+=', f_M) excess_whitespace6 = re.compile(r"\d\s+>", f_M) quotes_in_tag = re.compile('([a-zA-Z]+)="([a-zA-Z0-9-_\.]+)"') def _html_slimmer(html, xml=0): """ Optimize like XHTML but go one step further """ # 1. optimize inline CSS for styletag in html_inline_css.findall(html): html = html.replace(styletag, css_slimmer(styletag)) # 2. optimize inline Javascript for scripttag in html_inline_js.findall(html): html = html.replace(scripttag, js_slimmer(scripttag)) # 2. Remove excessive whitespace between tags html = re.sub(r'>\s+<','><', html) # 3. Remove oneline comments html = html_comments_oneline.sub('', html) # 4. In every tag, remove quotes on numerical attributes and all # excessive whitespace ew1 = excess_whitespace1 # shortcut ew6 = excess_whitespace6 # shortcut ew4 = excess_whitespace4 # shortcut for tag in uniqify(any_tag.findall(html)): # 4a. observe exceptions if tag.startswith('-1: continue original = tag # 4b. remove excess whitespace inside the tag tag= excess_whitespace2.sub('">', tag) tag= excess_whitespace3.sub("'>", tag) for each in ew1.findall(tag)+ew6.findall(tag): tag = tag.replace(each, excess_whitespace.sub(' ',each)) for each in ew4.findall(tag): tag = tag.replace(each, each[0]+' '+each[1:].lstrip()) # 4c. remove quotes if not xml and UNQUOTE_HTML_ATTRIBUTES: tag= quotes_in_tag.sub(r'\1=\2', tag) # has the tag been improved? if original != tag: html = html.replace(original, tag) return html.strip() def _xhtml_slimmer(xhtml): # currently not difference return _html_slimmer(xhtml, xml=1) excess_whitespace_js = re.compile('^\s+(\S)',re.MULTILINE) excess_whitespace_js2 = re.compile('(\S+);\s+(\S+)', re.MULTILINE) whitespaced_func_def = re.compile('(function)\s+(\S+\(.*?\))\s*{\s*(\S+)', f_IMD) whitespaced_func_def2 = re.compile('function\s*\(\)\s*{\s*(\S+)', f_IMD) js_comments_singlelines = re.compile('^//.*?$|\s+//.*?$', re.DOTALL|re.MULTILINE|re.I) js_comments_singlelines2 = re.compile('((^|;|\s)//.*?$)', re.DOTALL|re.MULTILINE|re.I) js_comment_end = re.compile('-->') js_comment_start = re.compile('(')==-1: return '' else: return match.group() js = js_comments_singlelines.sub(_reject_slashslash_comment, js) _=''' for comment, start in js_comments_singlelines2.findall(js): # ...except those that contain --> replacewith = '' if start == ';': replacewith = ';' if not js_comment_end.findall(comment): js = js.replace(comment, replacewith) ''' js = js_comment_start.sub(r'