#! /usr/bin/env awk -f # __ _ # |_) /| Copyright (C) 2000 | richard@ # | \/¯| Richard Atterer | atterer.net # ¯ '` ¯ # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License, version 2. See # the file COPYING for details. function appendWord(word, spaceAfterWord) { #print "appendWord \"" word "\" \"" gensub(/\n/, "\\\\n", "g", spaceAfterWord) "\""; if (prevSpaceAfterWord == "\n") { # Linebreak while inside
doc = doc substr(indentStr, 1, ind) docLine "\n";
docLine = word;
ind = 0;
prevSpaceAfterWord = spaceAfterWord;
return;
}
if (ind + length(docLine) + length(word) < curMaxLen) {
# Append
if (word != "" || doPreserve > 0)
docLine = docLine prevSpaceAfterWord word;
prevSpaceAfterWord = spaceAfterWord;
} else {
# New line
if (docLine != "") doc = doc substr(indentStr, 1, ind) docLine "\n";
#print ">>> " docLine;
docLine = word;
ind = nextInd;
prevSpaceAfterWord = spaceAfterWord;
}
}
#______________________________________________________________________
BEGIN {
get = ARGV[1];
put = ARGV[2];
maxLen = 75;
indent = 1;
indentStr = " "; # Won't indent by more than this
killClass = 1; # If nonzero, remove all " class=...>" attributes
# Only tags that come with closing tags are allowed!
tags["html"]=1; tags["body"]=1; tags["head"]; tags["title"];
tags["div"]; tags["h1"]; tags["h2"]; tags["h3"]; tags["h4"]; tags["h5"];
tags["h6"]; tags["p"]; tags["dl"]; tags["dt"]; tags["dd"]; tags["table"];
tags["tr"]; tags["td"];
preserve["pre"];
maxTagLength = 100;
curMaxLen = maxLen;
# Join lines
getline rest < get;
while ((getline line < get) == 1)
rest = rest line "\n";
if (killClass)
gsub(/[ \t\n]+(class|CLASS)=("[^"]*"|'[^']*')[ \t\n]*>/, ">", rest); #"
# Split lines at whitespace and some tags
nextInd = ind = 0; # Nr of characters of indentation
doc = ""; # Ouput document
docLine = ""; # Current line to append words to
doPreserve = 0; # Nesting level of
while (match(rest, /([ \n\t]+|< *(\/ *)?)/)) {
#print "MATCH \"" substr(rest, RSTART, RLENGTH) "\"";
#print "xxx "nextInd" " gensub(/\n/, "\\\\n", "g", substr(rest, 1, 90));
if (substr(rest, RSTART, 1) == "<") {
# Tag found
tagName = tolower(substr(rest, RSTART + RLENGTH, maxTagLength));
gsub(/[^a-z0-9].*$/, "", tagName);
closing = index(substr(rest, RSTART + 1, RLENGTH - 1), "/");
# Is tag ?
if (tagName in preserve) {
if (closing && doPreserve > 0) {
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
- 1), "");
nextInd -= indent;
--doPreserve;
if (doPreserve == 0) {
curMaxLen = maxLen; nextInd = nonpreserveInd;
}
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
}
if (!closing) {
# Disable indentation while inside
if (doPreserve == 0) { nonpreserveInd = nextInd; nextInd = 0; }
++doPreserve; curMaxLen = 9999999;
}
}
if (!(tagName in tags)) {
# No known tag name
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) - 1),
"");
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
} else if (closing) {
#print "---/" tagName;
# Closing tag
if (tags[tagName] == 0) {
appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
- 1), "");
nextInd -= indent;
} else {
nextInd -= indent;
appendWord(substr(rest, 1, RSTART - 1), "");
curMaxLen = 0; # Force new line with next appendWord()
appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
curMaxLen = maxLen;
}
} else {
#print "--- " tagName;
# Opening tag
appendWord(substr(rest, 1, RSTART - 1), "");
curMaxLen = 0; # Force new line with next appendWord()
appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
curMaxLen = maxLen;
nextInd += indent;
}
rest = substr(rest, RSTART + RLENGTH + length(tagName));
continue;
} # endif tag found
# Whitespace
#print "dop " doPreserve ", RSTART=" RSTART ", RLENGTH=" RLENGTH;
if (doPreserve) {
# Preserve spaces and newlines in output
appendWord(substr(rest, 1, RSTART - 1), substr(rest, RSTART, 1));
rest = substr(rest, RSTART + 1);
} else {
# Wrap words
if (substr(rest, RSTART + RLENGTH, 1) == ">")
appendWord(substr(rest, 1, RSTART - 1), "");
else
appendWord(substr(rest, 1, RSTART - 1), " ");
rest = substr(rest, RSTART + RLENGTH);
}
}
doc = doc substr(indentStr, 1, ind) docLine rest;
print doc;
}