# SCRIPT.awk -- awk script to generate SCRIPT.tab
# Copyright (C) 2007
# National Institute of Advanced Industrial Science and Technology (AIST)
# Registration Number H15PRO112
# This file is part of the m17n database; a sub-part of the m17n
# library.
# The m17n library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License
# as published by the Free Software Foundation; either version 2.1 of
# the License, or (at your option) any later version.
# The m17n library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
# You should have received a copy of the GNU Lesser General Public
# License along with the m17n library; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.
BEGIN {
tohex["0"] = 1;
tohex["1"] = 2;
tohex["2"] = 3;
tohex["3"] = 4;
tohex["4"] = 5;
tohex["5"] = 6;
tohex["6"] = 7;
tohex["7"] = 8;
tohex["8"] = 9;
tohex["9"] = 10;
tohex["A"] = 11;
tohex["B"] = 12;
tohex["C"] = 13;
tohex["D"] = 14;
tohex["E"] = 15;
tohex["F"] = 16;
tohex["a"] = 11;
tohex["b"] = 12;
tohex["c"] = 13;
tohex["d"] = 14;
tohex["e"] = 15;
tohex["f"] = 16;
FS = "[ \t]*[;#][ \t]*";
initialized = 0;
charcount = 0;
range_index = 0;
}
function decode_hex(str, idx) {
n = 0;
len = length(str);
for (i = idx; i <= len; i++) {
c = tohex[substr(str, i, 1)];
if (c == 0)
break;
n = n * 16 + c - 1;
}
return n;
}
function initialize() {
first = -1;
while (getline line < "UNIDATA/UnicodeData.txt" > 0) {
if (line ~ /^[0-9A-F][0-9A-F]*/) {
last = decode_hex(line, 1);
if (first >= 0) {
range[range_index++] = first;
range[range_index++] = last;
first = -1;
} else if (line ~ /First>/) {
first = last;
} else {
exists[last] = 1;
}
}
}
}
function char_exist_p(c) {
if (exists[c] == 1)
return 1;
for (i = 0; i < range_index; i += 2)
if (range[i] >= c && range[i + 1] <= c)
return 1;
return 0;
}
function maybe_emit(ch1, ch2, this_script) {
if (initialized == 0) {
initialize();
initialized = 1;
print "# Ranges may contain non-existing character codes.";
print "0x0-0x3FFFFF common";
first = ch1;
last = ch2;
script = this_script;
} else {
if (script == this_script) {
for (j = last + 1; j < ch1; j++)
if (char_exist_p(j))
break;
if (j == ch1) {
last = ch2;
return;
}
}
if (script != "Common") {
if (first == last)
printf "0x%04X %s\n", first, tolower(script);
else
printf "0x%04X-0x%04X %s\n", first, last, tolower(script);
}
first = ch1;
last = ch2;
script = this_script;
}
}
/^[0-9A-F]+\.\./ {
maybe_emit(decode_hex($0, 1), decode_hex($0, match($0, "\\.\\.") + 2), $2);
next;
}
/^[0-9A-F]/ {
ch = decode_hex($0, 1);
maybe_emit(ch, ch, $2);
next;
}
END {
if (script != "Common") {
if (first == last)
printf "0x%04X %s\n", first, tolower (script);
else
printf "0x%04X-0x%04X %s\n", first, last, tolower (script);
}
while (getline < "SCRIPT.ext" > 0) {
if ($0 ~ /^[0-9A-F][0-9A-F]*/) {
print;
}
}
}
syntax highlighted by Code2HTML, v. 0.9.1