#! /bin/sh # generate Big5 character mapping from Unihan database information # (BigFive and HKSCS entries) and additional entries from # obsolete unicode.org BIG5 mapping; in ambiguous cases, the # Unihan entry takes precedence if make Unihan.txt then true else echo Could not acquire Unicode data file Unihan.txt exit 1 fi if make BIG5.TXT then true else echo Could not acquire Unicode data file BIG5.TXT exit 1 fi echo extracting mappings from Unihan data sed -e 's/^U+\([^ ]*\) kBigFive \([^ ]*\)$/ {0x\2, 0x\1},/' \ -e 's/^U+\([^ ]*\) kHKSCS \([^ ]*\)$/ {0x\2, 0x\1},/' \ -e t -e d Unihan.txt > cjk-b5.h1 echo extracting further mappings from old BIG5 data sed -e 's/^0x\([^ ]*\) 0x\([^ ]*\).*$/ {0x\1, 0x\2},/' \ -e t -e d BIG5.TXT > cjk-b5.h2 echo determining unique entries in further BIG5 data cat cjk-b5.h2 cjk-b5.h1 cjk-b5.h1 | sort | uniq -u > cjk-b5.h3 echo determining ambiguous entries sed -e 's/ {0x\([^,]*\),.*/\1,/' cjk-b5.h1 cjk-b5.h3 | sort | uniq -d > cjk-b5.h4 echo filtering out ambiguous entries fgrep -v -f cjk-b5.h4 cjk-b5.h3 > cjk-b5.h5 echo merging mappings cat cjk-b5.h1 cjk-b5.h5 | sort > cjk-b5.h echo removing auxiliary files rm -f cjk-b5.h[1-5]