#!/usr/bin/perl -wn
#
# utf8map.pl - remap ascii to utf8
#
# Program was created to build conversion table from ascii into utf8.
# Ascii table's first half does not require any changes (because utf8
# [0-127] encoding is the same as in ascii). To control second half's
# encoding we have to specify all unicode codes for characters in
# [128-255] interval. Program takes unicode codes for 128 characters
# on input (in hex format with leading 0x) and generates conversion table .
# Every utf8 code is padded by '0' and occupies 4 bytes.
# It is suitable for use in 'C' programs.
#
# For example,
#  in windows-1257 table character 169 '(c)' has code 0x00A9 in unicode.
#  Program will generate folowing string:
#
#  0xC2, 0xA9, 0x00, 0x00, /*   169           0x00a9 */
# \______________________/     \___/         \______/
#    utf8 code (2 bytes        ascii         unicode
#    with padding)
#
# USAGE:
#   perl utf8map.pl asci_128-255_unicode_table.txt
#
# Andrejs Dubovskis
#

use strict ;

use vars qw/$N/ ;

BEGIN {
  # we going to prepare table for characters in 128-255 interval
  $N = 128 ;
}

# look for hex number (unicode)
for my $hex (/0x[\da-f]+/ig) {
  my $num = hex($hex) ;
  my @out = () ;

  if ($num > 0xffff) {
    die "too large number: $hex" ;
  } elsif ($num > 0x07ff) {
    # result is three bytes long
    @out = (
	    (($num >> 12) & 0xf) | 0xe0,
	    (($num >> 6) & 0x3f) | 0x80,
	    ($num & 0x3f) | 0x80
	   ) ;
  } elsif ($num > 0x7f) {
    # result is two bytes long
    @out = (
	    (($num >> 6) & 0x1f) | 0xc0,
	    ($num & 0x3f) | 0x80
	   ) ;
  } else {
    # only zero is legal here
    die "wrong input data: $hex" if $num ;
  }

  # pad by '0'
  push(@out, 0) while @out < 4 ;

  # output utf8 code
  printf("0x%02X,\t0x%02X,\t0x%02X,\t0x%02X,\t", @out) ;
  # output comments
  print "/*\t$N\t$hex\t*/\n" ;

  # characters in [128-255] interval only
  exit if ++$N > 255 ;
}