# LINEBREAK.awk -- awk script to produce a compact linebreak property map
# Copyright (C) 2005
#   National Institute of Advanced Industrial Science and Technology (AIST)
#   Registration Number H15PRO112

# This file is part of the m17n database; a sub-part of the m17n
# library.

# The m17n library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License
# as published by the Free Software Foundation; either version 2.1 of
# the License, or (at your option) any later version.

# The m17n library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public
# License along with the m17n library; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.

function setLBC(name, code) {
  if (code % 10 == 0)
    printf "\n# %2d:%s", code, name;
  else
    printf "  %2d:%s", code, name;
  to_lbc[name] = code;
}

BEGIN {
  FS = "[; ]";
  from = "";
  to = "";
  prev_lbc = -1;
  i = 0;
  printf "# Code:LineBreakingClass";
  # Assign a uniq integer code to each line breaking class.
  # The codes must be the same as "enum LineBreakingClass" of
  # m17n-lib/src/linebreak.c
  setLBC("OP", i++);		# open
  setLBC("CL", i++);		# close
  setLBC("QU", i++);		# quotation
  setLBC("GL", i++);		# glue
  setLBC("NS", i++);		# no-start
  setLBC("EX", i++);		# exclamation/interrogation
  setLBC("SY", i++);		# Syntax (slash)
  setLBC("IS", i++);		# infix (numeric) separator
  setLBC("PR", i++);		# prefix
  setLBC("PO", i++);		# postfix
  setLBC("NU", i++);		# numeric
  setLBC("AL", i++);		# alphabetic
  setLBC("ID", i++);		# ideograph (atomic)
  setLBC("IN", i++);		# inseparable
  setLBC("HY", i++);		# hyphen
  setLBC("BA", i++);		# break after
  setLBC("BB", i++);		# break before
  setLBC("B2", i++);		# break both
  setLBC("ZW", i++);		# ZW space
  setLBC("CM", i++);		# combining mark
  setLBC("WJ", i++);		# word joiner

  # For UAX#14 7.6 Korean Syllable Block Pair Table.
  setLBC("H2", i++);		# Hamgul 2 Jamo Syllable
  setLBC("H3", i++);		# Hangul 3 Jamo Syllable
  setLBC("JL", i++);		# Jamo leading consonant
  setLBC("JV", i++);		# Jamo vowel
  setLBC("JT", i++);		# Jamo trailing consonant

  # Not handled in the pair table.
  setLBC("SA", i++);		# south (east) asian
  setLBC("SP", i++);		# space
  setLBC("PS", i++);		# paragraph and line separators
  setLBC("BK", i++);		# hard break (newline)
  setLBC("CR", i++);		# carriage return
  setLBC("LF", i++);		# line feed
  setLBC("NL", i++);		# next line
  setLBC("CB", i++);		# contingent break opportunity
  setLBC("SG", i++);		# surrogate
  setLBC("AI", i++);		# ambiguous
  setLBC("XX", i);		# unknown
  
  # The default is "XX".
  printf "\n0x0000-0x3FFFFF %d\n", i;
}

/^[0-9A-Za-z]*;/ {
  lbc = to_lbc[$2];
  if (prev_lbc != lbc)
    {
      if (prev_lbc != -1)
	{
	  if (from == to)
	    printf "%s %d\n", from, prev_lbc;
	  else
	    printf "%s-%s %d\n", from, to, prev_lbc;
	}
      from = "0x" $1;
      to = "0x" $1;
      prev_lbc = lbc;
    }
  else
    to = "0x" $1;
  next;
}

/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
  lbc = to_lbc[$2];
  if (prev_lbc != -1)
    {
      if (from == to)
	printf "%s %d\n", from, prev_lbc;
      else
	printf "%s-%s %d\n", from, to, prev_lbc;
    }
  gsub("\\.\\.", "-0x");
  printf "0x%s %d\n", $1, lbc;
  from = "";
  to = "";
  prev_lbc = -1;
  next;
}

END {
  if (prev_lbc != -1)
    {
      if (from == to)
	printf "0x%s %d\n", from, prev_lbc;
      else
	printf "0x%s-0x%s %d\n", from, to, prev_lbc;
    }
}


syntax highlighted by Code2HTML, v. 0.9.1