summaryrefslogtreecommitdiff
path: root/LINEBREAK.awk
blob: 22318facdc069cc339117bd479018f362196804a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
# LINEBREAK.awk -- awk script to produce a compact linebreak property map
# Copyright (C) 2005
#   National Institute of Advanced Industrial Science and Technology (AIST)
#   Registration Number H15PRO112

# This file is part of the m17n database; a sub-part of the m17n
# library.

# The m17n library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public License
# as published by the Free Software Foundation; either version 2.1 of
# the License, or (at your option) any later version.

# The m17n library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.

# You should have received a copy of the GNU Lesser General Public
# License along with the m17n library; if not, write to the Free
# Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
# Boston, MA 02110-1301, USA.

function setLBC(name, code) {
  if (code % 10 == 0)
    printf "\n# %2d:%s", code, name;
  else
    printf "  %2d:%s", code, name;
  to_lbc[name] = code;
}

BEGIN {
  FS = "[; ]";
  from = "";
  to = "";
  prev_lbc = -1;
  i = 0;
  printf "# Code:LineBreakingClass";
  # Assign a uniq integer code to each line breaking class.
  # The codes must be the same as "enum LineBreakingClass" of
  # m17n-lib/src/linebreak.c
  setLBC("OP", i++);		# open
  setLBC("CL", i++);		# close
  setLBC("QU", i++);		# quotation
  setLBC("GL", i++);		# glue
  setLBC("NS", i++);		# no-start
  setLBC("EX", i++);		# exclamation/interrogation
  setLBC("SY", i++);		# Syntax (slash)
  setLBC("IS", i++);		# infix (numeric) separator
  setLBC("PR", i++);		# prefix
  setLBC("PO", i++);		# postfix
  setLBC("NU", i++);		# numeric
  setLBC("AL", i++);		# alphabetic
  setLBC("ID", i++);		# ideograph (atomic)
  setLBC("IN", i++);		# inseparable
  setLBC("HY", i++);		# hyphen
  setLBC("BA", i++);		# break after
  setLBC("BB", i++);		# break before
  setLBC("B2", i++);		# break both
  setLBC("ZW", i++);		# ZW space
  setLBC("CM", i++);		# combining mark
  setLBC("WJ", i++);		# word joiner

  # For UAX#14 7.6 Korean Syllable Block Pair Table.
  setLBC("H2", i++);		# Hamgul 2 Jamo Syllable
  setLBC("H3", i++);		# Hangul 3 Jamo Syllable
  setLBC("JL", i++);		# Jamo leading consonant
  setLBC("JV", i++);		# Jamo vowel
  setLBC("JT", i++);		# Jamo trailing consonant

  # Not handled in the pair table.
  setLBC("SA", i++);		# south (east) asian
  setLBC("SP", i++);		# space
  setLBC("PS", i++);		# paragraph and line separators
  setLBC("BK", i++);		# hard break (newline)
  setLBC("CR", i++);		# carriage return
  setLBC("LF", i++);		# line feed
  setLBC("NL", i++);		# next line
  setLBC("CB", i++);		# contingent break opportunity
  setLBC("SG", i++);		# surrogate
  setLBC("AI", i++);		# ambiguous
  setLBC("XX", i);		# unknown
  
  # The default is "XX".
  printf "\n0x0000-0x3FFFFF %d\n", i;
}

/^[0-9A-Za-z]*;/ {
  lbc = to_lbc[$2];
  if (prev_lbc != lbc)
    {
      if (prev_lbc != -1)
	{
	  if (from == to)
	    printf "%s %d\n", from, prev_lbc;
	  else
	    printf "%s-%s %d\n", from, to, prev_lbc;
	}
      from = "0x" $1;
      to = "0x" $1;
      prev_lbc = lbc;
    }
  else
    to = "0x" $1;
  next;
}

/^[0-9A-Za-z]*\.\.[0-9A-Za-z]*;/ {
  lbc = to_lbc[$2];
  if (prev_lbc != -1)
    {
      if (from == to)
	printf "%s %d\n", from, prev_lbc;
      else
	printf "%s-%s %d\n", from, to, prev_lbc;
    }
  gsub("\\.\\.", "-0x");
  printf "0x%s %d\n", $1, lbc;
  from = "";
  to = "";
  prev_lbc = -1;
  next;
}

END {
  if (prev_lbc != -1)
    {
      if (from == to)
	printf "0x%s %d\n", from, prev_lbc;
      else
	printf "0x%s-0x%s %d\n", from, to, prev_lbc;
    }
}