TransliterationRuleSet.java
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 1996-2010, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.UtilityExtensions;
import java.util.ArrayList;
import java.util.List;
/**
* A set of rules for a <code>RuleBasedTransliterator</code>. This set encodes the transliteration
* in one direction from one set of characters or short strings to another. A <code>
* RuleBasedTransliterator</code> consists of up to two such sets, one for the forward direction,
* and one for the reverse.
*
* <p>A <code>TransliterationRuleSet</code> has one important operation, that of finding a matching
* rule at a given point in the text. This is accomplished by the <code>findMatch()</code> method.
*
* <p>Copyright © IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
*/
class TransliterationRuleSet {
/** Vector of rules, in the order added. */
private List<TransliterationRule> ruleVector;
/** Length of the longest preceding context */
private int maxContextLength;
/**
* Sorted and indexed table of rules. This is created by freeze() from the rules in ruleVector.
* rules.length >= ruleVector.size(), and the references in rules[] are aliases of the
* references in ruleVector. A single rule in ruleVector is listed one or more times in rules[].
*/
private TransliterationRule[] rules;
/**
* Index table. For text having a first character c, compute x = c&0xFF. Now use
* rules[index[x]..index[x+1]-1]. This index table is created by freeze().
*/
private int[] index;
/** Construct a new empty rule set. */
public TransliterationRuleSet() {
ruleVector = new ArrayList<TransliterationRule>();
maxContextLength = 0;
}
/**
* Return the maximum context length.
*
* @return the length of the longest preceding context.
*/
public int getMaximumContextLength() {
return maxContextLength;
}
/**
* Add a rule to this set. Rules are added in order, and order is significant.
*
* @param rule the rule to add
*/
public void addRule(TransliterationRule rule) {
ruleVector.add(rule);
int len;
if ((len = rule.getAnteContextLength()) > maxContextLength) {
maxContextLength = len;
}
rules = null;
}
/**
* Close this rule set to further additions, check it for masked rules, and index it to optimize
* performance.
*
* @exception IllegalArgumentException if some rules are masked
*/
public void freeze() {
/* Construct the rule array and index table. We reorder the
* rules by sorting them into 256 bins. Each bin contains all
* rules matching the index value for that bin. A rule
* matches an index value if string whose first key character
* has a low byte equal to the index value can match the rule.
*
* Each bin contains zero or more rules, in the same order
* they were found originally. However, the total rules in
* the bins may exceed the number in the original vector,
* since rules that have a variable as their first key
* character will generally fall into more than one bin.
*
* That is, each bin contains all rules that either have that
* first index value as their first key character, or have
* a set containing the index value as their first character.
*/
int n = ruleVector.size();
index = new int[257]; // [sic]
List<TransliterationRule> v =
new ArrayList<TransliterationRule>(2 * n); // heuristic; adjust as needed
/* Precompute the index values. This saves a LOT of time.
*/
int[] indexValue = new int[n];
for (int j = 0; j < n; ++j) {
TransliterationRule r = ruleVector.get(j);
indexValue[j] = r.getIndexValue();
}
for (int x = 0; x < 256; ++x) {
index[x] = v.size();
for (int j = 0; j < n; ++j) {
if (indexValue[j] >= 0) {
if (indexValue[j] == x) {
v.add(ruleVector.get(j));
}
} else {
// If the indexValue is < 0, then the first key character is
// a set, and we must use the more time-consuming
// matchesIndexValue check. In practice this happens
// rarely, so we seldom tread this code path.
TransliterationRule r = ruleVector.get(j);
if (r.matchesIndexValue(x)) {
v.add(r);
}
}
}
}
index[256] = v.size();
/* Freeze things into an array.
*/
rules = new TransliterationRule[v.size()];
v.toArray(rules);
StringBuilder errors = null;
/* Check for masking. This is MUCH faster than our old check,
* which was each rule against each following rule, since we
* only have to check for masking within each bin now. It's
* 256*O(n2^2) instead of O(n1^2), where n1 is the total rule
* count, and n2 is the per-bin rule count. But n2<<n1, so
* it's a big win.
*/
for (int x = 0; x < 256; ++x) {
for (int j = index[x]; j < index[x + 1] - 1; ++j) {
TransliterationRule r1 = rules[j];
for (int k = j + 1; k < index[x + 1]; ++k) {
TransliterationRule r2 = rules[k];
if (r1.masks(r2)) {
if (errors == null) {
errors = new StringBuilder();
} else {
errors.append("\n");
}
errors.append("Rule " + r1 + " masks " + r2);
}
}
}
}
if (errors != null) {
throw new IllegalArgumentException(errors.toString());
}
}
/**
* Transliterate the given text with the given UTransPosition indices. Return true if the
* transliteration should continue or false if it should halt (because of a U_PARTIAL_MATCH
* match). Note that false is only ever returned if isIncremental is true.
*
* @param text the text to be transliterated
* @param pos the position indices, which will be updated
* @param incremental if true, assume new text may be inserted at index.limit, and return false
* if there is a partial match.
* @return true unless a U_PARTIAL_MATCH has been obtained, indicating that transliteration
* should stop until more text arrives.
*/
public boolean transliterate(
Replaceable text, Transliterator.Position pos, boolean incremental) {
int indexByte = text.char32At(pos.start) & 0xFF;
for (int i = index[indexByte]; i < index[indexByte + 1]; ++i) {
int m = rules[i].matchAndReplace(text, pos, incremental);
switch (m) {
case UnicodeMatcher.U_MATCH:
if (Transliterator.DEBUG) {
System.out.println(
(incremental ? "Rule.i: match " : "Rule: match ")
+ rules[i].toRule(true)
+ " => "
+ UtilityExtensions.formatInput(text, pos));
}
return true;
case UnicodeMatcher.U_PARTIAL_MATCH:
if (Transliterator.DEBUG) {
System.out.println(
(incremental ? "Rule.i: partial match " : "Rule: partial match ")
+ rules[i].toRule(true)
+ " => "
+ UtilityExtensions.formatInput(text, pos));
}
return false;
default:
if (Transliterator.DEBUG) {
System.out.println("Rule: no match " + rules[i]);
}
}
}
// No match or partial match from any rule
pos.start += UTF16.getCharCount(text.char32At(pos.start));
if (Transliterator.DEBUG) {
System.out.println(
(incremental ? "Rule.i: no match => " : "Rule: no match => ")
+ UtilityExtensions.formatInput(text, pos));
}
return true;
}
/** Create rule strings that represents this rule set. */
String toRules(boolean escapeUnprintable) {
int i;
int count = ruleVector.size();
StringBuilder ruleSource = new StringBuilder();
for (i = 0; i < count; ++i) {
if (i != 0) {
ruleSource.append('\n');
}
TransliterationRule r = ruleVector.get(i);
ruleSource.append(r.toRule(escapeUnprintable));
}
return ruleSource.toString();
}
// TODO Handle the case where we have :: [a] ; a > |b ; b > c ;
// TODO Merge into r.addSourceTargetSet, to avoid duplicate testing
void addSourceTargetSet(UnicodeSet filter, UnicodeSet sourceSet, UnicodeSet targetSet) {
UnicodeSet currentFilter = new UnicodeSet(filter);
UnicodeSet revisiting = new UnicodeSet();
int count = ruleVector.size();
for (int i = 0; i < count; ++i) {
TransliterationRule r = ruleVector.get(i);
r.addSourceTargetSet(currentFilter, sourceSet, targetSet, revisiting.clear());
currentFilter.addAll(revisiting);
}
}
}