StringMatcher.java
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2001-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Utility;
/**
* An object that matches a fixed input string, implementing the UnicodeMatcher API. This object
* also implements the UnicodeReplacer API, allowing it to emit the matched text as output. Since
* the match text may contain flexible match elements, such as UnicodeSets, the emitted text is not
* the match pattern, but instead a substring of the actual matched text. Following convention, the
* output text is the leftmost match seen up to this point.
*
* <p>A StringMatcher may represent a segment, in which case it has a positive segment number. This
* affects how the matcher converts itself to a pattern but does not otherwise affect its function.
*
* <p>A StringMatcher that is not a segment should not be used as a UnicodeReplacer.
*/
class StringMatcher implements UnicodeMatcher, UnicodeReplacer {
/** The text to be matched. */
private String pattern;
/** Start offset, in the match text, of the <em>rightmost</em> match. */
private int matchStart;
/** Limit offset, in the match text, of the <em>rightmost</em> match. */
private int matchLimit;
/** The segment number, 1-based, or 0 if not a segment. */
private int segmentNumber;
/** Context object that maps stand-ins to matcher and replacer objects. */
private final RuleBasedTransliterator.Data data;
/**
* Construct a matcher that matches the given pattern string.
*
* @param theString the pattern to be matched, possibly containing stand-ins that represent
* nested UnicodeMatcher objects.
* @param segmentNum the segment number from 1..n, or 0 if this is not a segment.
* @param theData context object mapping stand-ins to UnicodeMatcher objects.
*/
public StringMatcher(String theString, int segmentNum, RuleBasedTransliterator.Data theData) {
data = theData;
pattern = theString;
matchStart = matchLimit = -1;
segmentNumber = segmentNum;
}
/**
* Construct a matcher that matches a substring of the given pattern string.
*
* @param theString the pattern to be matched, possibly containing stand-ins that represent
* nested UnicodeMatcher objects.
* @param start first character of theString to be matched
* @param limit index after the last character of theString to be matched.
* @param segmentNum the segment number from 1..n, or 0 if this is not a segment.
* @param theData context object mapping stand-ins to UnicodeMatcher objects.
*/
public StringMatcher(
String theString,
int start,
int limit,
int segmentNum,
RuleBasedTransliterator.Data theData) {
this(theString.substring(start, limit), segmentNum, theData);
}
/** Implement UnicodeMatcher */
@Override
public int matches(Replaceable text, int[] offset, int limit, boolean incremental) {
// Note (1): We process text in 16-bit code units, rather than
// 32-bit code points. This works because stand-ins are
// always in the BMP and because we are doing a literal match
// operation, which can be done 16-bits at a time.
int i;
int[] cursor = new int[] {offset[0]};
if (limit < cursor[0]) {
// Match in the reverse direction
for (i = pattern.length() - 1; i >= 0; --i) {
char keyChar = pattern.charAt(i); // OK; see note (1) above
UnicodeMatcher subm = data.lookupMatcher(keyChar);
if (subm == null) {
if (cursor[0] > limit
&& keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
--cursor[0];
} else {
return U_MISMATCH;
}
} else {
int m = subm.matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position, but adjust for a normal
// forward start, limit, and only if a prior match does not
// exist -- we want the rightmost match.
if (matchStart < 0) {
matchStart = cursor[0] + 1;
matchLimit = offset[0] + 1;
}
} else {
for (i = 0; i < pattern.length(); ++i) {
if (incremental && cursor[0] == limit) {
// We've reached the context limit without a mismatch and
// without completing our match.
return U_PARTIAL_MATCH;
}
char keyChar = pattern.charAt(i); // OK; see note (1) above
UnicodeMatcher subm = data.lookupMatcher(keyChar);
if (subm == null) {
// Don't need the cursor < limit check if
// incremental is true (because it's done above); do need
// it otherwise.
if (cursor[0] < limit
&& keyChar == text.charAt(cursor[0])) { // OK; see note (1) above
++cursor[0];
} else {
return U_MISMATCH;
}
} else {
int m = subm.matches(text, cursor, limit, incremental);
if (m != U_MATCH) {
return m;
}
}
}
// Record the match position
matchStart = offset[0];
matchLimit = cursor[0];
}
offset[0] = cursor[0];
return U_MATCH;
}
/** Implement UnicodeMatcher */
@Override
public String toPattern(boolean escapeUnprintable) {
StringBuilder result = new StringBuilder();
StringBuilder quoteBuf = new StringBuilder();
if (segmentNumber > 0) { // i.e., if this is a segment
result.append('(');
}
for (int i = 0; i < pattern.length(); ++i) {
char keyChar = pattern.charAt(i); // OK; see note (1) above
UnicodeMatcher m = data.lookupMatcher(keyChar);
if (m == null) {
Utility.appendToRule(result, keyChar, false, escapeUnprintable, quoteBuf);
} else {
Utility.appendToRule(
result, m.toPattern(escapeUnprintable), true, escapeUnprintable, quoteBuf);
}
}
if (segmentNumber > 0) { // i.e., if this is a segment
result.append(')');
}
// Flush quoteBuf out to result
Utility.appendToRule(result, -1, true, escapeUnprintable, quoteBuf);
return result.toString();
}
/** Implement UnicodeMatcher */
@Override
public boolean matchesIndexValue(int v) {
if (pattern.length() == 0) {
return true;
}
int c = UTF16.charAt(pattern, 0);
UnicodeMatcher m = data.lookupMatcher(c);
return (m == null) ? ((c & 0xFF) == v) : m.matchesIndexValue(v);
}
/**
* Implementation of UnicodeMatcher API. Union the set of all characters that may be matched by
* this object into the given set.
*
* @param toUnionTo the set into which to union the source characters
*/
@Override
public void addMatchSetTo(UnicodeSet toUnionTo) {
int ch;
for (int i = 0; i < pattern.length(); i += UTF16.getCharCount(ch)) {
ch = UTF16.charAt(pattern, i);
UnicodeMatcher matcher = data.lookupMatcher(ch);
if (matcher == null) {
toUnionTo.add(ch);
} else {
matcher.addMatchSetTo(toUnionTo);
}
}
}
/** UnicodeReplacer API */
@Override
public int replace(Replaceable text, int start, int limit, int[] cursor) {
int outLen = 0;
// Copy segment with out-of-band data
int dest = limit;
// If there was no match, that means that a quantifier
// matched zero-length. E.g., x (a)* y matched "xy".
if (matchStart >= 0) {
if (matchStart != matchLimit) {
text.copy(matchStart, matchLimit, dest);
outLen = matchLimit - matchStart;
}
}
text.replace(start, limit, ""); // delete original text
return outLen;
}
/** UnicodeReplacer API */
@Override
public String toReplacerPattern(boolean escapeUnprintable) {
// assert(segmentNumber > 0);
StringBuilder rule = new StringBuilder("$");
Utility.appendNumber(rule, segmentNumber, 10, 1);
return rule.toString();
}
/**
* Remove any match data. This must be called before performing a set of matches with this
* segment.
*/
public void resetMatch() {
matchStart = matchLimit = -1;
}
/**
* Union the set of all characters that may output by this object into the given set.
*
* @param toUnionTo the set into which to union the output characters
*/
@Override
public void addReplacementSetTo(UnicodeSet toUnionTo) {
// The output of this replacer varies; it is the source text between
// matchStart and matchLimit. Since this varies depending on the
// input text, we can't compute it here. We can either do nothing
// or we can add ALL characters to the set. It's probably more useful
// to do nothing.
}
}
// eof