FCDIterCollationIterator.java
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2012-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
* FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
*
* C++ version created on: 2012sep23 (from utf16collationiterator.h)
* created by: Markus W. Scherer
*/
package com.ibm.icu.impl.coll;
import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.text.UCharacterIterator;
/** Incrementally checks the input text for FCD and normalizes where necessary. */
public final class FCDIterCollationIterator extends IterCollationIterator {
public FCDIterCollationIterator(
CollationData data, boolean numeric, UCharacterIterator ui, int startIndex) {
super(data, numeric, ui);
state = State.ITER_CHECK_FWD;
start = startIndex;
nfcImpl = data.nfcImpl;
}
@Override
public void resetToOffset(int newOffset) {
super.resetToOffset(newOffset);
start = newOffset;
state = State.ITER_CHECK_FWD;
}
@Override
public int getOffset() {
if (state.compareTo(State.ITER_CHECK_BWD) <= 0) {
return iter.getIndex();
} else if (state == State.ITER_IN_FCD_SEGMENT) {
return pos;
} else if (pos == 0) {
return start;
} else {
return limit;
}
}
@Override
public int nextCodePoint() {
int c;
for (; ; ) {
if (state == State.ITER_CHECK_FWD) {
c = iter.next();
if (c < 0) {
return c;
}
if (CollationFCD.hasTccc(c)) {
if (CollationFCD.maybeTibetanCompositeVowel(c)
|| CollationFCD.hasLccc(iter.current())) {
iter.previous();
if (!nextSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
}
if (isLeadSurrogate(c)) {
int trail = iter.next();
if (isTrailSurrogate(trail)) {
return Character.toCodePoint((char) c, (char) trail);
} else if (trail >= 0) {
iter.previous();
}
}
return c;
} else if (state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.nextCodePoint();
pos += Character.charCount(c);
assert (c >= 0);
return c;
} else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
&& pos != normalized.length()) {
c = normalized.codePointAt(pos);
pos += Character.charCount(c);
return c;
} else {
switchToForward();
}
}
}
@Override
public int previousCodePoint() {
int c;
for (; ; ) {
if (state == State.ITER_CHECK_BWD) {
c = iter.previous();
if (c < 0) {
start = pos = 0;
state = State.ITER_IN_FCD_SEGMENT;
return Collation.SENTINEL_CP;
}
if (CollationFCD.hasLccc(c)) {
int prev = Collation.SENTINEL_CP;
if (CollationFCD.maybeTibetanCompositeVowel(c)
|| CollationFCD.hasTccc(prev = iter.previous())) {
iter.next();
if (prev >= 0) {
iter.next();
}
if (!previousSegment()) {
return Collation.SENTINEL_CP;
}
continue;
}
// hasLccc(trail)=true for all trail surrogates
if (isTrailSurrogate(c)) {
if (prev < 0) {
prev = iter.previous();
}
if (isLeadSurrogate(prev)) {
return Character.toCodePoint((char) prev, (char) c);
}
}
if (prev >= 0) {
iter.next();
}
}
return c;
} else if (state == State.ITER_IN_FCD_SEGMENT && pos != start) {
c = iter.previousCodePoint();
pos -= Character.charCount(c);
assert (c >= 0);
return c;
} else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
c = normalized.codePointBefore(pos);
pos -= Character.charCount(c);
return c;
} else {
switchToBackward();
}
}
}
@Override
protected long handleNextCE32() {
int c;
for (; ; ) {
if (state == State.ITER_CHECK_FWD) {
c = iter.next();
if (c < 0) {
return NO_CP_AND_CE32;
}
if (CollationFCD.hasTccc(c)) {
if (CollationFCD.maybeTibetanCompositeVowel(c)
|| CollationFCD.hasLccc(iter.current())) {
iter.previous();
if (!nextSegment()) {
c = Collation.SENTINEL_CP;
return Collation.FALLBACK_CE32;
}
continue;
}
}
break;
} else if (state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
c = iter.next();
++pos;
assert (c >= 0);
break;
} else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
&& pos != normalized.length()) {
c = normalized.charAt(pos++);
break;
} else {
switchToForward();
}
}
return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char) c));
}
@Override
protected char handleGetTrailSurrogate() {
if (state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
int trail = iter.next();
if (isTrailSurrogate(trail)) {
if (state == State.ITER_IN_FCD_SEGMENT) {
++pos;
}
} else if (trail >= 0) {
iter.previous();
}
return (char) trail;
} else {
assert (pos < normalized.length());
char trail;
if (Character.isLowSurrogate(trail = normalized.charAt(pos))) {
++pos;
}
return trail;
}
}
@Override
protected void forwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while (num > 0 && nextCodePoint() >= 0) {
--num;
}
}
@Override
protected void backwardNumCodePoints(int num) {
// Specify the class to avoid a virtual-function indirection.
// In Java, we would declare this class final.
while (num > 0 && previousCodePoint() >= 0) {
--num;
}
}
/** Switches to forward checking if possible. */
private void switchToForward() {
assert (state == State.ITER_CHECK_BWD
|| (state == State.ITER_IN_FCD_SEGMENT && pos == limit)
|| (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
&& pos == normalized.length()));
if (state == State.ITER_CHECK_BWD) {
// Turn around from backward checking.
start = pos = iter.getIndex();
if (pos == limit) {
state = State.ITER_CHECK_FWD; // Check forward.
} else { // pos < limit
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the end of the FCD segment.
if (state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it forward.
} else {
// The input text segment needed to be normalized.
// Switch to checking forward from it.
if (state == State.IN_NORM_ITER_AT_START) {
iter.moveIndex(limit - start);
}
start = limit;
}
state = State.ITER_CHECK_FWD;
}
}
/**
* Extends the FCD text segment forward or normalizes around pos.
*
* @return true if success
*/
private boolean nextSegment() {
assert (state == State.ITER_CHECK_FWD);
// The input text [start..(iter index)[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if (s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int prevCC = 0;
for (; ; ) {
// Fetch the next character and its fcd16 value.
int c = iter.nextCodePoint();
if (c < 0) {
break;
}
int fcd16 = nfcImpl.getFCD16(c);
int leadCC = fcd16 >> 8;
if (leadCC == 0 && s.length() != 0) {
// FCD boundary before this character.
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
if (leadCC != 0
&& (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the next FCD boundary and normalize.
for (; ; ) {
c = iter.nextCodePoint();
if (c < 0) {
break;
}
if (nfcImpl.getFCD16(c) <= 0xff) {
iter.previousCodePoint();
break;
}
s.appendCodePoint(c);
}
normalize(s);
start = pos;
limit = pos + s.length();
state = State.IN_NORM_ITER_AT_LIMIT;
pos = 0;
return true;
}
prevCC = fcd16 & 0xff;
if (prevCC == 0) {
// FCD boundary after the last character.
break;
}
}
limit = pos + s.length();
assert (pos != limit);
iter.moveIndex(-s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
/** Switches to backward checking. */
private void switchToBackward() {
assert (state == State.ITER_CHECK_FWD
|| (state == State.ITER_IN_FCD_SEGMENT && pos == start)
|| (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
if (state == State.ITER_CHECK_FWD) {
// Turn around from forward checking.
limit = pos = iter.getIndex();
if (pos == start) {
state = State.ITER_CHECK_BWD; // Check backward.
} else { // pos > start
state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
}
} else {
// Reached the start of the FCD segment.
if (state == State.ITER_IN_FCD_SEGMENT) {
// The input text segment is FCD, extend it backward.
} else {
// The input text segment needed to be normalized.
// Switch to checking backward from it.
if (state == State.IN_NORM_ITER_AT_LIMIT) {
iter.moveIndex(start - limit);
}
limit = start;
}
state = State.ITER_CHECK_BWD;
}
}
/**
* Extends the FCD text segment backward or normalizes around pos.
*
* @return true if success
*/
private boolean previousSegment() {
assert (state == State.ITER_CHECK_BWD);
// The input text [(iter index)..limit[ passes the FCD check.
pos = iter.getIndex();
// Collect the characters being checked, in case they need to be normalized.
if (s == null) {
s = new StringBuilder();
} else {
s.setLength(0);
}
int nextCC = 0;
for (; ; ) {
// Fetch the previous character and its fcd16 value.
int c = iter.previousCodePoint();
if (c < 0) {
break;
}
int fcd16 = nfcImpl.getFCD16(c);
int trailCC = fcd16 & 0xff;
if (trailCC == 0 && s.length() != 0) {
// FCD boundary after this character.
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
if (trailCC != 0
&& ((nextCC != 0 && trailCC > nextCC)
|| CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
// Fails FCD check. Find the previous FCD boundary and normalize.
while (fcd16 > 0xff) {
c = iter.previousCodePoint();
if (c < 0) {
break;
}
fcd16 = nfcImpl.getFCD16(c);
if (fcd16 == 0) {
iter.nextCodePoint();
break;
}
s.appendCodePoint(c);
}
s.reverse();
normalize(s);
limit = pos;
start = pos - s.length();
state = State.IN_NORM_ITER_AT_START;
pos = normalized.length();
return true;
}
nextCC = fcd16 >> 8;
if (nextCC == 0) {
// FCD boundary before the following character.
break;
}
}
start = pos - s.length();
assert (pos != start);
iter.moveIndex(s.length());
state = State.ITER_IN_FCD_SEGMENT;
return true;
}
private void normalize(CharSequence s) {
if (normalized == null) {
normalized = new StringBuilder();
}
// NFD without argument checking.
nfcImpl.decompose(s, normalized);
}
private enum State {
/**
* The input text [start..(iter index)[ passes the FCD check. Moving forward checks
* incrementally. pos & limit are undefined.
*/
ITER_CHECK_FWD,
/**
* The input text [(iter index)..limit[ passes the FCD check. Moving backward checks
* incrementally. start & pos are undefined.
*/
ITER_CHECK_BWD,
/**
* The input text [start..limit[ passes the FCD check. pos tracks the current text index.
*/
ITER_IN_FCD_SEGMENT,
/**
* The input text [start..limit[ failed the FCD check and was normalized. pos tracks the
* current index in the normalized string. The text iterator is at the limit index.
*/
IN_NORM_ITER_AT_LIMIT,
/**
* The input text [start..limit[ failed the FCD check and was normalized. pos tracks the
* current index in the normalized string. The text iterator is at the start index.
*/
IN_NORM_ITER_AT_START
}
private State state;
private int start;
private int pos;
private int limit;
private final Normalizer2Impl nfcImpl;
private StringBuilder s;
private StringBuilder normalized;
}