RuleBasedBreakIterator.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2005-2016 International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.text;

import static com.ibm.icu.impl.CharacterIteration.DONE32;
import static com.ibm.icu.impl.CharacterIteration.next32;
import static com.ibm.icu.impl.CharacterIteration.nextTrail32;

import com.ibm.icu.impl.CharacterIteration;
import com.ibm.icu.impl.ICUBinary;
import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.impl.RBBIDataWrapper;
import com.ibm.icu.impl.breakiter.BurmeseBreakEngine;
import com.ibm.icu.impl.breakiter.CjkBreakEngine;
import com.ibm.icu.impl.breakiter.DictionaryBreakEngine;
import com.ibm.icu.impl.breakiter.KhmerBreakEngine;
import com.ibm.icu.impl.breakiter.LSTMBreakEngine;
import com.ibm.icu.impl.breakiter.LanguageBreakEngine;
import com.ibm.icu.impl.breakiter.LaoBreakEngine;
import com.ibm.icu.impl.breakiter.ThaiBreakEngine;
import com.ibm.icu.impl.breakiter.UnhandledBreakEngine;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.UnicodeSet.XSymbolTable;
import com.ibm.icu.util.CodePointTrie;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.text.CharacterIterator;
import java.util.MissingResourceException;
import java.util.concurrent.ConcurrentLinkedQueue;

/**
 * Rule Based Break Iterator This is a port of the C++ class RuleBasedBreakIterator from ICU4C.
 *
 * @stable ICU 2.0
 */
public class RuleBasedBreakIterator extends BreakIterator implements Cloneable {
    // =======================================================================
    // Constructors & Factories
    // =======================================================================

    /** private constructor */
    private RuleBasedBreakIterator() {
        fDictionaryCharCount = 0;
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * <p>Creating a break iterator from the binary rules is much faster than creating one from
     * source rules.
     *
     * <p>The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between different versions of
     * ICU.
     *
     * @param is an input stream supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the InputStream.
     * @see #compileRules(String, OutputStream)
     * @stable ICU 4.8
     */
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(InputStream is)
            throws IOException {
        RuleBasedBreakIterator This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(ICUBinary.getByteBufferFromInputStreamAndCloseStream(is));
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * This factory method doesn't have an access modifier; it is only accessible in the same
     * package.
     *
     * <p>Create a break iterator from a precompiled set of break rules.
     *
     * <p>Creating a break iterator from the binary rules is much faster than creating one from
     * source rules.
     *
     * <p>The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between different versions of
     * ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @param phraseBreaking a flag indicating if phrase breaking is required.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see #compileRules(String, OutputStream)
     * @internal
     */
    /* package-potected */ static RuleBasedBreakIterator getInstanceFromCompiledRules(
            ByteBuffer bytes, boolean phraseBreaking) throws IOException {
        RuleBasedBreakIterator instance = getInstanceFromCompiledRules(bytes);
        instance.fPhraseBreaking = phraseBreaking;
        return instance;
    }

    /**
     * Create a break iterator from a precompiled set of break rules.
     *
     * <p>Creating a break iterator from the binary rules is much faster than creating one from
     * source rules.
     *
     * <p>The binary rules are generated by the RuleBasedBreakIterator.compileRules() function.
     * Binary break iterator rules are not guaranteed to be compatible between different versions of
     * ICU.
     *
     * @param bytes a buffer supplying the compiled binary rules.
     * @throws IOException if there is an error while reading the rules from the buffer.
     * @see #compileRules(String, OutputStream)
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static RuleBasedBreakIterator getInstanceFromCompiledRules(ByteBuffer bytes)
            throws IOException {
        RuleBasedBreakIterator This = new RuleBasedBreakIterator();
        This.fRData = RBBIDataWrapper.get(bytes);
        This.fLookAheadMatches = new int[This.fRData.fFTable.fLookAheadResultsSize];
        return This;
    }

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
     *
     * @param rules The break rules to be used.
     * @stable ICU 2.2
     */
    public RuleBasedBreakIterator(String rules) {
        this(rules, /* customProperties= */ null);
    }

    /**
     * Construct a RuleBasedBreakIterator from a set of rules supplied as a string, using custom
     * properties.
     *
     * @param rules The break rules to be used.
     * @param customProperties An XSymbolTable that supplies the properties to use in the rules, or
     *     null to use ICU properties.
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RuleBasedBreakIterator(String rules, XSymbolTable customProperties) {
        this();
        try {
            ByteArrayOutputStream ruleOS = new ByteArrayOutputStream();
            RBBIRuleBuilder.compileRules(rules, ruleOS, customProperties);
            fRData = RBBIDataWrapper.get(ByteBuffer.wrap(ruleOS.toByteArray()));
            fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        } catch (IOException e) {
            // An IO exception can only arrive here if there is a bug in the RBBI Rule compiler,
            //  causing bogus compiled rules to be produced, but with no compile error raised.
            RuntimeException rte =
                    new RuntimeException(
                            "RuleBasedBreakIterator rule compilation internal error: "
                                    + e.getMessage());
            throw rte;
        }
    }

    // =======================================================================
    // Boilerplate
    // =======================================================================

    /**
     * Clones this iterator.
     *
     * @return A newly-constructed RuleBasedBreakIterator with the same behavior as this one.
     * @stable ICU 2.0
     */
    @Override
    public RuleBasedBreakIterator clone() {
        RuleBasedBreakIterator result;
        result = (RuleBasedBreakIterator) super.clone();
        if (fText != null) {
            result.fText = (CharacterIterator) fText.clone();
        }
        result.fLookAheadMatches = new int[fRData.fFTable.fLookAheadResultsSize];
        result.fBreakCache = result.new BreakCache(fBreakCache);
        result.fDictionaryCache = result.new DictionaryCache(fDictionaryCache);
        return result;
    }

    /**
     * Returns true if both BreakIterators are of the same class, have the same rules, and iterate
     * over the same text.
     *
     * @stable ICU 2.0
     */
    @Override
    public boolean equals(Object that) {
        if (that == null) {
            return false;
        }
        if (this == that) {
            return true;
        }
        try {
            RuleBasedBreakIterator other = (RuleBasedBreakIterator) that;
            if (fRData != other.fRData && (fRData == null || other.fRData == null)) {
                return false;
            }
            if (fRData != null
                    && other.fRData != null
                    && (!fRData.fRuleSource.equals(other.fRData.fRuleSource))) {
                return false;
            }
            if (fText == null && other.fText == null) {
                return true;
            }
            if (fText == null || other.fText == null || !fText.equals(other.fText)) {
                return false;
            }
            return fPosition == other.fPosition;
        } catch (ClassCastException e) {
            return false;
        }
    }

    /**
     * Returns the description (rules) used to create this iterator. (In ICU4C, the same function is
     * RuleBasedBreakIterator::getRules())
     *
     * @stable ICU 2.0
     */
    @Override
    public String toString() {
        String retStr = "";
        if (fRData != null) {
            retStr = fRData.fRuleSource;
        }
        return retStr;
    }

    /**
     * Compute a hashcode for this BreakIterator
     *
     * @return A hash code
     * @stable ICU 2.0
     */
    @Override
    public int hashCode() {
        return fRData.fRuleSource.hashCode();
    }

    private static final int START_STATE = 1; // The state number of the starting state
    private static final int STOP_STATE = 0; // The state-transition value indicating "stop"

    // RBBIRunMode - the state machine runs an extra iteration at the beginning and end
    //               of user text.  A variable with this enum type keeps track of where we
    //               are.  The state machine only fetches user text input while in RUN mode.
    private static final int RBBI_START = 0;
    private static final int RBBI_RUN = 1;
    private static final int RBBI_END = 2;

    /** The character iterator through which this BreakIterator accesses the text. */
    private CharacterIterator fText = new java.text.StringCharacterIterator("");

    /**
     * The rule data for this BreakIterator instance. Not intended for public use. Declared public
     * for testing purposes only.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated public RBBIDataWrapper fRData;

    /**
     * The iteration state - current position, rule status for the current position, and whether the
     * iterator ran off the end, yielding UBRK_DONE. Current position is pinned to be 0 < position
     * <= text.length. Current position is always set to a boundary.
     *
     * <p>The current position of the iterator. Pinned, 0 < fPosition <= text.length. Never has the
     * value UBRK_DONE (-1).
     */
    private int fPosition;

    /** Index of the Rule {tag} values for the most recent match. */
    private int fRuleStatusIndex;

    /** True when iteration has run off the end, and iterator functions should return UBRK_DONE. */
    private boolean fDone;

    /** Array of look-ahead tentative results. */
    private int[] fLookAheadMatches;

    /** Cache of previously determined boundary positions. */
    private BreakCache fBreakCache = new BreakCache();

    /** Flag used to indicate if phrase breaking is required. */
    private boolean fPhraseBreaking = false;

    /**
     * Counter for the number of characters encountered with the "dictionary" flag set. Normal RBBI
     * iterators don't use it, although the code for updating it is live. Dictionary Based break
     * iterators (a subclass of us) access this field directly.
     *
     * @internal
     */
    private int fDictionaryCharCount;

    private DictionaryCache fDictionaryCache = new DictionaryCache();

    /** ICU debug argument name for RBBI */
    private static final String RBBI_DEBUG_ARG = "rbbi";

    /** Debugging flag. Trace operation of state machine when true. */
    private static final boolean TRACE =
            ICUDebug.enabled(RBBI_DEBUG_ARG)
                    && ICUDebug.value(RBBI_DEBUG_ARG).indexOf("trace") >= 0;

    /**
     * The "default" break engine - just skips over ranges of dictionary words, producing no breaks.
     * Should only be used if characters need to be handled by a dictionary but we have no
     * dictionary implementation for them.
     *
     * <p>Only one instance; shared by all break iterators.
     */
    private static final UnhandledBreakEngine gUnhandledBreakEngine;

    /**
     * List of all known break engines, common for all break iterators. Lazily updated as break
     * engines are needed, because instantiation of break engines is expensive.
     *
     * <p>Important notes:
     *
     * <ul>
     *   Because we don't want to add the same LanguageBreakEngine multiple times, all writes are
     *   synchronized.
     *   <ul>
     *     Read access avoids explicit synchronization, but will end up being synchronized if
     *     needed.
     */
    private static final ConcurrentLinkedQueue<LanguageBreakEngine> gAllBreakEngines;

    static {
        gUnhandledBreakEngine = new UnhandledBreakEngine();
        gAllBreakEngines = new ConcurrentLinkedQueue<>();
        gAllBreakEngines.add(gUnhandledBreakEngine);
    }

    /**
     * Dump the contents of the state table and character classes for this break iterator. For
     * debugging only.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public void dump(java.io.PrintStream out) {
        if (out == null) {
            out = System.out;
        }
        this.fRData.dump(out);
    }

    /**
     * Compile a set of source break rules into the binary state tables used by the break iterator
     * engine. Creating a break iterator from precompiled rules is much faster than creating one
     * from source rules.
     *
     * <p>Binary break rules are not guaranteed to be compatible between different versions of ICU.
     *
     * @param rules The source form of the break rules
     * @param ruleBinary An output stream to receive the compiled rules.
     * @throws IOException If there is an error writing the output.
     * @see #getInstanceFromCompiledRules(InputStream)
     * @stable ICU 4.8
     */
    public static void compileRules(String rules, OutputStream ruleBinary) throws IOException {
        RBBIRuleBuilder.compileRules(rules, ruleBinary, /* customProperties= */ null);
    }

    // =======================================================================
    // BreakIterator overrides
    // =======================================================================

    /**
     * Sets the current iteration position to the beginning of the text. (i.e., the
     * CharacterIterator's starting offset).
     *
     * @return The offset of the beginning of the text.
     * @stable ICU 2.0
     */
    @Override
    public int first() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        fText.first();
        int start = fText.getIndex();
        if (!fBreakCache.seek(start)) {
            fBreakCache.populateNear(start);
        }
        fBreakCache.current();
        assert (fPosition == start);
        return fPosition;
    }

    /**
     * Sets the current iteration position to the end of the text. (i.e., the CharacterIterator's
     * ending offset).
     *
     * @return The text's past-the-end offset.
     * @stable ICU 2.0
     */
    @Override
    public int last() {
        if (fText == null) {
            return BreakIterator.DONE;
        }
        int endPos = fText.getEndIndex();
        boolean endShouldBeBoundary =
                isBoundary(endPos); // Has side effect of setting iterator position.
        assert (endShouldBeBoundary);
        if (fPosition != endPos) {
            assert (fPosition == endPos);
        }
        return endPos;
    }

    /**
     * Advances the iterator either forward or backward the specified number of steps. Negative
     * values move backward, and positive values move forward. This is equivalent to repeatedly
     * calling next() or previous().
     *
     * @param n The number of steps to move. The sign indicates the direction (negative is
     *     backwards, and positive is forwards).
     * @return The character offset of the boundary position n boundaries away from the current one.
     * @stable ICU 2.0
     */
    @Override
    public int next(int n) {
        int result = 0;
        if (n > 0) {
            for (; n > 0 && result != DONE; --n) {
                result = next();
            }
        } else if (n < 0) {
            for (; n < 0 && result != DONE; ++n) {
                result = previous();
            }
        } else {
            result = current();
        }
        return result;
    }

    /**
     * Advances the iterator to the next boundary position.
     *
     * @return The position of the first boundary after this one.
     * @stable ICU 2.0
     */
    @Override
    public int next() {
        fBreakCache.next();
        return fDone ? DONE : fPosition;
    }

    /**
     * Moves the iterator backwards, to the boundary preceding the current one.
     *
     * @return The position of the boundary position immediately preceding the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int previous() {
        fBreakCache.previous();
        return fDone ? DONE : fPosition;
    }

    /**
     * Sets the iterator to refer to the first boundary position following the specified position.
     *
     * @param startPos The position from which to begin searching for a break position.
     * @return The position of the first break after the current position.
     * @stable ICU 2.0
     */
    @Override
    public int following(int startPos) {
        // if the supplied position is before the beginning, return the
        // text's starting offset
        if (startPos < fText.getBeginIndex()) {
            return first();
        }

        // Move requested offset to a code point start. It might be between a lead and trail
        // surrogate.
        // Or it may be beyond the end of the text.
        startPos = CISetIndex32(fText, startPos);
        fBreakCache.following(startPos);
        return fDone ? DONE : fPosition;
    }

    /**
     * Sets the iterator to refer to the last boundary position before the specified position.
     *
     * @param offset The position to begin searching for a break from.
     * @return The position of the last boundary before the starting position.
     * @stable ICU 2.0
     */
    @Override
    public int preceding(int offset) {
        if (fText == null || offset > fText.getEndIndex()) {
            return last();
        } else if (offset < fText.getBeginIndex()) {
            return DONE;
        }

        // Move requested offset to a code point start. It might be between a lead and trail
        // surrogate.
        // int adjustedOffset = CISetIndex32(fText, offset);    // TODO: restore to match ICU4C
        // behavior.
        int adjustedOffset = offset;
        fBreakCache.preceding(adjustedOffset);
        return fDone ? DONE : fPosition;
    }

    /**
     * Throw IndexOutOfBoundsException unless begin &lt;= offset &lt; end.
     *
     * @stable ICU 2.0
     */
    protected static final void checkOffset(int offset, CharacterIterator text) {
        if (offset < text.getBeginIndex() || offset > text.getEndIndex()) {
            throw new IndexOutOfBoundsException(offset);
        }
    }

    /**
     * Returns true if the specified position is a boundary position. As a side effect, leaves the
     * iterator pointing to the first boundary position at or after "offset".
     *
     * @param offset the offset to check.
     * @return True if "offset" is a boundary position.
     * @stable ICU 2.0
     */
    @Override
    public boolean isBoundary(int offset) {
        // TODO: behavior difference with ICU4C, which considers out-of-range offsets
        //       to not be boundaries, and to not be errors.
        checkOffset(offset, fText);

        // Adjust offset to be on a code point boundary and not beyond the end of the text.
        // Note that isBoundary() is always false for offsets that are not on code point boundaries.
        // But we still need the side effect of leaving iteration at the following boundary.
        int adjustedOffset = CISetIndex32(fText, offset);

        boolean result = false;
        if (fBreakCache.seek(adjustedOffset) || fBreakCache.populateNear(adjustedOffset)) {
            result = (fBreakCache.current() == offset);
        }

        if (!result) {
            // Not on a boundary. isBoundary() must leave iterator on the following boundary.
            // fBreakCache.seek(), above, left us on the preceding boundary, so advance one.
            next();
        }
        return result;
    }

    /**
     * Returns the current iteration position. Note that DONE is never returned from this function;
     * if iteration has run to the end of a string, current() will return the length of the string
     * while next() will return BreakIterator.DONE).
     *
     * @return The current iteration position.
     * @stable ICU 2.0
     */
    @Override
    public int current() {
        return (fText != null) ? fPosition : BreakIterator.DONE;
    }

    /**
     * Return the status tag from the break rule that determined the boundary at the current
     * iteration position. The values appear in the rule source within brackets, {123}, for example.
     * For rules that do not specify a status, a default value of 0 is returned. If more than one
     * rule applies, the numerically largest of the possible status values is returned.
     *
     * <p>Of the standard types of ICU break iterators, only the word and line break iterator
     * provides status values. The values are defined in class RuleBasedBreakIterator, and allow
     * distinguishing between words that contain alphabetic letters, "words" that appear to be
     * numbers, punctuation and spaces, words containing ideographic characters, and more. Call
     * <code>getRuleStatus</code> after obtaining a boundary position from <code>next()</code>,
     * <code>previous()</code>, or any other break iterator functions that returns a boundary
     * position.
     *
     * <p>Note that <code>getRuleStatus()</code> returns the value corresponding to <code>current()
     * </code> index even after <code>next()</code> has returned DONE.
     *
     * <p>
     *
     * @return the status from the break rule that determined the boundary at the current iteration
     *     position.
     * @stable ICU 60
     */
    @Override
    public int getRuleStatus() {
        //   Status records have this form:
        //           Count N         <--  fLastRuleStatusIndex points here.
        //           Status val 0
        //           Status val 1
        //              ...
        //           Status val N-1  <--  the value we need to return
        //   The status values are sorted in ascending order.
        //   This function returns the last (largest) of the array of status values.
        int idx = fRuleStatusIndex + fRData.fStatusTable[fRuleStatusIndex];
        int tagVal = fRData.fStatusTable[idx];
        return tagVal;
    }

    /**
     * Get the status (tag) values from the break rule(s) that determined the boundary at the
     * current iteration position. The values appear in the rule source within brackets, {123}, for
     * example. The default status value for rules that do not explicitly provide one is zero.
     *
     * <p>The status values used by the standard ICU break rules are defined as public constants in
     * class RuleBasedBreakIterator.
     *
     * <p>If the size of the output array is insufficient to hold the data, the output will be
     * truncated to the available length. No exception will be thrown.
     *
     * @param fillInArray an array to be filled in with the status values.
     * @return The number of rule status values from the rules that determined the boundary at the
     *     current iteration position. In the event that the array is too small, the return value is
     *     the total number of status values that were available, not the reduced number that were
     *     actually returned.
     * @stable ICU 60
     */
    @Override
    public int getRuleStatusVec(int[] fillInArray) {
        int numStatusVals = fRData.fStatusTable[fRuleStatusIndex];
        if (fillInArray != null) {
            int numToCopy = Math.min(numStatusVals, fillInArray.length);
            for (int i = 0; i < numToCopy; i++) {
                fillInArray[i] = fRData.fStatusTable[fRuleStatusIndex + i + 1];
            }
        }
        return numStatusVals;
    }

    /**
     * Returns a CharacterIterator over the text being analyzed.
     *
     * <p><b><i>Caution:</i></b>The state of the returned CharacterIterator must not be modified in
     * any way while the BreakIterator is still in use. Doing so will lead to undefined behavior of
     * the BreakIterator. Clone the returned CharacterIterator first and work with that.
     *
     * <p>The returned CharacterIterator is a reference to the <b>actual iterator being used</b> by
     * the BreakIterator. No guarantees are made about the current position of this iterator when it
     * is returned; it may differ from the BreakIterators current position. If you need to move that
     * position to examine the text, clone this function's return value first.
     *
     * @return An iterator over the text being analyzed.
     * @stable ICU 2.0
     */
    @Override
    public CharacterIterator getText() {
        return fText;
    }

    /**
     * Set the iterator to analyze a new piece of text. This function resets the current iteration
     * position to the beginning of the text. (The old iterator is dropped.)
     *
     * <p><b><i>Caution:</i></b> The supplied CharacterIterator is used directly by the
     * BreakIterator, and must not be altered in any way by code outside of the BreakIterator. Doing
     * so will lead to undefined behavior of the BreakIterator.
     *
     * @param newText An iterator over the text to analyze.
     * @stable ICU 2.0
     */
    @Override
    public void setText(CharacterIterator newText) {
        if (newText != null) {
            fBreakCache.reset(newText.getBeginIndex(), 0);
        } else {
            fBreakCache.reset();
        }
        fDictionaryCache.reset();
        fText = newText;
        this.first();
    }

    /**
     * Control debug, trace and dump options.
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public static final String fDebugEnv =
            ICUDebug.enabled(RBBI_DEBUG_ARG) ? ICUDebug.value(RBBI_DEBUG_ARG) : null;

    private LanguageBreakEngine getLanguageBreakEngine(int c) {

        // We have a dictionary character.
        // Does an already instantiated break engine handle it?
        // First read without synchronization, which could lead to a new language
        // break engine being added and we didn't go over it.
        for (LanguageBreakEngine candidate : gAllBreakEngines) {
            if (candidate.handles(c)) {
                return candidate;
            }
        }

        synchronized (gAllBreakEngines) {
            // Another break iterator may have instantiated the desired engine.
            for (LanguageBreakEngine candidate : gAllBreakEngines) {
                if (candidate.handles(c)) {
                    return candidate;
                }
            }

            // The global list doesn't have an existing engine, build one.
            int script = UCharacter.getIntPropertyValue(c, UProperty.SCRIPT);
            if (script == UScript.KATAKANA || script == UScript.HIRAGANA) {
                // Katakana, Hiragana and Han are handled by the same dictionary engine.
                // Fold them together for mapping from script -> engine.
                script = UScript.HAN;
            }

            LanguageBreakEngine eng;
            try {
                switch (script) {
                    case UScript.THAI:
                        try {
                            eng =
                                    LSTMBreakEngine.create(
                                            script, LSTMBreakEngine.createData(script));
                        } catch (MissingResourceException e) {
                            eng = new ThaiBreakEngine();
                        }
                        break;
                    case UScript.LAO:
                        eng = new LaoBreakEngine();
                        break;
                    case UScript.MYANMAR:
                        try {
                            eng =
                                    LSTMBreakEngine.create(
                                            script, LSTMBreakEngine.createData(script));
                        } catch (MissingResourceException e) {
                            eng = new BurmeseBreakEngine();
                        }
                        break;
                    case UScript.KHMER:
                        eng = new KhmerBreakEngine();
                        break;
                    case UScript.HAN:
                        eng = new CjkBreakEngine(false);
                        break;
                    case UScript.HANGUL:
                        eng = new CjkBreakEngine(true);
                        break;
                    default:
                        gUnhandledBreakEngine.handleChar(c);
                        eng = gUnhandledBreakEngine;
                        break;
                }
            } catch (IOException e) {
                eng = null;
            }

            if (eng != null && eng != gUnhandledBreakEngine) {
                gAllBreakEngines.add(eng);
            }
            return eng;
        } // end synchronized(gAllBreakEngines)
    }

    /**
     * The State Machine Engine for moving forward is here. This function is the heart of the RBBI
     * run time engine.
     *
     * <p>Input fPosition, the position in the text to begin from. Output fPosition: the boundary
     * following the starting position. fDictionaryCharCount the number of dictionary characters
     * encountered. If > 0, the segment will be further subdivided fRuleStatusIndex Info from the
     * state table indicating which rules caused the boundary.
     *
     * @return the new iterator position
     *     <p>A note on supplementary characters and the position of underlying Java
     *     CharacterIterator: Normally, a character iterator is positioned at the char most recently
     *     returned by next(). Within this function, when a supplementary char is being processed,
     *     the char iterator is left sitting on the trail surrogate, in the middle of the code
     *     point. This is different from everywhere else, where an iterator always points at the
     *     lead surrogate of a supplementary.
     */
    private int handleNext() {
        if (TRACE) {
            System.out.println("Handle Next   pos      char  state category");
        }

        // handleNext always sets the break tag value.
        // Set the default for it.
        fRuleStatusIndex = 0;
        fDictionaryCharCount = 0;

        // caches for quicker access
        CharacterIterator text = fText;
        CodePointTrie trie = fRData.fTrie;

        char[] stateTable = fRData.fFTable.fTable;
        int initialPosition = fPosition;
        text.setIndex(initialPosition);
        int result = initialPosition;

        // Set up the starting char
        int c = text.current();
        if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
            c = nextTrail32(text, c);
            if (c == DONE32) {
                fDone = true;
                return BreakIterator.DONE;
            }
        }

        // Set the initial state for the state machine
        int state = START_STATE;
        int row = fRData.getRowIndex(state);
        short category = 3;
        int dictStart = fRData.fFTable.fDictCategoriesStart;
        int mode = RBBI_RUN;

        // loop until we reach the end of the text or transition to state 0
        while (state != STOP_STATE) {
            if (c == DONE32) {
                // Reached end of input string.
                if (mode == RBBI_END) {
                    // We have already run the loop one last time with the
                    // character set to the pseudo {eof} value. Now it is time
                    // to unconditionally bail out.
                    break;
                }
                // Run the loop one last time with the fake end-of-input character category
                mode = RBBI_END;
                category = 1;
            } else if (mode == RBBI_RUN) {
                // Get the char category.  An incoming category of 1 or 2 mens that
                //      we are preset for doing the beginning or end of input, and
                //      that we shouldn't get a category from an actual text input character.
                //

                // look up the current character's character category, which tells us
                // which column in the state table to look at.
                //
                category = (short) trie.get(c);

                // Check for categories that require word dictionary handling.
                if (category >= dictStart) {
                    fDictionaryCharCount++;
                }

                if (TRACE) {
                    System.out.print(
                            "            " + RBBIDataWrapper.intToString(text.getIndex(), 5));
                    System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                    System.out.println(
                            RBBIDataWrapper.intToString(state, 7)
                                    + RBBIDataWrapper.intToString(category, 6));
                }

                // Advance to the next character.
                // If this is a beginning-of-input loop iteration, don't advance.
                //    The next iteration will be processing the first real input character.
                c = text.next();
                if (c >= UTF16.LEAD_SURROGATE_MIN_VALUE) {
                    c = nextTrail32(text, c);
                }
            } else {
                mode = RBBI_RUN;
            }

            // look up a state transition in the state table
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row = fRData.getRowIndex(state);
            int accepting = stateTable[row + RBBIDataWrapper.ACCEPTING];
            if (accepting == RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
                // Match found, common case
                result = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the start of it.
                    result--;
                }

                //  Remember the break status (tag) values.
                fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
            } else if (accepting > RBBIDataWrapper.ACCEPTING_UNCONDITIONAL) {
                // Lookahead match is completed
                int lookaheadResult = fLookAheadMatches[accepting];
                if (lookaheadResult >= 0) {
                    fRuleStatusIndex = stateTable[row + RBBIDataWrapper.TAGSIDX];
                    fPosition = lookaheadResult;
                    return lookaheadResult;
                }
            }

            // If we are at the position of the '/' in a look-ahead (hard break) rule;
            // record the current position, to be returned later, if the full rule matches.
            // TODO: Move this check before the previous check of fAccepting.
            //       This would enable hard-break rules with no following context.
            //       But there are line break test failures when trying this. Investigate.
            //       Issue ICU-20837
            int rule = stateTable[row + RBBIDataWrapper.LOOKAHEAD];
            if (rule != 0) {
                int pos = text.getIndex();
                if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && c <= UTF16.CODEPOINT_MAX_VALUE) {
                    // The iterator has been left in the middle of a surrogate pair.
                    // We want the beginning  of it.
                    pos--;
                }
                fLookAheadMatches[rule] = pos;
            }
        } // End of state machine main loop

        // The state machine is done.  Check whether it found a match...

        // If the iterator failed to advance in the match engine force it ahead by one.
        // This indicates a defect in the break rules, which should always match
        // at least one character.

        if (result == initialPosition) {
            if (TRACE) {
                System.out.println("Iterator did not move. Advancing by 1.");
            }
            text.setIndex(initialPosition);
            next32(text);
            result = text.getIndex();
            fRuleStatusIndex = 0;
        }

        // Leave the iterator at our result position.
        //   (we may have advanced beyond the last accepting position chasing after
        //    longer matches that never completed.)
        fPosition = result;

        if (TRACE) {
            System.out.println("result = " + result);
        }
        return result;
    }

    /**
     * Iterate backwards from an arbitrary position in the input text using the Safe Reverse rules.
     * This locates a "Safe Position" from which the forward break rules will operate correctly. A
     * Safe Position is not necessarily a boundary itself.
     *
     * <p>The logic of this function is very similar to handleNext(), above, but simpler because the
     * safe table does not require as many options.
     *
     * @param fromPosition the position in the input text to begin the iteration.
     * @internal
     */
    private int handleSafePrevious(int fromPosition) {
        char state;
        short category = 0;
        int result = 0;

        // caches for quicker access
        CharacterIterator text = fText;
        CodePointTrie trie = fRData.fTrie;
        char[] stateTable = fRData.fRTable.fTable;

        CISetIndex32(text, fromPosition);
        if (TRACE) {
            System.out.print("Handle Previous   pos   char  state category");
        }

        // if we're already at the start of the text, return DONE.
        if (text.getIndex() == text.getBeginIndex()) {
            return BreakIterator.DONE;
        }

        //  Set the initial state for the state machine
        int c = CharacterIteration.previous32(text);
        state = START_STATE;
        int row = fRData.getRowIndex(state);

        // loop until we reach the start of the text or transition to state 0
        //
        for (; c != DONE32; c = CharacterIteration.previous32(text)) {

            // look up the current character's character category, which tells us
            // which column in the state table to look at.
            //
            //  And off the dictionary flag bit. For reverse iteration it is not used.
            category = (short) trie.get(c);
            if (TRACE) {
                System.out.print("            " + RBBIDataWrapper.intToString(text.getIndex(), 5));
                System.out.print(RBBIDataWrapper.intToHexString(c, 10));
                System.out.println(
                        RBBIDataWrapper.intToString(state, 7)
                                + RBBIDataWrapper.intToString(category, 6));
            }

            // State Transition - move machine to its next state
            //
            assert (category < fRData.fHeader.fCatCount);
            state = stateTable[row + RBBIDataWrapper.NEXTSTATES + category];
            row = fRData.getRowIndex(state);

            if (state == STOP_STATE) {
                // This is the normal exit from the lookup state machine.
                // Transition to state zero means we have found a safe point.
                break;
            }
        }

        // The state machine is done.
        result = text.getIndex();
        if (TRACE) {
            System.out.println("result = " + result);
        }
        return result;
    }

    /**
     * Set the index of a CharacterIterator. Pin the index to the valid range range of BeginIndex <=
     * index <= EndIndex. If the index points to a trail surrogate of a supplementary character,
     * adjust it to the start (lead surrogate) index.
     *
     * @param ci A CharacterIterator to set
     * @param index the index to set
     * @return the resulting index, possibly pinned or adjusted.
     */
    private static int CISetIndex32(CharacterIterator ci, int index) {
        if (index <= ci.getBeginIndex()) {
            ci.first();
        } else if (index >= ci.getEndIndex()) {
            ci.setIndex(ci.getEndIndex());
        } else if (Character.isLowSurrogate(ci.setIndex(index))) {
            if (!Character.isHighSurrogate(ci.previous())) {
                ci.next();
            }
        }
        return ci.getIndex();
    }

    /**
     * DictionaryCache stores the boundaries obtained from a run of dictionary characters.
     * Dictionary boundaries are moved first to this cache, then from here to the main BreakCache,
     * where they may inter-leave with non-dictionary boundaries. The public BreakIterator API
     * always fetches directly from the main BreakCache, not from here.
     *
     * <p>In common situations, the number of boundaries in a single dictionary run should be quite
     * small, it will be terminated by punctuation, spaces, or any other non-dictionary characters.
     * The main BreakCache may end up with boundaries from multiple dictionary based runs.
     *
     * <p>The boundaries are stored in a simple ArrayList (vector), with the assumption that they
     * will be accessed sequentially.
     */
    class DictionaryCache {

        void reset() {
            fPositionInCache = -1;
            fStart = 0;
            fLimit = 0;
            fFirstRuleStatusIndex = 0;
            fOtherRuleStatusIndex = 0;
            fBreaks.removeAllElements();
            int maxStatus = 0;
            for (int status : fRData.fStatusTable) {
                maxStatus = Math.max(status, maxStatus);
            }
            isWordLike = maxStatus > 100;
        }
        ;

        boolean following(int fromPos) {
            if (fromPos >= fLimit || fromPos < fStart) {
                fPositionInCache = -1;
                return false;
            }

            // Sequential iteration, move from previous boundary to the following

            int r = 0;
            if (fPositionInCache >= 0
                    && fPositionInCache < fBreaks.size()
                    && fBreaks.elementAt(fPositionInCache) == fromPos) {
                ++fPositionInCache;
                if (fPositionInCache >= fBreaks.size()) {
                    fPositionInCache = -1;
                    return false;
                }
                r = fBreaks.elementAt(fPositionInCache);
                assert (r > fromPos);
                fBoundary = r;
                fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
                return true;
            }

            // Random indexing. Linear search for the boundary following the given position.

            for (fPositionInCache = 0; fPositionInCache < fBreaks.size(); ++fPositionInCache) {
                r = fBreaks.elementAt(fPositionInCache);
                if (r > fromPos) {
                    fBoundary = r;
                    fStatusIndex = isWordLike || r == fLimit ? fOtherRuleStatusIndex : 0;
                    return true;
                }
            }

            // Internal error. fStart <= fromPos < fLimit, but no cached boundary.
            assert (false);
            fPositionInCache = -1;
            return false;
        }
        ;

        boolean preceding(int fromPos) {
            if (fromPos <= fStart || fromPos > fLimit) {
                fPositionInCache = -1;
                return false;
            }

            if (fromPos == fLimit) {
                fPositionInCache = fBreaks.size() - 1;
                if (fPositionInCache >= 0) {
                    assert (fBreaks.elementAt(fPositionInCache) == fromPos);
                }
            }

            int r;
            if (fPositionInCache > 0
                    && fPositionInCache < fBreaks.size()
                    && fBreaks.elementAt(fPositionInCache) == fromPos) {
                --fPositionInCache;
                r = fBreaks.elementAt(fPositionInCache);
                assert (r < fromPos);
                fBoundary = r;
                fStatusIndex =
                        (r == fStart)
                                ? fFirstRuleStatusIndex
                                : isWordLike ? fOtherRuleStatusIndex : 0;
                return true;
            }

            if (fPositionInCache == 0) {
                fPositionInCache = -1;
                return false;
            }

            for (fPositionInCache = fBreaks.size() - 1; fPositionInCache >= 0; --fPositionInCache) {
                r = fBreaks.elementAt(fPositionInCache);
                if (r < fromPos) {
                    fBoundary = r;
                    fStatusIndex =
                            (r == fStart)
                                    ? fFirstRuleStatusIndex
                                    : isWordLike ? fOtherRuleStatusIndex : 0;
                    return true;
                }
            }
            assert (false);
            fPositionInCache = -1;
            return false;
        }
        ;

        /**
         * Populate the cache with the dictionary based boundaries within a region of text.
         *
         * @param startPos The start position of a range of text
         * @param endPos The end position of a range of text
         * @param firstRuleStatus The rule status index that applies to the break at startPos
         * @param otherRuleStatus The rule status index that applies to boundaries other than
         *     startPos
         * @internal
         */
        void populateDictionary(
                int startPos, int endPos, int firstRuleStatus, int otherRuleStatus) {
            if ((endPos - startPos) <= 1) {
                return;
            }

            reset();
            fFirstRuleStatusIndex = firstRuleStatus;
            fOtherRuleStatusIndex = otherRuleStatus;

            int rangeStart = startPos;
            int rangeEnd = endPos;

            int category;
            int current;
            int foundBreakCount = 0;

            // Loop through the text, looking for ranges of dictionary characters.
            // For each span, find the appropriate break engine, and ask it to find
            // any breaks within the span.

            fText.setIndex(rangeStart);
            int c = CharacterIteration.current32(fText);
            category = (short) fRData.fTrie.get(c);
            int dictStart = fRData.fFTable.fDictCategoriesStart;

            while (true) {
                while ((current = fText.getIndex()) < rangeEnd && (category < dictStart)) {
                    c = CharacterIteration.next32(fText); // pre-increment
                    category = (short) fRData.fTrie.get(c);
                }
                if (current >= rangeEnd) {
                    break;
                }

                // We now have a dictionary character. Get the appropriate language object
                // to deal with it.
                LanguageBreakEngine lbe = getLanguageBreakEngine(c);

                // Ask the language object if there are any breaks. It will add them to the cache
                // and
                // leave the text pointer on the other side of its range, ready to search for the
                // next one.
                if (lbe != null) {
                    foundBreakCount +=
                            lbe.findBreaks(fText, rangeStart, rangeEnd, fBreaks, fPhraseBreaking);
                }

                // Reload the loop variables for the next go-round
                c = CharacterIteration.current32(fText);
                category = (short) fRData.fTrie.get(c);
            }

            // If we found breaks, ensure that the first and last entries are
            // the original starting and ending position. And initialize the
            // cache iteration position to the first entry.

            // System.out.printf("foundBreakCount = %d%n", foundBreakCount);
            if (foundBreakCount > 0) {
                assert (foundBreakCount == fBreaks.size());
                if (startPos < fBreaks.elementAt(0)) {
                    // The dictionary did not place a boundary at the start of the segment of text.
                    // Add one now. This should not commonly happen, but it would be easy for
                    // interactions
                    // of the rules for dictionary segments and the break engine implementations to
                    // inadvertently cause it. Cover it here, just in case.
                    fBreaks.offer(startPos);
                }
                if (endPos > fBreaks.peek()) {
                    fBreaks.push(endPos);
                }
                fPositionInCache = 0;
                // Note: Dictionary matching may extend beyond the original limit.
                fStart = fBreaks.elementAt(0);
                fLimit = fBreaks.peek();
            } else {
                // there were no language-based breaks, even though the segment contained
                // dictionary characters. Subsequent attempts to fetch boundaries from the
                // dictionary cache
                // for this range will fail, and the calling code will fall back to the rule based
                // boundaries.
            }
        }
        ;

        DictionaryCache() {
            fPositionInCache = -1;
            fBreaks = new DictionaryBreakEngine.DequeI();
        }

        /**
         * copy constructor. Used by RuleBasedBreakIterator.clone().
         *
         * @param src the source object to be copied.
         */
        DictionaryCache(DictionaryCache src) {
            try {
                fBreaks = src.fBreaks.clone();
            } catch (CloneNotSupportedException e) {
                throw new RuntimeException(e);
            }
            fPositionInCache = src.fPositionInCache;
            fStart = src.fStart;
            fLimit = src.fLimit;
            fFirstRuleStatusIndex = src.fFirstRuleStatusIndex;
            fOtherRuleStatusIndex = src.fOtherRuleStatusIndex;
            fBoundary = src.fBoundary;
            fStatusIndex = src.fStatusIndex;
        }

        // A data structure containing the boundaries themselves. Essentially a vector of raw ints.
        DictionaryBreakEngine.DequeI fBreaks;
        int fPositionInCache; // Index in fBreaks of last boundary returned by following()
        //                                      //    or preceding(). Optimizes sequential access.
        int fStart; // Text position of first boundary in cache.
        int fLimit; // Last boundary in cache. Which is the limit of the
        //                                      //    text segment being handled by the dictionary.
        int fFirstRuleStatusIndex; // Rule status info for first boundary.
        int fOtherRuleStatusIndex; // Rule status info for 2nd through last boundaries.
        int fBoundary; // Current boundary. Set by preceding(), following().
        int fStatusIndex; // Current rule status index. Set by preceding, following().

        /**
         * If `this.isWordLike`, the status of dictionary breaks is equal to the status of the final
         * break of the rule-based segment they refine (fOtherRuleStatusIndex); otherwise,
         * dictionary breaks have status 0. For compatibility, this property is determined by the
         * largest status value used by the rules: rules that have a largest status greater than 100
         * are considered word-like.
         */
        private boolean isWordLike;
    }
    ;

    /*
     * class BreakCache
     *
     * Cache of break boundary positions and rule status values.
     * Break iterator API functions, next(), previous(), etc., will use cached results
     * when possible, and otherwise cache new results as they are obtained.
     *
     * Uniformly caches both dictionary and rule based (non-dictionary) boundaries.
     *
     * The cache is implemented as a single circular buffer.
     */

    /*
     * size of the circular cache buffer.
     */

    class BreakCache {

        BreakCache() {
            reset();
        }
        ;

        void reset(int pos, int ruleStatus) {
            fStartBufIdx = 0;
            fEndBufIdx = 0;
            fTextIdx = pos;
            fBufIdx = 0;
            fBoundaries[0] = pos;
            fStatuses[0] = (short) ruleStatus;
        }

        void reset() {
            reset(0, 0);
        }
        ;

        void next() {
            if (fBufIdx == fEndBufIdx) {
                fDone = !populateFollowing();
                fPosition = fTextIdx;
                fRuleStatusIndex = fStatuses[fBufIdx];
            } else {
                fBufIdx = modChunkSize(fBufIdx + 1);
                fTextIdx = fPosition = fBoundaries[fBufIdx];
                fRuleStatusIndex = fStatuses[fBufIdx];
            }
        }
        ;

        void previous() {
            int initialBufIdx = fBufIdx;
            if (fBufIdx == fStartBufIdx) {
                // At start of cache. Prepend to it.
                populatePreceding();
            } else {
                // Cache already holds the next boundary
                fBufIdx = modChunkSize(fBufIdx - 1);
                fTextIdx = fBoundaries[fBufIdx];
            }
            fDone = (fBufIdx == initialBufIdx);
            fPosition = fTextIdx;
            fRuleStatusIndex = fStatuses[fBufIdx];
        }
        ;

        // Move the iteration state to the position following the startPosition.
        // Input position must be pinned to the input length.
        void following(int startPos) {
            if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
                // startPos is in the cache. Do a next() from that position.
                // TODO: an awkward set of interactions with bi->fDone
                //       seek() does not clear it; it can't because of interactions with
                // populateNear().
                //       next() does not clear it in the fast-path case, where everything matters.
                // Maybe it should.
                //       So clear it here, for the case where seek() succeeded on an iterator that
                // had previously run off the end.
                fDone = false;
                next();
            }
        }
        ;

        void preceding(int startPos) {
            if (startPos == fTextIdx || seek(startPos) || populateNear(startPos)) {
                if (startPos == fTextIdx) {
                    previous();
                } else {
                    // seek() leaves the BreakCache positioned at the preceding boundary
                    //        if the requested position is between two boundaries.
                    // current() pushes the BreakCache position out to the BreakIterator itself.
                    assert (startPos > fTextIdx);
                    current();
                }
            }
        }
        ;

        /**
         * Update the state of the public BreakIterator (fBI) to reflect the current state of the
         * break iterator cache (this).
         */
        int current() {
            fPosition = fTextIdx;
            fRuleStatusIndex = fStatuses[fBufIdx];
            fDone = false;
            return fTextIdx;
        }
        ;

        /**
         * Add boundaries to the cache near the specified position. The given position need not be a
         * boundary itself. The input position must be within the range of the text, and on a code
         * point boundary. If the requested position is a break boundary, leave the iteration
         * position on it. If the requested position is not a boundary, leave the iteration position
         * on the preceding boundary and include both the preceding and following boundaries in the
         * cache. Additional boundaries, either preceding or following, may be added to the cache as
         * a side effect.
         *
         * <p>Return false if the operation failed.
         */
        boolean populateNear(int position) {
            assert (position < fBoundaries[fStartBufIdx] || position > fBoundaries[fEndBufIdx]);

            // Add boundaries to the cache near the specified position.
            // The given position need not be a boundary itself.
            // The input position must be within the range of the text, and
            // on a code point boundary.
            // If the requested position is a break boundary, leave the iteration
            // position on it.
            // If the requested position is not a boundary, leave the iteration
            // position on the preceding boundary and include both the
            // preceding and following boundaries in the cache.
            // Additional boundaries, either preceding or following, may be added
            // to the cache as a side effect.

            // If the requested position is not near already cached positions, clear the existing
            // cache,
            // find a near-by boundary and begin new cache contents there.

            // Threshold for a text position to be considered near to existing cache contents.
            // TODO: See issue ICU-22024 "perf tuning of Cache needed."
            //       This value is subject to change. See the ticket for more details.
            final int CACHE_NEAR = 15;

            int startOfText = fText.getBeginIndex();
            int aBoundary = -1;
            int ruleStatusIndex = 0;
            boolean retainCache = false;
            if ((position > fBoundaries[fStartBufIdx] - CACHE_NEAR)
                    && position < (fBoundaries[fEndBufIdx] + CACHE_NEAR)) {
                // Requested position is near the existing cache. Retain it.
                retainCache = true;
            } else if (position <= startOfText + CACHE_NEAR) {
                // Requested position is near the start of the text. Fill cache from start, skipping
                // the need to find a safe point.
                retainCache = false;
                aBoundary = startOfText;
            } else {
                // Requested position is not near the existing cache.
                // Find a safe point to refill the cache from.
                int backupPos = handleSafePrevious(position);

                if (fBoundaries[fEndBufIdx] < position
                        && fBoundaries[fEndBufIdx] >= (backupPos - CACHE_NEAR)) {
                    // The requested position is beyond the end of the existing cache, but the
                    // reverse rules produced a position near or before the cached region.
                    // Retain the existing cache, and fill from the end of it.
                    retainCache = true;
                } else if (backupPos < startOfText + CACHE_NEAR) {
                    // The safe reverse rules moved us to near the start of text.
                    // Take that (index 0) as the backup boundary, avoiding the complication
                    // (in the following block) of moving forward from the safe point to a known
                    // boundary.
                    //
                    // Retain the cache if it begins not too far from the requested position.
                    aBoundary = startOfText;
                    retainCache = (fBoundaries[fStartBufIdx] <= (position + CACHE_NEAR));
                } else {
                    // The safe reverse rules produced a position that is neither near the existing
                    // cache, nor near the start of text.
                    // Advance to the boundary following.
                    // There is a complication: the safe reverse rules identify pairs of code points
                    // that are safe. If advancing from the safe point moves forwards by less than
                    // two code points, we need to advance one more time to ensure that the boundary
                    // is good, including a correct rules status value.
                    //
                    retainCache = false;
                    fPosition = backupPos;
                    aBoundary = handleNext();
                    if (aBoundary == backupPos + 1
                            || (aBoundary == backupPos + 2
                                    && Character.isHighSurrogate(fText.setIndex(backupPos))
                                    && Character.isLowSurrogate(fText.next()))) {
                        // The initial handleNext() only advanced by a single code point. Go again.
                        // Safe rules identify safe pairs.
                        aBoundary = handleNext();
                    }
                    if (aBoundary == BreakIterator.DONE) {
                        aBoundary = fText.getEndIndex();
                    }
                    ruleStatusIndex = fRuleStatusIndex;
                }
            }

            if (!retainCache) {
                assert (aBoundary != -1);
                reset(
                        aBoundary,
                        ruleStatusIndex); // Reset cache to hold aBoundary as a single starting
                // point.
            }

            // Fill in boundaries between existing cache content and the new requested position.

            if (fBoundaries[fEndBufIdx] < position) {
                // The last position in the cache precedes the requested position.
                // Add following position(s) to the cache.
                while (fBoundaries[fEndBufIdx] < position) {
                    if (!populateFollowing()) {
                        assert false;
                        return false;
                    }
                }
                fBufIdx = fEndBufIdx; // Set iterator position to the end of the buffer.
                fTextIdx = fBoundaries[fBufIdx]; // Required because populateFollowing may add extra
                // boundaries.
                while (fTextIdx
                        > position) { // Move backwards to a position at or preceding the requested
                    // pos.
                    previous();
                }
                return true;
            }

            if (fBoundaries[fStartBufIdx] > position) {
                // The first position in the cache is beyond the requested position.
                // back up more until we get a boundary <= the requested position.
                while (fBoundaries[fStartBufIdx] > position) {
                    populatePreceding();
                }
                fBufIdx = fStartBufIdx; // Set iterator position to the start of the buffer.
                fTextIdx = fBoundaries[fBufIdx]; // Required because populatePreceding may add extra
                // boundaries.
                while (fTextIdx
                        < position) { // Move forwards to a position at or following the requested
                    // pos.
                    next();
                }
                if (fTextIdx > position) {
                    // If position is not itself a boundary, the next() loop above will overshoot.
                    // Back up one, leaving cache position at the boundary preceding the requested
                    // position.
                    previous();
                }
                return true;
            }

            assert fTextIdx == position;
            return true;
        }
        ;

        /**
         * Add boundary(s) to the cache following the current last boundary. Return false if at the
         * end of the text, and no more boundaries can be added. Leave iteration position at the
         * first newly added boundary, or unchanged if no boundary was added.
         */
        boolean populateFollowing() {
            int fromPosition = fBoundaries[fEndBufIdx];
            int fromRuleStatusIdx = fStatuses[fEndBufIdx];
            int pos = 0;
            int ruleStatusIdx = 0;

            if (fDictionaryCache.following(fromPosition)) {
                addFollowing(
                        fDictionaryCache.fBoundary,
                        fDictionaryCache.fStatusIndex,
                        UpdateCachePosition);
                return true;
            }

            fPosition = fromPosition;
            pos = handleNext();
            if (pos == BreakIterator.DONE) {
                return false;
            }

            ruleStatusIdx = fRuleStatusIndex;
            if (fDictionaryCharCount > 0) {
                // The text segment obtained from the rules includes dictionary characters.
                // Subdivide it, with subdivided results going into the dictionary cache.
                fDictionaryCache.populateDictionary(
                        fromPosition, pos, fromRuleStatusIdx, ruleStatusIdx);
                if (fDictionaryCache.following(fromPosition)) {
                    addFollowing(
                            fDictionaryCache.fBoundary,
                            fDictionaryCache.fStatusIndex,
                            UpdateCachePosition);
                    return true;
                    // TODO: may want to move a sizable chunk of the dictionary cache to the break
                    // cache at this point.
                    //       But be careful with interactions with populateNear().
                }
            }

            // Rule based segment did not include dictionary characters.
            // Or, it did contain dictionary chars, but the dictionary segmenter didn't handle them,
            //    meaning that we didn't take the return, above.
            // Add its end point to the cache.
            addFollowing(pos, ruleStatusIdx, UpdateCachePosition);

            // Add several non-dictionary boundaries at this point, to optimize straight forward
            // iteration.
            //    (subsequent calls to BreakIterator::next() will take the fast path, getting cached
            // results.
            //
            for (int count = 0; count < 6; ++count) {
                pos = handleNext();
                if (pos == BreakIterator.DONE || fDictionaryCharCount > 0) {
                    break;
                }
                addFollowing(pos, fRuleStatusIndex, RetainCachePosition);
            }
            return true;
        }
        ;

        /**
         * Add one or more boundaries to the cache preceding the first currently cached boundary.
         * Leave the iteration position on the first added boundary. Return false if no boundaries
         * could be added (if at the start of the text.)
         */
        boolean populatePreceding() {
            int textBegin = fText.getBeginIndex();
            int fromPosition = fBoundaries[fStartBufIdx];
            if (fromPosition == textBegin) {
                return false;
            }

            int position = textBegin;
            int positionStatusIdx = 0;

            if (fDictionaryCache.preceding(fromPosition)) {
                addPreceding(
                        fDictionaryCache.fBoundary,
                        fDictionaryCache.fStatusIndex,
                        UpdateCachePosition);
                return true;
            }

            int backupPosition = fromPosition;

            // Find a boundary somewhere preceding the first already-cached boundary
            do {
                backupPosition = backupPosition - 30;
                if (backupPosition <= textBegin) {
                    backupPosition = textBegin;
                } else {
                    backupPosition = handleSafePrevious(backupPosition);
                }
                if (backupPosition == BreakIterator.DONE || backupPosition == textBegin) {
                    position = textBegin;
                    positionStatusIdx = 0;
                } else {
                    // Advance to the boundary following the backup position.
                    // There is a complication: the safe reverse rules identify pairs of code points
                    // that are safe. If advancing from the safe point moves forwards by less than
                    // two code points, we need to advance one more time to ensure that the boundary
                    // is good, including a correct rules status value.
                    //
                    fPosition = backupPosition; // TODO: pass starting position in a clearer way.
                    position = handleNext();
                    if (position == backupPosition + 1
                            || (position == backupPosition + 2
                                    && Character.isHighSurrogate(fText.setIndex(backupPosition))
                                    && Character.isLowSurrogate(fText.next()))) {
                        // The initial handleNext() only advanced by a single code point. Go again.
                        // Safe rules identify safe pairs.
                        position = handleNext();
                    }
                    positionStatusIdx = fRuleStatusIndex;
                }
            } while (position >= fromPosition);

            // Find boundaries between the one we just located and the first already-cached boundary
            // Put them in a side buffer, because we don't yet know where they will fall in the
            // circular cache buffer.

            fSideBuffer.removeAllElements();
            fSideBuffer.push(position);
            fSideBuffer.push(positionStatusIdx);

            do {
                int prevPosition = fPosition = position;
                int prevStatusIdx = positionStatusIdx;
                position = handleNext();
                positionStatusIdx = fRuleStatusIndex;
                if (position == BreakIterator.DONE) {
                    break;
                }

                boolean segmentHandledByDictionary = false;
                if (fDictionaryCharCount != 0) {
                    // Segment from the rules includes dictionary characters.
                    // Subdivide it, with subdivided results going into the dictionary cache.
                    int dictSegEndPosition = position;
                    fDictionaryCache.populateDictionary(
                            prevPosition, dictSegEndPosition, prevStatusIdx, positionStatusIdx);
                    while (fDictionaryCache.following(prevPosition)) {
                        position = fDictionaryCache.fBoundary;
                        positionStatusIdx = fDictionaryCache.fStatusIndex;
                        segmentHandledByDictionary = true;
                        assert (position > prevPosition);
                        if (position >= fromPosition) {
                            break;
                        }
                        assert (position <= dictSegEndPosition);
                        fSideBuffer.push(position);
                        fSideBuffer.push(positionStatusIdx);
                        prevPosition = position;
                    }
                    assert (position == dictSegEndPosition || position >= fromPosition);
                }

                if (!segmentHandledByDictionary && position < fromPosition) {
                    fSideBuffer.push(position);
                    fSideBuffer.push(positionStatusIdx);
                }
            } while (position < fromPosition);

            // Move boundaries from the side buffer to the main circular buffer.
            boolean success = false;
            if (!fSideBuffer.isEmpty()) {
                positionStatusIdx = fSideBuffer.pop();
                position = fSideBuffer.pop();
                addPreceding(position, positionStatusIdx, UpdateCachePosition);
                success = true;
            }

            while (!fSideBuffer.isEmpty()) {
                positionStatusIdx = fSideBuffer.pop();
                position = fSideBuffer.pop();
                if (!addPreceding(position, positionStatusIdx, RetainCachePosition)) {
                    // No space in circular buffer to hold a new preceding result while
                    // also retaining the current cache (iteration) position.
                    // Bailing out is safe; the cache will refill again if needed.
                    break;
                }
            }
            return success;
        }
        ;

        static final boolean RetainCachePosition = false;
        static final boolean UpdateCachePosition = true;

        /**
         * Add the boundary following the current position. The current position can be left as it
         * was, or changed to the newly added boundary, as specified by the update parameter.
         */
        void addFollowing(int position, int ruleStatusIdx, boolean update) {
            assert (position > fBoundaries[fEndBufIdx]);
            assert (ruleStatusIdx <= Short.MAX_VALUE);
            int nextIdx = modChunkSize(fEndBufIdx + 1);
            if (nextIdx == fStartBufIdx) {
                fStartBufIdx =
                        modChunkSize(fStartBufIdx + 6); // TODO: experiment. Probably revert to 1.
            }
            fBoundaries[nextIdx] = position;
            fStatuses[nextIdx] = (short) ruleStatusIdx;
            fEndBufIdx = nextIdx;
            if (update == UpdateCachePosition) {
                // Set current position to the newly added boundary.
                fBufIdx = nextIdx;
                fTextIdx = position;
            } else {
                // Retaining the original cache position.
                // Check if the added boundary wraps around the buffer, and would over-write the
                // original position.
                // It's the responsibility of callers of this function to not add too many.
                assert (nextIdx != fBufIdx);
            }
        }
        ;

        /**
         * Add the boundary preceding the current position. The current position can be left as it
         * was, or changed to the newly added boundary, as specified by the update parameter.
         */
        boolean addPreceding(int position, int ruleStatusIdx, boolean update) {
            assert (position < fBoundaries[fStartBufIdx]);
            assert (ruleStatusIdx <= Short.MAX_VALUE);
            int nextIdx = modChunkSize(fStartBufIdx - 1);
            if (nextIdx == fEndBufIdx) {
                if (fBufIdx == fEndBufIdx && update == RetainCachePosition) {
                    // Failure. The insertion of the new boundary would claim the buffer position
                    // that is the
                    // current iteration position. And we also want to retain the current iteration
                    // position.
                    // (The buffer is already completely full of entries that precede the iteration
                    // position.)
                    return false;
                }
                fEndBufIdx = modChunkSize(fEndBufIdx - 1);
            }
            fBoundaries[nextIdx] = position;
            fStatuses[nextIdx] = (short) ruleStatusIdx;
            fStartBufIdx = nextIdx;
            if (update == UpdateCachePosition) {
                fBufIdx = nextIdx;
                fTextIdx = position;
            }
            return true;
        }
        ;

        /**
         * Set the cache position to the specified position, or, if the position falls between to
         * cached boundaries, to the preceding boundary. Fails if the requested position is outside
         * of the range of boundaries currently held by the cache. The startPosition must be on a
         * code point boundary.
         *
         * <p>Return true if successful, false if the specified position is after the last cached
         * boundary or before the first.
         */
        boolean seek(int pos) {
            if (pos < fBoundaries[fStartBufIdx] || pos > fBoundaries[fEndBufIdx]) {
                return false;
            }
            if (pos == fBoundaries[fStartBufIdx]) {
                // Common case: seek(0), from BreakIterator::first()
                fBufIdx = fStartBufIdx;
                fTextIdx = fBoundaries[fBufIdx];
                return true;
            }
            if (pos == fBoundaries[fEndBufIdx]) {
                fBufIdx = fEndBufIdx;
                fTextIdx = fBoundaries[fBufIdx];
                return true;
            }

            int min = fStartBufIdx;
            int max = fEndBufIdx;
            while (min != max) {
                int probe = (min + max + (min > max ? CACHE_SIZE : 0)) / 2;
                probe = modChunkSize(probe);
                if (fBoundaries[probe] > pos) {
                    max = probe;
                } else {
                    min = modChunkSize(probe + 1);
                }
            }
            assert (fBoundaries[max] > pos);
            fBufIdx = modChunkSize(max - 1);
            fTextIdx = fBoundaries[fBufIdx];
            assert (fTextIdx <= pos);
            return true;
        }
        ;

        /**
         * copy constructor, used from RuleBasedBreakIterator.clone().
         *
         * @param src
         */
        BreakCache(BreakCache src) {
            fStartBufIdx = src.fStartBufIdx;
            fEndBufIdx = src.fEndBufIdx;
            fTextIdx = src.fTextIdx;
            fBufIdx = src.fBufIdx;
            fBoundaries = src.fBoundaries.clone();
            fStatuses = src.fStatuses.clone();
            fSideBuffer =
                    new DictionaryBreakEngine.DequeI(); // Transient, no need to clone contents.
        }

        void dumpCache() {
            System.out.printf("fTextIdx:%d   fBufIdx:%d%n", fTextIdx, fBufIdx);
            for (int i = fStartBufIdx; ; i = modChunkSize(i + 1)) {
                System.out.printf("%d  %d%n", i, fBoundaries[i]);
                if (i == fEndBufIdx) {
                    break;
                }
            }
        }
        ;

        private final int modChunkSize(int index) {
            return index & (CACHE_SIZE - 1);
        }
        ;

        static final int CACHE_SIZE = 128;
        // static_assert((CACHE_SIZE & (CACHE_SIZE-1)) == 0, "CACHE_SIZE must be power of two.");

        int fStartBufIdx;
        int fEndBufIdx; // inclusive

        int fTextIdx;
        int fBufIdx;

        int[] fBoundaries = new int[CACHE_SIZE];
        short[] fStatuses = new short[CACHE_SIZE];

        DictionaryBreakEngine.DequeI fSideBuffer = new DictionaryBreakEngine.DequeI();
    }
    ;
}