FCDIterCollationIterator.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2012-2014, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 * FCDIterCollationIterator.java, ported from uitercollationiterator.h/.cpp
 *
 * C++ version created on: 2012sep23 (from utf16collationiterator.h)
 * created by: Markus W. Scherer
 */

package com.ibm.icu.impl.coll;

import com.ibm.icu.impl.Normalizer2Impl;
import com.ibm.icu.text.UCharacterIterator;

/** Incrementally checks the input text for FCD and normalizes where necessary. */
public final class FCDIterCollationIterator extends IterCollationIterator {
    public FCDIterCollationIterator(
            CollationData data, boolean numeric, UCharacterIterator ui, int startIndex) {
        super(data, numeric, ui);
        state = State.ITER_CHECK_FWD;
        start = startIndex;
        nfcImpl = data.nfcImpl;
    }

    @Override
    public void resetToOffset(int newOffset) {
        super.resetToOffset(newOffset);
        start = newOffset;
        state = State.ITER_CHECK_FWD;
    }

    @Override
    public int getOffset() {
        if (state.compareTo(State.ITER_CHECK_BWD) <= 0) {
            return iter.getIndex();
        } else if (state == State.ITER_IN_FCD_SEGMENT) {
            return pos;
        } else if (pos == 0) {
            return start;
        } else {
            return limit;
        }
    }

    @Override
    public int nextCodePoint() {
        int c;
        for (; ; ) {
            if (state == State.ITER_CHECK_FWD) {
                c = iter.next();
                if (c < 0) {
                    return c;
                }
                if (CollationFCD.hasTccc(c)) {
                    if (CollationFCD.maybeTibetanCompositeVowel(c)
                            || CollationFCD.hasLccc(iter.current())) {
                        iter.previous();
                        if (!nextSegment()) {
                            return Collation.SENTINEL_CP;
                        }
                        continue;
                    }
                }
                if (isLeadSurrogate(c)) {
                    int trail = iter.next();
                    if (isTrailSurrogate(trail)) {
                        return Character.toCodePoint((char) c, (char) trail);
                    } else if (trail >= 0) {
                        iter.previous();
                    }
                }
                return c;
            } else if (state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
                c = iter.nextCodePoint();
                pos += Character.charCount(c);
                assert (c >= 0);
                return c;
            } else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
                    && pos != normalized.length()) {
                c = normalized.codePointAt(pos);
                pos += Character.charCount(c);
                return c;
            } else {
                switchToForward();
            }
        }
    }

    @Override
    public int previousCodePoint() {
        int c;
        for (; ; ) {
            if (state == State.ITER_CHECK_BWD) {
                c = iter.previous();
                if (c < 0) {
                    start = pos = 0;
                    state = State.ITER_IN_FCD_SEGMENT;
                    return Collation.SENTINEL_CP;
                }
                if (CollationFCD.hasLccc(c)) {
                    int prev = Collation.SENTINEL_CP;
                    if (CollationFCD.maybeTibetanCompositeVowel(c)
                            || CollationFCD.hasTccc(prev = iter.previous())) {
                        iter.next();
                        if (prev >= 0) {
                            iter.next();
                        }
                        if (!previousSegment()) {
                            return Collation.SENTINEL_CP;
                        }
                        continue;
                    }
                    // hasLccc(trail)=true for all trail surrogates
                    if (isTrailSurrogate(c)) {
                        if (prev < 0) {
                            prev = iter.previous();
                        }
                        if (isLeadSurrogate(prev)) {
                            return Character.toCodePoint((char) prev, (char) c);
                        }
                    }
                    if (prev >= 0) {
                        iter.next();
                    }
                }
                return c;
            } else if (state == State.ITER_IN_FCD_SEGMENT && pos != start) {
                c = iter.previousCodePoint();
                pos -= Character.charCount(c);
                assert (c >= 0);
                return c;
            } else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos != 0) {
                c = normalized.codePointBefore(pos);
                pos -= Character.charCount(c);
                return c;
            } else {
                switchToBackward();
            }
        }
    }

    @Override
    protected long handleNextCE32() {
        int c;
        for (; ; ) {
            if (state == State.ITER_CHECK_FWD) {
                c = iter.next();
                if (c < 0) {
                    return NO_CP_AND_CE32;
                }
                if (CollationFCD.hasTccc(c)) {
                    if (CollationFCD.maybeTibetanCompositeVowel(c)
                            || CollationFCD.hasLccc(iter.current())) {
                        iter.previous();
                        if (!nextSegment()) {
                            c = Collation.SENTINEL_CP;
                            return Collation.FALLBACK_CE32;
                        }
                        continue;
                    }
                }
                break;
            } else if (state == State.ITER_IN_FCD_SEGMENT && pos != limit) {
                c = iter.next();
                ++pos;
                assert (c >= 0);
                break;
            } else if (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
                    && pos != normalized.length()) {
                c = normalized.charAt(pos++);
                break;
            } else {
                switchToForward();
            }
        }
        return makeCodePointAndCE32Pair(c, trie.getFromU16SingleLead((char) c));
    }

    @Override
    protected char handleGetTrailSurrogate() {
        if (state.compareTo(State.ITER_IN_FCD_SEGMENT) <= 0) {
            int trail = iter.next();
            if (isTrailSurrogate(trail)) {
                if (state == State.ITER_IN_FCD_SEGMENT) {
                    ++pos;
                }
            } else if (trail >= 0) {
                iter.previous();
            }
            return (char) trail;
        } else {
            assert (pos < normalized.length());
            char trail;
            if (Character.isLowSurrogate(trail = normalized.charAt(pos))) {
                ++pos;
            }
            return trail;
        }
    }

    @Override
    protected void forwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while (num > 0 && nextCodePoint() >= 0) {
            --num;
        }
    }

    @Override
    protected void backwardNumCodePoints(int num) {
        // Specify the class to avoid a virtual-function indirection.
        // In Java, we would declare this class final.
        while (num > 0 && previousCodePoint() >= 0) {
            --num;
        }
    }

    /** Switches to forward checking if possible. */
    private void switchToForward() {
        assert (state == State.ITER_CHECK_BWD
                || (state == State.ITER_IN_FCD_SEGMENT && pos == limit)
                || (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0
                        && pos == normalized.length()));
        if (state == State.ITER_CHECK_BWD) {
            // Turn around from backward checking.
            start = pos = iter.getIndex();
            if (pos == limit) {
                state = State.ITER_CHECK_FWD; // Check forward.
            } else { // pos < limit
                state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
            }
        } else {
            // Reached the end of the FCD segment.
            if (state == State.ITER_IN_FCD_SEGMENT) {
                // The input text segment is FCD, extend it forward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking forward from it.
                if (state == State.IN_NORM_ITER_AT_START) {
                    iter.moveIndex(limit - start);
                }
                start = limit;
            }
            state = State.ITER_CHECK_FWD;
        }
    }

    /**
     * Extends the FCD text segment forward or normalizes around pos.
     *
     * @return true if success
     */
    private boolean nextSegment() {
        assert (state == State.ITER_CHECK_FWD);
        // The input text [start..(iter index)[ passes the FCD check.
        pos = iter.getIndex();
        // Collect the characters being checked, in case they need to be normalized.
        if (s == null) {
            s = new StringBuilder();
        } else {
            s.setLength(0);
        }
        int prevCC = 0;
        for (; ; ) {
            // Fetch the next character and its fcd16 value.
            int c = iter.nextCodePoint();
            if (c < 0) {
                break;
            }
            int fcd16 = nfcImpl.getFCD16(c);
            int leadCC = fcd16 >> 8;
            if (leadCC == 0 && s.length() != 0) {
                // FCD boundary before this character.
                iter.previousCodePoint();
                break;
            }
            s.appendCodePoint(c);
            if (leadCC != 0
                    && (prevCC > leadCC || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the next FCD boundary and normalize.
                for (; ; ) {
                    c = iter.nextCodePoint();
                    if (c < 0) {
                        break;
                    }
                    if (nfcImpl.getFCD16(c) <= 0xff) {
                        iter.previousCodePoint();
                        break;
                    }
                    s.appendCodePoint(c);
                }
                normalize(s);
                start = pos;
                limit = pos + s.length();
                state = State.IN_NORM_ITER_AT_LIMIT;
                pos = 0;
                return true;
            }
            prevCC = fcd16 & 0xff;
            if (prevCC == 0) {
                // FCD boundary after the last character.
                break;
            }
        }
        limit = pos + s.length();
        assert (pos != limit);
        iter.moveIndex(-s.length());
        state = State.ITER_IN_FCD_SEGMENT;
        return true;
    }

    /** Switches to backward checking. */
    private void switchToBackward() {
        assert (state == State.ITER_CHECK_FWD
                || (state == State.ITER_IN_FCD_SEGMENT && pos == start)
                || (state.compareTo(State.IN_NORM_ITER_AT_LIMIT) >= 0 && pos == 0));
        if (state == State.ITER_CHECK_FWD) {
            // Turn around from forward checking.
            limit = pos = iter.getIndex();
            if (pos == start) {
                state = State.ITER_CHECK_BWD; // Check backward.
            } else { // pos > start
                state = State.ITER_IN_FCD_SEGMENT; // Stay in FCD segment.
            }
        } else {
            // Reached the start of the FCD segment.
            if (state == State.ITER_IN_FCD_SEGMENT) {
                // The input text segment is FCD, extend it backward.
            } else {
                // The input text segment needed to be normalized.
                // Switch to checking backward from it.
                if (state == State.IN_NORM_ITER_AT_LIMIT) {
                    iter.moveIndex(start - limit);
                }
                limit = start;
            }
            state = State.ITER_CHECK_BWD;
        }
    }

    /**
     * Extends the FCD text segment backward or normalizes around pos.
     *
     * @return true if success
     */
    private boolean previousSegment() {
        assert (state == State.ITER_CHECK_BWD);
        // The input text [(iter index)..limit[ passes the FCD check.
        pos = iter.getIndex();
        // Collect the characters being checked, in case they need to be normalized.
        if (s == null) {
            s = new StringBuilder();
        } else {
            s.setLength(0);
        }
        int nextCC = 0;
        for (; ; ) {
            // Fetch the previous character and its fcd16 value.
            int c = iter.previousCodePoint();
            if (c < 0) {
                break;
            }
            int fcd16 = nfcImpl.getFCD16(c);
            int trailCC = fcd16 & 0xff;
            if (trailCC == 0 && s.length() != 0) {
                // FCD boundary after this character.
                iter.nextCodePoint();
                break;
            }
            s.appendCodePoint(c);
            if (trailCC != 0
                    && ((nextCC != 0 && trailCC > nextCC)
                            || CollationFCD.isFCD16OfTibetanCompositeVowel(fcd16))) {
                // Fails FCD check. Find the previous FCD boundary and normalize.
                while (fcd16 > 0xff) {
                    c = iter.previousCodePoint();
                    if (c < 0) {
                        break;
                    }
                    fcd16 = nfcImpl.getFCD16(c);
                    if (fcd16 == 0) {
                        iter.nextCodePoint();
                        break;
                    }
                    s.appendCodePoint(c);
                }
                s.reverse();
                normalize(s);
                limit = pos;
                start = pos - s.length();
                state = State.IN_NORM_ITER_AT_START;
                pos = normalized.length();
                return true;
            }
            nextCC = fcd16 >> 8;
            if (nextCC == 0) {
                // FCD boundary before the following character.
                break;
            }
        }
        start = pos - s.length();
        assert (pos != start);
        iter.moveIndex(s.length());
        state = State.ITER_IN_FCD_SEGMENT;
        return true;
    }

    private void normalize(CharSequence s) {
        if (normalized == null) {
            normalized = new StringBuilder();
        }
        // NFD without argument checking.
        nfcImpl.decompose(s, normalized);
    }

    private enum State {
        /**
         * The input text [start..(iter index)[ passes the FCD check. Moving forward checks
         * incrementally. pos & limit are undefined.
         */
        ITER_CHECK_FWD,
        /**
         * The input text [(iter index)..limit[ passes the FCD check. Moving backward checks
         * incrementally. start & pos are undefined.
         */
        ITER_CHECK_BWD,
        /**
         * The input text [start..limit[ passes the FCD check. pos tracks the current text index.
         */
        ITER_IN_FCD_SEGMENT,
        /**
         * The input text [start..limit[ failed the FCD check and was normalized. pos tracks the
         * current index in the normalized string. The text iterator is at the limit index.
         */
        IN_NORM_ITER_AT_LIMIT,
        /**
         * The input text [start..limit[ failed the FCD check and was normalized. pos tracks the
         * current index in the normalized string. The text iterator is at the start index.
         */
        IN_NORM_ITER_AT_START
    }

    private State state;

    private int start;
    private int pos;
    private int limit;

    private final Normalizer2Impl nfcImpl;
    private StringBuilder s;
    private StringBuilder normalized;
}