CollationRootElements.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2013-2014, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 * CollationRootElements.java, ported from collationrootelements.h/.cpp
 *
 * C++ version created on: 2013mar01
 * created by: Markus W. Scherer
 */

package com.ibm.icu.impl.coll;

/**
 * Container and access methods for collation elements and weights that occur in the root collator.
 * Needed for finding boundaries for building a tailoring.
 *
 * <p>This class takes and returns 16-bit secondary and tertiary weights.
 */
public final class CollationRootElements {
    public CollationRootElements(long[] rootElements) {
        elements = rootElements;
    }

    /** Higher than any root primary. */
    public static final long PRIMARY_SENTINEL = 0xffffff00L;

    /**
     * Flag in a root element, set if the element contains secondary & tertiary weights, rather than
     * a primary.
     */
    public static final int SEC_TER_DELTA_FLAG = 0x80;

    /** Mask for getting the primary range step value from a primary-range-end element. */
    public static final int PRIMARY_STEP_MASK = 0x7f;

    /**
     * Index of the first CE with a non-zero tertiary weight. Same as the start of the compact root
     * elements table.
     */
    public static final int IX_FIRST_TERTIARY_INDEX = 0;

    /** Index of the first CE with a non-zero secondary weight. */
    static final int IX_FIRST_SECONDARY_INDEX = 1;

    /** Index of the first CE with a non-zero primary weight. */
    static final int IX_FIRST_PRIMARY_INDEX = 2;

    /** Must match Collation.COMMON_SEC_AND_TER_CE. */
    static final int IX_COMMON_SEC_AND_TER_CE = 3;

    /**
     * Secondary & tertiary boundaries. Bits 31..24: [fixed last secondary common byte 45] Bits
     * 23..16: [fixed first ignorable secondary byte 80] Bits 15.. 8: reserved, 0 Bits 7.. 0: [fixed
     * first ignorable tertiary byte 3C]
     */
    static final int IX_SEC_TER_BOUNDARIES = 4;

    /** The current number of indexes. Currently the same as elements[IX_FIRST_TERTIARY_INDEX]. */
    static final int IX_COUNT = 5;

    /**
     * Returns the boundary between tertiary weights of primary/secondary CEs and those of tertiary
     * CEs. This is the upper limit for tertiaries of primary/secondary CEs. This minus one is the
     * lower limit for tertiaries of tertiary CEs.
     */
    public int getTertiaryBoundary() {
        return ((int) elements[IX_SEC_TER_BOUNDARIES] << 8) & 0xff00;
    }

    /** Returns the first assigned tertiary CE. */
    long getFirstTertiaryCE() {
        return elements[(int) elements[IX_FIRST_TERTIARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
    }

    /** Returns the last assigned tertiary CE. */
    long getLastTertiaryCE() {
        return elements[(int) elements[IX_FIRST_SECONDARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
    }

    /**
     * Returns the last common secondary weight. This is the lower limit for secondaries of primary
     * CEs.
     */
    public int getLastCommonSecondary() {
        return ((int) elements[IX_SEC_TER_BOUNDARIES] >> 16) & 0xff00;
    }

    /**
     * Returns the boundary between secondary weights of primary CEs and those of secondary CEs.
     * This is the upper limit for secondaries of primary CEs. This minus one is the lower limit for
     * secondaries of secondary CEs.
     */
    public int getSecondaryBoundary() {
        return ((int) elements[IX_SEC_TER_BOUNDARIES] >> 8) & 0xff00;
    }

    /** Returns the first assigned secondary CE. */
    long getFirstSecondaryCE() {
        return elements[(int) elements[IX_FIRST_SECONDARY_INDEX]] & ~SEC_TER_DELTA_FLAG;
    }

    /** Returns the last assigned secondary CE. */
    long getLastSecondaryCE() {
        return elements[(int) elements[IX_FIRST_PRIMARY_INDEX] - 1] & ~SEC_TER_DELTA_FLAG;
    }

    /** Returns the first assigned primary weight. */
    long getFirstPrimary() {
        return elements[(int) elements[IX_FIRST_PRIMARY_INDEX]]; // step=0: cannot be a range end
    }

    /** Returns the first assigned primary CE. */
    long getFirstPrimaryCE() {
        return Collation.makeCE(getFirstPrimary());
    }

    /**
     * Returns the last root CE with a primary weight before p. Intended only for reordering group
     * boundaries.
     */
    long lastCEWithPrimaryBefore(long p) {
        if (p == 0) {
            return 0;
        }
        assert (p > elements[(int) elements[IX_FIRST_PRIMARY_INDEX]]);
        int index = findP(p);
        long q = elements[index];
        long secTer;
        if (p == (q & 0xffffff00L)) {
            // p == elements[index] is a root primary. Find the CE before it.
            // We must not be in a primary range.
            assert ((q & PRIMARY_STEP_MASK) == 0);
            secTer = elements[index - 1];
            if ((secTer & SEC_TER_DELTA_FLAG) == 0) {
                // Primary CE just before p.
                p = secTer & 0xffffff00L;
                secTer = Collation.COMMON_SEC_AND_TER_CE;
            } else {
                // secTer = last secondary & tertiary for the previous primary
                index -= 2;
                for (; ; ) {
                    p = elements[index];
                    if ((p & SEC_TER_DELTA_FLAG) == 0) {
                        p &= 0xffffff00L;
                        break;
                    }
                    --index;
                }
            }
        } else {
            // p > elements[index] which is the previous primary.
            // Find the last secondary & tertiary weights for it.
            p = q & 0xffffff00L;
            secTer = Collation.COMMON_SEC_AND_TER_CE;
            for (; ; ) {
                q = elements[++index];
                if ((q & SEC_TER_DELTA_FLAG) == 0) {
                    // We must not be in a primary range.
                    assert ((q & PRIMARY_STEP_MASK) == 0);
                    break;
                }
                secTer = q;
            }
        }
        return (p << 32) | (secTer & ~SEC_TER_DELTA_FLAG);
    }

    /**
     * Returns the first root CE with a primary weight of at least p. Intended only for reordering
     * group boundaries.
     */
    long firstCEWithPrimaryAtLeast(long p) {
        if (p == 0) {
            return 0;
        }
        int index = findP(p);
        if (p != (elements[index] & 0xffffff00L)) {
            for (; ; ) {
                p = elements[++index];
                if ((p & SEC_TER_DELTA_FLAG) == 0) {
                    // First primary after p. We must not be in a primary range.
                    assert ((p & PRIMARY_STEP_MASK) == 0);
                    break;
                }
            }
        }
        // The code above guarantees that p has at most 3 bytes: (p & 0xff) == 0.
        return (p << 32) | Collation.COMMON_SEC_AND_TER_CE;
    }

    /** Returns the primary weight before p. p must be greater than the first root primary. */
    long getPrimaryBefore(long p, boolean isCompressible) {
        int index = findPrimary(p);
        int step;
        long q = elements[index];
        if (p == (q & 0xffffff00L)) {
            // Found p itself. Return the previous primary.
            // See if p is at the end of a previous range.
            step = (int) q & PRIMARY_STEP_MASK;
            if (step == 0) {
                // p is not at the end of a range. Look for the previous primary.
                do {
                    p = elements[--index];
                } while ((p & SEC_TER_DELTA_FLAG) != 0);
                return p & 0xffffff00L;
            }
        } else {
            // p is in a range, and not at the start.
            long nextElement = elements[index + 1];
            assert (isEndOfPrimaryRange(nextElement));
            step = (int) nextElement & PRIMARY_STEP_MASK;
        }
        // Return the previous range primary.
        if ((p & 0xffff) == 0) {
            return Collation.decTwoBytePrimaryByOneStep(p, isCompressible, step);
        } else {
            return Collation.decThreeBytePrimaryByOneStep(p, isCompressible, step);
        }
    }

    /** Returns the secondary weight before [p, s]. */
    int getSecondaryBefore(long p, int s) {
        int index;
        int previousSec, sec;
        if (p == 0) {
            index = (int) elements[IX_FIRST_SECONDARY_INDEX];
            // Gap at the beginning of the secondary CE range.
            previousSec = 0;
            sec = (int) (elements[index] >> 16);
        } else {
            index = findPrimary(p) + 1;
            previousSec = Collation.BEFORE_WEIGHT16;
            sec = (int) getFirstSecTerForPrimary(index) >>> 16;
        }
        assert (s >= sec);
        while (s > sec) {
            previousSec = sec;
            assert ((elements[index] & SEC_TER_DELTA_FLAG) != 0);
            sec = (int) (elements[index++] >> 16);
        }
        assert (sec == s);
        return previousSec;
    }

    /** Returns the tertiary weight before [p, s, t]. */
    int getTertiaryBefore(long p, int s, int t) {
        assert ((t & ~Collation.ONLY_TERTIARY_MASK) == 0);
        int index;
        int previousTer;
        long secTer;
        if (p == 0) {
            if (s == 0) {
                index = (int) elements[IX_FIRST_TERTIARY_INDEX];
                // Gap at the beginning of the tertiary CE range.
                previousTer = 0;
            } else {
                index = (int) elements[IX_FIRST_SECONDARY_INDEX];
                previousTer = Collation.BEFORE_WEIGHT16;
            }
            secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
        } else {
            index = findPrimary(p) + 1;
            previousTer = Collation.BEFORE_WEIGHT16;
            secTer = getFirstSecTerForPrimary(index);
        }
        long st = ((long) s << 16) | t;
        while (st > secTer) {
            if ((int) (secTer >> 16) == s) {
                previousTer = (int) secTer;
            }
            assert ((elements[index] & SEC_TER_DELTA_FLAG) != 0);
            secTer = elements[index++] & ~SEC_TER_DELTA_FLAG;
        }
        assert (secTer == st);
        return previousTer & 0xffff;
    }

    /** Finds the index of the input primary. p must occur as a root primary, and must not be 0. */
    int findPrimary(long p) {
        // Requirement: p must occur as a root primary.
        assert ((p & 0xff) == 0); // at most a 3-byte primary
        int index = findP(p);
        // If p is in a range, then we just assume that p is an actual primary in this range.
        // (Too cumbersome/expensive to check.)
        // Otherwise, it must be an exact match.
        assert (isEndOfPrimaryRange(elements[index + 1]) || p == (elements[index] & 0xffffff00L));
        return index;
    }

    /**
     * Returns the primary weight after p where index=findPrimary(p). p must be at least the first
     * root primary.
     */
    long getPrimaryAfter(long p, int index, boolean isCompressible) {
        assert (p == (elements[index] & 0xffffff00L) || isEndOfPrimaryRange(elements[index + 1]));
        long q = elements[++index];
        int step;
        if ((q & SEC_TER_DELTA_FLAG) == 0 && (step = (int) q & PRIMARY_STEP_MASK) != 0) {
            // Return the next primary in this range.
            if ((p & 0xffff) == 0) {
                return Collation.incTwoBytePrimaryByOffset(p, isCompressible, step);
            } else {
                return Collation.incThreeBytePrimaryByOffset(p, isCompressible, step);
            }
        } else {
            // Return the next primary in the list.
            while ((q & SEC_TER_DELTA_FLAG) != 0) {
                q = elements[++index];
            }
            assert ((q & PRIMARY_STEP_MASK) == 0);
            return q;
        }
    }

    /**
     * Returns the secondary weight after [p, s] where index=findPrimary(p) except use index=0 for
     * p=0.
     *
     * <p>Must return a weight for every root [p, s] as well as for every weight returned by
     * getSecondaryBefore(). If p!=0 then s can be BEFORE_WEIGHT16.
     *
     * <p>Exception: [0, 0] is handled by the CollationBuilder: Both its lower and upper boundaries
     * are special.
     */
    int getSecondaryAfter(int index, int s) {
        long secTer;
        int secLimit;
        if (index == 0) {
            // primary = 0
            assert (s != 0);
            index = (int) elements[IX_FIRST_SECONDARY_INDEX];
            secTer = elements[index];
            // Gap at the end of the secondary CE range.
            secLimit = 0x10000;
        } else {
            assert (index >= (int) elements[IX_FIRST_PRIMARY_INDEX]);
            secTer = getFirstSecTerForPrimary(index + 1);
            // If this is an explicit sec/ter unit, then it will be read once more.
            // Gap for secondaries of primary CEs.
            secLimit = getSecondaryBoundary();
        }
        for (; ; ) {
            int sec = (int) (secTer >> 16);
            if (sec > s) {
                return sec;
            }
            secTer = elements[++index];
            if ((secTer & SEC_TER_DELTA_FLAG) == 0) {
                return secLimit;
            }
        }
    }

    /**
     * Returns the tertiary weight after [p, s, t] where index=findPrimary(p) except use index=0 for
     * p=0.
     *
     * <p>Must return a weight for every root [p, s, t] as well as for every weight returned by
     * getTertiaryBefore(). If s!=0 then t can be BEFORE_WEIGHT16.
     *
     * <p>Exception: [0, 0, 0] is handled by the CollationBuilder: Both its lower and upper
     * boundaries are special.
     */
    int getTertiaryAfter(int index, int s, int t) {
        long secTer;
        int terLimit;
        if (index == 0) {
            // primary = 0
            if (s == 0) {
                assert (t != 0);
                index = (int) elements[IX_FIRST_TERTIARY_INDEX];
                // Gap at the end of the tertiary CE range.
                terLimit = 0x4000;
            } else {
                index = (int) elements[IX_FIRST_SECONDARY_INDEX];
                // Gap for tertiaries of primary/secondary CEs.
                terLimit = getTertiaryBoundary();
            }
            secTer = elements[index] & ~SEC_TER_DELTA_FLAG;
        } else {
            assert (index >= (int) elements[IX_FIRST_PRIMARY_INDEX]);
            secTer = getFirstSecTerForPrimary(index + 1);
            // If this is an explicit sec/ter unit, then it will be read once more.
            terLimit = getTertiaryBoundary();
        }
        long st = (((long) s & 0xffffffffL) << 16) | t;
        for (; ; ) {
            if (secTer > st) {
                assert ((secTer >> 16) == s);
                return (int) secTer & 0xffff;
            }
            secTer = elements[++index];
            // No tertiary greater than t for this primary+secondary.
            if ((secTer & SEC_TER_DELTA_FLAG) == 0 || (secTer >> 16) > s) {
                return terLimit;
            }
            secTer &= ~SEC_TER_DELTA_FLAG;
        }
    }

    /** Returns the first secondary & tertiary weights for p where index=findPrimary(p)+1. */
    private long getFirstSecTerForPrimary(int index) {
        long secTer = elements[index];
        if ((secTer & SEC_TER_DELTA_FLAG) == 0) {
            // No sec/ter delta.
            return Collation.COMMON_SEC_AND_TER_CE;
        }
        secTer &= ~SEC_TER_DELTA_FLAG;
        if (secTer > Collation.COMMON_SEC_AND_TER_CE) {
            // Implied sec/ter.
            return Collation.COMMON_SEC_AND_TER_CE;
        }
        // Explicit sec/ter below common/common.
        return secTer;
    }

    /**
     * Finds the largest index i where elements[i]<=p. Requires first primary<=p<0xffffff00
     * (PRIMARY_SENTINEL). Does not require that p is a root collator primary.
     */
    private int findP(long p) {
        // p need not occur as a root primary.
        // For example, it might be a reordering group boundary.
        assert ((p >> 24) != Collation.UNASSIGNED_IMPLICIT_BYTE);
        // modified binary search
        int start = (int) elements[IX_FIRST_PRIMARY_INDEX];
        assert (p >= elements[start]);
        int limit = elements.length - 1;
        assert (elements[limit] >= PRIMARY_SENTINEL);
        assert (p < elements[limit]);
        while ((start + 1) < limit) {
            // Invariant: elements[start] and elements[limit] are primaries,
            // and elements[start]<=p<=elements[limit].
            int i = (int) (((long) start + (long) limit) / 2);
            long q = elements[i];
            if ((q & SEC_TER_DELTA_FLAG) != 0) {
                // Find the next primary.
                int j = i + 1;
                for (; ; ) {
                    if (j == limit) {
                        break;
                    }
                    q = elements[j];
                    if ((q & SEC_TER_DELTA_FLAG) == 0) {
                        i = j;
                        break;
                    }
                    ++j;
                }
                if ((q & SEC_TER_DELTA_FLAG) != 0) {
                    // Find the preceding primary.
                    j = i - 1;
                    for (; ; ) {
                        if (j == start) {
                            break;
                        }
                        q = elements[j];
                        if ((q & SEC_TER_DELTA_FLAG) == 0) {
                            i = j;
                            break;
                        }
                        --j;
                    }
                    if ((q & SEC_TER_DELTA_FLAG) != 0) {
                        // No primary between start and limit.
                        break;
                    }
                }
            }
            if (p < (q & 0xffffff00L)) { // Reset the "step" bits of a range end primary.
                limit = i;
            } else {
                start = i;
            }
        }
        return start;
    }

    private static boolean isEndOfPrimaryRange(long q) {
        return (q & SEC_TER_DELTA_FLAG) == 0 && (q & PRIMARY_STEP_MASK) != 0;
    }

    /** Data structure: See ICU4C source/i18n/collationrootelements.h. */
    private long[] elements;
}