TrieBuilder.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 ******************************************************************************
 * Copyright (C) 1996-2010, International Business Machines Corporation and   *
 * others. All Rights Reserved.                                               *
 ******************************************************************************
 */

package com.ibm.icu.impl;

import com.ibm.icu.lang.UCharacter;
import java.util.Arrays;

/**
 * Builder class to manipulate and generate a trie. This is useful for ICU data in primitive types.
 * Provides a compact way to store information that is indexed by Unicode values, such as character
 * properties, types, keyboard values, etc. This is very useful when you have a block of Unicode
 * data that contains significant values while the rest of the Unicode data is unused in the
 * application or when you have a lot of redundance, such as where all 21,000 Han ideographs have
 * the same value. However, lookup is much faster than a hash table. A trie of any primitive data
 * type serves two purposes:
 *
 * <UL>
 *   <LI>Fast access of the indexed values.
 *   <LI>Smaller memory footprint.
 * </UL>
 *
 * This is a direct port from the ICU4C version
 *
 * @author Syn Wee Quek
 */
public class TrieBuilder {
    // public data member ----------------------------------------------

    /** Number of data values in a stage 2 (data array) block. 2, 4, 8, .., 0x200 */
    public static final int DATA_BLOCK_LENGTH = 1 << Trie.INDEX_STAGE_1_SHIFT_;

    // public class declaration ----------------------------------------

    /**
     * Character data in com.ibm.impl.Trie have different user-specified format for different
     * purposes. This interface specifies methods to be implemented in order for com.ibm.impl.Trie,
     * to surrogate offset information encapsulated within the data.
     */
    public static interface DataManipulate {
        /**
         * Build-time trie callback function, used with serialize(). This function calculates a lead
         * surrogate's value including a folding offset from the 1024 supplementary code points
         * [start..start+1024[ . It is U+10000 <= start <= U+10fc00 and (start&0x3ff)==0. The
         * folding offset is provided by the caller. It is
         * offset=UTRIE_BMP_INDEX_LENGTH+n*UTRIE_SURROGATE_BLOCK_COUNT with n=0..1023. Instead of
         * the offset itself, n can be stored in 10 bits - or fewer if it can be assumed that few
         * lead surrogates have associated data. The returned value must be - not zero if and only
         * if there is relevant data for the corresponding 1024 supplementary code points - such
         * that UTrie.getFoldingOffset(UNewTrieGetFoldedValue(..., offset))==offset
         *
         * @return a folded value, or 0 if there is no relevant data for the lead surrogate.
         */
        public int getFoldedValue(int start, int offset);
    }

    // public methods ----------------------------------------------------

    /**
     * Checks if the character belongs to a zero block in the trie
     *
     * @param ch codepoint which data is to be retrieved
     * @return true if ch is in the zero block
     */
    public boolean isInZeroBlock(int ch) {
        // valid, uncompacted trie and valid c?
        if (m_isCompacted_ || ch > UCharacter.MAX_VALUE || ch < UCharacter.MIN_VALUE) {
            return true;
        }

        return m_index_[ch >> SHIFT_] == 0;
    }

    // package private method -----------------------------------------------

    // protected data member -----------------------------------------------

    /**
     * Index values at build-time are 32 bits wide for easier processing. Bit 31 is set if the data
     * block is used by multiple index values (from setRange()).
     */
    protected int m_index_[];

    protected int m_indexLength_;
    protected int m_dataCapacity_;
    protected int m_dataLength_;
    protected boolean m_isLatin1Linear_;
    protected boolean m_isCompacted_;

    /** Map of adjusted indexes, used in utrie_compact(). Maps from original indexes to new ones. */
    protected int m_map_[];

    /** Shift size for shifting right the input index. 1..9 */
    protected static final int SHIFT_ = Trie.INDEX_STAGE_1_SHIFT_;

    /**
     * Length of the index (stage 1) array before folding. Maximum number of Unicode code points
     * (0x110000) shifted right by SHIFT.
     */
    protected static final int MAX_INDEX_LENGTH_ = (0x110000 >> SHIFT_);

    /** Length of the BMP portion of the index (stage 1) array. */
    protected static final int BMP_INDEX_LENGTH_ = 0x10000 >> SHIFT_;

    /**
     * Number of index (stage 1) entries per lead surrogate. Same as number of indexe entries for
     * 1024 trail surrogates, ==0x400>>UTRIE_SHIFT 10 - SHIFT == Number of bits of a trail surrogate
     * that are used in index table lookups.
     */
    protected static final int SURROGATE_BLOCK_COUNT_ = 1 << (10 - SHIFT_);

    /** Mask for getting the lower bits from the input index. DATA_BLOCK_LENGTH - 1. */
    protected static final int MASK_ = Trie.INDEX_STAGE_3_MASK_;

    /**
     * Shift size for shifting left the index array values. Increases possible data size with 16-bit
     * index values at the cost of compactability. This requires blocks of stage 2 data to be
     * aligned by UTRIE_DATA_GRANULARITY. 0..UTRIE_SHIFT
     */
    protected static final int INDEX_SHIFT_ = Trie.INDEX_STAGE_2_SHIFT_;

    /**
     * Maximum length of the runtime data (stage 2) array. Limited by 16-bit index values that are
     * left-shifted by INDEX_SHIFT_.
     */
    protected static final int MAX_DATA_LENGTH_ = (0x10000 << INDEX_SHIFT_);

    /** Shifting to position the index value in options */
    protected static final int OPTIONS_INDEX_SHIFT_ = 4;

    /** If set, then the data (stage 2) array is 32 bits wide. */
    protected static final int OPTIONS_DATA_IS_32_BIT_ = 0x100;

    /**
     * If set, then Latin-1 data (for U+0000..U+00ff) is stored in the data (stage 2) array as a
     * simple, linear array at data + DATA_BLOCK_LENGTH.
     */
    protected static final int OPTIONS_LATIN1_IS_LINEAR_ = 0x200;

    /** The alignment size of a stage 2 data block. Also the granularity for compaction. */
    protected static final int DATA_GRANULARITY_ = 1 << INDEX_SHIFT_;

    // protected constructor ----------------------------------------------

    protected TrieBuilder() {
        m_index_ = new int[MAX_INDEX_LENGTH_];
        m_map_ = new int[MAX_BUILD_TIME_DATA_LENGTH_ >> SHIFT_];
        m_isLatin1Linear_ = false;
        m_isCompacted_ = false;
        m_indexLength_ = MAX_INDEX_LENGTH_;
    }

    protected TrieBuilder(TrieBuilder table) {
        m_index_ = new int[MAX_INDEX_LENGTH_];
        m_indexLength_ = table.m_indexLength_;
        System.arraycopy(table.m_index_, 0, m_index_, 0, m_indexLength_);
        m_dataCapacity_ = table.m_dataCapacity_;
        m_dataLength_ = table.m_dataLength_;
        m_map_ = new int[table.m_map_.length];
        System.arraycopy(table.m_map_, 0, m_map_, 0, m_map_.length);
        m_isLatin1Linear_ = table.m_isLatin1Linear_;
        m_isCompacted_ = table.m_isCompacted_;
    }

    // protected functions ------------------------------------------------

    /** Compare two sections of an array for equality. */
    protected static final boolean equal_int(int[] array, int start1, int start2, int length) {
        while (length > 0 && array[start1] == array[start2]) {
            ++start1;
            ++start2;
            --length;
        }
        return length == 0;
    }

    /**
     * Set a value in the trie index map to indicate which data block is referenced and which one is
     * not. utrie_compact() will remove data blocks that are not used at all. Set - 0 if it is used
     * - -1 if it is not used
     */
    protected void findUnusedBlocks() {
        // fill the entire map with "not used"
        Arrays.fill(m_map_, 0xff);

        // mark each block that _is_ used with 0
        for (int i = 0; i < m_indexLength_; ++i) {
            m_map_[Math.abs(m_index_[i]) >> SHIFT_] = 0;
        }

        // never move the all-initial-value block 0
        m_map_[0] = 0;
    }

    /**
     * Finds the same index block as the otherBlock
     *
     * @param index array
     * @param indexLength size of index
     * @param otherBlock
     * @return same index block
     */
    protected static final int findSameIndexBlock(int index[], int indexLength, int otherBlock) {
        for (int block = BMP_INDEX_LENGTH_; block < indexLength; block += SURROGATE_BLOCK_COUNT_) {
            if (equal_int(index, block, otherBlock, SURROGATE_BLOCK_COUNT_)) {
                return block;
            }
        }
        return indexLength;
    }

    // private data member ------------------------------------------------

    /**
     * Maximum length of the build-time data (stage 2) array. The maximum length is 0x110000 +
     * DATA_BLOCK_LENGTH + 0x400. (Number of Unicode code points + one all-initial-value block +
     * possible duplicate entries for 1024 lead surrogates.)
     */
    private static final int MAX_BUILD_TIME_DATA_LENGTH_ = 0x110000 + DATA_BLOCK_LENGTH + 0x400;
}