UnicodeSetSpanner.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2014-2016, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.text;

import com.ibm.icu.text.UnicodeSet.SpanCondition;
import com.ibm.icu.util.OutputInt;

/**
 * A helper class used to count, replace, and trim CharSequences based on UnicodeSet matches. An
 * instance is immutable (and thus thread-safe) iff the source UnicodeSet is frozen.
 *
 * <p><b>Note:</b> The counting, deletion, and replacement depend on alternating a {@link
 * SpanCondition} with its inverse. That is, the code spans, then spans for the inverse, then spans,
 * and so on. For the inverse, the following mapping is used:
 *
 * <ul>
 *   <li>{@link UnicodeSet.SpanCondition#SIMPLE} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}
 *   <li>{@link UnicodeSet.SpanCondition#CONTAINED} → {@link UnicodeSet.SpanCondition#NOT_CONTAINED}
 *   <li>{@link UnicodeSet.SpanCondition#NOT_CONTAINED} → {@link UnicodeSet.SpanCondition#SIMPLE}
 * </ul>
 *
 * These are actually not complete inverses. However, the alternating works because there are no
 * gaps. For example, with [a{ab}{bc}], you get the following behavior when scanning forward:
 *
 * <table border="1">
 * <tr><th>SIMPLE</th><td>xxx[ab]cyyy</td></tr>
 * <tr><th>CONTAINED</th><td>xxx[abc]yyy</td></tr>
 * <tr><th>NOT_CONTAINED</th><td>[xxx]ab[cyyy]</td></tr>
 * </table>
 *
 * <p>So here is what happens when you alternate:
 *
 * <table border="1">
 * <tr><th>start</th><td>|xxxabcyyy</td></tr>
 * <tr><th>NOT_CONTAINED</th><td>xxx|abcyyy</td></tr>
 * <tr><th>CONTAINED</th><td>xxxabc|yyy</td></tr>
 * <tr><th>NOT_CONTAINED</th><td>xxxabcyyy|</td></tr>
 * </table>
 *
 * <p>The entire string is traversed.
 *
 * @stable ICU 54
 */
public class UnicodeSetSpanner {

    private final UnicodeSet unicodeSet;

    /**
     * Create a spanner from a UnicodeSet. For speed and safety, the UnicodeSet should be frozen.
     * However, this class can be used with a non-frozen version to avoid the cost of freezing.
     *
     * @param source the original UnicodeSet
     * @stable ICU 54
     */
    public UnicodeSetSpanner(UnicodeSet source) {
        unicodeSet = source;
    }

    /**
     * Returns the UnicodeSet used for processing. It is frozen iff the original was.
     *
     * @return the construction set.
     * @stable ICU 54
     */
    public UnicodeSet getUnicodeSet() {
        return unicodeSet;
    }

    /**
     * {@inheritDoc}
     *
     * @stable ICU 54
     */
    @Override
    public boolean equals(Object other) {
        return other instanceof UnicodeSetSpanner
                && unicodeSet.equals(((UnicodeSetSpanner) other).unicodeSet);
    }

    /**
     * {@inheritDoc}
     *
     * @stable ICU 54
     */
    @Override
    public int hashCode() {
        return unicodeSet.hashCode();
    }

    /**
     * Options for replaceFrom and countIn to control how to treat each matched span. It is similar
     * to whether one is replacing [abc] by x, or [abc]* by x.
     *
     * @stable ICU 54
     */
    public enum CountMethod {
        /**
         * Collapse spans. That is, modify/count the entire matching span as a single item, instead
         * of separate set elements.
         *
         * @stable ICU 54
         */
        WHOLE_SPAN,
        /**
         * Use the smallest number of elements in the spanned range for counting and modification,
         * based on the {@link UnicodeSet.SpanCondition}. If the set has no strings, this will be
         * the same as the number of spanned code points.
         *
         * <p>For example, in the string "abab" with SpanCondition.SIMPLE:
         *
         * <ul>
         *   <li>spanning with [ab] will count four MIN_ELEMENTS.
         *   <li>spanning with [{ab}] will count two MIN_ELEMENTS.
         *   <li>spanning with [ab{ab}] will also count two MIN_ELEMENTS.
         * </ul>
         *
         * @stable ICU 54
         */
        MIN_ELEMENTS,
        // Note: could in the future have an additional option MAX_ELEMENTS
    }

    /**
     * Returns the number of matching characters found in a character sequence, counting by
     * CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. The code alternates spans; see the class
     * doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence the sequence to count characters in
     * @return the count. Zero if there are none.
     * @stable ICU 54
     */
    public int countIn(CharSequence sequence) {
        return countIn(sequence, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
    }

    /**
     * Returns the number of matching characters found in a character sequence, using
     * SpanCondition.SIMPLE. The code alternates spans; see the class doc for {@link
     * UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence the sequence to count characters in
     * @param countMethod whether to treat an entire span as a match, or individual elements as
     *     matches
     * @return the count. Zero if there are none.
     * @stable ICU 54
     */
    public int countIn(CharSequence sequence, CountMethod countMethod) {
        return countIn(sequence, countMethod, SpanCondition.SIMPLE);
    }

    /**
     * Returns the number of matching characters found in a character sequence. The code alternates
     * spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence the sequence to count characters in
     * @param countMethod whether to treat an entire span as a match, or individual elements as
     *     matches
     * @param spanCondition the spanCondition to use. SIMPLE or CONTAINED means only count the
     *     elements in the span; NOT_CONTAINED is the reverse. <br>
     *     <b>WARNING: </b> when a UnicodeSet contains strings, there may be unexpected behavior in
     *     edge cases.
     * @return the count. Zero if there are none.
     * @stable ICU 54
     */
    public int countIn(
            CharSequence sequence, CountMethod countMethod, SpanCondition spanCondition) {
        int count = 0;
        int start = 0;
        SpanCondition skipSpan =
                spanCondition == SpanCondition.NOT_CONTAINED
                        ? SpanCondition.SIMPLE
                        : SpanCondition.NOT_CONTAINED;
        final int length = sequence.length();
        OutputInt spanCount = null;
        while (start != length) {
            int endOfSpan = unicodeSet.span(sequence, start, skipSpan);
            if (endOfSpan == length) {
                break;
            }
            if (countMethod == CountMethod.WHOLE_SPAN) {
                start = unicodeSet.span(sequence, endOfSpan, spanCondition);
                count += 1;
            } else {
                if (spanCount == null) {
                    spanCount = new OutputInt();
                }
                start = unicodeSet.spanAndCount(sequence, endOfSpan, spanCondition, spanCount);
                count += spanCount.value;
            }
        }
        return count;
    }

    /**
     * Delete all the matching spans in sequence, using SpanCondition.SIMPLE The code alternates
     * spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence charsequence to replace matching spans in.
     * @return modified string.
     * @stable ICU 54
     */
    public String deleteFrom(CharSequence sequence) {
        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, SpanCondition.SIMPLE);
    }

    /**
     * Delete all matching spans in sequence, according to the spanCondition. The code alternates
     * spans; see the class doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence charsequence to replace matching spans in.
     * @param spanCondition specify whether to modify the matching spans (CONTAINED or SIMPLE) or
     *     the non-matching (NOT_CONTAINED)
     * @return modified string.
     * @stable ICU 54
     */
    public String deleteFrom(CharSequence sequence, SpanCondition spanCondition) {
        return replaceFrom(sequence, "", CountMethod.WHOLE_SPAN, spanCondition);
    }

    /**
     * Replace all matching spans in sequence by the replacement, counting by
     * CountMethod.MIN_ELEMENTS using SpanCondition.SIMPLE. The code alternates spans; see the class
     * doc for {@link UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence charsequence to replace matching spans in.
     * @param replacement replacement sequence. To delete, use ""
     * @return modified string.
     * @stable ICU 54
     */
    public String replaceFrom(CharSequence sequence, CharSequence replacement) {
        return replaceFrom(sequence, replacement, CountMethod.MIN_ELEMENTS, SpanCondition.SIMPLE);
    }

    /**
     * Replace all matching spans in sequence by replacement, according to the CountMethod, using
     * SpanCondition.SIMPLE. The code alternates spans; see the class doc for {@link
     * UnicodeSetSpanner} for a note about boundary conditions.
     *
     * @param sequence charsequence to replace matching spans in.
     * @param replacement replacement sequence. To delete, use ""
     * @param countMethod whether to treat an entire span as a match, or individual elements as
     *     matches
     * @return modified string.
     * @stable ICU 54
     */
    public String replaceFrom(
            CharSequence sequence, CharSequence replacement, CountMethod countMethod) {
        return replaceFrom(sequence, replacement, countMethod, SpanCondition.SIMPLE);
    }

    /**
     * Replace all matching spans in sequence by replacement, according to the countMethod and
     * spanCondition. The code alternates spans; see the class doc for {@link UnicodeSetSpanner} for
     * a note about boundary conditions.
     *
     * @param sequence charsequence to replace matching spans in.
     * @param replacement replacement sequence. To delete, use ""
     * @param countMethod whether to treat an entire span as a match, or individual elements as
     *     matches
     * @param spanCondition specify whether to modify the matching spans (CONTAINED or SIMPLE) or
     *     the non-matching (NOT_CONTAINED)
     * @return modified string.
     * @stable ICU 54
     */
    public String replaceFrom(
            CharSequence sequence,
            CharSequence replacement,
            CountMethod countMethod,
            SpanCondition spanCondition) {
        SpanCondition copySpan =
                spanCondition == SpanCondition.NOT_CONTAINED
                        ? SpanCondition.SIMPLE
                        : SpanCondition.NOT_CONTAINED;
        final boolean remove = replacement.length() == 0;
        StringBuilder result = new StringBuilder();
        // TODO, we can optimize this to
        // avoid this allocation unless needed

        final int length = sequence.length();
        OutputInt spanCount = null;
        for (int endCopy = 0; endCopy != length; ) {
            int endModify;
            if (countMethod == CountMethod.WHOLE_SPAN) {
                endModify = unicodeSet.span(sequence, endCopy, spanCondition);
            } else {
                if (spanCount == null) {
                    spanCount = new OutputInt();
                }
                endModify = unicodeSet.spanAndCount(sequence, endCopy, spanCondition, spanCount);
            }
            if (remove || endModify == 0) {
                // do nothing
            } else if (countMethod == CountMethod.WHOLE_SPAN) {
                result.append(replacement);
            } else {
                for (int i = spanCount.value; i > 0; --i) {
                    result.append(replacement);
                }
            }
            if (endModify == length) {
                break;
            }
            endCopy = unicodeSet.span(sequence, endModify, copySpan);
            result.append(sequence.subSequence(endModify, endCopy));
        }
        return result.toString();
    }

    /**
     * Options for the trim() method
     *
     * @stable ICU 54
     */
    public enum TrimOption {
        /**
         * Trim leading spans.
         *
         * @stable ICU 54
         */
        LEADING,
        /**
         * Trim leading and trailing spans.
         *
         * @stable ICU 54
         */
        BOTH,
        /**
         * Trim trailing spans.
         *
         * @stable ICU 54
         */
        TRAILING;
    }

    /**
     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements
     * at the start and end of the string, using TrimOption.BOTH and SpanCondition.SIMPLE. For
     * example:
     *
     * <pre>{@code
     * new UnicodeSet("[ab]").trim("abacatbab")
     * }</pre>
     *
     * ... returns {@code "cat"}.
     *
     * @param sequence the sequence to trim
     * @return a subsequence
     * @stable ICU 54
     */
    public CharSequence trim(CharSequence sequence) {
        return trim(sequence, TrimOption.BOTH, SpanCondition.SIMPLE);
    }

    /**
     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements
     * at the start or end of the string, using the trimOption and SpanCondition.SIMPLE. For
     * example:
     *
     * <pre>{@code
     * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING)
     * }</pre>
     *
     * ... returns {@code "catbab"}.
     *
     * @param sequence the sequence to trim
     * @param trimOption LEADING, TRAILING, or BOTH
     * @return a subsequence
     * @stable ICU 54
     */
    public CharSequence trim(CharSequence sequence, TrimOption trimOption) {
        return trim(sequence, trimOption, SpanCondition.SIMPLE);
    }

    /**
     * Returns a trimmed sequence (using CharSequence.subsequence()), that omits matching elements
     * at the start or end of the string, depending on the trimOption and spanCondition. For
     * example:
     *
     * <pre>{@code
     * new UnicodeSet("[ab]").trim("abacatbab", TrimOption.LEADING, SpanCondition.SIMPLE)
     * }</pre>
     *
     * ... returns {@code "catbab"}.
     *
     * @param sequence the sequence to trim
     * @param trimOption LEADING, TRAILING, or BOTH
     * @param spanCondition SIMPLE, CONTAINED or NOT_CONTAINED
     * @return a subsequence
     * @stable ICU 54
     */
    public CharSequence trim(
            CharSequence sequence, TrimOption trimOption, SpanCondition spanCondition) {
        int endLeadContained, startTrailContained;
        final int length = sequence.length();
        if (trimOption != TrimOption.TRAILING) {
            endLeadContained = unicodeSet.span(sequence, spanCondition);
            if (endLeadContained == length) {
                return "";
            }
        } else {
            endLeadContained = 0;
        }
        if (trimOption != TrimOption.LEADING) {
            startTrailContained = unicodeSet.spanBack(sequence, spanCondition);
        } else {
            startTrailContained = length;
        }
        return endLeadContained == 0 && startTrailContained == length
                ? sequence
                : sequence.subSequence(endLeadContained, startTrailContained);
    }
}