SearchIterator.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 1996-2015, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */
package com.ibm.icu.text;

import java.text.CharacterIterator;

/**
 * {@code SearchIterator} is an abstract base class that provides methods to search for a pattern
 * within a text string. Instances of {@code SearchIterator} maintain a current position and scan
 * over the target text, returning the indices the pattern is matched and the length of each match.
 *
 * <p>{@code SearchIterator} defines a protocol for text searching. Subclasses provide concrete
 * implementations of various search algorithms. For example, {@code StringSearch} implements
 * language-sensitive pattern matching based on the comparison rules defined in a {@code
 * RuleBasedCollator} object.
 *
 * <p>Other options for searching include using a BreakIterator to restrict the points at which
 * matches are detected.
 *
 * <p>{@code SearchIterator} provides an API that is similar to that of other text iteration classes
 * such as {@code BreakIterator}. Using this class, it is easy to scan through text looking for all
 * occurrences of a given pattern. The following example uses a {@code StringSearch} object to find
 * all instances of "fox" in the target string. Any other subclass of {@code SearchIterator} can be
 * used in an identical manner.
 *
 * <pre><code>
 * String target = "The quick brown fox jumped over the lazy fox";
 * String pattern = "fox";
 * SearchIterator iter = new StringSearch(pattern, target);
 * for (int pos = iter.first(); pos != SearchIterator.DONE;
 *         pos = iter.next()) {
 *     System.out.println("Found match at " + pos +
 *             ", length is " + iter.getMatchLength());
 * }
 * </code></pre>
 *
 * @author Laura Werner, synwee
 * @stable ICU 2.0
 * @see BreakIterator
 * @see RuleBasedCollator
 */
public abstract class SearchIterator {
    /**
     * The BreakIterator to define the boundaries of a logical match. This value can be a null. See
     * class documentation for more information.
     *
     * @see #setBreakIterator(BreakIterator)
     * @see #getBreakIterator
     * @see BreakIterator
     * @stable ICU 2.0
     */
    protected BreakIterator breakIterator;

    /**
     * Target text for searching.
     *
     * @see #setTarget(CharacterIterator)
     * @see #getTarget
     * @stable ICU 2.0
     */
    protected CharacterIterator targetText;

    /**
     * Length of the most current match in target text. Value 0 is the default value.
     *
     * @see #setMatchLength
     * @see #getMatchLength
     * @stable ICU 2.0
     */
    protected int matchLength;

    /**
     * Java port of ICU4C struct USearch (usrchimp.h)
     *
     * <p>Note:
     *
     * <p>ICU4J already exposed some protected members such as targetText, breakIterator and
     * matchedLength as a part of stable APIs. In ICU4C, they are exposed through USearch struct,
     * although USearch struct itself is internal API.
     *
     * <p>This class was created for making ICU4J code parallel to ICU4C implementation. ICU4J
     * implementation access member fields like C struct (e.g. search_.isOverlap_) mostly, except
     * fields already exposed as protected member (e.g. search_.text()).
     */
    final class Search {

        CharacterIterator text() {
            return SearchIterator.this.targetText;
        }

        void setTarget(CharacterIterator text) {
            SearchIterator.this.targetText = text;
        }

        /**
         * Flag to indicate if overlapping search is to be done. E.g. looking for "aa" in "aaa" will
         * yield matches at offset 0 and 1.
         */
        boolean isOverlap_;

        boolean isCanonicalMatch_;

        ElementComparisonType elementComparisonType_;

        BreakIterator internalBreakIter_;

        BreakIterator breakIter() {
            return SearchIterator.this.breakIterator;
        }

        void setBreakIter(BreakIterator breakIter) {
            SearchIterator.this.breakIterator = breakIter;
        }

        int matchedIndex_;

        int matchedLength() {
            return SearchIterator.this.matchLength;
        }

        void setMatchedLength(int matchedLength) {
            SearchIterator.this.matchLength = matchedLength;
        }

        /** Flag indicates if we are doing a forwards search */
        boolean isForwardSearching_;

        /**
         * Flag indicates if we are at the start of a string search. This indicates that we are in
         * forward search and at the start of m_text.
         */
        boolean reset_;

        // Convenient methods for accessing begin/end index of the
        // target text. These are ICU4J only and are not data fields.
        int beginIndex() {
            if (targetText == null) {
                return 0;
            }
            return targetText.getBeginIndex();
        }

        int endIndex() {
            if (targetText == null) {
                return 0;
            }
            return targetText.getEndIndex();
        }
    }

    Search search_ = new Search();

    // public data members -------------------------------------------------

    /**
     * DONE is returned by previous() and next() after all valid matches have been returned, and by
     * first() and last() if there are no matches at all.
     *
     * @see #previous
     * @see #next
     * @stable ICU 2.0
     */
    public static final int DONE = -1;

    // public methods -----------------------------------------------------

    // public setters -----------------------------------------------------

    /**
     * Sets the position in the target text at which the next search will start. This method clears
     * any previous match.
     *
     * @param position position from which to start the next search
     * @exception IndexOutOfBoundsException thrown if argument position is out of the target text
     *     range.
     * @see #getIndex
     * @stable ICU 2.8
     */
    public void setIndex(int position) {
        if (position < search_.beginIndex() || position > search_.endIndex()) {
            throw new IndexOutOfBoundsException(
                    "setIndex(int) expected position to be between "
                            + search_.beginIndex()
                            + " and "
                            + search_.endIndex());
        }
        search_.reset_ = false;
        search_.setMatchedLength(0);
        search_.matchedIndex_ = DONE;
    }

    /**
     * Determines whether overlapping matches are returned. See the class documentation for more
     * information about overlapping matches.
     *
     * <p>The default setting of this property is false
     *
     * @param allowOverlap flag indicator if overlapping matches are allowed
     * @see #isOverlapping
     * @stable ICU 2.8
     */
    public void setOverlapping(boolean allowOverlap) {
        search_.isOverlap_ = allowOverlap;
    }

    /**
     * Set the BreakIterator that will be used to restrict the points at which matches are detected.
     *
     * @param breakiter A BreakIterator that will be used to restrict the points at which matches
     *     are detected. If a match is found, but the match's start or end index is not a boundary
     *     as determined by the {@link BreakIterator}, the match will be rejected and another will
     *     be searched for. If this parameter is {@code null}, no break detection is attempted.
     * @see BreakIterator
     * @stable ICU 2.0
     */
    public void setBreakIterator(BreakIterator breakiter) {
        search_.setBreakIter(breakiter);
        if (search_.breakIter() != null) {
            // Create a clone of CharacterItearator, so it won't
            // affect the position currently held by search_.text()
            if (search_.text() != null) {
                search_.breakIter().setText((CharacterIterator) search_.text().clone());
            }
        }
    }

    /**
     * Set the target text to be searched. Text iteration will then begin at the start of the text
     * string. This method is useful if you want to reuse an iterator to search within a different
     * body of text.
     *
     * @param text new text iterator to look for match,
     * @exception IllegalArgumentException thrown when text is null or has 0 length
     * @see #getTarget
     * @stable ICU 2.4
     */
    public void setTarget(CharacterIterator text) {
        if (text == null || text.getEndIndex() == text.getIndex()) {
            throw new IllegalArgumentException("Illegal null or empty text");
        }

        text.setIndex(text.getBeginIndex());
        search_.setTarget(text);
        search_.matchedIndex_ = DONE;
        search_.setMatchedLength(0);
        search_.reset_ = true;
        search_.isForwardSearching_ = true;
        if (search_.breakIter() != null) {
            // Create a clone of CharacterItearator, so it won't
            // affect the position currently held by search_.text()
            search_.breakIter().setText((CharacterIterator) text.clone());
        }
        if (search_.internalBreakIter_ != null) {
            search_.internalBreakIter_.setText((CharacterIterator) text.clone());
        }
    }

    // TODO: We may add APIs below to match ICU4C APIs
    // setCanonicalMatch

    // public getters ----------------------------------------------------

    /**
     * Returns the index to the match in the text string that was searched. This call returns a
     * valid result only after a successful call to {@link #first}, {@link #next}, {@link
     * #previous}, or {@link #last}. Just after construction, or after a searching method returns
     * {@link #DONE}, this method will return {@link #DONE}.
     *
     * <p>Use {@link #getMatchLength} to get the matched string length.
     *
     * @return index of a substring within the text string that is being searched.
     * @see #first
     * @see #next
     * @see #previous
     * @see #last
     * @stable ICU 2.0
     */
    public int getMatchStart() {
        return search_.matchedIndex_;
    }

    /**
     * Return the current index in the text being searched. If the iteration has gone past the end
     * of the text (or past the beginning for a backwards search), {@link #DONE} is returned.
     *
     * @return current index in the text being searched.
     * @stable ICU 2.8
     */
    public abstract int getIndex();

    /**
     * Returns the length of text in the string which matches the search pattern. This call returns
     * a valid result only after a successful call to {@link #first}, {@link #next}, {@link
     * #previous}, or {@link #last}. Just after construction, or after a searching method returns
     * {@link #DONE}, this method will return 0.
     *
     * @return The length of the match in the target text, or 0 if there is no match currently.
     * @see #first
     * @see #next
     * @see #previous
     * @see #last
     * @stable ICU 2.0
     */
    public int getMatchLength() {
        return search_.matchedLength();
    }

    /**
     * Returns the BreakIterator that is used to restrict the indexes at which matches are detected.
     * This will be the same object that was passed to the constructor or to {@link
     * #setBreakIterator}. If the {@link BreakIterator} has not been set, {@code null} will be
     * returned. See {@link #setBreakIterator} for more information.
     *
     * @return the BreakIterator set to restrict logic matches
     * @see #setBreakIterator
     * @see BreakIterator
     * @stable ICU 2.0
     */
    public BreakIterator getBreakIterator() {
        return search_.breakIter();
    }

    /**
     * Return the string text to be searched.
     *
     * @return text string to be searched.
     * @stable ICU 2.0
     */
    public CharacterIterator getTarget() {
        return search_.text();
    }

    /**
     * Returns the text that was matched by the most recent call to {@link #first}, {@link #next},
     * {@link #previous}, or {@link #last}. If the iterator is not pointing at a valid match (e.g.
     * just after construction or after {@link #DONE} has been returned, returns an empty string.
     *
     * @return the substring in the target test of the most recent match, or null if there is no
     *     match currently.
     * @see #first
     * @see #next
     * @see #previous
     * @see #last
     * @stable ICU 2.0
     */
    public String getMatchedText() {
        if (search_.matchedLength() > 0) {
            int limit = search_.matchedIndex_ + search_.matchedLength();
            StringBuilder result = new StringBuilder(search_.matchedLength());
            CharacterIterator it = search_.text();
            it.setIndex(search_.matchedIndex_);
            while (it.getIndex() < limit) {
                result.append(it.current());
                it.next();
            }
            it.setIndex(search_.matchedIndex_);
            return result.toString();
        }
        return null;
    }

    // miscellaneous public methods -----------------------------------------

    /**
     * Returns the index of the next point at which the text matches the search pattern, starting
     * from the current position The iterator is adjusted so that its current index (as returned by
     * {@link #getIndex}) is the match position if one was found. If a match is not found, {@link
     * #DONE} will be returned and the iterator will be adjusted to a position after the end of the
     * text string.
     *
     * @return The index of the next match after the current position, or {@link #DONE} if there are
     *     no more matches.
     * @see #getIndex
     * @stable ICU 2.0
     */
    public int next() {
        int index = getIndex(); // offset = getOffset() in ICU4C
        int matchindex = search_.matchedIndex_;
        int matchlength = search_.matchedLength();
        search_.reset_ = false;
        if (search_.isForwardSearching_) {
            int endIdx = search_.endIndex();
            if (index == endIdx
                    || matchindex == endIdx
                    || (matchindex != DONE && matchindex + matchlength >= endIdx)) {
                setMatchNotFound();
                return DONE;
            }
        } else {
            // switching direction.
            // if matchedIndex == DONE, it means that either a
            // setIndex (setOffset in C) has been called or that previous ran off the text
            // string. the iterator would have been set to offset 0 if a
            // match is not found.
            search_.isForwardSearching_ = true;
            if (search_.matchedIndex_ != DONE) {
                // there's no need to set the collation element iterator
                // the next call to next will set the offset.
                return matchindex;
            }
        }

        if (matchlength > 0) {
            // if matchlength is 0 we are at the start of the iteration
            if (search_.isOverlap_) {
                index++;
            } else {
                index += matchlength;
            }
        }

        return handleNext(index);
    }

    /**
     * Returns the index of the previous point at which the string text matches the search pattern,
     * starting at the current position. The iterator is adjusted so that its current index (as
     * returned by {@link #getIndex}) is the match position if one was found. If a match is not
     * found, {@link #DONE} will be returned and the iterator will be adjusted to the index {@link
     * #DONE}.
     *
     * @return The index of the previous match before the current position, or {@link #DONE} if
     *     there are no more matches.
     * @see #getIndex
     * @stable ICU 2.0
     */
    public int previous() {
        int index; // offset in ICU4C
        if (search_.reset_) {
            index = search_.endIndex(); // m_search_->textLength in ICU4C
            search_.isForwardSearching_ = false;
            search_.reset_ = false;
            setIndex(index);
        } else {
            index = getIndex();
        }

        int matchindex = search_.matchedIndex_;
        if (search_.isForwardSearching_) {
            // switching direction.
            // if matchedIndex == DONE, it means that either a
            // setIndex (setOffset in C) has been called or that next ran off the text
            // string. the iterator would have been set to offset textLength if
            // a match is not found.
            search_.isForwardSearching_ = false;
            if (matchindex != DONE) {
                return matchindex;
            }
        } else {
            int startIdx = search_.beginIndex();
            if (index == startIdx || matchindex == startIdx) {
                // not enough characters to match
                setMatchNotFound();
                return DONE;
            }
        }

        if (matchindex != DONE) {
            if (search_.isOverlap_) {
                matchindex += search_.matchedLength() - 2;
            }

            return handlePrevious(matchindex);
        }

        return handlePrevious(index);
    }

    /**
     * Return true if the overlapping property has been set. See {@link #setOverlapping(boolean)}
     * for more information.
     *
     * @see #setOverlapping
     * @return true if the overlapping property has been set, false otherwise
     * @stable ICU 2.8
     */
    public boolean isOverlapping() {
        return search_.isOverlap_;
    }

    // TODO: We may add APIs below to match ICU4C APIs
    // isCanonicalMatch

    /**
     * Resets the iteration. Search will begin at the start of the text string if a forward
     * iteration is initiated before a backwards iteration. Otherwise if a backwards iteration is
     * initiated before a forwards iteration, the search will begin at the end of the text string.
     *
     * @stable ICU 2.0
     */
    public void reset() {
        setMatchNotFound();
        setIndex(search_.beginIndex());
        search_.isOverlap_ = false;
        search_.isCanonicalMatch_ = false;
        search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
        search_.isForwardSearching_ = true;
        search_.reset_ = true;
    }

    /**
     * Returns the first index at which the string text matches the search pattern. The iterator is
     * adjusted so that its current index (as returned by {@link #getIndex()}) is the match position
     * if one
     *
     * <p>was found. If a match is not found, {@link #DONE} will be returned and the iterator will
     * be adjusted to the index {@link #DONE}.
     *
     * @return The character index of the first match, or {@link #DONE} if there are no matches.
     * @see #getIndex
     * @stable ICU 2.0
     */
    public final int first() {
        int startIdx = search_.beginIndex();
        setIndex(startIdx);
        return handleNext(startIdx);
    }

    /**
     * Returns the first index equal or greater than {@code position} at which the string text
     * matches the search pattern. The iterator is adjusted so that its current index (as returned
     * by {@link #getIndex()}) is the match position if one was found. If a match is not found,
     * {@link #DONE} will be returned and the iterator will be adjusted to the index {@link #DONE}.
     *
     * @param position where search if to start from.
     * @return The character index of the first match following {@code position}, or {@link #DONE}
     *     if there are no matches.
     * @throws IndexOutOfBoundsException If position is less than or greater than the text range for
     *     searching.
     * @see #getIndex
     * @stable ICU 2.0
     */
    public final int following(int position) {
        setIndex(position);
        return handleNext(position);
    }

    /**
     * Returns the last index in the target text at which it matches the search pattern. The
     * iterator is adjusted so that its current index (as returned by {@link #getIndex}) is the
     * match position if one was found. If a match is not found, {@link #DONE} will be returned and
     * the iterator will be adjusted to the index {@link #DONE}.
     *
     * @return The index of the first match, or {@link #DONE} if there are no matches.
     * @see #getIndex
     * @stable ICU 2.0
     */
    public final int last() {
        int endIdx = search_.endIndex();
        setIndex(endIdx);
        return handlePrevious(endIdx);
    }

    /**
     * Returns the first index less than {@code position} at which the string text matches the
     * search pattern. The iterator is adjusted so that its current index (as returned by {@link
     * #getIndex}) is the match position if one was found. If a match is not found, {@link #DONE}
     * will be returned and the iterator will be adjusted to the index {@link #DONE}
     *
     * <p>When the overlapping option ({@link #isOverlapping}) is off, the last index of the result
     * match is always less than {@code position}. When the overlapping option is on, the result
     * match may span across {@code position}.
     *
     * @param position where search is to start from.
     * @return The character index of the first match preceding {@code position}, or {@link #DONE}
     *     if there are no matches.
     * @throws IndexOutOfBoundsException If position is less than or greater than the text range for
     *     searching
     * @see #getIndex
     * @stable ICU 2.0
     */
    public final int preceding(int position) {
        setIndex(position);
        return handlePrevious(position);
    }

    // protected constructor ----------------------------------------------

    /**
     * Protected constructor for use by subclasses. Initializes the iterator with the argument
     * target text for searching and sets the BreakIterator. See class documentation for more
     * details on the use of the target text and {@link BreakIterator}.
     *
     * @param target The target text to be searched.
     * @param breaker A {@link BreakIterator} that is used to determine the boundaries of a logical
     *     match. This argument can be null.
     * @exception IllegalArgumentException thrown when argument target is null, or of length 0
     * @see BreakIterator
     * @stable ICU 2.0
     */
    protected SearchIterator(CharacterIterator target, BreakIterator breaker) {
        if (target == null || (target.getEndIndex() - target.getBeginIndex()) == 0) {
            throw new IllegalArgumentException(
                    "Illegal argument target. " + " Argument can not be null or of length 0");
        }

        search_.setTarget(target);
        search_.setBreakIter(breaker);
        if (search_.breakIter() != null) {
            search_.breakIter().setText((CharacterIterator) target.clone());
        }
        search_.isOverlap_ = false;
        search_.isCanonicalMatch_ = false;
        search_.elementComparisonType_ = ElementComparisonType.STANDARD_ELEMENT_COMPARISON;
        search_.isForwardSearching_ = true;
        search_.reset_ = true;
        search_.matchedIndex_ = DONE;
        search_.setMatchedLength(0);
    }

    // protected methods --------------------------------------------------

    /**
     * Sets the length of the most recent match in the target text. Subclasses' handleNext() and
     * handlePrevious() methods should call this after they find a match in the target text.
     *
     * @param length new length to set
     * @see #handleNext
     * @see #handlePrevious
     * @stable ICU 2.0
     */
    protected void setMatchLength(int length) {
        search_.setMatchedLength(length);
    }

    /**
     * Abstract method which subclasses override to provide the mechanism for finding the next match
     * in the target text. This allows different subclasses to provide different search algorithms.
     *
     * <p>If a match is found, the implementation should return the index at which the match starts
     * and should call {@link #setMatchLength} with the number of characters in the target text that
     * make up the match. If no match is found, the method should return {@link #DONE}.
     *
     * @param start The index in the target text at which the search should start.
     * @return index at which the match starts, else if match is not found {@link #DONE} is returned
     * @see #setMatchLength
     * @stable ICU 2.0
     */
    protected abstract int handleNext(int start);

    /**
     * Abstract method which subclasses override to provide the mechanism for finding the previous
     * match in the target text. This allows different subclasses to provide different search
     * algorithms.
     *
     * <p>If a match is found, the implementation should return the index at which the match starts
     * and should call {@link #setMatchLength} with the number of characters in the target text that
     * make up the match. If no match is found, the method should return {@link #DONE}.
     *
     * @param startAt The index in the target text at which the search should start.
     * @return index at which the match starts, else if match is not found {@link #DONE} is returned
     * @see #setMatchLength
     * @stable ICU 2.0
     */
    protected abstract int handlePrevious(int startAt);

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    // TODO: This protected method is @stable 2.0 in ICU4C
    protected void setMatchNotFound() {
        search_.matchedIndex_ = DONE;
        search_.setMatchedLength(0);
    }

    /**
     * Option to control how collation elements are compared. The default value will be {@link
     * #STANDARD_ELEMENT_COMPARISON}.
     *
     * <p>PATTERN_BASE_WEIGHT_IS_WILDCARD supports "asymmetric search" as described in <a
     * href="http://www.unicode.org/reports/tr10/#Asymmetric_Search">UTS #10 Unicode Collation
     * Algorithm</a>, while ANY_BASE_WEIGHT_IS_WILDCARD supports a related option in which
     * "unmarked" characters in either the pattern or the searched text are treated as wildcards
     * that match marked or unmarked versions of the same character.
     *
     * @see #setElementComparisonType(ElementComparisonType)
     * @see #getElementComparisonType()
     * @stable ICU 53
     */
    public enum ElementComparisonType {
        /**
         * Standard collation element comparison at the specified collator strength.
         *
         * @stable ICU 53
         */
        STANDARD_ELEMENT_COMPARISON,
        /**
         * Collation element comparison is modified to effectively provide behavior between the
         * specified strength and strength - 1.
         *
         * <p>Collation elements in the pattern that have the base weight for the specified strength
         * are treated as "wildcards" that match an element with any other weight at that collation
         * level in the searched text. For example, with a secondary-strength English collator, a
         * plain 'e' in the pattern will match a plain e or an e with any diacritic in the searched
         * text, but an e with diacritic in the pattern will only match an e with the same diacritic
         * in the searched text.
         *
         * @stable ICU 53
         */
        PATTERN_BASE_WEIGHT_IS_WILDCARD,

        /**
         * Collation element comparison is modified to effectively provide behavior between the
         * specified strength and strength - 1.
         *
         * <p>Collation elements in either the pattern or the searched text that have the base
         * weight for the specified strength are treated as "wildcards" that match an element with
         * any other weight at that collation level. For example, with a secondary-strength English
         * collator, a plain 'e' in the pattern will match a plain e or an e with any diacritic in
         * the searched text, but an e with diacritic in the pattern will only match an e with the
         * same diacritic or a plain e in the searched text.
         *
         * @stable ICU 53
         */
        ANY_BASE_WEIGHT_IS_WILDCARD
    }

    /**
     * Sets the collation element comparison type.
     *
     * <p>The default comparison type is {@link ElementComparisonType#STANDARD_ELEMENT_COMPARISON}.
     *
     * @see ElementComparisonType
     * @see #getElementComparisonType()
     * @stable ICU 53
     */
    public void setElementComparisonType(ElementComparisonType type) {
        search_.elementComparisonType_ = type;
    }

    /**
     * Returns the collation element comparison type.
     *
     * @see ElementComparisonType
     * @see #setElementComparisonType(ElementComparisonType)
     * @stable ICU 53
     */
    public ElementComparisonType getElementComparisonType() {
        return search_.elementComparisonType_;
    }
}