RuleBasedSegmenter.java

// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

package com.ibm.icu.segmenter;

import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import java.io.InputStream;

/**
 * Performs segmentation according to the provided rule string. The rule string must follow the same
 * guidelines as for {@link RuleBasedBreakIterator#RuleBasedBreakIterator(String)}.
 *
 * @draft ICU 78
 */
public class RuleBasedSegmenter implements Segmenter {

    private final BreakIterator breakIterPrototype;

    /**
     * Returns a {@link Segments} object that encapsulates the segmentation of the input {@code
     * CharSequence}. The {@code Segments} object, in turn, provides the main APIs to support
     * traversal over the resulting segments and boundaries via the Java {@code Stream} abstraction.
     *
     * @param s input {@code CharSequence} on which segmentation is performed. The input must not be
     *     modified while using the resulting {@code Segments} object.
     * @return A {@code Segments} object with APIs to access the results of segmentation, including
     *     APIs that return {@code Stream}s of the segments and boundaries.
     * @draft ICU 78
     */
    @Override
    public Segments segment(CharSequence s) {
        return new SegmentsImpl(breakIterPrototype, s);
    }

    /**
     * @return a builder for constructing {@code RuleBasedSegmenter}
     * @draft ICU 78
     */
    public static Builder builder() {
        return new Builder();
    }

    private RuleBasedSegmenter(BreakIterator breakIter) {
        breakIterPrototype = breakIter;
    }

    /**
     * Builder for {@link RuleBasedSegmenter}
     *
     * @draft ICU 78
     */
    public static class Builder {

        private BreakIterator breakIter = null;

        private Builder() {}

        /**
         * Sets the rule string for segmentation.
         *
         * @param rules rule string. The rule string must follow the same guidelines as for {@link
         *     RuleBasedBreakIterator#getInstanceFromCompiledRules(InputStream)}.
         * @draft ICU 78
         */
        public Builder setRules(String rules) {
            if (rules == null) {
                throw new IllegalArgumentException("rules cannot be set to null.");
            }
            try {
                breakIter = new RuleBasedBreakIterator(rules);
                return this;
            } catch (RuntimeException rte) {
                throw new IllegalArgumentException(
                        "The provided rule string is invalid"
                                + " or there was an error in creating the RuleBasedSegmenter.",
                        rte);
            }
        }

        /**
         * Builds the {@code Segmenter}
         *
         * @return the constructed {@code Segmenter} instance
         * @draft ICU 78
         */
        public Segmenter build() {
            if (breakIter == null) {
                throw new IllegalArgumentException("A rule string must be set.");
            } else {
                return new RuleBasedSegmenter(breakIter);
            }
        }
    }
}