Segments.java
// © 2025 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html
package com.ibm.icu.segmenter;
import java.util.stream.IntStream;
import java.util.stream.Stream;
/**
* An interface that represents the segmentation results, including the APIs for iteration therein,
* that are yielded from passing an input {@code CharSequence} to a {@code Segmenter}.
*
* <p>The segmentation results can be provided either as the segmentation boundary indices ({code
* int}s) or as segments, which are represented by the {@link Segment} class. In turn, the {@code
* Segment} object can also provide the subsequence of the original input that it represents.
*
* <p>Example:
*
* <blockquote>
*
* <pre>
* Segmenter wordSeg =
* LocalizedSegmenter.builder()
* .setLocale(ULocale.forLanguageTag("de"))
* .setSegmentationType(SegmentationType.WORD)
* .build();
*
* Segments segments = wordSeg.segment("Das 21ste Jahrh. ist das beste.");
*
* List<CharSequence> words = segments.subSequences().collect(Collectors.toList());
* </pre>
*
* </blockquote>
*
* @see Segmenter
* @see Segment
* @draft ICU 78
*/
public interface Segments {
/**
* Returns a {@code Stream} of the {@code CharSequence}s for all of the segments in the source
* sequence. Start from the beginning of the sequence and iterate forwards until the end.
*
* @return a {@code Stream} of all {@code Segments} in the source sequence.
* @draft ICU 78
*/
default Stream<CharSequence> subSequences() {
return segments().map(Segment::getSubSequence);
}
/**
* Returns the segment that contains index {@code i}. Containment is inclusive of the start
* index and exclusive of the limit index.
*
* <p>Specifically, the containing segment is defined as the segment with start {@code s} and
* limit {@code l} such that {@code s ≤ i < l}.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @throws IndexOutOfBoundsException if {@code i} is less than 0 or greater than or equal to the
* length of the input {@code CharSequence} to the {@code Segmenter}
* @return A segment that either starts at or contains index {@code i}
* @draft ICU 78
*/
Segment segmentAt(int i);
/**
* Returns a {@code Stream} of all {@code Segment}s in the source sequence. Start with the first
* and iterate forwards until the end of the sequence.
*
* <p>This is equivalent to {@code segmentsFrom(0)}.
*
* @return a {@code Stream} of all {@code Segments} in the source sequence.
* @draft ICU 78
*/
default Stream<Segment> segments() {
return segmentsFrom(0);
}
/**
* Returns a {@code Stream} of all {@code Segment}s in the source sequence where all segment
* limits {@code l} satisfy {@code i < l}. Iteration moves forwards.
*
* <p>This means that the first segment in the stream is the same as what is returned by {@code
* segmentAt(i)}.
*
* <p>The word "from" is used here to mean "at or after", with the semantics of "at" for a
* {@code Segment} defined by {@link #segmentAt(int)}}. We cannot describe the segments all as
* being "after" since the first segment might contain {@code i} in the middle, meaning that in
* the forward direction, its start position precedes {@code i}.
*
* <p>{@code segmentsFrom} and {@link #segmentsBefore(int)} create a partitioning of the space
* of all {@code Segment}s.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @return a {@code Stream} of all {@code Segment}s at or after {@code i}
* @draft ICU 78
*/
Stream<Segment> segmentsFrom(int i);
/**
* Returns a {@code Stream} of all {@code Segment}s in the source sequence where all segment
* limits {@code l} satisfy {@code l ≤ i}. Iteration moves backwards.
*
* <p>This means that the all segments in the stream come before the one that is returned by
* {@code segmentAt(i)}. A segment is not considered to contain index {@code i} if {code i} is
* equal to limit {@code l}. Thus, "before" encapsulates the invariant {@code l ≤ i}.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @return a {@code Stream} of all {@code Segment}s before {@code i}
* @draft ICU 78
*/
Stream<Segment> segmentsBefore(int i);
/**
* Returns whether offset {@code i} is a segmentation boundary. Throws an exception when {@code
* i} is not a valid index position for the source sequence.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @throws IllegalArgumentException if {@code i} is less than 0 or greater than the length of
* the input {@code CharSequence} to the {@code Segmenter}
* @return Returns whether offset {@code i} is a segmentation boundary.
* @draft ICU 78
*/
boolean isBoundary(int i);
/**
* Returns all segmentation boundaries, starting from the beginning and moving forwards.
*
* <p><b>Note:</b> {@code boundaries() != boundariesAfter(0)}. This difference naturally results
* from the strict inequality condition in boundariesAfter, and the fact that 0 is the first
* boundary returned from the start of an input sequence.
*
* @return An {@code IntStream} of all segmentation boundaries, starting at the first boundary
* with index 0, and moving forwards in the input sequence.
* @draft ICU 78
*/
default IntStream boundaries() {
return boundariesAfter(-1);
}
/**
* Returns all segmentation boundaries after the provided index. Iteration moves forwards.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @return An {@code IntStream} of all boundaries {@code b} such that {@code b > i}
* @draft ICU 78
*/
IntStream boundariesAfter(int i);
/**
* Returns all segmentation boundaries on or before the provided index. Iteration moves
* backwards.
*
* <p>The phrase "back from" is used to indicate both that: 1) boundaries are "on or before" the
* input index; 2) the direction of iteration is backwards (towards the beginning). "on or
* before" indicates that the result set is {@code b} where {@code b ≤ i}, which is a weak
* inequality, while "before" might suggest the strict inequality {@code b < i}.
*
* <p>{@code boundariesBackFrom} and {@link #boundariesAfter(int)} create a partitioning of the
* space of all boundaries.
*
* @param i index in the input {@code CharSequence} to the {@code Segmenter}
* @return An {@code IntStream} of all boundaries {@code b} such that {@code b ≤ i}
* @draft ICU 78
*/
IntStream boundariesBackFrom(int i);
//
// Inner enums/classes in common for other inner classes
//
/**
* @draft ICU 78
*/
enum IterationDirection {
/**
* @draft ICU 78
*/
FORWARDS,
/**
* @draft ICU 78
*/
BACKWARDS,
}
}