TitlecaseTransliterator.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 * Copyright (C) 1996-2011, International Business Machines Corporation and
 * others. All Rights Reserved.
 *
 */
package com.ibm.icu.text;

import com.ibm.icu.impl.UCaseProps;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.util.ULocale;

/**
 * A transliterator that converts all letters (as defined by <code>UCharacter.isLetter()</code>) to
 * lower case, except for those letters preceded by non-letters. The latter are converted to title
 * case using <code>UCharacter.toTitleCase()</code>.
 *
 * @author Alan Liu
 */
class TitlecaseTransliterator extends Transliterator {

    static final String _ID = "Any-Title";

    // TODO: Add variants for tr/az, lt, default = default locale: ICU ticket #12720

    /** System registration hook. */
    static void register() {
        Transliterator.registerFactory(
                _ID,
                new Transliterator.Factory() {
                    @Override
                    public Transliterator getInstance(String ID) {
                        return new TitlecaseTransliterator(ULocale.US);
                    }
                });

        registerSpecialInverse("Title", "Lower", false);
    }

    private final ULocale locale;

    private final UCaseProps csp;
    private int caseLocale;

    /** Constructs a transliterator. */
    public TitlecaseTransliterator(ULocale loc) {
        super(_ID, null);
        locale = loc;
        // Need to look back 2 characters in the case of "can't"
        setMaximumContextLength(2);
        csp = UCaseProps.INSTANCE;
        caseLocale = UCaseProps.getCaseLocale(locale);
    }

    /** Implements {@link Transliterator#handleTransliterate}. */
    @Override
    protected void handleTransliterate(Replaceable text, Position offsets, boolean isIncremental) {
        // TODO reimplement, see ustrcase.c
        // using a real word break iterator
        //   instead of just looking for a transition between cased and uncased characters
        // call CaseMapTransliterator::handleTransliterate() for lowercasing? (set fMap)
        // needs to take isIncremental into account because case mappings are context-sensitive
        //   also detect when lowercasing function did not finish because of context

        if (offsets.start >= offsets.limit) {
            return;
        }

        ReplaceableContextIterator iter = new ReplaceableContextIterator();
        StringBuilder result = new StringBuilder();

        // case type: >0 cased (UCaseProps.LOWER etc.)  ==0 uncased  <0 case-ignorable
        int type;

        // Our mode; we are either converting letter toTitle or
        // toLower.
        boolean doTitle = true;

        // Determine if there is a preceding context of cased case-ignorable*,
        // in which case we want to start in toLower mode.  If the
        // prior context is anything else (including empty) then start
        // in toTitle mode.
        int c, start;
        for (start = offsets.start - 1;
                start >= offsets.contextStart;
                start -= UTF16.getCharCount(c)) {
            c = text.char32At(start);
            type = csp.getTypeOrIgnorable(c);
            if (type > 0) { // cased
                doTitle = false;
                break;
            } else if (type == 0) { // uncased but not ignorable
                break;
            }
            // else (type<0) case-ignorable: continue
        }

        // Convert things after a cased character toLower; things
        // after a uncased, non-case-ignorable character toTitle.  Case-ignorable
        // characters are copied directly and do not change the mode.

        iter.setText(text);
        iter.setIndex(offsets.start);
        iter.setLimit(offsets.limit);
        iter.setContextLimits(offsets.contextStart, offsets.contextLimit);

        // Walk through original string
        // If there is a case change, modify corresponding position in replaceable
        int delta;

        while ((c = iter.nextCaseMapCP()) >= 0) {
            type = csp.getTypeOrIgnorable(c);
            if (type >= 0) { // not case-ignorable
                if (doTitle) {
                    c = csp.toFullTitle(c, iter, result, caseLocale);
                } else {
                    c = csp.toFullLower(c, iter, result, caseLocale);
                }
                doTitle = type == 0; // doTitle=isUncased

                if (iter.didReachLimit() && isIncremental) {
                    // the case mapping function tried to look beyond the context limit
                    // wait for more input
                    offsets.start = iter.getCaseMapCPStart();
                    return;
                }

                /* decode the result */
                if (c < 0) {
                    /* c mapped to itself, no change */
                    continue;
                } else if (c <= UCaseProps.MAX_STRING_LENGTH) {
                    /* replace by the mapping string */
                    delta = iter.replace(result.toString());
                    result.setLength(0);
                } else {
                    /* replace by single-code point mapping */
                    delta = iter.replace(UTF16.valueOf(c));
                }

                if (delta != 0) {
                    offsets.limit += delta;
                    offsets.contextLimit += delta;
                }
            }
        }
        offsets.start = offsets.limit;
    }

    // NOTE: normally this would be static, but because the results vary by locale....
    SourceTargetUtility sourceTargetUtility = null;

    /* (non-Javadoc)
     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
     */
    @Override
    public void addSourceTargetSet(
            UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
        synchronized (this) {
            if (sourceTargetUtility == null) {
                sourceTargetUtility =
                        new SourceTargetUtility(
                                new Transform<String, String>() {
                                    @Override
                                    public String transform(String source) {
                                        return UCharacter.toTitleCase(locale, source, null);
                                    }
                                });
            }
        }
        sourceTargetUtility.addSourceTargetSet(this, inputFilter, sourceSet, targetSet);
    }
}