RbnfScannerProviderImpl.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2009-2014, International Business Machines Corporation and    *
 * others. All Rights Reserved.                                                *
 *******************************************************************************
 */

package com.ibm.icu.impl.text;

import com.ibm.icu.impl.ICUDebug;
import com.ibm.icu.text.CollationElementIterator;
import com.ibm.icu.text.Collator;
import com.ibm.icu.text.RbnfLenientScanner;
import com.ibm.icu.text.RbnfLenientScannerProvider;
import com.ibm.icu.text.RuleBasedCollator;
import com.ibm.icu.util.ULocale;
import java.util.concurrent.ConcurrentHashMap;

/**
 * Returns RbnfLenientScanners that use the old RuleBasedNumberFormat implementation behind
 * setLenientParseMode, which is based on Collator.
 *
 * @internal
 * @deprecated This API is ICU internal only.
 */
@Deprecated
public class RbnfScannerProviderImpl implements RbnfLenientScannerProvider {
    private static final boolean DEBUG = ICUDebug.enabled("rbnf");
    private final ConcurrentHashMap<String, RbnfLenientScanner> cache;

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RbnfScannerProviderImpl() {
        cache = new ConcurrentHashMap<>();
    }

    /**
     * Returns a collation-based scanner.
     *
     * <p>Only primary differences are treated as significant. This means that case differences,
     * accent differences, alternate spellings of the same letter (e.g., ae and a-umlaut in German),
     * ignorable characters, etc. are ignored in matching the text. In many cases, numerals will be
     * accepted in place of words or phrases as well.
     *
     * <p>For example, all of the following will correctly parse as 255 in English in lenient-parse
     * mode: <br>
     * "two hundred fifty-five" <br>
     * "two hundred fifty five" <br>
     * "TWO HUNDRED FIFTY-FIVE" <br>
     * "twohundredfiftyfive" <br>
     * "2 hundred fifty-5"
     *
     * <p>The Collator used is determined by the locale that was passed to this object on
     * construction. The description passed to this object on construction may supply additional
     * collation rules that are appended to the end of the default collator for the locale, enabling
     * additional equivalences (such as adding more ignorable characters or permitting spelled-out
     * version of symbols; see the demo program for examples).
     *
     * <p>It's important to emphasize that even strict parsing is relatively lenient: it will accept
     * some text that it won't produce as output. In English, for example, it will correctly parse
     * "two hundred zero" and "fifteen hundred".
     *
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    public RbnfLenientScanner get(ULocale locale, String extras) {
        RbnfLenientScanner result = null;
        String key = locale.toString() + "/" + extras;
        // Avoid computeIfAbsent: createScanner() constructs a RuleBasedCollator (expensive)
        // and would hold the ConcurrentHashMap bin lock for the entire duration. Using
        // get-then-putIfAbsent allows parallel construction with first-write-wins.
        result = cache.get(key);
        if (result != null) {
            return result;
        }
        result = createScanner(locale, extras);
        RbnfLenientScanner existing = cache.putIfAbsent(key, result);
        return existing != null ? existing : result;
    }

    /**
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    protected RbnfLenientScanner createScanner(ULocale locale, String extras) {
        RuleBasedCollator collator = null;
        try {
            // create a default collator based on the locale,
            // then pull out that collator's rules, append any additional
            // rules specified in the description, and create a _new_
            // collator based on the combination of those rules
            collator = (RuleBasedCollator) Collator.getInstance(locale.toLocale());
            if (extras != null) {
                String rules = collator.getRules() + extras;
                collator = new RuleBasedCollator(rules);
            }
            collator.setDecomposition(Collator.CANONICAL_DECOMPOSITION);
        } catch (Exception e) {
            // If we get here, it means we have a malformed set of
            // collation rules, which hopefully won't happen
            if (DEBUG) { // debug hook
                e.printStackTrace();
                System.out.println("++++");
            }
            collator = null;
        }

        return new RbnfLenientScannerImpl(collator);
    }

    private static class RbnfLenientScannerImpl implements RbnfLenientScanner {
        private final RuleBasedCollator collator;

        private RbnfLenientScannerImpl(RuleBasedCollator rbc) {
            this.collator = rbc;
        }

        public boolean allIgnorable(String s) {
            CollationElementIterator iter = collator.getCollationElementIterator(s);

            int o = iter.next();
            while (o != CollationElementIterator.NULLORDER
                    && CollationElementIterator.primaryOrder(o) == 0) {
                o = iter.next();
            }
            return o == CollationElementIterator.NULLORDER;
        }

        public int[] findText(String str, String key, int startingAt) {
            int p = startingAt;
            int keyLen = 0;

            // basically just isolate smaller and smaller substrings of
            // the target string (each running to the end of the string,
            // and with the first one running from startingAt to the end)
            // and then use prefixLength() to see if the search key is at
            // the beginning of each substring.  This is excruciatingly
            // slow, but it will locate the key and tell use how long the
            // matching text was.
            while (p < str.length() && keyLen == 0) {
                keyLen = prefixLength(str.substring(p), key);
                if (keyLen != 0) {
                    return new int[] {p, keyLen};
                }
                ++p;
            }
            // if we make it to here, we didn't find it.  Return -1 for the
            // location.  The length should be ignored, but set it to 0,
            // which should be "safe"
            return new int[] {-1, 0};
        }

        // The following method contains the same signature as findText
        //  and has never been used by anything once.
        @SuppressWarnings("unused")
        public int[] findText2(String str, String key, int startingAt) {

            CollationElementIterator strIter = collator.getCollationElementIterator(str);
            CollationElementIterator keyIter = collator.getCollationElementIterator(key);

            int keyStart = -1;

            strIter.setOffset(startingAt);

            int oStr = strIter.next();
            int oKey = keyIter.next();
            while (oKey != CollationElementIterator.NULLORDER) {
                while (oStr != CollationElementIterator.NULLORDER
                        && CollationElementIterator.primaryOrder(oStr) == 0) {
                    oStr = strIter.next();
                }

                while (oKey != CollationElementIterator.NULLORDER
                        && CollationElementIterator.primaryOrder(oKey) == 0) {
                    oKey = keyIter.next();
                }

                if (oStr == CollationElementIterator.NULLORDER) {
                    return new int[] {-1, 0};
                }

                if (oKey == CollationElementIterator.NULLORDER) {
                    break;
                }

                if (CollationElementIterator.primaryOrder(oStr)
                        == CollationElementIterator.primaryOrder(oKey)) {
                    keyStart = strIter.getOffset();
                    oStr = strIter.next();
                    oKey = keyIter.next();
                } else {
                    if (keyStart != -1) {
                        keyStart = -1;
                        keyIter.reset();
                    } else {
                        oStr = strIter.next();
                    }
                }
            }

            return new int[] {keyStart, strIter.getOffset() - keyStart};
        }

        public int prefixLength(String str, String prefix) {
            // Create two collation element iterators, one over the target string
            // and another over the prefix.
            //
            // Previous code was matching "fifty-" against " fifty" and leaving
            // the number " fifty-7" to parse as 43 (50 - 7).
            // Also it seems that if we consume the entire prefix, that's ok even
            // if we've consumed the entire string, so I switched the logic to
            // reflect this.

            CollationElementIterator strIter = collator.getCollationElementIterator(str);
            CollationElementIterator prefixIter = collator.getCollationElementIterator(prefix);

            // match collation elements between the strings
            int oStr = strIter.next();
            int oPrefix = prefixIter.next();

            while (oPrefix != CollationElementIterator.NULLORDER) {
                // skip over ignorable characters in the target string
                while (CollationElementIterator.primaryOrder(oStr) == 0
                        && oStr != CollationElementIterator.NULLORDER) {
                    oStr = strIter.next();
                }

                // skip over ignorable characters in the prefix
                while (CollationElementIterator.primaryOrder(oPrefix) == 0
                        && oPrefix != CollationElementIterator.NULLORDER) {
                    oPrefix = prefixIter.next();
                }

                // if skipping over ignorables brought to the end of
                // the prefix, we DID match: drop out of the loop
                if (oPrefix == CollationElementIterator.NULLORDER) {
                    break;
                }

                // if skipping over ignorables brought us to the end
                // of the target string, we didn't match and return 0
                if (oStr == CollationElementIterator.NULLORDER) {
                    return 0;
                }

                // match collation elements from the two strings
                // (considering only primary differences).  If we
                // get a mismatch, dump out and return 0
                if (CollationElementIterator.primaryOrder(oStr)
                        != CollationElementIterator.primaryOrder(oPrefix)) {
                    return 0;
                }

                // otherwise, advance to the next character in each string
                // and loop (we drop out of the loop when we exhaust
                // collation elements in the prefix)

                oStr = strIter.next();
                oPrefix = prefixIter.next();
            }

            int result = strIter.getOffset();
            if (oStr != CollationElementIterator.NULLORDER) {
                --result;
            }
            return result;
        }
    }
}