LocaleValidityChecker.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2015-2016, International Business Machines Corporation and
 * others. All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.impl.locale;

import com.ibm.icu.impl.ValidIdentifiers;
import com.ibm.icu.impl.ValidIdentifiers.Datasubtype;
import com.ibm.icu.impl.ValidIdentifiers.Datatype;
import com.ibm.icu.impl.locale.KeyTypeData.ValueType;
import com.ibm.icu.util.IllformedLocaleException;
import com.ibm.icu.util.Output;
import com.ibm.icu.util.ULocale;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;

/**
 * @author markdavis
 */
public class LocaleValidityChecker {
    private final Set<Datasubtype> datasubtypes;
    private final boolean allowsDeprecated;

    public static class Where {
        public Datatype fieldFailure;
        public String codeFailure;

        public boolean set(Datatype datatype, String code) {
            fieldFailure = datatype;
            codeFailure = code;
            return false;
        }

        @Override
        public String toString() {
            return fieldFailure == null ? "OK" : "{" + fieldFailure + ", " + codeFailure + "}";
        }
    }

    public LocaleValidityChecker(Set<Datasubtype> datasubtypes) {
        this.datasubtypes = EnumSet.copyOf(datasubtypes);
        allowsDeprecated = datasubtypes.contains(Datasubtype.deprecated);
    }

    public LocaleValidityChecker(Datasubtype... datasubtypes) {
        this.datasubtypes = EnumSet.copyOf(Arrays.asList(datasubtypes));
        allowsDeprecated = this.datasubtypes.contains(Datasubtype.deprecated);
    }

    /**
     * @return the datasubtypes
     */
    public Set<Datasubtype> getDatasubtypes() {
        return EnumSet.copyOf(datasubtypes);
    }

    static Pattern SEPARATOR = Pattern.compile("[-_]");

    @SuppressWarnings("unused")
    private static final Pattern VALID_X = Pattern.compile("[a-zA-Z0-9]{2,8}(-[a-zA-Z0-9]{2,8})*");

    public boolean isValid(ULocale locale, Where where) {
        where.set(null, null);
        final String language = locale.getLanguage();
        final String script = locale.getScript();
        final String region = locale.getCountry();
        final String variantString = locale.getVariant();
        final Set<Character> extensionKeys = locale.getExtensionKeys();
        //        if (language.isEmpty()) {
        //            // the only case where this is valid is if there is only an 'x' extension
        // string
        //            if (!script.isEmpty() || !region.isEmpty() || variantString.isEmpty()
        //                    || extensionKeys.size() != 1 || !extensionKeys.contains('x')) {
        //                return where.set(Datatype.x, "Null language only with x-...");
        //            }
        //            return true; // for x string, wellformedness = valid
        //        }
        if (!isValid(Datatype.language, language, where)) {
            // special case x
            if (language.equals("x")) {
                where.set(null, null); // for x, well-formed == valid
                return true;
            }
            return false;
        }
        if (!isValid(Datatype.script, script, where)) return false;
        if (!isValid(Datatype.region, region, where)) return false;
        if (!variantString.isEmpty()) {
            for (String variant : SEPARATOR.split(variantString)) {
                if (!isValid(Datatype.variant, variant, where)) return false;
            }
        }
        for (Character c : extensionKeys) {
            try {
                Datatype datatype = Datatype.valueOf(c + "");
                switch (datatype) {
                    case x:
                        return true; // if it is syntactic (checked by ULocale) it is valid
                    case t:
                    case u:
                        if (!isValidU(locale, datatype, locale.getExtension(c), where))
                            return false;
                        break;
                    default:
                        break;
                }
            } catch (Exception e) {
                return where.set(Datatype.illegal, c + "");
            }
        }
        return true;
    }

    // TODO combine this with the KeyTypeData.SpecialType, and get it from the type, not the key
    enum SpecialCase {
        normal,
        anything,
        reorder,
        codepoints,
        subdivision,
        rgKey;

        static SpecialCase get(String key) {
            if (key.equals("kr")) {
                return reorder;
            } else if (key.equals("vt")) {
                return codepoints;
            } else if (key.equals("sd")) {
                return subdivision;
            } else if (key.equals("rg")) {
                return rgKey;
            } else if (key.equals("x0")) {
                return anything;
            } else {
                return normal;
            }
        }
    }

    /**
     * @param locale
     * @param datatype
     * @param extension
     * @param where
     * @return
     */
    private boolean isValidU(
            ULocale locale, Datatype datatype, String extensionString, Where where) {
        String key = "";
        int typeCount = 0;
        ValueType valueType = null;
        SpecialCase specialCase = null;
        StringBuilder prefix = new StringBuilder();
        Set<String> seen = new HashSet<String>();

        StringBuilder tBuffer = datatype == Datatype.t ? new StringBuilder() : null;

        // TODO: is empty -u- valid?

        for (String subtag : SEPARATOR.split(extensionString)) {
            if (subtag.length() == 2 && (tBuffer == null || subtag.charAt(1) <= '9')) {
                // if we have accumulated a t buffer, check that first
                if (tBuffer != null) {
                    // Check t buffer. Empty after 't' is ok.
                    if (tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(), where)) {
                        return false;
                    }
                    tBuffer = null;
                }
                key = KeyTypeData.toBcpKey(subtag);
                if (key == null) {
                    return where.set(datatype, subtag);
                }
                if (!allowsDeprecated && KeyTypeData.isDeprecated(key)) {
                    return where.set(datatype, key);
                }
                valueType = KeyTypeData.getValueType(key);
                specialCase = SpecialCase.get(key);
                typeCount = 0;
            } else if (tBuffer != null) {
                if (tBuffer.length() != 0) {
                    tBuffer.append('-');
                }
                tBuffer.append(subtag);
            } else {
                ++typeCount;
                switch (valueType) {
                    case single:
                        if (typeCount > 1) {
                            return where.set(datatype, key + "-" + subtag);
                        }
                        break;
                    case incremental:
                        if (typeCount == 1) {
                            prefix.setLength(0);
                            prefix.append(subtag);
                        } else {
                            prefix.append('-').append(subtag);
                            subtag = prefix.toString();
                        }
                        break;
                    case multiple:
                        if (typeCount == 1) {
                            seen.clear();
                        }
                        break;
                    default:
                        break;
                }
                switch (specialCase) {
                    case anything:
                        continue;
                    case codepoints:
                        try {
                            if (Integer.parseInt(subtag, 16) > 0x10FFFF) {
                                return where.set(datatype, key + "-" + subtag);
                            }
                        } catch (NumberFormatException e) {
                            return where.set(datatype, key + "-" + subtag);
                        }
                        continue;
                    case reorder:
                        boolean newlyAdded = seen.add(subtag.equals("zzzz") ? "others" : subtag);
                        if (!newlyAdded || !isScriptReorder(subtag)) {
                            return where.set(datatype, key + "-" + subtag);
                        }
                        continue;
                    case subdivision:
                        if (!isSubdivision(locale, subtag)) {
                            return where.set(datatype, key + "-" + subtag);
                        }
                        continue;
                    case rgKey:
                        if (subtag.length() < 6 || !subtag.endsWith("zzzz")) {
                            return where.set(datatype, subtag);
                        }
                        if (!isValid(
                                Datatype.region, subtag.substring(0, subtag.length() - 4), where)) {
                            return false;
                        }
                        continue;
                    default:
                        break;
                }

                // en-u-sd-usca
                // en-US-u-sd-usca
                Output<Boolean> isKnownKey = new Output<Boolean>();
                Output<Boolean> isSpecialType = new Output<Boolean>();
                String type = KeyTypeData.toBcpType(key, subtag, isKnownKey, isSpecialType);
                if (type == null) {
                    return where.set(datatype, key + "-" + subtag);
                }
                if (!allowsDeprecated && KeyTypeData.isDeprecated(key, subtag)) {
                    return where.set(datatype, key + "-" + subtag);
                }
            }
        }
        // Check t buffer. Empty after 't' is ok.
        if (tBuffer != null && tBuffer.length() != 0 && !isValidLocale(tBuffer.toString(), where)) {
            return false;
        }
        return true;
    }

    /**
     * @param locale
     * @param subtag
     * @return
     */
    private boolean isSubdivision(ULocale locale, String subtag) {
        // First check if the subtag is valid
        if (subtag.length() < 3) {
            return false;
        }
        String region = subtag.substring(0, subtag.charAt(0) <= '9' ? 3 : 2);
        String subdivision = subtag.substring(region.length());
        if (ValidIdentifiers.isValid(Datatype.subdivision, datasubtypes, region, subdivision)
                == null) {
            return false;
        }
        // Then check for consistency with the locale's region
        String localeRegion = locale.getCountry();
        if (localeRegion.isEmpty()) {
            ULocale max = ULocale.addLikelySubtags(locale);
            localeRegion = max.getCountry();
        }
        if (!region.equalsIgnoreCase(localeRegion)) {
            return false;
        }
        return true;
    }

    static final Set<String> REORDERING_INCLUDE =
            new HashSet<String>(
                    Arrays.asList(
                            "space", "punct", "symbol", "currency", "digit", "others", "zzzz"));
    static final Set<String> REORDERING_EXCLUDE =
            new HashSet<String>(Arrays.asList("zinh", "zyyy"));
    static final Set<Datasubtype> REGULAR_ONLY = EnumSet.of(Datasubtype.regular);

    /**
     * @param subtag
     * @return
     */
    private boolean isScriptReorder(String subtag) {
        subtag = AsciiUtil.toLowerString(subtag);
        if (REORDERING_INCLUDE.contains(subtag)) {
            return true;
        } else if (REORDERING_EXCLUDE.contains(subtag)) {
            return false;
        }
        return ValidIdentifiers.isValid(Datatype.script, REGULAR_ONLY, subtag) != null;
        //        space, punct, symbol, currency, digit - core groups of characters below 'a'
        //        any script code except Common and Inherited.
        //      sc ; Zinh                             ; Inherited                        ; Qaai
        //      sc ; Zyyy                             ; Common
        //        Some pairs of scripts sort primary-equal and always reorder together. For example,
        // Katakana characters are are always reordered with Hiragana.
        //        others - where all codes not explicitly mentioned should be ordered. The script
        // code Zzzz (Unknown Script) is a synonym for others.        return false;
    }

    /**
     * @param extensionString
     * @param where
     * @return
     */
    private boolean isValidLocale(String extensionString, Where where) {
        try {
            ULocale locale = new ULocale.Builder().setLanguageTag(extensionString).build();
            return isValid(locale, where);
        } catch (IllformedLocaleException e) {
            int startIndex = e.getErrorIndex();
            String[] list = SEPARATOR.split(extensionString.substring(startIndex));
            return where.set(Datatype.t, list[0]);
        } catch (Exception e) {
            return where.set(Datatype.t, e.getMessage());
        }
    }

    /**
     * @param datatype
     * @param code
     * @param where
     * @return
     */
    private boolean isValid(Datatype datatype, String code, Where where) {
        if (code.isEmpty()) {
            return true;
        }

        // Note:
        // BCP 47 -u- locale extension '-u-va-posix' is mapped to variant 'posix' automatically.
        // For example, ULocale.forLanguageTag("en-u-va-posix").getVariant() returns "posix".
        // This is only the exceptional case when -u- locale extension is mapped to a subtag type
        // other than keyword.
        //
        // The locale validity data is based on IANA language subtag registry data and "posix"
        // is not a valid variant. So we need to handle this specific case here. There are no
        // othe exceptions.
        if (datatype == Datatype.variant && "posix".equalsIgnoreCase(code)) {
            return true;
        }

        return ValidIdentifiers.isValid(datatype, datasubtypes, code) != null
                ? true
                : (where == null ? false : where.set(datatype, code));
    }
}