UTS46.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2010-2014, International Business Machines
 * Corporation and others.  All Rights Reserved.
 *******************************************************************************
 */
package com.ibm.icu.impl;

import com.ibm.icu.impl.Normalizer2Impl.UTF16Plus;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UCharacterCategory;
import com.ibm.icu.lang.UCharacterDirection;
import com.ibm.icu.lang.UScript;
import com.ibm.icu.text.IDNA;
import com.ibm.icu.text.Normalizer2;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.util.ICUException;
import java.util.EnumSet;

// Note about tests for IDNA.Error.DOMAIN_NAME_TOO_LONG:
//
// The domain name length limit is 255 octets in an internal DNS representation
// where the last ("root") label is the empty label
// represented by length byte 0 alone.
// In a conventional string, this translates to 253 characters, or 254
// if there is a trailing dot for the root label.

/**
 * UTS #46 (IDNA2008) implementation.
 *
 * @author Markus Scherer
 * @since 2010jul09
 */
public final class UTS46 extends IDNA {
    public UTS46(int options) {
        this.options = options;
    }

    @Override
    public StringBuilder labelToASCII(CharSequence label, StringBuilder dest, Info info) {
        return process(label, true, true, dest, info);
    }

    @Override
    public StringBuilder labelToUnicode(CharSequence label, StringBuilder dest, Info info) {
        return process(label, true, false, dest, info);
    }

    @Override
    public StringBuilder nameToASCII(CharSequence name, StringBuilder dest, Info info) {
        process(name, false, true, dest, info);
        if (dest.length() >= 254
                && !info.getErrors().contains(Error.DOMAIN_NAME_TOO_LONG)
                && isASCIIString(dest)
                && (dest.length() > 254 || dest.charAt(253) != '.')) {
            addError(info, Error.DOMAIN_NAME_TOO_LONG);
        }
        return dest;
    }

    @Override
    public StringBuilder nameToUnicode(CharSequence name, StringBuilder dest, Info info) {
        return process(name, false, false, dest, info);
    }

    private static final Normalizer2 uts46Norm2 =
            Normalizer2.getInstance(null, "uts46", Normalizer2.Mode.COMPOSE); // uts46.nrm
    final int options;

    // Severe errors which usually result in a U+FFFD replacement character in the result string.
    private static final EnumSet<Error> severeErrors =
            EnumSet.of(
                    Error.LEADING_COMBINING_MARK,
                    Error.DISALLOWED,
                    Error.PUNYCODE,
                    Error.LABEL_HAS_DOT,
                    Error.INVALID_ACE_LABEL);

    private static boolean isASCIIString(CharSequence dest) {
        int length = dest.length();
        for (int i = 0; i < length; ++i) {
            if (dest.charAt(i) > 0x7f) {
                return false;
            }
        }
        return true;
    }

    // UTS #46 data for ASCII characters.
    // The normalizer (using uts46.nrm) maps uppercase ASCII letters to lowercase
    // and passes through all other ASCII characters.
    // If USE_STD3_RULES is set, then non-LDH characters are disallowed
    // using this data.
    // The ASCII fastpath also uses this data.
    // Values: -1=disallowed  0==valid  1==mapped (lowercase)
    private static final byte asciiData[] = {
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        // 002D..002E; valid  #  HYPHEN-MINUS..FULL STOP
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        0,
        0,
        -1,
        // 0030..0039; valid  #  DIGIT ZERO..DIGIT NINE
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        -1,
        -1,
        -1,
        -1,
        -1,
        -1,
        // 0041..005A; mapped  #  LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER Z
        -1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        1,
        -1,
        -1,
        -1,
        -1,
        -1,
        // 0061..007A; valid  #  LATIN SMALL LETTER A..LATIN SMALL LETTER Z
        -1,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        0,
        -1,
        -1,
        -1,
        -1,
        -1
    };

    private StringBuilder process(
            CharSequence src, boolean isLabel, boolean toASCII, StringBuilder dest, Info info) {
        // uts46Norm2.normalize() would do all of this error checking and setup,
        // but with the ASCII fastpath we do not always call it, and do not
        // call it first.
        if (dest == src) {
            throw new IllegalArgumentException();
        }
        // Arguments are fine, reset output values.
        dest.delete(0, 0x7fffffff);
        resetInfo(info);
        int srcLength = src.length();
        if (srcLength == 0) {
            addError(info, Error.EMPTY_LABEL);
            return dest;
        }
        // ASCII fastpath
        boolean disallowNonLDHDot = (options & USE_STD3_RULES) != 0;
        int labelStart = 0;
        int i;
        for (i = 0; ; ++i) {
            if (i == srcLength) {
                if (toASCII) {
                    if ((i - labelStart) > 63) {
                        addLabelError(info, Error.LABEL_TOO_LONG);
                    }
                    // There is a trailing dot if labelStart==i.
                    if (!isLabel && i >= 254 && (i > 254 || labelStart < i)) {
                        addError(info, Error.DOMAIN_NAME_TOO_LONG);
                    }
                }
                promoteAndResetLabelErrors(info);
                return dest;
            }
            char c = src.charAt(i);
            if (c > 0x7f) {
                break;
            }
            int cData = asciiData[c];
            if (cData > 0) {
                dest.append((char) (c + 0x20)); // Lowercase an uppercase ASCII letter.
            } else if (cData < 0 && disallowNonLDHDot) {
                break; // Replacing with U+FFFD can be complicated for toASCII.
            } else {
                dest.append(c);
                if (c == '-') { // hyphen
                    if (i == (labelStart + 3) && src.charAt(i - 1) == '-') {
                        // "??--..." is Punycode or forbidden.
                        ++i; // '-' was copied to dest already
                        break;
                    }
                    if (i == labelStart) {
                        // label starts with "-"
                        addLabelError(info, Error.LEADING_HYPHEN);
                    }
                    if ((i + 1) == srcLength || src.charAt(i + 1) == '.') {
                        // label ends with "-"
                        addLabelError(info, Error.TRAILING_HYPHEN);
                    }
                } else if (c == '.') { // dot
                    if (isLabel) {
                        // Replacing with U+FFFD can be complicated for toASCII.
                        ++i; // '.' was copied to dest already
                        break;
                    }
                    if (i == labelStart) {
                        addLabelError(info, Error.EMPTY_LABEL);
                    }
                    if (toASCII && (i - labelStart) > 63) {
                        addLabelError(info, Error.LABEL_TOO_LONG);
                    }
                    promoteAndResetLabelErrors(info);
                    labelStart = i + 1;
                }
            }
        }
        promoteAndResetLabelErrors(info);
        processUnicode(src, labelStart, i, isLabel, toASCII, dest, info);
        if (isBiDi(info)
                && !hasCertainErrors(info, severeErrors)
                && (!isOkBiDi(info) || (labelStart > 0 && !isASCIIOkBiDi(dest, labelStart)))) {
            addError(info, Error.BIDI);
        }
        return dest;
    }

    private StringBuilder processUnicode(
            CharSequence src,
            int labelStart,
            int mappingStart,
            boolean isLabel,
            boolean toASCII,
            StringBuilder dest,
            Info info) {
        if (mappingStart == 0) {
            uts46Norm2.normalize(src, dest);
        } else {
            uts46Norm2.normalizeSecondAndAppend(dest, src.subSequence(mappingStart, src.length()));
        }
        boolean doMapDevChars =
                toASCII
                        ? (options & NONTRANSITIONAL_TO_ASCII) == 0
                        : (options & NONTRANSITIONAL_TO_UNICODE) == 0;
        int destLength = dest.length();
        int labelLimit = labelStart;
        while (labelLimit < destLength) {
            char c = dest.charAt(labelLimit);
            if (c == '.' && !isLabel) {
                int labelLength = labelLimit - labelStart;
                int newLength = processLabel(dest, labelStart, labelLength, toASCII, info);
                promoteAndResetLabelErrors(info);
                destLength += newLength - labelLength;
                labelLimit = labelStart += newLength + 1;
                continue;
            } else if (c < 0xdf) {
                // pass
            } else if (c <= 0x200d && (c == 0xdf || c == 0x3c2 || c >= 0x200c)) {
                setTransitionalDifferent(info);
                if (doMapDevChars) {
                    destLength = mapDevChars(dest, labelStart, labelLimit);
                    // All deviation characters have been mapped, no need to check for them again.
                    doMapDevChars = false;
                    // Do not increment labelLimit in case c was removed.
                    continue;
                }
            } else if (Character.isSurrogate(c)) {
                if (UTF16Plus.isSurrogateLead(c)
                        ? (labelLimit + 1) == destLength
                                || !Character.isLowSurrogate(dest.charAt(labelLimit + 1))
                        : labelLimit == labelStart
                                || !Character.isHighSurrogate(dest.charAt(labelLimit - 1))) {
                    // Map an unpaired surrogate to U+FFFD before normalization so that when
                    // that removes characters we do not turn two unpaired ones into a pair.
                    addLabelError(info, Error.DISALLOWED);
                    dest.setCharAt(labelLimit, '\ufffd');
                }
            }
            ++labelLimit;
        }
        // Permit an empty label at the end (0<labelStart==labelLimit==destLength is ok)
        // but not an empty label elsewhere nor a completely empty domain name.
        // processLabel() sets UIDNA_ERROR_EMPTY_LABEL when labelLength==0.
        if (0 == labelStart || labelStart < labelLimit) {
            processLabel(dest, labelStart, labelLimit - labelStart, toASCII, info);
            promoteAndResetLabelErrors(info);
        }
        return dest;
    }

    // returns the new dest.length()
    private int mapDevChars(StringBuilder dest, int labelStart, int mappingStart) {
        int length = dest.length();
        boolean didMapDevChars = false;
        for (int i = mappingStart; i < length; ) {
            char c = dest.charAt(i);
            switch (c) {
                case 0xdf:
                    // Map sharp s to ss.
                    didMapDevChars = true;
                    dest.setCharAt(i++, 's');
                    dest.insert(i++, 's');
                    ++length;
                    break;
                case 0x3c2: // Map final sigma to nonfinal sigma.
                    didMapDevChars = true;
                    dest.setCharAt(i++, '\u03c3');
                    break;
                case 0x200c: // Ignore/remove ZWNJ.
                case 0x200d: // Ignore/remove ZWJ.
                    didMapDevChars = true;
                    dest.delete(i, i + 1);
                    --length;
                    break;
                default:
                    ++i;
                    break;
            }
        }
        if (didMapDevChars) {
            // Mapping deviation characters might have resulted in an un-NFC string.
            // We could use either the NFC or the UTS #46 normalizer.
            // By using the UTS #46 normalizer again, we avoid having to load a second .nrm data
            // file.
            String normalized = uts46Norm2.normalize(dest.subSequence(labelStart, dest.length()));
            dest.replace(labelStart, 0x7fffffff, normalized);
            return dest.length();
        }
        return length;
    }

    // Replace the label in dest with the label string, if the label was modified.
    // If label==dest then the label was modified in-place and labelLength
    // is the new label length, different from label.length().
    // If label!=dest then labelLength==label.length().
    // Returns labelLength (= the new label length).
    private static int replaceLabel(
            StringBuilder dest,
            int destLabelStart,
            int destLabelLength,
            CharSequence label,
            int labelLength) {
        if (label != dest) {
            dest.delete(destLabelStart, destLabelStart + destLabelLength)
                    .insert(destLabelStart, label);
            // or dest.replace(destLabelStart, destLabelStart+destLabelLength, label.toString());
            // which would create a String rather than moving characters in the StringBuilder.
        }
        return labelLength;
    }

    // returns the new label length
    private int processLabel(
            StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info) {
        StringBuilder fromPunycode;
        StringBuilder labelString;
        int destLabelStart = labelStart;
        int destLabelLength = labelLength;
        boolean wasPunycode;
        if (labelLength >= 4
                && dest.charAt(labelStart) == 'x'
                && dest.charAt(labelStart + 1) == 'n'
                && dest.charAt(labelStart + 2) == '-'
                && dest.charAt(labelStart + 3) == '-') {
            // Label starts with "xn--", try to un-Punycode it.
            // In IDNA2008, labels like "xn--" (decodes to an empty string) and
            // "xn--ASCII-" (decodes to just "ASCII") fail the round-trip validation from
            // comparing the ToUnicode input with the back-to-ToASCII output.
            // They are alternate encodings of the respective ASCII labels.
            // Ignore "xn---" here: It will fail Punycode.decode() which logically comes before
            // the round-trip verification.
            if (labelLength == 4
                    || (labelLength > 5 && dest.charAt(labelStart + labelLength - 1) == '-')) {
                addLabelError(info, Error.INVALID_ACE_LABEL);
                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
            }
            wasPunycode = true;
            try {
                fromPunycode =
                        Punycode.decode(
                                dest.subSequence(labelStart + 4, labelStart + labelLength), null);
            } catch (StringPrepParseException e) {
                addLabelError(info, Error.PUNYCODE);
                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
            }
            // Check for NFC, and for characters that are not
            // valid or deviation characters according to the normalizer.
            // If there is something wrong, then the string will change.
            // Note that the normalizer passes through non-LDH ASCII and deviation characters.
            // Deviation characters are ok in Punycode even in transitional processing.
            // In the code further below, if we find non-LDH ASCII and we have UIDNA_USE_STD3_RULES
            // then we will set UIDNA_ERROR_INVALID_ACE_LABEL there too.
            boolean isValid = uts46Norm2.isNormalized(fromPunycode);
            // Unicode 15.1 UTS #46:
            // Added an additional condition in 4.1 Validity Criteria to
            // disallow labels such as xn--xn---epa., which do not round-trip.
            // --> Validity Criteria new criterion 4:
            // If not CheckHyphens, the label must not begin with “xn--”.
            if (!isValid || startsWithXNDashDash(fromPunycode)) {
                addLabelError(info, Error.INVALID_ACE_LABEL);
                return markBadACELabel(dest, labelStart, labelLength, toASCII, info);
            }
            labelString = fromPunycode;
            labelStart = 0;
            labelLength = fromPunycode.length();
        } else {
            wasPunycode = false;
            labelString = dest;
        }
        // Validity check
        if (labelLength == 0) {
            addLabelError(info, Error.EMPTY_LABEL);
            return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
        }
        // labelLength>0
        if (labelLength >= 4
                && labelString.charAt(labelStart + 2) == '-'
                && labelString.charAt(labelStart + 3) == '-') {
            // label starts with "??--"
            addLabelError(info, Error.HYPHEN_3_4);
        }
        if (labelString.charAt(labelStart) == '-') {
            // label starts with "-"
            addLabelError(info, Error.LEADING_HYPHEN);
        }
        if (labelString.charAt(labelStart + labelLength - 1) == '-') {
            // label ends with "-"
            addLabelError(info, Error.TRAILING_HYPHEN);
        }
        // If the label was not a Punycode label, then it was the result of
        // mapping, normalization and label segmentation.
        // If the label was in Punycode, then we mapped it again above
        // and checked its validity.
        // Now we handle the STD3 restriction to LDH characters (if set)
        // and we look for U+FFFD which indicates disallowed characters
        // in a non-Punycode label or U+FFFD itself in a Punycode label.
        // We also check for dots which can come from the input to a single-label function.
        // Ok to cast away const because we own the UnicodeString.
        int i = labelStart;
        int limit = labelStart + labelLength;
        char oredChars = 0;
        // If we enforce STD3 rules, then ASCII characters other than LDH and dot are disallowed.
        boolean disallowNonLDHDot = (options & USE_STD3_RULES) != 0;
        do {
            char c = labelString.charAt(i);
            if (c <= 0x7f) {
                if (c == '.') {
                    addLabelError(info, Error.LABEL_HAS_DOT);
                    labelString.setCharAt(i, '\ufffd');
                } else if (disallowNonLDHDot && asciiData[c] < 0) {
                    addLabelError(info, Error.DISALLOWED);
                    labelString.setCharAt(i, '\ufffd');
                }
            } else {
                oredChars |= c;
                if (c == 0xfffd) {
                    addLabelError(info, Error.DISALLOWED);
                }
            }
            ++i;
        } while (i < limit);
        // Check for a leading combining mark after other validity checks
        // so that we don't report IDNA.Error.DISALLOWED for the U+FFFD from here.
        int c;
        // "Unsafe" is ok because unpaired surrogates were mapped to U+FFFD.
        c = labelString.codePointAt(labelStart);
        if ((U_GET_GC_MASK(c) & U_GC_M_MASK) != 0) {
            addLabelError(info, Error.LEADING_COMBINING_MARK);
            labelString.setCharAt(labelStart, '\ufffd');
            if (c > 0xffff) {
                // Remove c's trail surrogate.
                labelString.deleteCharAt(labelStart + 1);
                --labelLength;
                if (labelString == dest) {
                    --destLabelLength;
                }
            }
        }
        if (!hasCertainLabelErrors(info, severeErrors)) {
            // Do contextual checks only if we do not have U+FFFD from a severe error
            // because U+FFFD can make these checks fail.
            if ((options & CHECK_BIDI) != 0 && (!isBiDi(info) || isOkBiDi(info))) {
                checkLabelBiDi(labelString, labelStart, labelLength, info);
            }
            if ((options & CHECK_CONTEXTJ) != 0
                    && (oredChars & 0x200c) == 0x200c
                    && !isLabelOkContextJ(labelString, labelStart, labelLength)) {
                addLabelError(info, Error.CONTEXTJ);
            }
            if ((options & CHECK_CONTEXTO) != 0 && oredChars >= 0xb7) {
                checkLabelContextO(labelString, labelStart, labelLength, info);
            }
            if (toASCII) {
                if (wasPunycode) {
                    // Leave a Punycode label unchanged if it has no severe errors.
                    if (destLabelLength > 63) {
                        addLabelError(info, Error.LABEL_TOO_LONG);
                    }
                    return destLabelLength;
                } else if (oredChars >= 0x80) {
                    // Contains non-ASCII characters.
                    StringBuilder punycode;
                    try {
                        punycode =
                                Punycode.encode(
                                        labelString.subSequence(
                                                labelStart, labelStart + labelLength),
                                        null);
                    } catch (StringPrepParseException e) {
                        throw new ICUException(e); // unexpected
                    }
                    punycode.insert(0, "xn--");
                    if (punycode.length() > 63) {
                        addLabelError(info, Error.LABEL_TOO_LONG);
                    }
                    return replaceLabel(
                            dest, destLabelStart, destLabelLength, punycode, punycode.length());
                } else {
                    // all-ASCII label
                    if (labelLength > 63) {
                        addLabelError(info, Error.LABEL_TOO_LONG);
                    }
                }
            }
        } else {
            // If a Punycode label has severe errors,
            // then leave it but make sure it does not look valid.
            if (wasPunycode) {
                addLabelError(info, Error.INVALID_ACE_LABEL);
                return markBadACELabel(dest, destLabelStart, destLabelLength, toASCII, info);
            }
        }
        return replaceLabel(dest, destLabelStart, destLabelLength, labelString, labelLength);
    }

    private static boolean startsWithXNDashDash(CharSequence s) {
        return s.length() >= 4
                && s.charAt(0) == 'x'
                && s.charAt(1) == 'n'
                && s.charAt(2) == '-'
                && s.charAt(3) == '-';
    }

    private int markBadACELabel(
            StringBuilder dest, int labelStart, int labelLength, boolean toASCII, Info info) {
        boolean disallowNonLDHDot = (options & USE_STD3_RULES) != 0;
        boolean isASCII = true;
        boolean onlyLDH = true;
        int limit = labelStart + labelLength;
        // Start after the initial "xn--".
        for (int i = labelStart + 4; i < limit; ++i) {
            char c = dest.charAt(i);
            if (c <= 0x7f) {
                if (c == '.') {
                    addLabelError(info, Error.LABEL_HAS_DOT);
                    dest.setCharAt(i, '\ufffd');
                    isASCII = onlyLDH = false;
                } else if (asciiData[c] < 0) {
                    onlyLDH = false;
                    if (disallowNonLDHDot) {
                        dest.setCharAt(i, '\ufffd');
                        isASCII = false;
                    }
                }
            } else {
                isASCII = onlyLDH = false;
            }
        }
        if (onlyLDH) {
            dest.insert(labelStart + labelLength, '\ufffd');
            ++labelLength;
        } else {
            if (toASCII && isASCII && labelLength > 63) {
                addLabelError(info, Error.LABEL_TOO_LONG);
            }
        }
        return labelLength;
    }

    private static final int L_MASK = U_MASK(UCharacterDirection.LEFT_TO_RIGHT);
    private static final int R_AL_MASK =
            U_MASK(UCharacterDirection.RIGHT_TO_LEFT)
                    | U_MASK(UCharacterDirection.RIGHT_TO_LEFT_ARABIC);
    private static final int L_R_AL_MASK = L_MASK | R_AL_MASK;

    private static final int R_AL_AN_MASK = R_AL_MASK | U_MASK(UCharacterDirection.ARABIC_NUMBER);

    private static final int EN_AN_MASK =
            U_MASK(UCharacterDirection.EUROPEAN_NUMBER) | U_MASK(UCharacterDirection.ARABIC_NUMBER);
    private static final int R_AL_EN_AN_MASK = R_AL_MASK | EN_AN_MASK;
    private static final int L_EN_MASK = L_MASK | U_MASK(UCharacterDirection.EUROPEAN_NUMBER);

    private static final int ES_CS_ET_ON_BN_NSM_MASK =
            U_MASK(UCharacterDirection.EUROPEAN_NUMBER_SEPARATOR)
                    | U_MASK(UCharacterDirection.COMMON_NUMBER_SEPARATOR)
                    | U_MASK(UCharacterDirection.EUROPEAN_NUMBER_TERMINATOR)
                    | U_MASK(UCharacterDirection.OTHER_NEUTRAL)
                    | U_MASK(UCharacterDirection.BOUNDARY_NEUTRAL)
                    | U_MASK(UCharacterDirection.DIR_NON_SPACING_MARK);
    private static final int L_EN_ES_CS_ET_ON_BN_NSM_MASK = L_EN_MASK | ES_CS_ET_ON_BN_NSM_MASK;
    private static final int R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK =
            R_AL_MASK | EN_AN_MASK | ES_CS_ET_ON_BN_NSM_MASK;

    // We scan the whole label and check both for whether it contains RTL characters
    // and whether it passes the BiDi Rule.
    // In a BiDi domain name, all labels must pass the BiDi Rule, but we might find
    // that a domain name is a BiDi domain name (has an RTL label) only after
    // processing several earlier labels.
    private void checkLabelBiDi(CharSequence label, int labelStart, int labelLength, Info info) {
        // IDNA2008 BiDi rule
        // Get the directionality of the first character.
        int c;
        int i = labelStart;
        c = Character.codePointAt(label, i);
        i += Character.charCount(c);
        int firstMask = U_MASK(UBiDiProps.INSTANCE.getClass(c));
        // 1. The first character must be a character with BIDI property L, R
        // or AL.  If it has the R or AL property, it is an RTL label; if it
        // has the L property, it is an LTR label.
        if ((firstMask & ~L_R_AL_MASK) != 0) {
            setNotOkBiDi(info);
        }
        // Get the directionality of the last non-NSM character.
        int lastMask;
        int labelLimit = labelStart + labelLength;
        for (; ; ) {
            if (i >= labelLimit) {
                lastMask = firstMask;
                break;
            }
            c = Character.codePointBefore(label, labelLimit);
            labelLimit -= Character.charCount(c);
            int dir = UBiDiProps.INSTANCE.getClass(c);
            if (dir != UCharacterDirection.DIR_NON_SPACING_MARK) {
                lastMask = U_MASK(dir);
                break;
            }
        }
        // 3. In an RTL label, the end of the label must be a character with
        // BIDI property R, AL, EN or AN, followed by zero or more
        // characters with BIDI property NSM.
        // 6. In an LTR label, the end of the label must be a character with
        // BIDI property L or EN, followed by zero or more characters with
        // BIDI property NSM.
        if ((firstMask & L_MASK) != 0
                ? (lastMask & ~L_EN_MASK) != 0
                : (lastMask & ~R_AL_EN_AN_MASK) != 0) {
            setNotOkBiDi(info);
        }
        // Add the directionalities of the intervening characters.
        int mask = firstMask | lastMask;
        while (i < labelLimit) {
            c = Character.codePointAt(label, i);
            i += Character.charCount(c);
            mask |= U_MASK(UBiDiProps.INSTANCE.getClass(c));
        }
        if ((firstMask & L_MASK) != 0) {
            // 5. In an LTR label, only characters with the BIDI properties L, EN,
            // ES, CS, ET, ON, BN and NSM are allowed.
            if ((mask & ~L_EN_ES_CS_ET_ON_BN_NSM_MASK) != 0) {
                setNotOkBiDi(info);
            }
        } else {
            // 2. In an RTL label, only characters with the BIDI properties R, AL,
            // AN, EN, ES, CS, ET, ON, BN and NSM are allowed.
            if ((mask & ~R_AL_AN_EN_ES_CS_ET_ON_BN_NSM_MASK) != 0) {
                setNotOkBiDi(info);
            }
            // 4. In an RTL label, if an EN is present, no AN may be present, and
            // vice versa.
            if ((mask & EN_AN_MASK) == EN_AN_MASK) {
                setNotOkBiDi(info);
            }
        }
        // An RTL label is a label that contains at least one character of type
        // R, AL or AN. [...]
        // A "BIDI domain name" is a domain name that contains at least one RTL
        // label. [...]
        // The following rule, consisting of six conditions, applies to labels
        // in BIDI domain names.
        if ((mask & R_AL_AN_MASK) != 0) {
            setBiDi(info);
        }
    }

    // Special code for the ASCII prefix of a BiDi domain name.
    // The ASCII prefix is all-LTR.

    // IDNA2008 BiDi rule, parts relevant to ASCII labels:
    // 1. The first character must be a character with BIDI property L [...]
    // 5. In an LTR label, only characters with the BIDI properties L, EN,
    // ES, CS, ET, ON, BN and NSM are allowed.
    // 6. In an LTR label, the end of the label must be a character with
    // BIDI property L or EN [...]

    // UTF-16 version, called for mapped ASCII prefix.
    // Cannot contain uppercase A-Z.
    // s[length-1] must be the trailing dot.
    private static boolean isASCIIOkBiDi(CharSequence s, int length) {
        int labelStart = 0;
        for (int i = 0; i < length; ++i) {
            char c = s.charAt(i);
            if (c == '.') { // dot
                if (i > labelStart) {
                    c = s.charAt(i - 1);
                    if (!('a' <= c && c <= 'z') && !('0' <= c && c <= '9')) {
                        // Last character in the label is not an L or EN.
                        return false;
                    }
                }
                labelStart = i + 1;
            } else if (i == labelStart) {
                if (!('a' <= c && c <= 'z')) {
                    // First character in the label is not an L.
                    return false;
                }
            } else {
                if (c <= 0x20 && (c >= 0x1c || (9 <= c && c <= 0xd))) {
                    // Intermediate character in the label is a B, S or WS.
                    return false;
                }
            }
        }
        return true;
    }

    private boolean isLabelOkContextJ(CharSequence label, int labelStart, int labelLength) {
        // [IDNA2008-Tables]
        // 200C..200D  ; CONTEXTJ    # ZERO WIDTH NON-JOINER..ZERO WIDTH JOINER
        int labelLimit = labelStart + labelLength;
        for (int i = labelStart; i < labelLimit; ++i) {
            if (label.charAt(i) == 0x200c) {
                // Appendix A.1. ZERO WIDTH NON-JOINER
                // Rule Set:
                //  False;
                //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
                //  If RegExpMatch((Joining_Type:{L,D})(Joining_Type:T)*\u200C
                //     (Joining_Type:T)*(Joining_Type:{R,D})) Then True;
                if (i == labelStart) {
                    return false;
                }
                int c;
                int j = i;
                c = Character.codePointBefore(label, j);
                j -= Character.charCount(c);
                if (uts46Norm2.getCombiningClass(c) == 9) {
                    continue;
                }
                // check precontext (Joining_Type:{L,D})(Joining_Type:T)*
                for (; ; ) {
                    /* UJoiningType */ int type = UBiDiProps.INSTANCE.getJoiningType(c);
                    if (type == UCharacter.JoiningType.TRANSPARENT) {
                        if (j == 0) {
                            return false;
                        }
                        c = Character.codePointBefore(label, j);
                        j -= Character.charCount(c);
                    } else if (type == UCharacter.JoiningType.LEFT_JOINING
                            || type == UCharacter.JoiningType.DUAL_JOINING) {
                        break; // precontext fulfilled
                    } else {
                        return false;
                    }
                }
                // check postcontext (Joining_Type:T)*(Joining_Type:{R,D})
                for (j = i + 1; ; ) {
                    if (j == labelLimit) {
                        return false;
                    }
                    c = Character.codePointAt(label, j);
                    j += Character.charCount(c);
                    /* UJoiningType */ int type = UBiDiProps.INSTANCE.getJoiningType(c);
                    if (type == UCharacter.JoiningType.TRANSPARENT) {
                        // just skip this character
                    } else if (type == UCharacter.JoiningType.RIGHT_JOINING
                            || type == UCharacter.JoiningType.DUAL_JOINING) {
                        break; // postcontext fulfilled
                    } else {
                        return false;
                    }
                }
            } else if (label.charAt(i) == 0x200d) {
                // Appendix A.2. ZERO WIDTH JOINER (U+200D)
                // Rule Set:
                //  False;
                //  If Canonical_Combining_Class(Before(cp)) .eq.  Virama Then True;
                if (i == labelStart) {
                    return false;
                }
                int c = Character.codePointBefore(label, i);
                if (uts46Norm2.getCombiningClass(c) != 9) {
                    return false;
                }
            }
        }
        return true;
    }

    private void checkLabelContextO(
            CharSequence label, int labelStart, int labelLength, Info info) {
        int labelEnd = labelStart + labelLength - 1; // inclusive
        int arabicDigits = 0; // -1 for 066x, +1 for 06Fx
        for (int i = labelStart; i <= labelEnd; ++i) {
            int c = label.charAt(i);
            if (c < 0xb7) {
                // ASCII fastpath
            } else if (c <= 0x6f9) {
                if (c == 0xb7) {
                    // Appendix A.3. MIDDLE DOT (U+00B7)
                    // Rule Set:
                    //  False;
                    //  If Before(cp) .eq.  U+006C And
                    //     After(cp) .eq.  U+006C Then True;
                    if (!(labelStart < i
                            && label.charAt(i - 1) == 'l'
                            && i < labelEnd
                            && label.charAt(i + 1) == 'l')) {
                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    }
                } else if (c == 0x375) {
                    // Appendix A.4. GREEK LOWER NUMERAL SIGN (KERAIA) (U+0375)
                    // Rule Set:
                    //  False;
                    //  If Script(After(cp)) .eq.  Greek Then True;
                    if (!(i < labelEnd
                            && UScript.GREEK
                                    == UScript.getScript(Character.codePointAt(label, i + 1)))) {
                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    }
                } else if (c == 0x5f3 || c == 0x5f4) {
                    // Appendix A.5. HEBREW PUNCTUATION GERESH (U+05F3)
                    // Rule Set:
                    //  False;
                    //  If Script(Before(cp)) .eq.  Hebrew Then True;
                    //
                    // Appendix A.6. HEBREW PUNCTUATION GERSHAYIM (U+05F4)
                    // Rule Set:
                    //  False;
                    //  If Script(Before(cp)) .eq.  Hebrew Then True;
                    if (!(labelStart < i
                            && UScript.HEBREW
                                    == UScript.getScript(Character.codePointBefore(label, i)))) {
                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                    }
                } else if (0x660 <= c /* && c<=0x6f9 */) {
                    // Appendix A.8. ARABIC-INDIC DIGITS (0660..0669)
                    // Rule Set:
                    //  True;
                    //  For All Characters:
                    //    If cp .in. 06F0..06F9 Then False;
                    //  End For;
                    //
                    // Appendix A.9. EXTENDED ARABIC-INDIC DIGITS (06F0..06F9)
                    // Rule Set:
                    //  True;
                    //  For All Characters:
                    //    If cp .in. 0660..0669 Then False;
                    //  End For;
                    if (c <= 0x669) {
                        if (arabicDigits > 0) {
                            addLabelError(info, Error.CONTEXTO_DIGITS);
                        }
                        arabicDigits = -1;
                    } else if (0x6f0 <= c) {
                        if (arabicDigits < 0) {
                            addLabelError(info, Error.CONTEXTO_DIGITS);
                        }
                        arabicDigits = 1;
                    }
                }
            } else if (c == 0x30fb) {
                // Appendix A.7. KATAKANA MIDDLE DOT (U+30FB)
                // Rule Set:
                //  False;
                //  For All Characters:
                //    If Script(cp) .in. {Hiragana, Katakana, Han} Then True;
                //  End For;
                for (int j = labelStart; ; j += Character.charCount(c)) {
                    if (j > labelEnd) {
                        addLabelError(info, Error.CONTEXTO_PUNCTUATION);
                        break;
                    }
                    c = Character.codePointAt(label, j);
                    int script = UScript.getScript(c);
                    if (script == UScript.HIRAGANA
                            || script == UScript.KATAKANA
                            || script == UScript.HAN) {
                        break;
                    }
                }
            }
        }
    }

    // TODO: make public(?) -- in C, these are public in uchar.h
    private static int U_MASK(int x) {
        return 1 << x;
    }

    private static int U_GET_GC_MASK(int c) {
        return (1 << UCharacter.getType(c));
    }

    private static int U_GC_M_MASK =
            U_MASK(UCharacterCategory.NON_SPACING_MARK)
                    | U_MASK(UCharacterCategory.ENCLOSING_MARK)
                    | U_MASK(UCharacterCategory.COMBINING_SPACING_MARK);
}