StringUtils.java

// © 2024 and later: Unicode, Inc. and others.
// License & terms of use: https://www.unicode.org/copyright.html

package com.ibm.icu.message2;

import com.ibm.icu.text.Normalizer2;

class StringUtils {

    // abnf: simple-start-char = %x01-08 ; omit NULL (%x00), HTAB (%x09) and LF (%x0A)
    // abnf:   / %x0B-0C ; omit CR (%x0D)
    // abnf:   / %x0E-1F ; omit SP (%x20)
    // abnf:   / %x21-2D ; omit . (%x2E)
    // abnf:   / %x2F-5B ; omit \ (%x5C)
    // abnf:   / %x5D-7A ; omit { (%x7B)
    // abnf:   / %x7C ; omit } (%x7D)
    // abnf:   / %x7E-2FFF ; omit IDEOGRAPHIC SPACE (%x3000)
    // abnf:   / %x3001-10FFFF
    static boolean isSimpleStartChar(int cp) {
        return (cp >= 0x01 && cp <= 0x08)
                || (cp >= 0x0B && cp <= 0x0C)
                || (cp >= 0x0E && cp <= 0x1F)
                || (cp >= 0x21 && cp <= 0x2D)
                || (cp >= 0x2F && cp <= 0x5B)
                || (cp >= 0x5D && cp <= 0x7A)
                || cp == 0x7C
                || (cp >= 0x7E && cp <= 0x2FFF)
                || (cp >= 0x3001 && cp <= 0x10FFFF);
    }

    // abnf: text-char = %x01-5B ; omit NULL (%x00) and \ (%x5C)
    // abnf:         / %x5D-7A ; omit { (%x7B)
    // abnf:         / %x7C ; omit } (%x7D)
    // abnf:         / %x7E-10FFFF
    static boolean isTextChar(int cp) {
        return (cp >= 0x01 && cp <= 0x5B)
                || (cp >= 0x5D && cp <= 0x7A)
                || cp == 0x7C
                || (cp >= 0x7E && cp <= 0x10FFFF);
    }

    // abnf: backslash = %x5C ; U+005C REVERSE SOLIDUS "\"
    static boolean isBackslash(int cp) {
        return cp == '\\';
    }

    /*
     * ; Whitespace
     * abnf: ws = SP / HTAB / CR / LF / %x3000
     */
    static boolean isWhitespace(int cp) {
        return cp == ' ' || cp == '\t' || cp == '\r' || cp == '\n' || cp == 0x3000;
    }

    /*
     * ; Bidirectional marks and isolates
     * ; ALM / LRM / RLM / LRI, RLI, FSI & PDI
     * abnf: bidi = %x061C / %x200E / %x200F / %x2066-2069
     */
    static boolean isBidi(int cp) {
        return cp == 0x061C || cp == 0x200E || cp == 0x200F || (cp >= 0x2066 && cp <= 0x2069);
    }

    /*
     * abnf: name-start = ALPHA
     * abnf:         ; omit Cc: %x0-1F, Whitespace: SPACE, Ascii: «!"#$%&'()*»
     * abnf:         / %x2B ; «+» omit Ascii: «,-./0123456789:;<=>?@» «[\]^»
     * abnf:         / %x5F ; «_» omit Cc: %x7F-9F, Whitespace: %xA0, Ascii: «`» «{|}~»
     * abnf:         / %xA1-61B ; omit BidiControl: %x61C
     * abnf:         / %x61D-167F ; omit Whitespace: %x1680
     * abnf:         / %x1681-1FFF ; omit Whitespace: %x2000-200A
     * abnf:         / %x200B-200D ; omit BidiControl: %x200E-200F
     * abnf:         / %x2010-2027 ; omit Whitespace: %x2028-2029 %x202F, BidiControl: %x202A-202E
     * abnf:         / %x2030-205E ; omit Whitespace: %x205F
     * abnf:         / %x2060-2065 ; omit BidiControl: %x2066-2069
     * abnf:         / %x206A-2FFF ; omit Whitespace: %x3000
     * abnf:         / %x3001-D7FF ; omit Cs: %xD800-DFFF
     * abnf:         / %xE000-FDCF ; omit NChar: %xFDD0-FDEF
     * abnf:         / %xFDF0-FFFD ; omit NChar: %xFFFE-FFFF
     * abnf:         / %x10000-1FFFD ; omit NChar: %x1FFFE-1FFFF
     * abnf:         / %x20000-2FFFD ; omit NChar: %x2FFFE-2FFFF
     * abnf:         / %x30000-3FFFD ; omit NChar: %x3FFFE-3FFFF
     * abnf:         / %x40000-4FFFD ; omit NChar: %x4FFFE-4FFFF
     * abnf:         / %x50000-5FFFD ; omit NChar: %x5FFFE-5FFFF
     * abnf:         / %x60000-6FFFD ; omit NChar: %x6FFFE-6FFFF
     * abnf:         / %x70000-7FFFD ; omit NChar: %x7FFFE-7FFFF
     * abnf:         / %x80000-8FFFD ; omit NChar: %x8FFFE-8FFFF
     * abnf:         / %x90000-9FFFD ; omit NChar: %x9FFFE-9FFFF
     * abnf:         / %xA0000-AFFFD ; omit NChar: %xAFFFE-AFFFF
     * abnf:         / %xB0000-BFFFD ; omit NChar: %xBFFFE-BFFFF
     * abnf:         / %xC0000-CFFFD ; omit NChar: %xCFFFE-CFFFF
     * abnf:         / %xD0000-DFFFD ; omit NChar: %xDFFFE-DFFFF
     * abnf:         / %xE0000-EFFFD ; omit NChar: %xEFFFE-EFFFF
     * abnf:         / %xF0000-FFFFD ; omit NChar: %xFFFFE-FFFFF
     * abnf:         / %x100000-10FFFD ; omit NChar: %x10FFFE-10FFFF
     */
    static boolean isNameStart(int cp) {
        return isAlpha(cp)
                || cp == 0x2B
                || cp == 0x5F
                || (cp >= 0xA1 && cp <= 0x61B)
                || (cp >= 0x61D && cp <= 0x167F)
                || (cp >= 0x1681 && cp <= 0x1FFF)
                || (cp >= 0x200B && cp <= 0x200D)
                || (cp >= 0x2010 && cp <= 0x2027)
                || (cp >= 0x2030 && cp <= 0x205E)
                || (cp >= 0x2060 && cp <= 0x2065)
                || (cp >= 0x206A && cp <= 0x2FFF)
                || (cp >= 0x3001 && cp <= 0xD7FF)
                || (cp >= 0xE000 && cp <= 0xFDCF)
                || (cp >= 0xFDF0 && cp <= 0xFFFD)
                || (cp >= 0x10000 && cp <= 0x1FFFD)
                || (cp >= 0x20000 && cp <= 0x2FFFD)
                || (cp >= 0x30000 && cp <= 0x3FFFD)
                || (cp >= 0x40000 && cp <= 0x4FFFD)
                || (cp >= 0x50000 && cp <= 0x5FFFD)
                || (cp >= 0x60000 && cp <= 0x6FFFD)
                || (cp >= 0x70000 && cp <= 0x7FFFD)
                || (cp >= 0x80000 && cp <= 0x8FFFD)
                || (cp >= 0x90000 && cp <= 0x9FFFD)
                || (cp >= 0xA0000 && cp <= 0xAFFFD)
                || (cp >= 0xB0000 && cp <= 0xBFFFD)
                || (cp >= 0xC0000 && cp <= 0xCFFFD)
                || (cp >= 0xD0000 && cp <= 0xDFFFD)
                || (cp >= 0xE0000 && cp <= 0xEFFFD)
                || (cp >= 0xF0000 && cp <= 0xFFFFD)
                || (cp >= 0x100000 && cp <= 0x10FFFD);
    }

    /*
     * abnf: name-char = name-start / DIGIT / "-" / "."
     */
    static boolean isNameChar(int cp) {
        return isNameStart(cp) || isDigit(cp) || cp == '-' || cp == '.';
    }

    // abnf: quoted-char = %x01-5B ; omit NULL (%x00) and \ (%x5C)
    // abnf:         / %x5D-7B ; omit | (%x7C)
    // abnf:         / %x7D-10FFFF
    static boolean isQuotedChar(int cp) {
        return (cp >= 0x01 && cp <= 0x5B)
                || (cp >= 0x5D && cp <= 0x7B)
                || (cp >= 0x7D && cp <= 0x10FFFF);
    }

    // ALPHA is predefined in ABNF as plain ASCII, A-Z and a-z
    // See https://en.wikipedia.org/wiki/Augmented_Backus%E2%80%93Naur_form
    static boolean isAlpha(int cp) {
        return (cp >= 'a' && cp <= 'z') || (cp >= 'A' && cp <= 'Z');
    }

    // DIGIT is predefined in ABNF as plain ASCII, 0-9
    // See https://en.wikipedia.org/wiki/Augmented_Backus%E2%80%93Naur_form
    static boolean isDigit(int cp) {
        return cp >= '0' && cp <= '9';
    }

    // abnf: function = ":" identifier *(s option)
    static boolean isFunctionSigil(int cp) {
        return cp == ':';
    }

    private static final Normalizer2 NFC_NORMALIZER = Normalizer2.getNFCInstance();

    static String toNfc(CharSequence value) {
        return value == null ? null : NFC_NORMALIZER.normalize(value);
    }
}