Punycode.java
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
*******************************************************************************
* Copyright (C) 2003-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package com.ibm.icu.impl;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.StringPrepParseException;
import com.ibm.icu.text.UTF16;
import com.ibm.icu.util.ICUInputTooLongException;
/**
* Ported code from ICU punycode.c
*
* @author ram
*/
public final class Punycode {
/* Punycode parameters for Bootstring */
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
/* "Basic" Unicode/ASCII code points */
private static final char HYPHEN = 0x2d;
private static final char DELIMITER = HYPHEN;
private static final int ZERO = 0x30;
// private static final int NINE = 0x39;
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
private static int adaptBias(int delta, int length, boolean firstTime) {
if (firstTime) {
delta /= DAMP;
} else {
delta /= 2;
}
delta += delta / length;
int count = 0;
for (; delta > ((BASE - TMIN) * TMAX) / 2; count += BASE) {
delta /= (BASE - TMIN);
}
return count + (((BASE - TMIN + 1) * delta) / (delta + SKEW));
}
/**
* @return the numeric value of a basic code point (for use in representing integers) in the
* range 0 to BASE-1, or a negative value if cp is invalid.
*/
private static final int decodeDigit(int cp) {
if (cp <= 'Z') {
if (cp <= '9') {
if (cp < '0') {
return -1;
} else {
return cp - '0' + 26; // 0..9 -> 26..35
}
} else {
return cp - 'A'; // A-Z -> 0..25
}
} else if (cp <= 'z') {
return cp - 'a'; // a..z -> 0..25
} else {
return -1;
}
}
/// CLOVER:OFF
private static char asciiCaseMap(char b, boolean uppercase) {
if (uppercase) {
if (SMALL_A <= b && b <= SMALL_Z) {
b -= (SMALL_A - CAPITAL_A);
}
} else {
if (CAPITAL_A <= b && b <= CAPITAL_Z) {
b += (SMALL_A - CAPITAL_A);
}
}
return b;
}
/// CLOVER:ON
/**
* digitToBasic() returns the basic code point whose value (when used for representing integers)
* is d, which must be in the range 0 to BASE-1. The lowercase form is used unless the uppercase
* flag is nonzero, in which case the uppercase form is used.
*/
private static char digitToBasic(int digit, boolean uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if (digit < 26) {
if (uppercase) {
return (char) (CAPITAL_A + digit);
} else {
return (char) (SMALL_A + digit);
}
} else {
return (char) ((ZERO - 26) + digit);
}
}
// ICU-13727: Limit input length for n^2 algorithm
// where well-formed strings are at most 59 characters long.
private static final int ENCODE_MAX_CODE_UNITS = 1000;
private static final int DECODE_MAX_CHARS = 2000;
/**
* Converts Unicode to Punycode. The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* @param src The source of the String Buffer passed.
* @param caseFlags The boolean array of case flags.
* @return An array of ASCII code points.
*/
public static StringBuilder encode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException {
int n, delta, handledCPCount, basicLength, bias, j, m, q, k, t, srcCPCount;
char c, c2;
int srcLength = src.length();
if (srcLength > ENCODE_MAX_CODE_UNITS) {
throw new ICUInputTooLongException(
"input too long: " + srcLength + " UTF-16 code units");
}
int[] cpBuffer = new int[srcLength];
StringBuilder dest = new StringBuilder(srcLength);
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount = 0;
for (j = 0; j < srcLength; ++j) {
c = src.charAt(j);
if (isBasic(c)) {
cpBuffer[srcCPCount++] = 0;
dest.append(caseFlags != null ? asciiCaseMap(c, caseFlags[j]) : c);
} else {
n = ((caseFlags != null && caseFlags[j]) ? 1 : 0) << 31L;
if (!UTF16.isSurrogate(c)) {
n |= c;
} else if (UTF16.isLeadSurrogate(c)
&& (j + 1) < srcLength
&& UTF16.isTrailSurrogate(c2 = src.charAt(j + 1))) {
++j;
n |= UCharacter.getCodePoint(c, c2);
} else {
/* error: unmatched surrogate */
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
cpBuffer[srcCPCount++] = n;
}
}
/* Finish the basic string - if it is not empty - with a delimiter. */
basicLength = dest.length();
if (basicLength > 0) {
dest.append(DELIMITER);
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n = INITIAL_N;
delta = 0;
bias = INITIAL_BIAS;
/* Main encoding loop: */
for (handledCPCount = basicLength; handledCPCount < srcCPCount; /* no op */ ) {
/*
* All non-basic code points < n have been handled already.
* Find the next larger one:
*/
for (m = 0x7fffffff, j = 0; j < srcCPCount; ++j) {
q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
if (n <= q && q < m) {
m = q;
}
}
/*
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if (m - n > (0x7fffffff - handledCPCount - delta) / (handledCPCount + 1)) {
throw new IllegalStateException("Internal program error");
}
delta += (m - n) * (handledCPCount + 1);
n = m;
/* Encode a sequence of same code points n */
for (j = 0; j < srcCPCount; ++j) {
q = cpBuffer[j] & 0x7fffffff; /* remove case flag from the sign bit */
if (q < n) {
++delta;
} else if (q == n) {
/* Represent delta as a generalized variable-length integer: */
for (q = delta, k = BASE; /* no condition */ ; k += BASE) {
/**
* RAM: comment out the old code for conformance with
* draft-ietf-idn-punycode-03.txt
*
* <p>t=k-bias; if(t<TMIN) { t=TMIN; } else if(t>TMAX) { t=TMAX; }
*/
t = k - bias;
if (t < TMIN) {
t = TMIN;
} else if (k >= (bias + TMAX)) {
t = TMAX;
}
if (q < t) {
break;
}
dest.append(digitToBasic(t + (q - t) % (BASE - t), false));
q = (q - t) / (BASE - t);
}
dest.append(digitToBasic(q, (cpBuffer[j] < 0)));
bias = adaptBias(delta, handledCPCount + 1, (handledCPCount == basicLength));
delta = 0;
++handledCPCount;
}
}
++delta;
++n;
}
return dest;
}
private static boolean isBasic(int ch) {
return (ch < INITIAL_N);
}
/// CLOVER:OFF
private static boolean isBasicUpperCase(int ch) {
return (CAPITAL_A <= ch && ch >= CAPITAL_Z);
}
/// CLOVER:ON
private static boolean isSurrogate(int ch) {
return (((ch) & 0xfffff800) == 0xd800);
}
/**
* Converts Punycode to Unicode. The Unicode string will be at most as long as the Punycode
* string.
*
* @param src The source of the string buffer being passed.
* @param caseFlags The array of boolean case flags.
* @return StringBuilder string.
*/
public static StringBuilder decode(CharSequence src, boolean[] caseFlags)
throws StringPrepParseException {
int srcLength = src.length();
if (srcLength > DECODE_MAX_CHARS) {
throw new ICUInputTooLongException("input too long: " + srcLength + " characters");
}
StringBuilder dest = new StringBuilder(src.length());
int n,
i,
bias,
basicLength,
j,
in,
oldi,
w,
k,
digit,
t,
destCPCount,
firstSupplementaryIndex,
cpLength;
char b;
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The following loop iterates backward.
*/
for (j = srcLength; j > 0; ) {
if (src.charAt(--j) == DELIMITER) {
break;
}
}
basicLength = destCPCount = j;
for (j = 0; j < basicLength; ++j) {
b = src.charAt(j);
if (!isBasic(b)) {
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.INVALID_CHAR_FOUND);
}
dest.append(b);
if (caseFlags != null && j < caseFlags.length) {
caseFlags[j] = isBasicUpperCase(b);
}
}
/* Initialize the state: */
n = INITIAL_N;
i = 0;
bias = INITIAL_BIAS;
firstSupplementaryIndex = 1000000000;
/*
* Main decoding loop:
* Start just after the last delimiter if any
* basic code points were copied; start at the beginning otherwise.
*/
for (in = basicLength > 0 ? basicLength + 1 : 0; in < srcLength; /* no op */ ) {
/*
* in is the index of the next character to be consumed, and
* destCPCount is the number of code points in the output array.
*
* Decode a generalized variable-length integer into delta,
* which gets added to i. The overflow checking is easier
* if we increase i as we go, then subtract off its starting
* value at the end to obtain delta.
*/
for (oldi = i, w = 1, k = BASE; /* no condition */ ; k += BASE) {
if (in >= srcLength) {
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
digit = decodeDigit(src.charAt(in++));
if (digit < 0) {
throw new StringPrepParseException(
"Invalid char found", StringPrepParseException.INVALID_CHAR_FOUND);
}
if (digit > (0x7fffffff - i) / w) {
/* integer overflow */
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
i += digit * w;
t = k - bias;
if (t < TMIN) {
t = TMIN;
} else if (k >= (bias + TMAX)) {
t = TMAX;
}
if (digit < t) {
break;
}
if (w > 0x7fffffff / (BASE - t)) {
/* integer overflow */
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
w *= BASE - t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias = adaptBias(i - oldi, destCPCount, (oldi == 0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if (i / destCPCount > (0x7fffffff - n)) {
/* integer overflow */
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
n += i / destCPCount;
i %= destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if (n > 0x10ffff || isSurrogate(n)) {
/* Unicode code point overflow */
throw new StringPrepParseException(
"Illegal char found", StringPrepParseException.ILLEGAL_CHAR_FOUND);
}
/* Insert n at position i of the output: */
cpLength = Character.charCount(n);
int codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if (i <= firstSupplementaryIndex) {
codeUnitIndex = i;
if (cpLength > 1) {
firstSupplementaryIndex = codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex =
dest.offsetByCodePoints(
firstSupplementaryIndex, i - firstSupplementaryIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if (caseFlags != null && (dest.length() + cpLength) <= caseFlags.length) {
if (codeUnitIndex < dest.length()) {
System.arraycopy(
caseFlags,
codeUnitIndex,
caseFlags,
codeUnitIndex + cpLength,
dest.length() - codeUnitIndex);
}
/* Case of last character determines uppercase flag: */
caseFlags[codeUnitIndex] = isBasicUpperCase(src.charAt(in - 1));
if (cpLength == 2) {
caseFlags[codeUnitIndex + 1] = false;
}
}
if (cpLength == 1) {
/* BMP, insert one code unit */
dest.insert(codeUnitIndex, (char) n);
} else {
/* supplementary character, insert two code units */
dest.insert(codeUnitIndex, UTF16.getLeadSurrogate(n));
dest.insert(codeUnitIndex + 1, UTF16.getTrailSurrogate(n));
}
++i;
}
return dest;
}
}