EscapeTransliterator.java
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
**********************************************************************
* Copyright (c) 2001-2011, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
* Date Name Description
* 11/19/2001 aliu Creation.
**********************************************************************
*/
package com.ibm.icu.text;
import com.ibm.icu.impl.Utility;
/**
* A transliterator that converts Unicode characters to an escape form. Examples of escape forms are
* "U+4E01" and "". Escape forms have a prefix and suffix, either of which may be empty, a
* radix, typically 16 or 10, a minimum digit count, typically 1, 4, or 8, and a boolean that
* specifies whether supplemental characters are handled as 32-bit code points or as two 16-bit code
* units. Most escape forms handle 32-bit code points, but some, such as the Java form,
* intentionally break them into two surrogate pairs, for backward compatibility.
*
* <p>Some escape forms actually have two different patterns, one for BMP characters (0..FFFF) and
* one for supplements (>FFFF). To handle this, a second EscapeTransliterator may be defined that
* specifies the pattern to be produced for supplementals. An example of a form that requires this
* is the C form, which uses "\\uFFFF" for BMP characters and "\\U0010FFFF" for supplementals.
*
* <p>This class is package private. It registers several standard variants with the system which
* are then accessed via their IDs.
*
* @author Alan Liu
*/
class EscapeTransliterator extends Transliterator {
/** The prefix of the escape form; may be empty, but usually isn't. May not be null. */
private String prefix;
/** The prefix of the escape form; often empty. May not be null. */
private String suffix;
/** The radix to display the number in. Typically 16 or 10. Must be in the range 2 to 36. */
private int radix;
/**
* The minimum number of digits. Typically 1, 4, or 8. Values less than 1 are equivalent to 1.
*/
private int minDigits;
/**
* If true, supplementals are handled as 32-bit code points. If false, they are handled as two
* 16-bit code units.
*/
private boolean grokSupplementals;
/**
* The form to be used for supplementals. If this is null then the same form is used for BMP
* characters and supplementals. If this is not null and if grokSupplementals is true then the
* prefix, suffix, radix, and minDigits of this object are used for supplementals.
*/
private EscapeTransliterator supplementalHandler;
/**
* Registers standard variants with the system. Called by Transliterator during initialization.
*/
static void register() {
// Unicode: "U+10FFFF" hex, min=4, max=6
Transliterator.registerFactory(
"Any-Hex/Unicode",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/Unicode", "U+", "", 16, 4, true, null);
}
});
// Java: "\\uFFFF" hex, min=4, max=4
Transliterator.registerFactory(
"Any-Hex/Java",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/Java", "\\u", "", 16, 4, false, null);
}
});
// C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
Transliterator.registerFactory(
"Any-Hex/C",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/C",
"\\u",
"",
16,
4,
true,
new EscapeTransliterator("", "\\U", "", 16, 8, true, null));
}
});
// XML: "" hex, min=1, max=6
Transliterator.registerFactory(
"Any-Hex/XML",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/XML", "&#x", ";", 16, 1, true, null);
}
});
// XML10: "&1114111;" dec, min=1, max=7 (not really "Any-Hex")
Transliterator.registerFactory(
"Any-Hex/XML10",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/XML10", "&#", ";", 10, 1, true, null);
}
});
// Perl: "\\x{263A}" hex, min=1, max=6
Transliterator.registerFactory(
"Any-Hex/Perl",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator(
"Any-Hex/Perl", "\\x{", "}", 16, 1, true, null);
}
});
// Plain: "FFFF" hex, min=4, max=6
Transliterator.registerFactory(
"Any-Hex/Plain",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator("Any-Hex/Plain", "", "", 16, 4, true, null);
}
});
// Generic
Transliterator.registerFactory(
"Any-Hex",
new Transliterator.Factory() {
@Override
public Transliterator getInstance(String ID) {
return new EscapeTransliterator("Any-Hex", "\\u", "", 16, 4, false, null);
}
});
}
/**
* Constructs an escape transliterator with the given ID and parameters. See the class member
* documentation for details.
*/
EscapeTransliterator(
String ID,
String prefix,
String suffix,
int radix,
int minDigits,
boolean grokSupplementals,
EscapeTransliterator supplementalHandler) {
super(ID, null);
this.prefix = prefix;
this.suffix = suffix;
this.radix = radix;
this.minDigits = minDigits;
this.grokSupplementals = grokSupplementals;
this.supplementalHandler = supplementalHandler;
}
/** Implements {@link Transliterator#handleTransliterate}. */
@Override
protected void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
int start = pos.start;
int limit = pos.limit;
StringBuilder buf = new StringBuilder(prefix);
int prefixLen = prefix.length();
boolean redoPrefix = false;
while (start < limit) {
int c = grokSupplementals ? text.char32At(start) : text.charAt(start);
int charLen = grokSupplementals ? UTF16.getCharCount(c) : 1;
if ((c & 0xFFFF0000) != 0 && supplementalHandler != null) {
buf.setLength(0);
buf.append(supplementalHandler.prefix);
Utility.appendNumber(
buf, c, supplementalHandler.radix, supplementalHandler.minDigits);
buf.append(supplementalHandler.suffix);
redoPrefix = true;
} else {
if (redoPrefix) {
buf.setLength(0);
buf.append(prefix);
redoPrefix = false;
} else {
buf.setLength(prefixLen);
}
Utility.appendNumber(buf, c, radix, minDigits);
buf.append(suffix);
}
text.replace(start, start + charLen, buf.toString());
start += buf.length();
limit += buf.length() - charLen;
}
pos.contextLimit += limit - pos.limit;
pos.limit = limit;
pos.start = start;
}
/* (non-Javadoc)
* @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
*/
@Override
public void addSourceTargetSet(
UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
sourceSet.addAll(getFilterAsUnicodeSet(inputFilter));
for (EscapeTransliterator it = this; it != null; it = it.supplementalHandler) {
if (inputFilter.size() != 0) {
targetSet.addAll(it.prefix);
targetSet.addAll(it.suffix);
StringBuilder buffer = new StringBuilder();
for (int i = 0; i < it.radix; ++i) {
Utility.appendNumber(buffer, i, it.radix, it.minDigits);
}
targetSet.addAll(
buffer.toString()); // TODO drop once String is changed to CharSequence in
// UnicodeSet
}
}
}
}