TransliteratorParser.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 **********************************************************************
 *   Copyright (c) 2001-2011, International Business Machines
 *   Corporation and others.  All Rights Reserved.
 **********************************************************************
 */
package com.ibm.icu.text;

import com.ibm.icu.impl.IllegalIcuArgumentException;
import com.ibm.icu.impl.PatternProps;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.text.RuleBasedTransliterator.Data;
import java.text.ParsePosition;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

class TransliteratorParser {

    // ----------------------------------------------------------------------
    // Data members
    // ----------------------------------------------------------------------

    /**
     * PUBLIC data member. A Vector of RuleBasedTransliterator.Data objects, one for each discrete
     * group of rules in the rule set
     */
    public List<Data> dataVector;

    /** PUBLIC data member. A Vector of Strings containing all of the ID blocks in the rule set */
    public List<String> idBlockVector;

    /** The current data object for which we are parsing rules */
    private Data curData;

    /** PUBLIC data member containing the parsed compound filter, if any. */
    public UnicodeSet compoundFilter;

    private int direction;

    /** Temporary symbol table used during parsing. */
    private ParseData parseData;

    /**
     * Temporary vector of set variables. When parsing is complete, this is copied into the array
     * data.variables. As with data.variables, element 0 corresponds to character
     * data.variablesBase.
     */
    private List<Object> variablesVector;

    /**
     * Temporary table of variable names. When parsing is complete, this is copied into
     * data.variableNames.
     */
    private Map<String, char[]> variableNames;

    /**
     * String of standins for segments. Used during the parsing of a single rule.
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds to StringMatcher object
     * segmentObjects.elementAt(0), etc.
     */
    private StringBuilder segmentStandins;

    /**
     * Vector of StringMatcher objects for segments. Used during the parsing of a single rule.
     * segmentStandins.charAt(0) is the standin for "$1" and corresponds to StringMatcher object
     * segmentObjects.elementAt(0), etc.
     */
    private List<StringMatcher> segmentObjects;

    /**
     * The next available stand-in for variables. This starts at some point in the private use area
     * (discovered dynamically) and increments up toward <code>variableLimit</code>. At any point
     * during parsing, available variables are <code>variableNext..variableLimit-1</code>.
     */
    private char variableNext;

    /**
     * The last available stand-in for variables. This is discovered dynamically. At any point
     * during parsing, available variables are <code>variableNext..variableLimit-1</code>. During
     * variable definition we use the special value variableLimit-1 as a placeholder.
     */
    private char variableLimit;

    /**
     * When we encounter an undefined variable, we do not immediately signal an error, in case we
     * are defining this variable, e.g., "$a = [a-z];". Instead, we save the name of the undefined
     * variable, and substitute in the placeholder char variableLimit - 1, and decrement
     * variableLimit.
     */
    private String undefinedVariableName;

    /**
     * The stand-in character for the 'dot' set, represented by '.' in patterns. This is allocated
     * the first time it is needed, and reused thereafter.
     */
    private int dotStandIn = -1;

    // ----------------------------------------------------------------------
    // Constants
    // ----------------------------------------------------------------------

    // Indicator for ID blocks
    private static final String ID_TOKEN = "::";
    private static final int ID_TOKEN_LEN = 2;

    /*
    (reserved for future expansion)
        // markers for beginning and end of rule groups
        private static final String BEGIN_TOKEN = "BEGIN";
        private static final String END_TOKEN = "END";
    */

    // Operators
    private static final char VARIABLE_DEF_OP = '=';
    private static final char FORWARD_RULE_OP = '>';
    private static final char REVERSE_RULE_OP = '<';
    private static final char FWDREV_RULE_OP = '~'; // internal rep of <> op

    private static final String OPERATORS = "=><\u2190\u2192\u2194";
    private static final String HALF_ENDERS = "=><\u2190\u2192\u2194;";

    // Other special characters
    private static final char QUOTE = '\'';
    private static final char ESCAPE = '\\';
    private static final char END_OF_RULE = ';';
    private static final char RULE_COMMENT_CHAR = '#';

    private static final char CONTEXT_ANTE = '{'; // ante{key
    private static final char CONTEXT_POST = '}'; // key}post
    private static final char CURSOR_POS = '|';
    private static final char CURSOR_OFFSET = '@';
    private static final char ANCHOR_START = '^';

    private static final char KLEENE_STAR = '*';
    private static final char ONE_OR_MORE = '+';
    private static final char ZERO_OR_ONE = '?';

    private static final char DOT = '.';
    private static final String DOT_SET = "[^[:Zp:][:Zl:]\\r\\n$]";

    // By definition, the ANCHOR_END special character is a
    // trailing SymbolTable.SYMBOL_REF character.
    // private static final char ANCHOR_END       = '$';

    // Segments of the input string are delimited by "(" and ")".  In the
    // output string these segments are referenced as "$1", "$2", etc.
    private static final char SEGMENT_OPEN = '(';
    private static final char SEGMENT_CLOSE = ')';

    // A function is denoted &Source-Target/Variant(text)
    private static final char FUNCTION = '&';

    // Aliases for some of the syntax characters. These are provided so
    // transliteration rules can be expressed in XML without clashing with
    // XML syntax characters '<', '>', and '&'.
    private static final char ALT_REVERSE_RULE_OP = '\u2190'; // Left Arrow
    private static final char ALT_FORWARD_RULE_OP = '\u2192'; // Right Arrow
    private static final char ALT_FWDREV_RULE_OP = '\u2194'; // Left Right Arrow
    private static final char ALT_FUNCTION = '\u2206'; // Increment (~Greek Capital Delta)

    // Special characters disallowed at the top level
    private static final UnicodeSet ILLEGAL_TOP = new UnicodeSet("[\\)]").freeze();

    // Special characters disallowed within a segment
    private static final UnicodeSet ILLEGAL_SEG = new UnicodeSet("[\\{\\}\\|\\@]").freeze();

    // Special characters disallowed within a function argument
    private static final UnicodeSet ILLEGAL_FUNC =
            new UnicodeSet("[\\^\\(\\.\\*\\+\\?\\{\\}\\|\\@]").freeze();

    // ----------------------------------------------------------------------
    // class ParseData
    // ----------------------------------------------------------------------

    /**
     * This class implements the SymbolTable interface. It is used during parsing to give UnicodeSet
     * access to variables that have been defined so far. Note that it uses variablesVector, _not_
     * data.variables.
     */
    private class ParseData implements SymbolTable {

        /** Implement SymbolTable API. */
        @Override
        public char[] lookup(String name) {
            return variableNames.get(name);
        }

        @Override
        public UnicodeSet lookupSet(String name) {
            final char[] value = variableNames.get(name);
            if (value == null || value.length != 1) {
                return null;
            }
            final int i = value[0] - curData.variablesBase;
            if (0 <= i && i < variablesVector.size()) {
                final var result = variablesVector.get(i);
                if (result instanceof UnicodeSet) {
                    return (UnicodeSet) result;
                }
            }
            return null;
        }

        /** Implement SymbolTable API. */
        @Override
        public UnicodeMatcher lookupMatcher(int ch) {
            // Note that we cannot use data.lookup() because the
            // set array has not been constructed yet.
            int i = ch - curData.variablesBase;
            if (i >= 0 && i < variablesVector.size()) {
                return (UnicodeMatcher) variablesVector.get(i);
            }
            return null;
        }

        /** Implement SymbolTable API. Parse out a symbol reference name. */
        @Override
        public String parseReference(String text, ParsePosition pos, int limit) {
            int start = pos.getIndex();
            int i = start;
            while (i < limit) {
                char c = text.charAt(i);
                if ((i == start && !UCharacter.isUnicodeIdentifierStart(c))
                        || !UCharacter.isUnicodeIdentifierPart(c)) {
                    break;
                }
                ++i;
            }
            if (i == start) { // No valid name chars
                return null;
            }
            pos.setIndex(i);
            return text.substring(start, i);
        }

        /**
         * Return true if the given character is a matcher standin or a plain character (non
         * standin).
         */
        public boolean isMatcher(int ch) {
            // Note that we cannot use data.lookup() because the
            // set array has not been constructed yet.
            int i = ch - curData.variablesBase;
            if (i >= 0 && i < variablesVector.size()) {
                return variablesVector.get(i) instanceof UnicodeMatcher;
            }
            return true;
        }

        /**
         * Return true if the given character is a replacer standin or a plain character (non
         * standin).
         */
        public boolean isReplacer(int ch) {
            // Note that we cannot use data.lookup() because the
            // set array has not been constructed yet.
            int i = ch - curData.variablesBase;
            if (i >= 0 && i < variablesVector.size()) {
                return variablesVector.get(i) instanceof UnicodeReplacer;
            }
            return true;
        }
    }

    // ----------------------------------------------------------------------
    // classes RuleBody, RuleArray, and RuleReader
    // ----------------------------------------------------------------------

    /**
     * A private abstract class representing the interface to rule source code that is broken up
     * into lines. Handles the folding of lines terminated by a backslash. This folding is limited;
     * it does not account for comments, quotes, or escapes, so its use to be limited.
     */
    private abstract static class RuleBody {

        /**
         * Retrieve the next line of the source, or return null if none. Folds lines terminated by a
         * backslash into the next line, without regard for comments, quotes, or escapes.
         */
        String nextLine() {
            String s = handleNextLine();
            if (s != null && s.length() > 0 && s.charAt(s.length() - 1) == '\\') {
                StringBuilder b = new StringBuilder(s);
                do {
                    b.deleteCharAt(b.length() - 1);
                    s = handleNextLine();
                    if (s == null) {
                        break;
                    }
                    b.append(s);
                } while (s.length() > 0 && s.charAt(s.length() - 1) == '\\');
                s = b.toString();
            }
            return s;
        }

        /** Reset to the first line of the source. */
        abstract void reset();

        /** Subclass method to return the next line of the source. */
        abstract String handleNextLine();
    }

    /** RuleBody subclass for a String[] array. */
    private static class RuleArray extends RuleBody {
        String[] array;
        int i;

        public RuleArray(String[] array) {
            this.array = array;
            i = 0;
        }

        @Override
        public String handleNextLine() {
            return (i < array.length) ? array[i++] : null;
        }

        @Override
        public void reset() {
            i = 0;
        }
    }

    /*
     * RuleBody subclass for a ResourceReader.
     */
    /*    private static class RuleReader extends RuleBody {
        ResourceReader reader;
        public RuleReader(ResourceReader reader) { this.reader = reader; }
        public String handleNextLine() {
            try {
                return reader.readLine();
            } catch (java.io.IOException e) {}
            return null;
        }
        public void reset() {
            reader.reset();
        }
    }*/

    // ----------------------------------------------------------------------
    // class RuleHalf
    // ----------------------------------------------------------------------

    /**
     * A class representing one side of a rule. This class knows how to parse half of a rule. It is
     * tightly coupled to the method TransliteratorParser.parseRule().
     */
    private static class RuleHalf {

        public String text;

        public int cursor = -1; // position of cursor in text
        public int ante = -1; // position of ante context marker '{' in text
        public int post = -1; // position of post context marker '}' in text

        // Record the offset to the cursor either to the left or to the
        // right of the key.  This is indicated by characters on the output
        // side that allow the cursor to be positioned arbitrarily within
        // the matching text.  For example, abc{def} > | @@@ xyz; changes
        // def to xyz and moves the cursor to before abc.  Offset characters
        // must be at the start or end, and they cannot move the cursor past
        // the ante- or postcontext text.  Placeholders are only valid in
        // output text.  The length of the ante and post context is
        // determined at runtime, because of supplementals and quantifiers.
        public int cursorOffset = 0; // only nonzero on output side

        // Position of first CURSOR_OFFSET on _right_.  This will be -1
        // for |@, -2 for |@@, etc., and 1 for @|, 2 for @@|, etc.
        private int cursorOffsetPos = 0;

        public boolean anchorStart = false;
        public boolean anchorEnd = false;

        /** The segment number from 1..n of the next '(' we see during parsing; 1-based. */
        private int nextSegmentNumber = 1;

        /**
         * Parse one side of a rule, stopping at either the limit, the END_OF_RULE character, or an
         * operator.
         *
         * @return the index after the terminating character, or if limit was reached, limit
         */
        public int parse(String rule, int pos, int limit, TransliteratorParser parser) {
            int start = pos;
            StringBuilder buf = new StringBuilder();
            pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_TOP, false);
            text = buf.toString();

            if (cursorOffset > 0 && cursor != cursorOffsetPos) {
                syntaxError("Misplaced " + CURSOR_POS, rule, start);
            }

            return pos;
        }

        /**
         * Parse a section of one side of a rule, stopping at either the limit, the END_OF_RULE
         * character, an operator, or a segment close character. This method parses both a top-level
         * rule half and a segment within such a rule half. It calls itself recursively to parse
         * segments and nested segments.
         *
         * @param buf buffer into which to accumulate the rule pattern characters, either literal
         *     characters from the rule or standins for UnicodeMatcher objects including segments.
         * @param illegal the set of special characters that is illegal during this parse.
         * @param isSegment if true, then we've already seen a '(' and pos on entry points right
         *     after it. Accumulate everything up to the closing ')', put it in a segment matcher
         *     object, generate a standin for it, and add the standin to buf. As a side effect,
         *     update the segments vector with a reference to the segment matcher. This works
         *     recursively for nested segments. If isSegment is false, just accumulate characters
         *     into buf.
         * @return the index after the terminating character, or if limit was reached, limit
         */
        private int parseSection(
                String rule,
                int pos,
                int limit,
                TransliteratorParser parser,
                StringBuilder buf,
                UnicodeSet illegal,
                boolean isSegment) {
            int start = pos;
            ParsePosition pp = null;
            int quoteStart = -1; // Most recent 'single quoted string'
            int quoteLimit = -1;
            int varStart = -1; // Most recent $variableReference
            int varLimit = -1;
            int[] iref = new int[1];
            int bufStart = buf.length();

            main:
            while (pos < limit) {
                // Since all syntax characters are in the BMP, fetching
                // 16-bit code units suffices here.
                char c = rule.charAt(pos++);
                if (PatternProps.isWhiteSpace(c)) {
                    continue;
                }
                // HALF_ENDERS is all chars that end a rule half: "<>=;"
                if (HALF_ENDERS.indexOf(c) >= 0) {
                    // isSegment is always false
                    if (isSegment) {
                        syntaxError("Unclosed segment", rule, start);
                    }
                    break main;
                }
                if (anchorEnd) {
                    // Text after a presumed end anchor is a syntax err
                    syntaxError("Malformed variable reference", rule, start);
                }
                if (UnicodeSet.resemblesPattern(rule, pos - 1)) {
                    if (pp == null) {
                        pp = new ParsePosition(0);
                    }
                    pp.setIndex(pos - 1); // Backup to opening '['
                    buf.append(parser.parseSet(rule, pp));
                    pos = pp.getIndex();
                    continue;
                }
                // Handle escapes
                if (c == ESCAPE) {
                    if (pos == limit) {
                        syntaxError("Trailing backslash", rule, start);
                    }
                    int cpAndLength = Utility.unescapeAndLengthAt(rule, pos);
                    if (cpAndLength < 0) {
                        syntaxError("Malformed escape", rule, start);
                    }
                    int escaped = Utility.cpFromCodePointAndLength(cpAndLength);
                    pos += Utility.lengthFromCodePointAndLength(cpAndLength);
                    parser.checkVariableRange(escaped, rule, start);
                    buf.appendCodePoint(escaped);
                    continue;
                }
                // Handle quoted matter
                if (c == QUOTE) {
                    int iq = rule.indexOf(QUOTE, pos);
                    if (iq == pos) {
                        buf.append(c); // Parse [''] outside quotes as [']
                        ++pos;
                    } else {
                        /* This loop picks up a run of quoted text of the
                         * form 'aaaa' each time through.  If this run
                         * hasn't really ended ('aaaa''bbbb') then it keeps
                         * looping, each time adding on a new run.  When it
                         * reaches the final quote it breaks.
                         */
                        quoteStart = buf.length();
                        for (; ; ) {
                            if (iq < 0) {
                                syntaxError("Unterminated quote", rule, start);
                            }
                            buf.append(rule.substring(pos, iq));
                            pos = iq + 1;
                            if (pos < limit && rule.charAt(pos) == QUOTE) {
                                // Parse [''] inside quotes as [']
                                iq = rule.indexOf(QUOTE, pos + 1);
                                // Continue looping
                            } else {
                                break;
                            }
                        }
                        quoteLimit = buf.length();

                        for (iq = quoteStart; iq < quoteLimit; ++iq) {
                            parser.checkVariableRange(buf.charAt(iq), rule, start);
                        }
                    }
                    continue;
                }

                parser.checkVariableRange(c, rule, start);

                if (illegal.contains(c)) {
                    syntaxError("Illegal character '" + c + '\'', rule, start);
                }

                switch (c) {

                    // ------------------------------------------------------
                    // Elements allowed within and out of segments
                    // ------------------------------------------------------
                    case ANCHOR_START:
                        if (buf.length() == 0 && !anchorStart) {
                            anchorStart = true;
                        } else {
                            syntaxError("Misplaced anchor start", rule, start);
                        }
                        break;
                    case SEGMENT_OPEN:
                        {
                            // bufSegStart is the offset in buf to the first
                            // character of the segment we are parsing.
                            int bufSegStart = buf.length();

                            // Record segment number now, since nextSegmentNumber
                            // will be incremented during the call to parseSection
                            // if there are nested segments.
                            int segmentNumber = nextSegmentNumber++; // 1-based

                            // Parse the segment
                            pos = parseSection(rule, pos, limit, parser, buf, ILLEGAL_SEG, true);

                            // After parsing a segment, the relevant characters are
                            // in buf, starting at offset bufSegStart.  Extract them
                            // into a string matcher, and replace them with a
                            // standin for that matcher.
                            StringMatcher m =
                                    new StringMatcher(
                                            buf.substring(bufSegStart),
                                            segmentNumber,
                                            parser.curData);

                            // Record and associate object and segment number
                            parser.setSegmentObject(segmentNumber, m);
                            buf.setLength(bufSegStart);
                            buf.append(parser.getSegmentStandin(segmentNumber));
                        }
                        break;
                    case FUNCTION:
                    case ALT_FUNCTION:
                        {
                            iref[0] = pos;
                            TransliteratorIDParser.SingleID single =
                                    TransliteratorIDParser.parseFilterID(rule, iref);
                            // The next character MUST be a segment open
                            if (single == null || !Utility.parseChar(rule, iref, SEGMENT_OPEN)) {
                                syntaxError("Invalid function", rule, start);
                            }

                            Transliterator t = single.getInstance();
                            if (t == null) {
                                syntaxError("Invalid function ID", rule, start);
                            }

                            // bufSegStart is the offset in buf to the first
                            // character of the segment we are parsing.
                            int bufSegStart = buf.length();

                            // Parse the segment
                            pos =
                                    parseSection(
                                            rule, iref[0], limit, parser, buf, ILLEGAL_FUNC, true);

                            // After parsing a segment, the relevant characters are
                            // in buf, starting at offset bufSegStart.
                            FunctionReplacer r =
                                    new FunctionReplacer(
                                            t,
                                            new StringReplacer(
                                                    buf.substring(bufSegStart), parser.curData));

                            // Replace the buffer contents with a stand-in
                            buf.setLength(bufSegStart);
                            buf.append(parser.generateStandInFor(r));
                        }
                        break;
                    case SymbolTable.SYMBOL_REF:
                        // Handle variable references and segment references "$1" .. "$9"
                        {
                            // A variable reference must be followed immediately
                            // by a Unicode identifier start and zero or more
                            // Unicode identifier part characters, or by a digit
                            // 1..9 if it is a segment reference.
                            if (pos == limit) {
                                // A variable ref character at the end acts as
                                // an anchor to the context limit, as in perl.
                                anchorEnd = true;
                                break;
                            }
                            // Parse "$1" "$2" .. "$9" .. (no upper limit)
                            c = rule.charAt(pos);
                            int r = UCharacter.digit(c, 10);
                            if (r >= 1 && r <= 9) {
                                iref[0] = pos;
                                r = Utility.parseNumber(rule, iref, 10);
                                if (r < 0) {
                                    syntaxError("Undefined segment reference", rule, start);
                                }
                                pos = iref[0];
                                buf.append(parser.getSegmentStandin(r));
                            } else {
                                if (pp == null) { // Lazy create
                                    pp = new ParsePosition(0);
                                }
                                pp.setIndex(pos);
                                String name = parser.parseData.parseReference(rule, pp, limit);
                                if (name == null) {
                                    // This means the '$' was not followed by a
                                    // valid name.  Try to interpret it as an
                                    // end anchor then.  If this also doesn't work
                                    // (if we see a following character) then signal
                                    // an error.
                                    anchorEnd = true;
                                    break;
                                }
                                pos = pp.getIndex();
                                // If this is a variable definition statement,
                                // then the LHS variable will be undefined.  In
                                // that case appendVariableDef() will append the
                                // special placeholder char variableLimit-1.
                                varStart = buf.length();
                                parser.appendVariableDef(name, buf);
                                varLimit = buf.length();
                            }
                        }
                        break;
                    case DOT:
                        buf.append(parser.getDotStandIn());
                        break;
                    case KLEENE_STAR:
                    case ONE_OR_MORE:
                    case ZERO_OR_ONE:
                        // Quantifiers.  We handle single characters, quoted strings,
                        // variable references, and segments.
                        //  a+      matches  aaa
                        //  'foo'+  matches  foofoofoo
                        //  $v+     matches  xyxyxy if $v == xy
                        //  (seg)+  matches  segsegseg
                        {
                            // isSegment is always false
                            if (isSegment && buf.length() == bufStart) {
                                // The */+ immediately follows '('
                                syntaxError("Misplaced quantifier", rule, start);
                                break;
                            }

                            int qstart, qlimit;
                            // The */+ follows an isolated character or quote
                            // or variable reference
                            if (buf.length() == quoteLimit) {
                                // The */+ follows a 'quoted string'
                                qstart = quoteStart;
                                qlimit = quoteLimit;
                            } else if (buf.length() == varLimit) {
                                // The */+ follows a $variableReference
                                qstart = varStart;
                                qlimit = varLimit;
                            } else {
                                // The */+ follows a single character, possibly
                                // a segment standin
                                qstart = buf.length() - 1;
                                qlimit = qstart + 1;
                            }

                            UnicodeMatcher m;
                            try {
                                m =
                                        new StringMatcher(
                                                buf.toString(), qstart, qlimit, 0, parser.curData);
                            } catch (RuntimeException e) {
                                final String precontext =
                                        pos < 50
                                                ? rule.substring(0, pos)
                                                : "..." + rule.substring(pos - 50, pos);
                                final String postContext =
                                        limit - pos <= 50
                                                ? rule.substring(pos, limit)
                                                : rule.substring(pos, pos + 50) + "...";
                                throw new IllegalIcuArgumentException(
                                                "Failure in rule: "
                                                        + precontext
                                                        + "$$$"
                                                        + postContext)
                                        .initCause(e);
                            }
                            int min = 0;
                            int max = Quantifier.MAX;
                            switch (c) {
                                case ONE_OR_MORE:
                                    min = 1;
                                    break;
                                case ZERO_OR_ONE:
                                    min = 0;
                                    max = 1;
                                    break;
                                    // case KLEENE_STAR:
                                    //    do nothing -- min, max already set
                            }
                            m = new Quantifier(m, min, max);
                            buf.setLength(qstart);
                            buf.append(parser.generateStandInFor(m));
                        }
                        break;

                    // ------------------------------------------------------
                    // Elements allowed ONLY WITHIN segments
                    // ------------------------------------------------------
                    case SEGMENT_CLOSE:
                        // assert(isSegment);
                        // We're done parsing a segment.
                        break main;

                    // ------------------------------------------------------
                    // Elements allowed ONLY OUTSIDE segments
                    // ------------------------------------------------------
                    case CONTEXT_ANTE:
                        if (ante >= 0) {
                            syntaxError("Multiple ante contexts", rule, start);
                        }
                        ante = buf.length();
                        break;
                    case CONTEXT_POST:
                        if (post >= 0) {
                            syntaxError("Multiple post contexts", rule, start);
                        }
                        post = buf.length();
                        break;
                    case CURSOR_POS:
                        if (cursor >= 0) {
                            syntaxError("Multiple cursors", rule, start);
                        }
                        cursor = buf.length();
                        break;
                    case CURSOR_OFFSET:
                        if (cursorOffset < 0) {
                            if (buf.length() > 0) {
                                syntaxError("Misplaced " + c, rule, start);
                            }
                            --cursorOffset;
                        } else if (cursorOffset > 0) {
                            if (buf.length() != cursorOffsetPos || cursor >= 0) {
                                syntaxError("Misplaced " + c, rule, start);
                            }
                            ++cursorOffset;
                        } else {
                            if (cursor == 0 && buf.length() == 0) {
                                cursorOffset = -1;
                            } else if (cursor < 0) {
                                cursorOffsetPos = buf.length();
                                cursorOffset = 1;
                            } else {
                                syntaxError("Misplaced " + c, rule, start);
                            }
                        }
                        break;

                    // ------------------------------------------------------
                    // Non-special characters
                    // ------------------------------------------------------
                    default:
                        // Disallow unquoted characters other than [0-9A-Za-z]
                        // in the printable ASCII range.  These characters are
                        // reserved for possible future use.
                        if (c >= 0x0021
                                && c <= 0x007E
                                && !((c >= '0' && c <= '9')
                                        || (c >= 'A' && c <= 'Z')
                                        || (c >= 'a' && c <= 'z'))) {
                            syntaxError("Unquoted " + c, rule, start);
                        }
                        buf.append(c);
                        break;
                }
            }
            return pos;
        }

        /** Remove context. */
        void removeContext() {
            text = text.substring(ante < 0 ? 0 : ante, post < 0 ? text.length() : post);
            ante = post = -1;
            anchorStart = anchorEnd = false;
        }

        /**
         * Return true if this half looks like valid output, that is, does not contain quantifiers
         * or other special input-only elements.
         */
        public boolean isValidOutput(TransliteratorParser parser) {
            for (int i = 0; i < text.length(); ) {
                int c = UTF16.charAt(text, i);
                i += UTF16.getCharCount(c);
                if (!parser.parseData.isReplacer(c)) {
                    return false;
                }
            }
            return true;
        }

        /**
         * Return true if this half looks like valid input, that is, does not contain functions or
         * other special output-only elements.
         */
        public boolean isValidInput(TransliteratorParser parser) {
            for (int i = 0; i < text.length(); ) {
                int c = UTF16.charAt(text, i);
                i += UTF16.getCharCount(c);
                if (!parser.parseData.isMatcher(c)) {
                    return false;
                }
            }
            return true;
        }
    }

    // ----------------------------------------------------------------------
    // PUBLIC methods
    // ----------------------------------------------------------------------

    /** Constructor. */
    public TransliteratorParser() {}

    /**
     * Parse a set of rules. After the parse completes, examine the public data members for results.
     */
    public void parse(String rules, int dir) {
        parseRules(new RuleArray(new String[] {rules}), dir);
    }

    /*
     * Parse a set of rules.  After the parse completes, examine the public
     * data members for results.
     */
    /*    public void parse(ResourceReader rules, int direction) {
        parseRules(new RuleReader(rules), direction);
    }*/

    // ----------------------------------------------------------------------
    // PRIVATE methods
    // ----------------------------------------------------------------------

    /**
     * Parse an array of zero or more rules. The strings in the array are treated as if they were
     * concatenated together, with rule terminators inserted between array elements if not present
     * already.
     *
     * <p>Any previous rules are discarded. Typically this method is called exactly once, during
     * construction.
     *
     * <p>The member this.data will be set to null if there are no rules.
     *
     * @exception IllegalIcuArgumentException if there is a syntax error in the rules
     */
    void parseRules(RuleBody ruleArray, int dir) {
        boolean parsingIDs = true;
        int ruleCount = 0;

        dataVector = new ArrayList<>();
        idBlockVector = new ArrayList<>();
        curData = null;
        direction = dir;
        compoundFilter = null;
        variablesVector = new ArrayList<>();
        variableNames = new HashMap<>();
        parseData = new ParseData();

        List<RuntimeException> errors = new ArrayList<>();
        int errorCount = 0;

        ruleArray.reset();

        StringBuilder idBlockResult = new StringBuilder();

        // The compound filter offset is an index into idBlockResult.
        // If it is 0, then the compound filter occurred at the start,
        // and it is the offset to the _start_ of the compound filter
        // pattern.  Otherwise it is the offset to the _limit_ of the
        // compound filter pattern within idBlockResult.
        this.compoundFilter = null;
        int compoundFilterOffset = -1;

        main:
        for (; ; ) {
            String rule = ruleArray.nextLine();
            if (rule == null) {
                break;
            }
            int pos = 0;
            int limit = rule.length();
            while (pos < limit) {
                char c = rule.charAt(pos++);
                if (PatternProps.isWhiteSpace(c)) {
                    continue;
                }
                // Skip lines starting with the comment character
                if (c == RULE_COMMENT_CHAR) {
                    pos = rule.indexOf("\n", pos) + 1;
                    if (pos == 0) {
                        break; // No "\n" found; rest of rule is a comment
                    }
                    continue; // Either fall out or restart with next line
                }

                // skip empty rules
                if (c == END_OF_RULE) continue;

                // Often a rule file contains multiple errors.  It's
                // convenient to the rule author if these are all reported
                // at once.  We keep parsing rules even after a failure, up
                // to a specified limit, and report all errors at once.
                try {
                    ++ruleCount;

                    // We've found the start of a rule or ID.  c is its first
                    // character, and pos points past c.
                    --pos;
                    // Look for an ID token.  Must have at least ID_TOKEN_LEN + 1
                    // chars left.
                    if ((pos + ID_TOKEN_LEN + 1) <= limit
                            && rule.regionMatches(pos, ID_TOKEN, 0, ID_TOKEN_LEN)) {
                        pos += ID_TOKEN_LEN;
                        c = rule.charAt(pos);
                        while (PatternProps.isWhiteSpace(c) && pos < limit) {
                            ++pos;
                            c = rule.charAt(pos);
                        }
                        int[] p = new int[] {pos};

                        if (!parsingIDs) {
                            if (curData != null) {
                                if (direction == Transliterator.FORWARD) dataVector.add(curData);
                                else dataVector.add(0, curData);
                                curData = null;
                            }
                            parsingIDs = true;
                        }

                        TransliteratorIDParser.SingleID id =
                                TransliteratorIDParser.parseSingleID(rule, p, direction);
                        if (p[0] != pos && Utility.parseChar(rule, p, END_OF_RULE)) {
                            // Successful ::ID parse.

                            if (direction == Transliterator.FORWARD) {
                                idBlockResult.append(id.canonID).append(END_OF_RULE);
                            } else {
                                idBlockResult.insert(0, id.canonID + END_OF_RULE);
                            }

                        } else {
                            // Couldn't parse an ID.  Try to parse a global filter
                            int[] withParens = new int[] {-1};
                            UnicodeSet f =
                                    TransliteratorIDParser.parseGlobalFilter(
                                            rule, p, direction, withParens, null);
                            if (f != null && Utility.parseChar(rule, p, END_OF_RULE)) {
                                if ((direction == Transliterator.FORWARD) == (withParens[0] == 0)) {
                                    if (compoundFilter != null) {
                                        // Multiple compound filters
                                        syntaxError("Multiple global filters", rule, pos);
                                    }
                                    compoundFilter = f;
                                    compoundFilterOffset = ruleCount;
                                }
                            } else {
                                // Invalid ::id
                                // Can be parsed as neither an ID nor a global filter
                                syntaxError("Invalid ::ID", rule, pos);
                            }
                        }

                        pos = p[0];
                    } else {
                        if (parsingIDs) {
                            if (direction == Transliterator.FORWARD)
                                idBlockVector.add(idBlockResult.toString());
                            else idBlockVector.add(0, idBlockResult.toString());
                            idBlockResult.delete(0, idBlockResult.length());
                            parsingIDs = false;
                            curData = new RuleBasedTransliterator.Data();

                            // By default, rules use part of the private use area
                            // E000..F8FF for variables and other stand-ins.  Currently
                            // the range F000..F8FF is typically sufficient.  The 'use
                            // variable range' pragma allows rule sets to modify this.
                            setVariableRange(0xF000, 0xF8FF);
                        }

                        if (resemblesPragma(rule, pos, limit)) {
                            int ppp = parsePragma(rule, pos, limit);
                            if (ppp < 0) {
                                syntaxError("Unrecognized pragma", rule, pos);
                            }
                            pos = ppp;
                            // Parse a rule
                        } else {
                            pos = parseRule(rule, pos, limit);
                        }
                    }
                } catch (IllegalArgumentException e) {
                    if (errorCount == 30) {
                        IllegalIcuArgumentException icuEx =
                                new IllegalIcuArgumentException(
                                        "\nMore than 30 errors; further messages squelched");
                        icuEx.initCause(e);
                        errors.add(icuEx);
                        break main;
                    }
                    e.fillInStackTrace();
                    errors.add(e);
                    ++errorCount;
                    pos = ruleEnd(rule, pos, limit) + 1; // +1 advances past ';'
                }
            }
        }
        if (parsingIDs && idBlockResult.length() > 0) {
            if (direction == Transliterator.FORWARD) idBlockVector.add(idBlockResult.toString());
            else idBlockVector.add(0, idBlockResult.toString());
        } else if (!parsingIDs && curData != null) {
            if (direction == Transliterator.FORWARD) dataVector.add(curData);
            else dataVector.add(0, curData);
        }

        // Convert the set vector to an array
        for (int i = 0; i < dataVector.size(); i++) {
            Data data = dataVector.get(i);
            data.variables = new Object[variablesVector.size()];
            variablesVector.toArray(data.variables);
            data.variableNames = new HashMap<>();
            data.variableNames.putAll(variableNames);
        }
        variablesVector = null;

        // Do more syntax checking and index the rules
        try {
            if (compoundFilter != null) {
                if ((direction == Transliterator.FORWARD && compoundFilterOffset != 1)
                        || (direction == Transliterator.REVERSE
                                && compoundFilterOffset != ruleCount)) {
                    throw new IllegalIcuArgumentException("Compound filters misplaced");
                }
            }

            for (int i = 0; i < dataVector.size(); i++) {
                Data data = dataVector.get(i);
                data.ruleSet.freeze();
            }

            if (idBlockVector.size() == 1 && (idBlockVector.get(0)).length() == 0)
                idBlockVector.remove(0);

        } catch (IllegalArgumentException e) {
            e.fillInStackTrace();
            errors.add(e);
        }

        if (errors.size() != 0) {
            for (int i = errors.size() - 1; i > 0; --i) {
                RuntimeException previous = errors.get(i - 1);
                while (previous.getCause() != null) {
                    previous = (RuntimeException) previous.getCause(); // chain specially
                }
                previous.initCause(errors.get(i));
            }
            throw errors.get(0);
            // if initCause not supported: throw new IllegalArgumentException(errors.toString());
        }
    }

    /**
     * MAIN PARSER. Parse the next rule in the given rule string, starting at pos. Return the index
     * after the last character parsed. Do not parse characters at or after limit.
     *
     * <p>Important: The character at pos must be a non-whitespace character that is not the comment
     * character.
     *
     * <p>This method handles quoting, escaping, and whitespace removal. It parses the end-of-rule
     * character. It recognizes context and cursor indicators. Once it does a lexical breakdown of
     * the rule at pos, it creates a rule object and adds it to our rule list.
     *
     * <p>This method is tightly coupled to the inner class RuleHalf.
     */
    private int parseRule(String rule, int pos, int limit) {
        // Locate the left side, operator, and right side
        int start = pos;
        char operator = 0;

        // Set up segments data
        segmentStandins = new StringBuilder();
        segmentObjects = new ArrayList<>();

        RuleHalf left = new RuleHalf();
        RuleHalf right = new RuleHalf();

        undefinedVariableName = null;
        pos = left.parse(rule, pos, limit, this);

        if (pos == limit || OPERATORS.indexOf(operator = rule.charAt(--pos)) < 0) {
            syntaxError("No operator pos=" + pos, rule, start);
        }
        ++pos;

        // Found an operator char.  Check for forward-reverse operator.
        if (operator == REVERSE_RULE_OP && (pos < limit && rule.charAt(pos) == FORWARD_RULE_OP)) {
            ++pos;
            operator = FWDREV_RULE_OP;
        }

        // Translate alternate op characters.
        switch (operator) {
            case ALT_FORWARD_RULE_OP:
                operator = FORWARD_RULE_OP;
                break;
            case ALT_REVERSE_RULE_OP:
                operator = REVERSE_RULE_OP;
                break;
            case ALT_FWDREV_RULE_OP:
                operator = FWDREV_RULE_OP;
                break;
        }

        pos = right.parse(rule, pos, limit, this);

        if (pos < limit) {
            if (rule.charAt(--pos) == END_OF_RULE) {
                ++pos;
            } else {
                // RuleHalf parser must have terminated at an operator
                syntaxError("Unquoted operator", rule, start);
            }
        }

        if (operator == VARIABLE_DEF_OP) {
            // LHS is the name.  RHS is a single character, either a literal
            // or a set (already parsed).  If RHS is longer than one
            // character, it is either a multi-character string, or multiple
            // sets, or a mixture of chars and sets -- syntax error.

            // We expect to see a single undefined variable (the one being
            // defined).
            if (undefinedVariableName == null) {
                syntaxError("Missing '$' or duplicate definition", rule, start);
            }
            if (left.text.length() != 1 || left.text.charAt(0) != variableLimit) {
                syntaxError("Malformed LHS", rule, start);
            }
            if (left.anchorStart || left.anchorEnd || right.anchorStart || right.anchorEnd) {
                syntaxError("Malformed variable def", rule, start);
            }
            // We allow anything on the right, including an empty string.
            int n = right.text.length();
            char[] value = new char[n];
            right.text.getChars(0, n, value, 0);
            variableNames.put(undefinedVariableName, value);

            ++variableLimit;
            return pos;
        }

        // If this is not a variable definition rule, we shouldn't have
        // any undefined variable names.
        if (undefinedVariableName != null) {
            syntaxError("Undefined variable $" + undefinedVariableName, rule, start);
        }

        // Verify segments
        if (segmentStandins.length() > segmentObjects.size()) {
            syntaxError("Undefined segment reference", rule, start);
        }
        for (int i = 0; i < segmentStandins.length(); ++i) {
            if (segmentStandins.charAt(i) == 0) {
                syntaxError("Internal error", rule, start); // will never happen
            }
        }
        for (int i = 0; i < segmentObjects.size(); ++i) {
            if (segmentObjects.get(i) == null) {
                syntaxError("Internal error", rule, start); // will never happen
            }
        }

        // If the direction we want doesn't match the rule
        // direction, do nothing.
        if (operator != FWDREV_RULE_OP
                && ((direction == Transliterator.FORWARD) != (operator == FORWARD_RULE_OP))) {
            return pos;
        }

        // Transform the rule into a forward rule by swapping the
        // sides if necessary.
        if (direction == Transliterator.REVERSE) {
            RuleHalf temp = left;
            left = right;
            right = temp;
        }

        // Remove non-applicable elements in forward-reverse
        // rules.  Bidirectional rules ignore elements that do not
        // apply.
        if (operator == FWDREV_RULE_OP) {
            right.removeContext();
            left.cursor = -1;
            left.cursorOffset = 0;
        }

        // Normalize context
        if (left.ante < 0) {
            left.ante = 0;
        }
        if (left.post < 0) {
            left.post = left.text.length();
        }

        // Context is only allowed on the input side.  Cursors are only
        // allowed on the output side.  Segment delimiters can only appear
        // on the left, and references on the right.  Cursor offset
        // cannot appear without an explicit cursor.  Cursor offset
        // cannot place the cursor outside the limits of the context.
        // Anchors are only allowed on the input side.
        if (right.ante >= 0
                || right.post >= 0
                || left.cursor >= 0
                || (right.cursorOffset != 0 && right.cursor < 0)
                ||
                // - The following two checks were used to ensure that the
                // - the cursor offset stayed within the ante- or postcontext.
                // - However, with the addition of quantifiers, we have to
                // - allow arbitrary cursor offsets and do runtime checking.
                // (right.cursorOffset > (left.text.length() - left.post)) ||
                // (-right.cursorOffset > left.ante) ||
                right.anchorStart
                || right.anchorEnd
                || !left.isValidInput(this)
                || !right.isValidOutput(this)
                || left.ante > left.post) {
            syntaxError("Malformed rule", rule, start);
        }

        // Flatten segment objects vector to an array
        UnicodeMatcher[] segmentsArray = null;
        if (segmentObjects.size() > 0) {
            segmentsArray = new UnicodeMatcher[segmentObjects.size()];
            segmentObjects.toArray(segmentsArray);
        }

        curData.ruleSet.addRule(
                new TransliterationRule(
                        left.text,
                        left.ante,
                        left.post,
                        right.text,
                        right.cursor,
                        right.cursorOffset,
                        segmentsArray,
                        left.anchorStart,
                        left.anchorEnd,
                        curData));

        return pos;
    }

    /** Set the variable range to [start, end] (inclusive). */
    private void setVariableRange(int start, int end) {
        if (start > end || start < 0 || end > 0xFFFF) {
            throw new IllegalIcuArgumentException("Invalid variable range " + start + ", " + end);
        }

        curData.variablesBase = (char) start; // first private use

        if (dataVector.size() == 0) {
            variableNext = (char) start;
            variableLimit = (char) (end + 1);
        }
    }

    /**
     * Assert that the given character is NOT within the variable range. If it is, signal an error.
     * This is necessary to ensure that the variable range does not overlap characters used in a
     * rule.
     */
    private void checkVariableRange(int ch, String rule, int start) {
        if (ch >= curData.variablesBase && ch < variableLimit) {
            syntaxError("Variable range character in rule", rule, start);
        }
    }

    // (The following method is part of an unimplemented feature.
    // 2003-06-11 ICU 2.6 Alan)
    /** Set the maximum backup to 'backup', in response to a pragma statement. */
    private void pragmaMaximumBackup(int backup) {
        // TODO Finish
        throw new IllegalIcuArgumentException("use maximum backup pragma not implemented yet");
    }

    // (The following method is part of an unimplemented feature.
    // 2003-06-11 ICU 2.6 Alan)
    /** Begin normalizing all rules using the given mode, in response to a pragma statement. */
    private void pragmaNormalizeRules(Normalizer.Mode mode) {
        // TODO Finish
        throw new IllegalIcuArgumentException("use normalize rules pragma not implemented yet");
    }

    /**
     * Return true if the given rule looks like a pragma.
     *
     * @param pos offset to the first non-whitespace character of the rule.
     * @param limit pointer past the last character of the rule.
     */
    static boolean resemblesPragma(String rule, int pos, int limit) {
        // Must start with /use\s/i
        return Utility.parsePattern(rule, pos, limit, "use ", null) >= 0;
    }

    /**
     * Parse a pragma. This method assumes resemblesPragma() has already returned true.
     *
     * @param pos offset to the first non-whitespace character of the rule.
     * @param limit pointer past the last character of the rule.
     * @return the position index after the final ';' of the pragma, or -1 on failure.
     */
    private int parsePragma(String rule, int pos, int limit) {
        int[] array = new int[2];

        // resemblesPragma() has already returned true, so we
        // know that pos points to /use\s/i; we can skip 4 characters
        // immediately
        pos += 4;

        // Here are the pragmas we recognize:
        // use variable range 0xE000 0xEFFF;
        // use maximum backup 16;
        // use nfd rules;
        int p = Utility.parsePattern(rule, pos, limit, "~variable range # #~;", array);
        if (p >= 0) {
            setVariableRange(array[0], array[1]);
            return p;
        }

        p = Utility.parsePattern(rule, pos, limit, "~maximum backup #~;", array);
        if (p >= 0) {
            pragmaMaximumBackup(array[0]);
            return p;
        }

        p = Utility.parsePattern(rule, pos, limit, "~nfd rules~;", null);
        if (p >= 0) {
            pragmaNormalizeRules(Normalizer.NFD);
            return p;
        }

        p = Utility.parsePattern(rule, pos, limit, "~nfc rules~;", null);
        if (p >= 0) {
            pragmaNormalizeRules(Normalizer.NFC);
            return p;
        }

        // Syntax error: unable to parse pragma
        return -1;
    }

    /**
     * Throw an exception indicating a syntax error. Search the rule string for the probable end of
     * the rule. Of course, if the error is that the end of rule marker is missing, then the rule
     * end will not be found. In any case the rule start will be correctly reported.
     *
     * @param msg error description
     * @param rule pattern string
     * @param start position of first character of current rule
     */
    static final void syntaxError(String msg, String rule, int start) {
        int end = ruleEnd(rule, start, rule.length());
        throw new IllegalIcuArgumentException(
                msg + " in \"" + Utility.escape(rule.substring(start, end)) + '"');
    }

    static final int ruleEnd(String rule, int start, int limit) {
        int end = Utility.quotedIndexOf(rule, start, limit, ";");
        if (end < 0) {
            end = limit;
        }
        return end;
    }

    /** Parse a UnicodeSet out, store it, and return the stand-in character used to represent it. */
    private final char parseSet(String rule, ParsePosition pos) {
        UnicodeSet set = new UnicodeSet(rule, pos, parseData);
        if (variableNext >= variableLimit) {
            throw new RuntimeException("Private use variables exhausted");
        }
        set.compact();
        return generateStandInFor(set);
    }

    /**
     * Generate and return a stand-in for a new UnicodeMatcher or UnicodeReplacer. Store the object.
     */
    char generateStandInFor(Object obj) {
        // assert(obj != null);

        // Look up previous stand-in, if any.  This is a short list
        // (typical n is 0, 1, or 2); linear search is optimal.
        for (int i = 0; i < variablesVector.size(); ++i) {
            if (variablesVector.get(i) == obj) { // [sic] pointer comparison
                return (char) (curData.variablesBase + i);
            }
        }

        if (variableNext >= variableLimit) {
            throw new RuntimeException("Variable range exhausted");
        }
        variablesVector.add(obj);
        return variableNext++;
    }

    /** Return the standin for segment seg (1-based). */
    public char getSegmentStandin(int seg) {
        if (segmentStandins.length() < seg) {
            segmentStandins.setLength(seg);
        }
        char c = segmentStandins.charAt(seg - 1);
        if (c == 0) {
            if (variableNext >= variableLimit) {
                throw new RuntimeException("Variable range exhausted");
            }
            c = variableNext++;
            // Set a placeholder in the primary variables vector that will be
            // filled in later by setSegmentObject().  We know that we will get
            // called first because setSegmentObject() will call us.
            variablesVector.add(null);
            segmentStandins.setCharAt(seg - 1, c);
        }
        return c;
    }

    /** Set the object for segment seg (1-based). */
    public void setSegmentObject(int seg, StringMatcher obj) {
        // Since we call parseSection() recursively, nested
        // segments will result in segment i+1 getting parsed
        // and stored before segment i; be careful with the
        // vector handling here.
        while (segmentObjects.size() < seg) {
            segmentObjects.add(null);
        }
        int index = getSegmentStandin(seg) - curData.variablesBase;
        if (segmentObjects.get(seg - 1) != null || variablesVector.get(index) != null) {
            throw new RuntimeException(); // should never happen
        }
        segmentObjects.set(seg - 1, obj);
        variablesVector.set(index, obj);
    }

    /**
     * Return the stand-in for the dot set. It is allocated the first time and reused thereafter.
     */
    char getDotStandIn() {
        if (dotStandIn == -1) {
            dotStandIn = generateStandInFor(new UnicodeSet(DOT_SET));
        }
        return (char) dotStandIn;
    }

    /**
     * Append the value of the given variable name to the given StringBuilder.
     *
     * @exception IllegalIcuArgumentException if the name is unknown.
     */
    private void appendVariableDef(String name, StringBuilder buf) {
        char[] ch = variableNames.get(name);
        if (ch == null) {
            // We allow one undefined variable so that variable definition
            // statements work.  For the first undefined variable we return
            // the special placeholder variableLimit-1, and save the variable
            // name.
            if (undefinedVariableName == null) {
                undefinedVariableName = name;
                if (variableNext >= variableLimit) {
                    throw new RuntimeException("Private use variables exhausted");
                }
                buf.append(--variableLimit);
            } else {
                throw new IllegalIcuArgumentException("Undefined variable $" + name);
            }
        } else {
            buf.append(ch);
        }
    }
}

// eof