RBBIRuleScanner.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
/*
 *******************************************************************************
 * Copyright (C) 2003-2016, International Business Machines Corporation and others. All Rights Reserved.
 *******************************************************************************
 */

package com.ibm.icu.text;

import com.ibm.icu.impl.Assert;
import com.ibm.icu.impl.Utility;
import com.ibm.icu.lang.UCharacter;
import com.ibm.icu.lang.UProperty;
import java.text.ParsePosition;
import java.util.HashMap;

/**
 * This class is part of the Rule Based Break Iterator rule compiler. It scans the rules and builds
 * the parse tree. There is no public API here.
 */
class RBBIRuleScanner {

    private static final int kStackSize = 100; // The size of the state stack for

    //   rules parsing.  Corresponds roughly
    //   to the depth of parentheses nesting
    //   that is allowed in the rules.

    static class RBBIRuleChar {
        int fChar;
        boolean fEscaped;
    }

    RBBIRuleBuilder fRB; // The rule builder that we are part of.

    int fScanIndex; // Index of current character being processed
    //   in the rule input string.
    int fNextIndex; // Index of the next character, which
    //   is the first character not yet scanned.
    boolean fQuoteMode; // Scan is in a 'quoted region'
    int fLineNum; // Line number in input file.
    int fCharNum; // Char position within the line.
    int fLastChar; // Previous char, needed to count CR-LF
    //   as a single line, not two.

    RBBIRuleChar fC = new RBBIRuleChar(); // Current char for parse state machine
    //   processing.

    short fStack[] = new short[kStackSize]; // State stack, holds state pushes
    int fStackPtr; //  and pops as specified in the state
    //  transition rules.

    RBBINode fNodeStack[] = new RBBINode[kStackSize]; // Node stack, holds nodes created
    //  during the parse of a rule
    int fNodeStackPtr;

    boolean fReverseRule; // True if the rule currently being scanned
    //  is a reverse direction rule (if it
    //  starts with a '!')

    boolean fLookAheadRule; // True if the rule includes a '/'
    //   somewhere within it.

    boolean fNoChainInRule; // True if the current rule starts with a '^'.

    RBBISymbolTable fSymbolTable; // symbol table, holds definitions of
    //   $variable symbols.

    HashMap<String, RBBISetTableEl> fSetTable =
            new HashMap<>(); // UnicocodeSet hash table, holds indexes to
    //   the sets created while parsing rules.
    //   The key is the string used for creating
    //   the set.

    UnicodeSet fRuleSets[] = new UnicodeSet[10]; // Unicode Sets that are needed during
    //  the scanning of RBBI rules.  The
    //  Indices for these are assigned by the
    //  perl script that builds the state tables.
    //  See rbbirpt.h.

    int fRuleNum; // Counts each rule as it is scanned.

    int fOptionStart; // Input index of start of a !!option
    //   keyword, while being scanned.

    // gRuleSet_rule_char_pattern is characters that may appear as literals in patterns without
    // escaping or quoting.
    private static String gRuleSet_rule_char_pattern =
            "[^[\\p{Z}\\u0020-\\u007f]-[\\p{L}]-[\\p{N}]]";
    private static String gRuleSet_name_char_pattern = "[_\\p{L}\\p{N}]";
    private static String gRuleSet_digit_char_pattern = "[0-9]";
    private static String gRuleSet_name_start_char_pattern = "[_\\p{L}]";
    private static String gRuleSet_white_space_pattern = "[\\p{Pattern_White_Space}]";
    private static String kAny = "any";

    // ----------------------------------------------------------------------------------------
    //
    //  Constructor.
    //
    // ----------------------------------------------------------------------------------------
    RBBIRuleScanner(RBBIRuleBuilder rb) {
        fRB = rb;
        fLineNum = 1;

        //
        //  Set up the constant Unicode Sets.
        //     Note: These could be made static and shared among
        //            all instances of RBBIRuleScanners.
        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128] =
                new UnicodeSet(gRuleSet_rule_char_pattern);
        fRuleSets[RBBIRuleParseTable.kRuleSet_white_space - 128] =
                new UnicodeSet(gRuleSet_white_space_pattern);
        fRuleSets[RBBIRuleParseTable.kRuleSet_name_char - 128] =
                new UnicodeSet(gRuleSet_name_char_pattern);
        fRuleSets[RBBIRuleParseTable.kRuleSet_name_start_char - 128] =
                new UnicodeSet(gRuleSet_name_start_char_pattern);
        fRuleSets[RBBIRuleParseTable.kRuleSet_digit_char - 128] =
                new UnicodeSet(gRuleSet_digit_char_pattern);

        fSymbolTable = new RBBISymbolTable(this);
    }

    // ----------------------------------------------------------------------------------------
    //
    //  doParseAction Do some action during rule parsing.
    //                       Called by the parse state machine.
    //                       Actions build the parse tree and Unicode Sets,
    //                       and maintain the parse stack for nested expressions.
    //
    // ----------------------------------------------------------------------------------------
    boolean doParseActions(int action) {
        RBBINode n = null;

        boolean returnVal = true;

        switch (action) {
            case RBBIRuleParseTable.doExprStart:
                pushNewNode(RBBINode.opStart);
                fRuleNum++;
                break;

            case RBBIRuleParseTable.doNoChain:
                // Scanned a '^' while on the rule start state.
                fNoChainInRule = true;
                break;

            case RBBIRuleParseTable.doExprOrOperator:
                {
                    fixOpStack(RBBINode.precOpCat);
                    RBBINode operandNode = fNodeStack[fNodeStackPtr--];
                    RBBINode orNode = pushNewNode(RBBINode.opOr);
                    orNode.fLeftChild = operandNode;
                    operandNode.fParent = orNode;
                }
                break;

            case RBBIRuleParseTable.doExprCatOperator:
                // concatenation operator.
                // For the implicit concatenation of adjacent terms in an expression
                // that are
                //   not separated by any other operator. Action is invoked between the
                //   actions for the two terms.
                {
                    fixOpStack(RBBINode.precOpCat);
                    RBBINode operandNode = fNodeStack[fNodeStackPtr--];
                    RBBINode catNode = pushNewNode(RBBINode.opCat);
                    catNode.fLeftChild = operandNode;
                    operandNode.fParent = catNode;
                }
                break;

            case RBBIRuleParseTable.doLParen:
                // Open Paren.
                //   The openParen node is a dummy operation type with a low
                // precedence,
                //     which has the affect of ensuring that any real binary op that
                //     follows within the parens binds more tightly to the operands than
                //     stuff outside of the parens.
                pushNewNode(RBBINode.opLParen);
                break;

            case RBBIRuleParseTable.doExprRParen:
                fixOpStack(RBBINode.precLParen);
                break;

            case RBBIRuleParseTable.doNOP:
                break;

            case RBBIRuleParseTable.doStartAssign:
                // We've just scanned "$variable = "
                // The top of the node stack has the $variable ref node.

                // Save the start position of the RHS text in the StartExpression
                // node
                //   that precedes the $variableReference node on the stack.
                //   This will eventually be used when saving the full $variable
                // replacement
                //   text as a string.
                n = fNodeStack[fNodeStackPtr - 1];
                n.fFirstPos = fNextIndex; // move past the '='

                // Push a new start-of-expression node; needed to keep parse of the
                //   RHS expression happy.
                pushNewNode(RBBINode.opStart);
                break;

            case RBBIRuleParseTable.doEndAssign:
                {
                    // We have reached the end of an assignment statement.
                    //   Current scan char is the ';' that terminates the assignment.

                    // Terminate expression, leaves expression parse tree rooted in TOS
                    // node.
                    fixOpStack(RBBINode.precStart);

                    RBBINode startExprNode = fNodeStack[fNodeStackPtr - 2];
                    RBBINode varRefNode = fNodeStack[fNodeStackPtr - 1];
                    RBBINode RHSExprNode = fNodeStack[fNodeStackPtr];

                    // Save original text of right side of assignment, excluding the
                    // terminating ';'
                    //  in the root of the node for the right-hand-side expression.
                    RHSExprNode.fFirstPos = startExprNode.fFirstPos;
                    RHSExprNode.fLastPos = fScanIndex;
                    // fRB.fRules.extractBetween(RHSExprNode.fFirstPos,
                    // RHSExprNode.fLastPos, RHSExprNode.fText);
                    RHSExprNode.fText =
                            fRB.fRules.substring(RHSExprNode.fFirstPos, RHSExprNode.fLastPos);

                    // Expression parse tree becomes l. child of the $variable reference
                    // node.
                    varRefNode.fLeftChild = RHSExprNode;
                    RHSExprNode.fParent = varRefNode;

                    // Make a symbol table entry for the $variableRef node.
                    fSymbolTable.addEntry(varRefNode.fText, varRefNode);

                    // Clean up the stack.
                    fNodeStackPtr -= 3;
                    break;
                }

            case RBBIRuleParseTable.doEndOfRule:
                {
                    fixOpStack(RBBINode.precStart); // Terminate expression, leaves
                    // expression

                    if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("rtree") >= 0) {
                        printNodeStack("end of rule");
                    }
                    Assert.assrt(fNodeStackPtr == 1);
                    RBBINode thisRule = fNodeStack[fNodeStackPtr];

                    // If this rule includes a look-ahead '/', add a endMark node to the
                    //   expression tree.
                    if (fLookAheadRule) {
                        RBBINode endNode = pushNewNode(RBBINode.endMark);
                        RBBINode catNode = pushNewNode(RBBINode.opCat);
                        fNodeStackPtr -= 2;
                        catNode.fLeftChild = thisRule;
                        catNode.fRightChild = endNode;
                        fNodeStack[fNodeStackPtr] = catNode;
                        endNode.fVal = fRuleNum;
                        endNode.fLookAheadEnd = true;
                        thisRule = catNode;

                        // TODO: Disable chaining out of look-ahead (hard break) rules.
                        //   The break on rule match is forced, so there is no point in building up
                        //   the state table to chain into another rule for a longer match.
                    }

                    // Mark this node as being the root of a rule.
                    thisRule.fRuleRoot = true;

                    // Flag if chaining into this rule is wanted.
                    //
                    if (fRB.fChainRules
                            && // If rule chaining is enabled globally via !!chain
                            !fNoChainInRule) { //     and no '^' chain-in inhibit was on this rule
                        thisRule.fChainIn = true;
                    }

                    // All rule expressions are ORed together.
                    // The ';' that terminates an expression really just functions as a
                    // '|' with
                    //   a low operator precedence.
                    //
                    // Each of the four sets of rules are collected separately.
                    //  (forward, reverse, safe_forward, safe_reverse)
                    //  OR this rule into the appropriate group of them.
                    //

                    int destRules =
                            (fReverseRule ? RBBIRuleBuilder.fSafeRevTree : fRB.fDefaultTree);

                    if (fRB.fTreeRoots[destRules] != null) {
                        // This is not the first rule encountered.
                        // OR previous stuff (from *destRules)
                        // with the current rule expression (on the Node Stack)
                        //  with the resulting OR expression going to *destRules
                        //
                        thisRule = fNodeStack[fNodeStackPtr];
                        RBBINode prevRules = fRB.fTreeRoots[destRules];
                        RBBINode orNode = pushNewNode(RBBINode.opOr);
                        orNode.fLeftChild = prevRules;
                        prevRules.fParent = orNode;
                        orNode.fRightChild = thisRule;
                        thisRule.fParent = orNode;
                        fRB.fTreeRoots[destRules] = orNode;
                    } else {
                        // This is the first rule encountered (for this direction).
                        // Just move its parse tree from the stack to *destRules.
                        fRB.fTreeRoots[destRules] = fNodeStack[fNodeStackPtr];
                    }
                    fReverseRule = false; // in preparation for the next rule.
                    fLookAheadRule = false;
                    fNoChainInRule = false;
                    fNodeStackPtr = 0;
                }
                break;

            case RBBIRuleParseTable.doRuleError:
                error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
                returnVal = false;
                break;

            case RBBIRuleParseTable.doVariableNameExpectedErr:
                error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
                break;

            //
            //  Unary operands + ? *
            //    These all appear after the operand to which they apply.
            //    When we hit one, the operand (may be a whole sub expression)
            //    will be on the top of the stack.
            //    Unary Operator becomes TOS, with the old TOS as its one child.
            case RBBIRuleParseTable.doUnaryOpPlus:
                {
                    RBBINode operandNode = fNodeStack[fNodeStackPtr--];
                    RBBINode plusNode = pushNewNode(RBBINode.opPlus);
                    plusNode.fLeftChild = operandNode;
                    operandNode.fParent = plusNode;
                }
                break;

            case RBBIRuleParseTable.doUnaryOpQuestion:
                {
                    RBBINode operandNode = fNodeStack[fNodeStackPtr--];
                    RBBINode qNode = pushNewNode(RBBINode.opQuestion);
                    qNode.fLeftChild = operandNode;
                    operandNode.fParent = qNode;
                }
                break;

            case RBBIRuleParseTable.doUnaryOpStar:
                {
                    RBBINode operandNode = fNodeStack[fNodeStackPtr--];
                    RBBINode starNode = pushNewNode(RBBINode.opStar);
                    starNode.fLeftChild = operandNode;
                    operandNode.fParent = starNode;
                }
                break;

            case RBBIRuleParseTable.doRuleChar:
                // A "Rule Character" is any single character that is a literal part
                // of the regular expression. Like a, b and c in the expression "(abc*)
                // | [:L:]"
                // These are pretty uncommon in break rules; the terms are more commonly
                //  sets. To keep things uniform, treat these characters like as
                // sets that just happen to contain only one character.
                {
                    n = pushNewNode(RBBINode.setRef);
                    String s = String.valueOf((char) fC.fChar);
                    findSetFor(s, n, null);
                    n.fFirstPos = fScanIndex;
                    n.fLastPos = fNextIndex;
                    n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
                    break;
                }

            case RBBIRuleParseTable.doDotAny:
                // scanned a ".", meaning match any single character.
                {
                    n = pushNewNode(RBBINode.setRef);
                    findSetFor(kAny, n, null);
                    n.fFirstPos = fScanIndex;
                    n.fLastPos = fNextIndex;
                    n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
                    break;
                }

            case RBBIRuleParseTable.doSlash:
                // Scanned a '/', which identifies a look-ahead break position in a
                // rule.
                n = pushNewNode(RBBINode.lookAhead);
                n.fVal = fRuleNum;
                n.fFirstPos = fScanIndex;
                n.fLastPos = fNextIndex;
                n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
                fLookAheadRule = true;
                break;

            case RBBIRuleParseTable.doStartTagValue:
                // Scanned a '{', the opening delimiter for a tag value within a
                // rule.
                n = pushNewNode(RBBINode.tag);
                n.fVal = 0;
                n.fFirstPos = fScanIndex;
                n.fLastPos = fNextIndex;
                break;

            case RBBIRuleParseTable.doTagDigit:
                // Just scanned a decimal digit that's part of a tag value
                {
                    n = fNodeStack[fNodeStackPtr];
                    int v = UCharacter.digit((char) fC.fChar, 10);
                    long update = (long) (n.fVal) * 10 + v;
                    if (update > Integer.MAX_VALUE) {
                        error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
                    }
                    n.fVal = (int) (update);
                    break;
                }

            case RBBIRuleParseTable.doTagValue:
                n = fNodeStack[fNodeStackPtr];
                n.fLastPos = fNextIndex;
                n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
                break;

            case RBBIRuleParseTable.doTagExpectedError:
                error(RBBIRuleBuilder.U_BRK_MALFORMED_RULE_TAG);
                returnVal = false;
                break;

            case RBBIRuleParseTable.doOptionStart:
                // Scanning a !!option. At the start of string.
                fOptionStart = fScanIndex;
                break;

            case RBBIRuleParseTable.doOptionEnd:
                {
                    String opt = fRB.fRules.substring(fOptionStart, fScanIndex);
                    if (opt.equals("chain")) {
                        fRB.fChainRules = true;
                    } else if (opt.equals("forward")) {
                        fRB.fDefaultTree = RBBIRuleBuilder.fForwardTree;
                    } else if (opt.equals("reverse")) {
                        fRB.fDefaultTree = RBBIRuleBuilder.fReverseTree;
                    } else if (opt.equals("safe_forward")) {
                        fRB.fDefaultTree = RBBIRuleBuilder.fSafeFwdTree;
                    } else if (opt.equals("safe_reverse")) {
                        fRB.fDefaultTree = RBBIRuleBuilder.fSafeRevTree;
                    } else if (opt.equals("lookAheadHardBreak")) {
                        fRB.fLookAheadHardBreak = true;
                    } else if (opt.equals("quoted_literals_only")) {
                        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].clear();
                    } else if (opt.equals("unquoted_literals")) {
                        fRuleSets[RBBIRuleParseTable.kRuleSet_rule_char - 128].applyPattern(
                                gRuleSet_rule_char_pattern);
                    } else {
                        error(RBBIRuleBuilder.U_BRK_UNRECOGNIZED_OPTION);
                    }
                    break;
                }

            case RBBIRuleParseTable.doReverseDir:
                fReverseRule = true;
                break;

            case RBBIRuleParseTable.doStartVariableName:
                n = pushNewNode(RBBINode.varRef);
                n.fFirstPos = fScanIndex;
                break;

            case RBBIRuleParseTable.doEndVariableName:
                n = fNodeStack[fNodeStackPtr];
                if (n == null || n.fType != RBBINode.varRef) {
                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
                    break;
                }
                n.fLastPos = fScanIndex;
                n.fText = fRB.fRules.substring(n.fFirstPos + 1, n.fLastPos);
                // Look the newly scanned name up in the symbol table
                //   If there's an entry, set the l. child of the var ref to the
                // replacement expression.
                //   (We also pass through here when scanning assignments, but no harm
                // is done, other
                //    than a slight wasted effort that seems hard to avoid. Lookup will
                // be null)
                n.fLeftChild = fSymbolTable.lookupNode(n.fText);
                break;

            case RBBIRuleParseTable.doCheckVarDef:
                n = fNodeStack[fNodeStackPtr];
                if (n.fLeftChild == null) {
                    error(RBBIRuleBuilder.U_BRK_UNDEFINED_VARIABLE);
                    returnVal = false;
                }
                break;

            case RBBIRuleParseTable.doExprFinished:
                break;

            case RBBIRuleParseTable.doRuleErrorAssignExpr:
                error(RBBIRuleBuilder.U_BRK_ASSIGN_ERROR);
                returnVal = false;
                break;

            case RBBIRuleParseTable.doExit:
                returnVal = false;
                break;

            case RBBIRuleParseTable.doScanUnicodeSet:
                scanSet();
                break;

            default:
                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
                returnVal = false;
                break;
        }
        return returnVal;
    }

    // ----------------------------------------------------------------------------------------
    //
    //  Error Throw and IllegalArgumentException in response to a rule parse
    // error.
    //
    // ----------------------------------------------------------------------------------------
    void error(int e) {
        String s = "Error " + e + " at line " + fLineNum + " column " + fCharNum;
        IllegalArgumentException ex = new IllegalArgumentException(s);
        throw ex;
    }

    // ----------------------------------------------------------------------------------------
    //
    //  fixOpStack The parse stack holds partially assembled chunks of the parse
    // tree.
    //               An entry on the stack may be as small as a single setRef node,
    //               or as large as the parse tree
    //               for an entire expression (this will be the one item left on the stack
    //               when the parsing of an RBBI rule completes.
    //
    //               This function is called when a binary operator is encountered.
    //               It looks back up the stack for operators that are not yet associated
    //               with a right operand, and if the precedence of the stacked operator >=
    //               the precedence of the current operator, binds the operand left,
    //               to the previously encountered operator.
    //
    // ----------------------------------------------------------------------------------------
    void fixOpStack(int p) {
        RBBINode n;
        // printNodeStack("entering fixOpStack()");
        for (; ; ) {
            n = fNodeStack[fNodeStackPtr - 1]; // an operator node
            if (n.fPrecedence == 0) {
                System.out.print("RBBIRuleScanner.fixOpStack, bad operator node");
                error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
                return;
            }

            if (n.fPrecedence < p || n.fPrecedence <= RBBINode.precLParen) {
                // The most recent operand goes with the current operator,
                //   not with the previously stacked one.
                break;
            }
            // Stack operator is a binary op ( '|' or concatenation)
            //   TOS operand becomes right child of this operator.
            //   Resulting subexpression becomes the TOS operand.
            n.fRightChild = fNodeStack[fNodeStackPtr];
            fNodeStack[fNodeStackPtr].fParent = n;
            fNodeStackPtr--;
            // printNodeStack("looping in fixOpStack() ");
        }

        if (p <= RBBINode.precLParen) {
            // Scan is at a right paren or end of expression.
            //  The scanned item must match the stack, or else there was an
            // error.
            //  Discard the left paren (or start expr) node from the stack,
            //  leaving the completed (sub)expression as TOS.
            if (n.fPrecedence != p) {
                // Right paren encountered matched start of expression node, or
                // end of expression matched with a left paren node.
                error(RBBIRuleBuilder.U_BRK_MISMATCHED_PAREN);
            }
            fNodeStack[fNodeStackPtr - 1] = fNodeStack[fNodeStackPtr];
            fNodeStackPtr--;
            // Delete the now-discarded LParen or Start node.
            // delete n;
        }
        // printNodeStack("leaving fixOpStack()");
    }

    // ----------------------------------------------------------------------------
    //
    //       RBBISetTableEl is an entry in the hash table of UnicodeSets that have
    //                        been encountered. The val Node will be of nodetype uset
    //                        and contain pointers to the actual UnicodeSets.
    //                        The Key is the source string for initializing the set.
    //
    //                        The hash table is used to avoid creating duplicate
    //                        unnamed (not $var references) UnicodeSets.
    //
    // ----------------------------------------------------------------------------
    static class RBBISetTableEl {
        String key;

        RBBINode val;
    }

    // ----------------------------------------------------------------------------------------
    //
    //   findSetFor given a String,
    //                  - find the corresponding Unicode Set (uset node)
    //                         (create one if necessary)
    //                  - Set fLeftChild of the caller's node (should be a setRef node)
    //                         to the uset node
    //                 Maintain a hash table of uset nodes, so the same one is always used
    //                    for the same string.
    //                 If a "to adopt" set is provided and we haven't seen this key before,
    //                    add the provided set to the hash table.
    //                 If the string is one (32 bit) char in length, the set contains
    //                    just one element which is the char in question.
    //                 If the string is "any", return a set containing all chars.
    //
    // ----------------------------------------------------------------------------------------
    void findSetFor(String s, RBBINode node, UnicodeSet setToAdopt) {

        RBBISetTableEl el;

        // First check whether we've already cached a set for this string.
        // If so, just use the cached set in the new node.
        //   delete any set provided by the caller, since we own it.
        el = fSetTable.get(s);
        if (el != null) {
            node.fLeftChild = el.val;
            Assert.assrt(node.fLeftChild.fType == RBBINode.uset);
            return;
        }

        // Haven't seen this set before.
        // If the caller didn't provide us with a prebuilt set,
        //   create a new UnicodeSet now.
        if (setToAdopt == null) {
            if (s.equals(kAny)) {
                setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
            } else {
                int c;
                c = UTF16.charAt(s, 0);
                setToAdopt = new UnicodeSet(c, c);
            }
        }

        //
        // Make a new uset node to refer to this UnicodeSet
        // This new uset node becomes the child of the caller's setReference
        // node.
        //
        RBBINode usetNode = new RBBINode(RBBINode.uset);
        usetNode.fInputSet = setToAdopt;
        usetNode.fParent = node;
        node.fLeftChild = usetNode;
        usetNode.fText = s;

        //
        // Add the new uset node to the list of all uset nodes.
        //
        fRB.fUSetNodes.add(usetNode);

        //
        // Add the new set to the set hash table.
        //
        el = new RBBISetTableEl();
        el.key = s;
        el.val = usetNode;
        fSetTable.put(el.key, el);

        return;
    }

    //
    //  Assorted Unicode character constants.
    //     Numeric because there is no portable way to enter them as literals.
    //     (Think EBCDIC).
    //
    static final int chNEL = 0x85; //    NEL newline variant

    static final int chLS = 0x2028; //    Unicode Line Separator

    // ----------------------------------------------------------------------------------------
    //
    //  stripRules    Return a rules string without unnecessary
    //                characters.
    //
    // ----------------------------------------------------------------------------------------
    static String stripRules(String rules) {
        StringBuilder strippedRules = new StringBuilder();
        int rulesLength = rules.length();

        for (int idx = 0; idx < rulesLength; idx = rules.offsetByCodePoints(idx, 1)) {
            int cp = rules.codePointAt(idx);
            boolean whiteSpace = UCharacter.hasBinaryProperty(cp, UProperty.PATTERN_WHITE_SPACE);
            if (whiteSpace) {
                continue;
            }
            strippedRules.appendCodePoint(cp);
        }
        return strippedRules.toString();
    }

    // ----------------------------------------------------------------------------------------
    //
    //  nextCharLL    Low Level Next Char from rule input source.
    //                Get a char from the input character iterator,
    //                keep track of input position for error reporting.
    //
    // ----------------------------------------------------------------------------------------
    int nextCharLL() {
        int ch;

        if (fNextIndex >= fRB.fRules.length()) {
            return -1;
        }
        ch = UTF16.charAt(fRB.fRules, fNextIndex);
        if (Character.isBmpCodePoint(ch) && Character.isSurrogate((char) ch)) {
            error(RBBIRuleBuilder.U_ILLEGAL_CHAR_FOUND);
        }
        fNextIndex = UTF16.moveCodePointOffset(fRB.fRules, fNextIndex, 1);

        if (ch == '\r' || ch == chNEL || ch == chLS || ch == '\n' && fLastChar != '\r') {
            // Character is starting a new line.  Bump up the line number, and
            //  reset the column to 0.
            fLineNum++;
            fCharNum = 0;
            if (fQuoteMode) {
                error(RBBIRuleBuilder.U_BRK_NEW_LINE_IN_QUOTED_STRING);
                fQuoteMode = false;
            }
        } else {
            // Character is not starting a new line.  Except in the case of a
            //   LF following a CR, increment the column position.
            if (ch != '\n') {
                fCharNum++;
            }
        }
        fLastChar = ch;
        return ch;
    }

    // ---------------------------------------------------------------------------------
    //
    //   nextChar     for rules scanning.  At this level, we handle stripping
    //                out comments and processing backslash character escapes.
    //                The rest of the rules grammar is handled at the next level up.
    //
    // ---------------------------------------------------------------------------------
    void nextChar(RBBIRuleChar c) {

        // Unicode Character constants needed for the processing done by nextChar(),
        //   in hex because literals wont work on EBCDIC machines.

        fScanIndex = fNextIndex;
        c.fChar = nextCharLL();
        c.fEscaped = false;

        //
        //  check for '' sequence.
        //  These are recognized in all contexts, whether in quoted text or not.
        //
        if (c.fChar == '\'') {
            if (fNextIndex < fRB.fRules.length() && UTF16.charAt(fRB.fRules, fNextIndex) == '\'') {
                c.fChar = nextCharLL(); // get nextChar officially so character counts
                c.fEscaped = true; //   stay correct.
            } else {
                // Single quote, by itself.
                //   Toggle quoting mode.
                //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
                fQuoteMode = !fQuoteMode;
                if (fQuoteMode == true) {
                    c.fChar = '(';
                } else {
                    c.fChar = ')';
                }
                c.fEscaped = false; // The paren that we return is not escaped.
                return;
            }
        }
        if (c.fChar == -1) {
            return;
        }

        if (fQuoteMode) {
            c.fEscaped = true;
        } else {
            // We are not in a 'quoted region' of the source.
            //
            if (c.fChar == '#') {
                // Start of a comment.  Consume the rest of it.
                //  The new-line char that terminates the comment is always returned.
                //  It will be treated as white-space, and serves to break up anything
                //    that might otherwise incorrectly clump together with a comment in
                //    the middle (a variable name, for example.)
                int commentStart = fScanIndex;
                for (; ; ) {
                    c.fChar = nextCharLL();
                    if (c.fChar == -1
                            || // EOF
                            c.fChar == '\r'
                            || c.fChar == '\n'
                            || c.fChar == chNEL
                            || c.fChar == chLS) {
                        break;
                    }
                }
                for (int i = commentStart; i < fNextIndex - 1; ++i) {
                    fRB.fStrippedRules.setCharAt(i, ' ');
                }
            }
            if (c.fChar == -1) {
                return;
            }

            //
            //  check for backslash escaped characters.
            //
            if (c.fChar == '\\') {
                c.fEscaped = true;
                int cpAndLength = Utility.unescapeAndLengthAt(fRB.fRules, fNextIndex);
                if (cpAndLength < 0) {
                    error(RBBIRuleBuilder.U_BRK_HEX_DIGITS_EXPECTED);
                }
                c.fChar = Utility.cpFromCodePointAndLength(cpAndLength);
                int length = Utility.lengthFromCodePointAndLength(cpAndLength);

                fCharNum += length;
                fNextIndex += length;
            }
        }
        // putc(c.fChar, stdout);
    }

    // ---------------------------------------------------------------------------------
    //
    //  Parse RBBI rules.   The state machine for rules parsing is here.
    //                      The state tables are hand-written in the file rbbirpt.txt,
    //                      and converted to the form used here by a perl
    //                      script rbbicst.pl
    //
    // ---------------------------------------------------------------------------------
    void parse() {
        int state;
        RBBIRuleParseTable.RBBIRuleTableElement tableEl;

        state = 1;
        nextChar(fC);
        //
        // Main loop for the rule parsing state machine.
        //   Runs once per state transition.
        //   Each time through optionally performs, depending on the state table,
        //      - an advance to the next input char
        //      - an action to be performed.
        //      - pushing or popping a state to/from the local state return stack.
        //
        for (; ; ) {
            // Quit if state == 0.  This is the normal way to exit the state machine.
            //
            if (state == 0) {
                break;
            }

            // Find the state table element that matches the input char from the rule, or the
            //    class of the input character.  Start with the first table row for this
            //    state, then linearly scan forward until we find a row that matches the
            //    character.  The last row for each state always matches all characters, so
            //    the search will stop there, if not before.
            //
            tableEl = RBBIRuleParseTable.gRuleParseStateTable[state];
            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
                System.out.println(
                        "char, line, col = (\'"
                                + (char) fC.fChar
                                + "\', "
                                + fLineNum
                                + ", "
                                + fCharNum
                                + "    state = "
                                + tableEl.fStateName);
            }

            for (int tableRow = state;
                    ;
                    tableRow++) { // loop over the state table rows associated with this state.
                tableEl = RBBIRuleParseTable.gRuleParseStateTable[tableRow];
                if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
                    System.out.print(".");
                }
                if (tableEl.fCharClass < 127
                        && fC.fEscaped == false
                        && tableEl.fCharClass == fC.fChar) {
                    // Table row specified an individual character, not a set, and
                    //   the input character is not escaped, and
                    //   the input character matched it.
                    break;
                }
                if (tableEl.fCharClass == 255) {
                    // Table row specified default, match anything character class.
                    break;
                }
                if (tableEl.fCharClass == 254 && fC.fEscaped) {
                    // Table row specified "escaped" and the char was escaped.
                    break;
                }
                if (tableEl.fCharClass == 253
                        && fC.fEscaped
                        && (fC.fChar == 0x50 || fC.fChar == 0x70)) {
                    // Table row specified "escaped P" and the char is either 'p' or 'P'.
                    break;
                }
                if (tableEl.fCharClass == 252 && fC.fChar == -1) {
                    // Table row specified eof and we hit eof on the input.
                    break;
                }

                if (tableEl.fCharClass >= 128
                        && tableEl.fCharClass < 240
                        && // Table specs a char class &&
                        fC.fEscaped == false
                        && //   char is not escaped &&
                        fC.fChar != -1) { //   char is not EOF
                    UnicodeSet uniset = fRuleSets[tableEl.fCharClass - 128];
                    if (uniset.contains(fC.fChar)) {
                        // Table row specified a character class, or set of characters,
                        //   and the current char matches it.
                        break;
                    }
                }
            }

            if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("scan") >= 0) {
                System.out.println("");
            }
            //
            // We've found the row of the state table that matches the current input
            //   character from the rules string.
            // Perform any action specified  by this row in the state table.
            if (doParseActions(tableEl.fAction) == false) {
                // Break out of the state machine loop if the
                //   the action signaled some kind of error, or
                //   the action was to exit, occurs on normal end-of-rules-input.
                break;
            }

            if (tableEl.fPushState != 0) {
                fStackPtr++;
                if (fStackPtr >= kStackSize) {
                    System.out.println("RBBIRuleScanner.parse() - state stack overflow.");
                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
                }
                fStack[fStackPtr] = tableEl.fPushState;
            }

            if (tableEl.fNextChar) {
                nextChar(fC);
            }

            // Get the next state from the table entry, or from the
            //   state stack if the next state was specified as "pop".
            if (tableEl.fNextState != 255) {
                state = tableEl.fNextState;
            } else {
                state = fStack[fStackPtr];
                fStackPtr--;
                if (fStackPtr < 0) {
                    System.out.println("RBBIRuleScanner.parse() - state stack underflow.");
                    error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
                }
            }
        }

        // If there are no forward rules throw an error.
        //
        if (fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree] == null) {
            error(RBBIRuleBuilder.U_BRK_RULE_SYNTAX);
        }

        //
        // Parsing of the input RBBI rules is complete.
        // We now have a parse tree for the rule expressions
        // and a list of all UnicodeSets that are referenced.
        //
        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("symbols") >= 0) {
            fSymbolTable.rbbiSymtablePrint();
        }
        if (fRB.fDebugEnv != null && fRB.fDebugEnv.indexOf("ptree") >= 0) {
            System.out.println("Completed Forward Rules Parse Tree...");
            fRB.fTreeRoots[RBBIRuleBuilder.fForwardTree].printTree(true);
            System.out.println("\nCompleted Reverse Rules Parse Tree...");
            fRB.fTreeRoots[RBBIRuleBuilder.fReverseTree].printTree(true);
            System.out.println("\nCompleted Safe Point Forward Rules Parse Tree...");
            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree] == null) {
                System.out.println("  -- null -- ");
            } else {
                fRB.fTreeRoots[RBBIRuleBuilder.fSafeFwdTree].printTree(true);
            }
            System.out.println("\nCompleted Safe Point Reverse Rules Parse Tree...");
            if (fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree] == null) {
                System.out.println("  -- null -- ");
            } else {
                fRB.fTreeRoots[RBBIRuleBuilder.fSafeRevTree].printTree(true);
            }
        }
    }

    // ---------------------------------------------------------------------------------
    //
    //  printNodeStack     for debugging...
    //
    // ---------------------------------------------------------------------------------
    /// CLOVER:OFF
    void printNodeStack(String title) {
        int i;
        System.out.println(title + ".  Dumping node stack...\n");
        for (i = fNodeStackPtr; i > 0; i--) {
            fNodeStack[i].printTree(true);
        }
    }

    /// CLOVER:ON

    // ---------------------------------------------------------------------------------
    //
    //  pushNewNode   create a new RBBINode of the specified type and push it
    //                onto the stack of nodes.
    //
    // ---------------------------------------------------------------------------------
    RBBINode pushNewNode(int nodeType) {
        fNodeStackPtr++;
        if (fNodeStackPtr >= kStackSize) {
            System.out.println("RBBIRuleScanner.pushNewNode - stack overflow.");
            error(RBBIRuleBuilder.U_BRK_INTERNAL_ERROR);
        }
        fNodeStack[fNodeStackPtr] = new RBBINode(nodeType);
        return fNodeStack[fNodeStackPtr];
    }

    // ---------------------------------------------------------------------------------
    //
    //  scanSet    Construct a UnicodeSet from the text at the current scan
    //             position.  Advance the scan position to the first character
    //             after the set.
    //
    //             A new RBBI setref node referring to the set is pushed onto the node
    //             stack.
    //
    //             The scan position is normally under the control of the state machine
    //             that controls rule parsing.  UnicodeSets, however, are parsed by
    //             the UnicodeSet constructor, not by the RBBI rule parser.
    //
    // ---------------------------------------------------------------------------------
    void scanSet() {
        UnicodeSet uset = null;
        int startPos;
        ParsePosition pos = new ParsePosition(fScanIndex);
        int i;

        startPos = fScanIndex;
        try {
            uset = new UnicodeSet(fRB.fRules, pos, fSymbolTable, UnicodeSet.IGNORE_SPACE);
        } catch (Exception e) { // TODO:  catch fewer exception types.
            // Repackage UnicodeSet errors as RBBI rule builder errors, with location info.
            error(RBBIRuleBuilder.U_BRK_MALFORMED_SET);
        }

        // Verify that the set contains at least one code point.
        //
        // Use tempSet to handle the case that the UnicodeSet contains
        // only string element, such as [{ab}] and treat it as empty set.
        UnicodeSet tempSet = new UnicodeSet(uset);
        tempSet.removeAllStrings();
        if (tempSet.isEmpty()) {
            // This set is empty.
            //  Make it an error, because it almost certainly is not what the user wanted.
            //  Also, avoids having to think about corner cases in the tree manipulation code
            //   that occurs later on.
            //  TODO:  this shouldn't be an error; it does happen.
            error(RBBIRuleBuilder.U_BRK_RULE_EMPTY_SET);
        }

        // Advance the RBBI parse position over the UnicodeSet pattern.
        //   Don't just set fScanIndex because the line/char positions maintained
        //   for error reporting would be thrown off.
        i = pos.getIndex();
        for (; ; ) {
            if (fNextIndex >= i) {
                break;
            }
            nextCharLL();
        }

        RBBINode n;

        n = pushNewNode(RBBINode.setRef);
        n.fFirstPos = startPos;
        n.fLastPos = fNextIndex;
        n.fText = fRB.fRules.substring(n.fFirstPos, n.fLastPos);
        //  findSetFor() serves several purposes here:
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
        //     - Maintains collection of all sets in use, needed later for establishing
        //          character categories for run time engine.
        //     - Eliminates multiple instances of the same set.
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
        findSetFor(n.fText, n, uset);
    }

    /**
     * @return the number of rules that have been seen.
     */
    int numRules() {
        return fRuleNum;
    }
}