diff options
Diffstat (limited to 'libjava/classpath/gnu/regexp/RE.java')
| -rw-r--r-- | libjava/classpath/gnu/regexp/RE.java | 778 |
1 files changed, 656 insertions, 122 deletions
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java index 9ac9b53d1a9..ef606a6d8a7 100644 --- a/libjava/classpath/gnu/regexp/RE.java +++ b/libjava/classpath/gnu/regexp/RE.java @@ -1,5 +1,5 @@ /* gnu/regexp/RE.java - Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -136,12 +136,13 @@ public class RE extends REToken { /** Minimum length, in characters, of any possible match. */ private int minimumLength; + private int maximumLength; /** * Compilation flag. Do not differentiate case. Subsequent * searches using this RE will be case insensitive. */ - public static final int REG_ICASE = 2; + public static final int REG_ICASE = 0x02; /** * Compilation flag. The match-any-character operator (dot) @@ -149,14 +150,14 @@ public class RE extends REToken { * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to * the "/s" operator in Perl. */ - public static final int REG_DOT_NEWLINE = 4; + public static final int REG_DOT_NEWLINE = 0x04; /** * Compilation flag. Use multiline mode. In this mode, the ^ and $ * anchors will match based on newlines within the input. This is * equivalent to the "/m" operator in Perl. */ - public static final int REG_MULTILINE = 8; + public static final int REG_MULTILINE = 0x08; /** * Execution flag. @@ -185,14 +186,14 @@ public class RE extends REToken { * // m4.toString(): "fool"<BR> * </CODE> */ - public static final int REG_NOTBOL = 16; + public static final int REG_NOTBOL = 0x10; /** * Execution flag. * The match-end operator ($) does not match at the end * of the input string. Useful for matching on substrings. */ - public static final int REG_NOTEOL = 32; + public static final int REG_NOTEOL = 0x20; /** * Execution flag. @@ -206,7 +207,7 @@ public class RE extends REToken { * the example under REG_NOTBOL. It also affects the use of the \< * and \b operators. */ - public static final int REG_ANCHORINDEX = 64; + public static final int REG_ANCHORINDEX = 0x40; /** * Execution flag. @@ -215,7 +216,24 @@ public class RE extends REToken { * the corresponding subexpressions. For example, you may want to * replace all matches of "one dollar" with "$1". */ - public static final int REG_NO_INTERPOLATE = 128; + public static final int REG_NO_INTERPOLATE = 0x80; + + /** + * Execution flag. + * Try to match the whole input string. An implicit match-end operator + * is added to this regexp. + */ + public static final int REG_TRY_ENTIRE_MATCH = 0x0100; + + /** + * Execution flag. + * The substitute and substituteAll methods will treat the + * character '\' in the replacement as an escape to a literal + * character. In this case "\n", "\$", "\\", "\x40" and "\012" + * will become "n", "$", "\", "x40" and "012" respectively. + * This flag has no effect if REG_NO_INTERPOLATE is set on. + */ + public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200; /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { @@ -273,12 +291,13 @@ public class RE extends REToken { } // internal constructor used for alternation - private RE(REToken first, REToken last,int subs, int subIndex, int minLength) { + private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) { super(subIndex); firstToken = first; lastToken = last; numSubs = subs; minimumLength = minLength; + maximumLength = maxLength; addToken(new RETokenEndSub(subIndex)); } @@ -333,6 +352,11 @@ public class RE extends REToken { char ch; boolean quot = false; + // Saved syntax and flags. + RESyntax savedSyntax = null; + int savedCflags = 0; + boolean flagsSaved = false; + while (index < pLength) { // read the next character unit (including backslash escapes) index = getCharUnit(pattern,index,unit,quot); @@ -359,8 +383,9 @@ public class RE extends REToken { && !syntax.get(RESyntax.RE_LIMITED_OPS)) { // make everything up to here be a branch. create vector if nec. addToken(currentToken); - RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength); + RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength); minimumLength = 0; + maximumLength = 0; if (branches == null) { branches = new Vector(); } @@ -402,102 +427,12 @@ public class RE extends REToken { // [...] | [^...] else if ((unit.ch == '[') && !(unit.bk || quot)) { - Vector options = new Vector(); - boolean negative = false; - char lastChar = 0; - if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); - - // Check for initial caret, negation - if ((ch = pattern[index]) == '^') { - negative = true; - if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - ch = pattern[index]; - } - - // Check for leading right bracket literal - if (ch == ']') { - lastChar = ch; - if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - } - - while ((ch = pattern[index++]) != ']') { - if ((ch == '-') && (lastChar != 0)) { - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - if ((ch = pattern[index]) == ']') { - options.addElement(new RETokenChar(subIndex,lastChar,insens)); - lastChar = '-'; - } else { - options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); - lastChar = 0; - index++; - } - } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - int posixID = -1; - boolean negate = false; - char asciiEsc = 0; - if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { - switch (pattern[index]) { - case 'D': - negate = true; - case 'd': - posixID = RETokenPOSIX.DIGIT; - break; - case 'S': - negate = true; - case 's': - posixID = RETokenPOSIX.SPACE; - break; - case 'W': - negate = true; - case 'w': - posixID = RETokenPOSIX.ALNUM; - break; - } - } - else if ("nrt".indexOf(pattern[index]) != -1) { - switch (pattern[index]) { - case 'n': - asciiEsc = '\n'; - break; - case 't': - asciiEsc = '\t'; - break; - case 'r': - asciiEsc = '\r'; - break; - } - } - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - - if (posixID != -1) { - options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); - } else if (asciiEsc != 0) { - lastChar = asciiEsc; - } else { - lastChar = pattern[index]; - } - ++index; - } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { - StringBuffer posixSet = new StringBuffer(); - index = getPosixSet(pattern,index+1,posixSet); - int posixId = RETokenPOSIX.intValue(posixSet.toString()); - if (posixId != -1) - options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); - } else { - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - lastChar = ch; - } - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - } // while in list - // Out of list, index is one past ']' - - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - // Create a new RETokenOneOf + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index, pLength, cflags, syntax, 0); addToken(currentToken); - options.trimToSize(); - currentToken = new RETokenOneOf(subIndex,options,negative); + currentToken = result.token; + index = result.index; } // SUBEXPRESSIONS @@ -507,7 +442,10 @@ public class RE extends REToken { boolean pure = false; boolean comment = false; boolean lookAhead = false; + boolean lookBehind = false; + boolean independent = false; boolean negativelh = false; + boolean negativelb = false; if ((index+1 < pLength) && (pattern[index] == '?')) { switch (pattern[index+1]) { case '!': @@ -525,6 +463,114 @@ public class RE extends REToken { index += 2; } break; + case '<': + // We assume that if the syntax supports look-ahead, + // it also supports look-behind. + if (syntax.get(RESyntax.RE_LOOKAHEAD)) { + index++; + switch (pattern[index +1]) { + case '!': + pure = true; + negativelb = true; + lookBehind = true; + index += 2; + break; + case '=': + pure = true; + lookBehind = true; + index += 2; + } + } + break; + case '>': + // We assume that if the syntax supports look-ahead, + // it also supports independent group. + if (syntax.get(RESyntax.RE_LOOKAHEAD)) { + pure = true; + independent = true; + index += 2; + } + break; + case 'i': + case 'd': + case 'm': + case 's': + // case 'u': not supported + // case 'x': not supported + case '-': + if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break; + // Set or reset syntax flags. + int flagIndex = index + 1; + int endFlag = -1; + RESyntax newSyntax = new RESyntax(syntax); + int newCflags = cflags; + boolean negate = false; + while (flagIndex < pLength && endFlag < 0) { + switch(pattern[flagIndex]) { + case 'i': + if (negate) + newCflags &= ~REG_ICASE; + else + newCflags |= REG_ICASE; + flagIndex++; + break; + case 'd': + if (negate) + newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR); + else + newSyntax.setLineSeparator("\n"); + flagIndex++; + break; + case 'm': + if (negate) + newCflags &= ~REG_MULTILINE; + else + newCflags |= REG_MULTILINE; + flagIndex++; + break; + case 's': + if (negate) + newCflags &= ~REG_DOT_NEWLINE; + else + newCflags |= REG_DOT_NEWLINE; + flagIndex++; + break; + // case 'u': not supported + // case 'x': not supported + case '-': + negate = true; + flagIndex++; + break; + case ':': + case ')': + endFlag = pattern[flagIndex]; + break; + default: + throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); + } + } + if (endFlag == ')') { + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + // This can be treated as though it were a comment. + comment = true; + index = flagIndex - 1; + break; + } + if (endFlag == ':') { + savedSyntax = syntax; + savedCflags = cflags; + flagsSaved = true; + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + index = flagIndex -1; + // Fall through to the next case. + } + else { + throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); + } case ':': if (syntax.get(RESyntax.RE_PURE_GROUPING)) { pure = true; @@ -607,15 +653,28 @@ public class RE extends REToken { numSubs++; } - int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs; + int useIndex = (pure || lookAhead || lookBehind || independent) ? + 0 : nextSub + numSubs; currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs); numSubs += ((RE) currentToken).getNumSubs(); if (lookAhead) { currentToken = new RETokenLookAhead(currentToken,negativelh); } + else if (lookBehind) { + currentToken = new RETokenLookBehind(currentToken,negativelb); + } + else if (independent) { + currentToken = new RETokenIndependent(currentToken); + } index = nextIndex; + if (flagsSaved) { + syntax = savedSyntax; + cflags = savedCflags; + insens = ((cflags & REG_ICASE) > 0); + flagsSaved = false; + } } // not a comment } // subexpression @@ -715,14 +774,45 @@ public class RE extends REToken { else currentToken = setRepeated(currentToken,0,1,index); } + + // OCTAL CHARACTER + // \0377 + else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) { + CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException("invalid octal character", REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken(currentToken); + currentToken = new RETokenChar(subIndex,ce.ch,insens); + } + // BACKREFERENCE OPERATOR - // \1 \2 ... \9 + // \1 \2 ... \9 and \10 \11 \12 ... // not available if RE_NO_BK_REFS is set + // Perl recognizes \10, \11, and so on only if enough number of + // parentheses have opened before it, otherwise they are treated + // as aliases of \010, \011, ... (octal characters). In case of + // Sun's JDK, octal character expression must always begin with \0. + // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". + // JDK treats \2 as a back reference to the 2nd group because + // there are only two groups. But in our poor implementation, + // we cannot help but treat \29 as a back reference to the 29th group. else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) { addToken(currentToken); - currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens); + int numBegin = index - 1; + int numEnd = pLength; + for (int i = index; i < pLength; i++) { + if (! Character.isDigit(pattern[i])) { + numEnd = i; + break; + } + } + int num = parseInt(pattern, numBegin, numEnd-numBegin, 10); + + currentToken = new RETokenBackRef(subIndex,num,insens); + index = numEnd; } // START OF STRING OPERATOR @@ -844,6 +934,32 @@ public class RE extends REToken { currentToken = new RETokenEnd(subIndex,null); } + // HEX CHARACTER, UNICODE CHARACTER + // \x1B, \u1234 + + else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) || + (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) { + CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException("invalid hex character", REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken(currentToken); + currentToken = new RETokenChar(subIndex,ce.ch,insens); + } + + // NAMED PROPERTY + // \p{prop}, \P{prop} + + else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) || + (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) { + NamedProperty np = getNamedProperty(pattern, index - 2, pLength); + if (np == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + index = index - 2 + np.len; + addToken(currentToken); + currentToken = getRETokenNamedProperty(subIndex,np,insens,index); + } + // NON-SPECIAL CHARACTER (or escape to make literal) // c | \* for example @@ -857,9 +973,10 @@ public class RE extends REToken { addToken(currentToken); if (branches != null) { - branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength)); + branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength)); branches.trimToSize(); // compact the Vector minimumLength = 0; + maximumLength = 0; firstToken = lastToken = null; addToken(new RETokenOneOf(subIndex,branches,false)); } @@ -867,6 +984,199 @@ public class RE extends REToken { } + private static class ParseCharClassResult { + RETokenOneOf token; + int index; + boolean returnAtAndOperator = false; + } + + /** + * Parse [...] or [^...] and make an RETokenOneOf instance. + * @param subIndex subIndex to be given to the created RETokenOneOf instance. + * @param pattern Input array of characters to be parsed. + * @param index Index pointing to the character next to the beginning '['. + * @param pLength Limit of the input array. + * @param cflags Compilation flags used to parse the pattern. + * @param pflags Flags that affect the behavior of this method. + * @param syntax Syntax used to parse the pattern. + */ + private static ParseCharClassResult parseCharClass(int subIndex, + char[] pattern, int index, + int pLength, int cflags, RESyntax syntax, int pflags) + throws REException { + + boolean insens = ((cflags & REG_ICASE) > 0); + Vector options = new Vector(); + Vector addition = new Vector(); + boolean additionAndAppeared = false; + final int RETURN_AT_AND = 0x01; + boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0); + boolean negative = false; + char ch; + + char lastChar = 0; + boolean lastCharIsSet = false; + if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); + + // Check for initial caret, negation + if ((ch = pattern[index]) == '^') { + negative = true; + if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + ch = pattern[index]; + } + + // Check for leading right bracket literal + if (ch == ']') { + lastChar = ch; lastCharIsSet = true; + if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + } + + while ((ch = pattern[index++]) != ']') { + if ((ch == '-') && (lastCharIsSet)) { + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + if ((ch = pattern[index]) == ']') { + options.addElement(new RETokenChar(subIndex,lastChar,insens)); + lastChar = '-'; + } else { + if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { + CharExpression ce = getCharExpression(pattern, index, pLength, syntax); + if (ce == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + ch = ce.ch; + index = index + ce.len - 1; + } + options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); + lastChar = 0; lastCharIsSet = false; + index++; + } + } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + int posixID = -1; + boolean negate = false; + char asciiEsc = 0; + boolean asciiEscIsSet = false; + NamedProperty np = null; + if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { + switch (pattern[index]) { + case 'D': + negate = true; + case 'd': + posixID = RETokenPOSIX.DIGIT; + break; + case 'S': + negate = true; + case 's': + posixID = RETokenPOSIX.SPACE; + break; + case 'W': + negate = true; + case 'w': + posixID = RETokenPOSIX.ALNUM; + break; + } + } + if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) { + np = getNamedProperty(pattern, index - 1, pLength); + if (np == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + index = index - 1 + np.len - 1; + } + else { + CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax); + if (ce == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + asciiEsc = ce.ch; asciiEscIsSet = true; + index = index - 1 + ce.len - 1; + } + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + + if (posixID != -1) { + options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); + } else if (np != null) { + options.addElement(getRETokenNamedProperty(subIndex,np,insens,index)); + } else if (asciiEscIsSet) { + lastChar = asciiEsc; lastCharIsSet = true; + } else { + lastChar = pattern[index]; lastCharIsSet = true; + } + ++index; + } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { + StringBuffer posixSet = new StringBuffer(); + index = getPosixSet(pattern,index+1,posixSet); + int posixId = RETokenPOSIX.intValue(posixSet.toString()); + if (posixId != -1) + options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); + } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) { + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index, pLength, cflags, syntax, 0); + addition.addElement(result.token); + addition.addElement("|"); + index = result.index; + } else if ((ch == '&') && + (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) && + (index < pLength) && (pattern[index] == '&')) { + if (returnAtAndOperator) { + ParseCharClassResult result = new ParseCharClassResult(); + options.trimToSize(); + if (additionAndAppeared) addition.addElement("&"); + if (addition.size() == 0) addition = null; + result.token = new RETokenOneOf(subIndex, + options, addition, negative); + result.index = index - 1; + result.returnAtAndOperator = true; + return result; + } + // The precedence of the operator "&&" is the lowest. + // So we postpone adding "&" until other elements + // are added. And we insert Boolean.FALSE at the + // beginning of the list of tokens following "&&". + // So, "&&[a-b][k-m]" will be stored in the Vecter + // addition in this order: + // Boolean.FALSE, [a-b], "|", [k-m], "|", "&" + if (additionAndAppeared) addition.addElement("&"); + addition.addElement(Boolean.FALSE); + additionAndAppeared = true; + + // The part on which "&&" operates may be either + // (1) explicitly enclosed by [] + // or + // (2) not enclosed by [] and terminated by the + // next "&&" or the end of the character list. + // Let the preceding else if block do the case (1). + // We must do something in case of (2). + if ((index + 1 < pLength) && (pattern[index + 1] != '[')) { + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index+1, pLength, cflags, syntax, + RETURN_AT_AND); + addition.addElement(result.token); + addition.addElement("|"); + // If the method returned at the next "&&", it is OK. + // Otherwise we have eaten the mark of the end of this + // character list "]". In this case we must give back + // the end mark. + index = (result.returnAtAndOperator ? + result.index: result.index - 1); + } + } else { + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + lastChar = ch; lastCharIsSet = true; + } + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + } // while in list + // Out of list, index is one past ']' + + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + + ParseCharClassResult result = new ParseCharClassResult(); + // Create a new RETokenOneOf + options.trimToSize(); + if (additionAndAppeared) addition.addElement("&"); + if (addition.size() == 0) addition = null; + result.token = new RETokenOneOf(subIndex,options, addition, negative); + result.index = index; + return result; + } + private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException { unit.ch = input[index++]; unit.bk = (unit.ch == '\\' @@ -878,6 +1188,176 @@ public class RE extends REToken { return index; } + private static int parseInt(char[] input, int pos, int len, int radix) { + int ret = 0; + for (int i = pos; i < pos + len; i++) { + ret = ret * radix + Character.digit(input[i], radix); + } + return ret; + } + + /** + * This class represents various expressions for a character. + * "a" : 'a' itself. + * "\0123" : Octal char 0123 + * "\x1b" : Hex char 0x1b + * "\u1234" : Unicode char \u1234 + */ + private static class CharExpression { + /** character represented by this expression */ + char ch; + /** String expression */ + String expr; + /** length of this expression */ + int len; + public String toString() { return expr; } + } + + private static CharExpression getCharExpression(char[] input, int pos, int lim, + RESyntax syntax) { + CharExpression ce = new CharExpression(); + char c = input[pos]; + if (c == '\\') { + if (pos + 1 >= lim) return null; + c = input[pos + 1]; + switch(c) { + case 't': + ce.ch = '\t'; + ce.len = 2; + break; + case 'n': + ce.ch = '\n'; + ce.len = 2; + break; + case 'r': + ce.ch = '\r'; + ce.len = 2; + break; + case 'x': + case 'u': + if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) || + (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) { + int l = 0; + int expectedLength = (c == 'x' ? 2 : 4); + for (int i = pos + 2; i < pos + 2 + expectedLength; i++) { + if (i >= lim) break; + if (!((input[i] >= '0' && input[i] <= '9') || + (input[i] >= 'A' && input[i] <= 'F') || + (input[i] >= 'a' && input[i] <= 'f'))) + break; + l++; + } + if (l != expectedLength) return null; + ce.ch = (char)(parseInt(input, pos + 2, l, 16)); + ce.len = l + 2; + } + else { + ce.ch = c; + ce.len = 2; + } + break; + case '0': + if (syntax.get(RESyntax.RE_OCTAL_CHAR)) { + int l = 0; + for (int i = pos + 2; i < pos + 2 + 3; i++) { + if (i >= lim) break; + if (input[i] < '0' || input[i] > '7') break; + l++; + } + if (l == 3 && input[pos + 2] > '3') l--; + if (l <= 0) return null; + ce.ch = (char)(parseInt(input, pos + 2, l, 8)); + ce.len = l + 2; + } + else { + ce.ch = c; + ce.len = 2; + } + break; + default: + ce.ch = c; + ce.len = 2; + break; + } + } + else { + ce.ch = input[pos]; + ce.len = 1; + } + ce.expr = new String(input, pos, ce.len); + return ce; + } + + /** + * This class represents a substring in a pattern string expressing + * a named property. + * "\pA" : Property named "A" + * "\p{prop}" : Property named "prop" + * "\PA" : Property named "A" (Negated) + * "\P{prop}" : Property named "prop" (Negated) + */ + private static class NamedProperty { + /** Property name */ + String name; + /** Negated or not */ + boolean negate; + /** length of this expression */ + int len; + } + + private static NamedProperty getNamedProperty(char[] input, int pos, int lim) { + NamedProperty np = new NamedProperty(); + char c = input[pos]; + if (c == '\\') { + if (++pos >= lim) return null; + c = input[pos++]; + switch(c) { + case 'p': + np.negate = false; + break; + case 'P': + np.negate = true; + break; + default: + return null; + } + c = input[pos++]; + if (c == '{') { + int p = -1; + for (int i = pos; i < lim; i++) { + if (input[i] == '}') { + p = i; + break; + } + } + if (p < 0) return null; + int len = p - pos; + np.name = new String(input, pos, len); + np.len = len + 4; + } + else { + np.name = new String(input, pos - 1, 1); + np.len = 3; + } + return np; + } + else return null; + } + + private static RETokenNamedProperty getRETokenNamedProperty( + int subIndex, NamedProperty np, boolean insens, int index) + throws REException { + try { + return new RETokenNamedProperty(subIndex, np.name, insens, np.negate); + } + catch (REException e) { + REException ree; + ree = new REException(e.getMessage(), REException.REG_ESCAPE, index); + ree.initCause(e); + throw ree; + } + } + /** * Checks if the regular expression matches the input in its entirety. * @@ -958,6 +1438,10 @@ public class RE extends REToken { return minimumLength; } + public int getMaximumLength() { + return maximumLength; + } + /** * Returns an array of all matches found in the input. * @@ -1025,7 +1509,9 @@ public class RE extends REToken { /* Implements abstract method REToken.match() */ boolean match(CharIndexed input, REMatch mymatch) { - if (firstToken == null) return next(input, mymatch); + if (firstToken == null) { + return next(input, mymatch); + } // Note the start of this subexpression mymatch.start[subIndex] = mymatch.index; @@ -1089,23 +1575,34 @@ public class RE extends REToken { } REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { + boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); + RE re = (tryEntireMatch ? (RE) this.clone() : this); + if (tryEntireMatch) { + re.chain(new RETokenEnd(0, null)); + } // Create a new REMatch to hold results REMatch mymatch = new REMatch(numSubs, anchor, eflags); do { // Optimization: check if anchor + minimumLength > length if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { - if (match(input, mymatch)) { - // Find longest match of them all to observe leftmost longest - REMatch longest = mymatch; + if (re.match(input, mymatch)) { + REMatch best = mymatch; + // We assume that the match that coms first is the best. + // And the following "The longer, the better" rule has + // been commented out. The longest is not neccesarily + // the best. For example, "a" out of "aaa" is the best + // match for /a+?/. + /* + // Find best match of them all to observe leftmost longest while ((mymatch = mymatch.next) != null) { - if (mymatch.index > longest.index) { - longest = mymatch; + if (mymatch.index > best.index) { + best = mymatch; } } - - longest.end[0] = longest.index; - longest.finish(input); - return longest; + */ + best.end[0] = best.index; + best.finish(input); + return best; } } mymatch.clear(++anchor); @@ -1216,8 +1713,7 @@ public class RE extends REToken { StringBuffer buffer = new StringBuffer(); REMatch m = getMatchImpl(input,index,eflags,buffer); if (m==null) return buffer.toString(); - buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? - replace : m.substituteInto(replace) ); + buffer.append(getReplacement(replace, m, eflags)); if (input.move(m.end[0])) { do { buffer.append(input.charAt(0)); @@ -1278,8 +1774,7 @@ public class RE extends REToken { StringBuffer buffer = new StringBuffer(); REMatch m; while ((m = getMatchImpl(input,index,eflags,buffer)) != null) { - buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? - replace : m.substituteInto(replace) ); + buffer.append(getReplacement(replace, m, eflags)); index = m.getEndIndex(); if (m.end[0] == 0) { char ch = input.charAt(0); @@ -1294,11 +1789,50 @@ public class RE extends REToken { } return buffer.toString(); } + + public static String getReplacement(String replace, REMatch m, int eflags) { + if ((eflags & REG_NO_INTERPOLATE) > 0) + return replace; + else { + if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) { + StringBuffer sb = new StringBuffer(); + int l = replace.length(); + for (int i = 0; i < l; i++) { + char c = replace.charAt(i); + switch(c) { + case '\\': + i++; + // Let StringIndexOutOfBoundsException be thrown. + sb.append(replace.charAt(i)); + break; + case '$': + int i1 = i + 1; + while (i1 < replace.length() && + Character.isDigit(replace.charAt(i1))) i1++; + sb.append(m.substituteInto(replace.substring(i, i1))); + i = i1 - 1; + break; + default: + sb.append(c); + } + } + return sb.toString(); + } + else + return m.substituteInto(replace); + } + } /* Helper function for constructor */ private void addToken(REToken next) { if (next == null) return; minimumLength += next.getMinimumLength(); + int nmax = next.getMaximumLength(); + if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE) + maximumLength += nmax; + else + maximumLength = Integer.MAX_VALUE; + if (firstToken == null) { lastToken = firstToken = next; } else { |

