summaryrefslogtreecommitdiffstats
path: root/libjava/classpath/gnu/regexp/RE.java
diff options
context:
space:
mode:
Diffstat (limited to 'libjava/classpath/gnu/regexp/RE.java')
-rw-r--r--libjava/classpath/gnu/regexp/RE.java778
1 files changed, 656 insertions, 122 deletions
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java
index 9ac9b53d1a9..ef606a6d8a7 100644
--- a/libjava/classpath/gnu/regexp/RE.java
+++ b/libjava/classpath/gnu/regexp/RE.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RE.java
- Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -136,12 +136,13 @@ public class RE extends REToken {
/** Minimum length, in characters, of any possible match. */
private int minimumLength;
+ private int maximumLength;
/**
* Compilation flag. Do not differentiate case. Subsequent
* searches using this RE will be case insensitive.
*/
- public static final int REG_ICASE = 2;
+ public static final int REG_ICASE = 0x02;
/**
* Compilation flag. The match-any-character operator (dot)
@@ -149,14 +150,14 @@ public class RE extends REToken {
* bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
* the "/s" operator in Perl.
*/
- public static final int REG_DOT_NEWLINE = 4;
+ public static final int REG_DOT_NEWLINE = 0x04;
/**
* Compilation flag. Use multiline mode. In this mode, the ^ and $
* anchors will match based on newlines within the input. This is
* equivalent to the "/m" operator in Perl.
*/
- public static final int REG_MULTILINE = 8;
+ public static final int REG_MULTILINE = 0x08;
/**
* Execution flag.
@@ -185,14 +186,14 @@ public class RE extends REToken {
* // m4.toString(): "fool"<BR>
* </CODE>
*/
- public static final int REG_NOTBOL = 16;
+ public static final int REG_NOTBOL = 0x10;
/**
* Execution flag.
* The match-end operator ($) does not match at the end
* of the input string. Useful for matching on substrings.
*/
- public static final int REG_NOTEOL = 32;
+ public static final int REG_NOTEOL = 0x20;
/**
* Execution flag.
@@ -206,7 +207,7 @@ public class RE extends REToken {
* the example under REG_NOTBOL. It also affects the use of the \&lt;
* and \b operators.
*/
- public static final int REG_ANCHORINDEX = 64;
+ public static final int REG_ANCHORINDEX = 0x40;
/**
* Execution flag.
@@ -215,7 +216,24 @@ public class RE extends REToken {
* the corresponding subexpressions. For example, you may want to
* replace all matches of "one dollar" with "$1".
*/
- public static final int REG_NO_INTERPOLATE = 128;
+ public static final int REG_NO_INTERPOLATE = 0x80;
+
+ /**
+ * Execution flag.
+ * Try to match the whole input string. An implicit match-end operator
+ * is added to this regexp.
+ */
+ public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
+
+ /**
+ * Execution flag.
+ * The substitute and substituteAll methods will treat the
+ * character '\' in the replacement as an escape to a literal
+ * character. In this case "\n", "\$", "\\", "\x40" and "\012"
+ * will become "n", "$", "\", "x40" and "012" respectively.
+ * This flag has no effect if REG_NO_INTERPOLATE is set on.
+ */
+ public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
@@ -273,12 +291,13 @@ public class RE extends REToken {
}
// internal constructor used for alternation
- private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
+ private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
super(subIndex);
firstToken = first;
lastToken = last;
numSubs = subs;
minimumLength = minLength;
+ maximumLength = maxLength;
addToken(new RETokenEndSub(subIndex));
}
@@ -333,6 +352,11 @@ public class RE extends REToken {
char ch;
boolean quot = false;
+ // Saved syntax and flags.
+ RESyntax savedSyntax = null;
+ int savedCflags = 0;
+ boolean flagsSaved = false;
+
while (index < pLength) {
// read the next character unit (including backslash escapes)
index = getCharUnit(pattern,index,unit,quot);
@@ -359,8 +383,9 @@ public class RE extends REToken {
&& !syntax.get(RESyntax.RE_LIMITED_OPS)) {
// make everything up to here be a branch. create vector if nec.
addToken(currentToken);
- RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
+ RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
minimumLength = 0;
+ maximumLength = 0;
if (branches == null) {
branches = new Vector();
}
@@ -402,102 +427,12 @@ public class RE extends REToken {
// [...] | [^...]
else if ((unit.ch == '[') && !(unit.bk || quot)) {
- Vector options = new Vector();
- boolean negative = false;
- char lastChar = 0;
- if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
-
- // Check for initial caret, negation
- if ((ch = pattern[index]) == '^') {
- negative = true;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- ch = pattern[index];
- }
-
- // Check for leading right bracket literal
- if (ch == ']') {
- lastChar = ch;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- }
-
- while ((ch = pattern[index++]) != ']') {
- if ((ch == '-') && (lastChar != 0)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = '-';
- } else {
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
- lastChar = 0;
- index++;
- }
- } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- int posixID = -1;
- boolean negate = false;
- char asciiEsc = 0;
- if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
- switch (pattern[index]) {
- case 'D':
- negate = true;
- case 'd':
- posixID = RETokenPOSIX.DIGIT;
- break;
- case 'S':
- negate = true;
- case 's':
- posixID = RETokenPOSIX.SPACE;
- break;
- case 'W':
- negate = true;
- case 'w':
- posixID = RETokenPOSIX.ALNUM;
- break;
- }
- }
- else if ("nrt".indexOf(pattern[index]) != -1) {
- switch (pattern[index]) {
- case 'n':
- asciiEsc = '\n';
- break;
- case 't':
- asciiEsc = '\t';
- break;
- case 'r':
- asciiEsc = '\r';
- break;
- }
- }
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
- if (posixID != -1) {
- options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
- } else if (asciiEsc != 0) {
- lastChar = asciiEsc;
- } else {
- lastChar = pattern[index];
- }
- ++index;
- } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
- StringBuffer posixSet = new StringBuffer();
- index = getPosixSet(pattern,index+1,posixSet);
- int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
- } else {
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = ch;
- }
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- } // while in list
- // Out of list, index is one past ']'
-
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
// Create a new RETokenOneOf
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
addToken(currentToken);
- options.trimToSize();
- currentToken = new RETokenOneOf(subIndex,options,negative);
+ currentToken = result.token;
+ index = result.index;
}
// SUBEXPRESSIONS
@@ -507,7 +442,10 @@ public class RE extends REToken {
boolean pure = false;
boolean comment = false;
boolean lookAhead = false;
+ boolean lookBehind = false;
+ boolean independent = false;
boolean negativelh = false;
+ boolean negativelb = false;
if ((index+1 < pLength) && (pattern[index] == '?')) {
switch (pattern[index+1]) {
case '!':
@@ -525,6 +463,114 @@ public class RE extends REToken {
index += 2;
}
break;
+ case '<':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports look-behind.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ index++;
+ switch (pattern[index +1]) {
+ case '!':
+ pure = true;
+ negativelb = true;
+ lookBehind = true;
+ index += 2;
+ break;
+ case '=':
+ pure = true;
+ lookBehind = true;
+ index += 2;
+ }
+ }
+ break;
+ case '>':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports independent group.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ pure = true;
+ independent = true;
+ index += 2;
+ }
+ break;
+ case 'i':
+ case 'd':
+ case 'm':
+ case 's':
+ // case 'u': not supported
+ // case 'x': not supported
+ case '-':
+ if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
+ // Set or reset syntax flags.
+ int flagIndex = index + 1;
+ int endFlag = -1;
+ RESyntax newSyntax = new RESyntax(syntax);
+ int newCflags = cflags;
+ boolean negate = false;
+ while (flagIndex < pLength && endFlag < 0) {
+ switch(pattern[flagIndex]) {
+ case 'i':
+ if (negate)
+ newCflags &= ~REG_ICASE;
+ else
+ newCflags |= REG_ICASE;
+ flagIndex++;
+ break;
+ case 'd':
+ if (negate)
+ newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
+ else
+ newSyntax.setLineSeparator("\n");
+ flagIndex++;
+ break;
+ case 'm':
+ if (negate)
+ newCflags &= ~REG_MULTILINE;
+ else
+ newCflags |= REG_MULTILINE;
+ flagIndex++;
+ break;
+ case 's':
+ if (negate)
+ newCflags &= ~REG_DOT_NEWLINE;
+ else
+ newCflags |= REG_DOT_NEWLINE;
+ flagIndex++;
+ break;
+ // case 'u': not supported
+ // case 'x': not supported
+ case '-':
+ negate = true;
+ flagIndex++;
+ break;
+ case ':':
+ case ')':
+ endFlag = pattern[flagIndex];
+ break;
+ default:
+ throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
+ }
+ }
+ if (endFlag == ')') {
+ syntax = newSyntax;
+ cflags = newCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ // This can be treated as though it were a comment.
+ comment = true;
+ index = flagIndex - 1;
+ break;
+ }
+ if (endFlag == ':') {
+ savedSyntax = syntax;
+ savedCflags = cflags;
+ flagsSaved = true;
+ syntax = newSyntax;
+ cflags = newCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ index = flagIndex -1;
+ // Fall through to the next case.
+ }
+ else {
+ throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
+ }
case ':':
if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
pure = true;
@@ -607,15 +653,28 @@ public class RE extends REToken {
numSubs++;
}
- int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
+ int useIndex = (pure || lookAhead || lookBehind || independent) ?
+ 0 : nextSub + numSubs;
currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
numSubs += ((RE) currentToken).getNumSubs();
if (lookAhead) {
currentToken = new RETokenLookAhead(currentToken,negativelh);
}
+ else if (lookBehind) {
+ currentToken = new RETokenLookBehind(currentToken,negativelb);
+ }
+ else if (independent) {
+ currentToken = new RETokenIndependent(currentToken);
+ }
index = nextIndex;
+ if (flagsSaved) {
+ syntax = savedSyntax;
+ cflags = savedCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ flagsSaved = false;
+ }
} // not a comment
} // subexpression
@@ -715,14 +774,45 @@ public class RE extends REToken {
else
currentToken = setRepeated(currentToken,0,1,index);
}
+
+ // OCTAL CHARACTER
+ // \0377
+ else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid octal character", REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
// BACKREFERENCE OPERATOR
- // \1 \2 ... \9
+ // \1 \2 ... \9 and \10 \11 \12 ...
// not available if RE_NO_BK_REFS is set
+ // Perl recognizes \10, \11, and so on only if enough number of
+ // parentheses have opened before it, otherwise they are treated
+ // as aliases of \010, \011, ... (octal characters). In case of
+ // Sun's JDK, octal character expression must always begin with \0.
+ // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
+ // JDK treats \2 as a back reference to the 2nd group because
+ // there are only two groups. But in our poor implementation,
+ // we cannot help but treat \29 as a back reference to the 29th group.
else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
addToken(currentToken);
- currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
+ int numBegin = index - 1;
+ int numEnd = pLength;
+ for (int i = index; i < pLength; i++) {
+ if (! Character.isDigit(pattern[i])) {
+ numEnd = i;
+ break;
+ }
+ }
+ int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
+
+ currentToken = new RETokenBackRef(subIndex,num,insens);
+ index = numEnd;
}
// START OF STRING OPERATOR
@@ -844,6 +934,32 @@ public class RE extends REToken {
currentToken = new RETokenEnd(subIndex,null);
}
+ // HEX CHARACTER, UNICODE CHARACTER
+ // \x1B, \u1234
+
+ else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid hex character", REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
+ // NAMED PROPERTY
+ // \p{prop}, \P{prop}
+
+ else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
+ (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
+ NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 2 + np.len;
+ addToken(currentToken);
+ currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+ }
+
// NON-SPECIAL CHARACTER (or escape to make literal)
// c | \* for example
@@ -857,9 +973,10 @@ public class RE extends REToken {
addToken(currentToken);
if (branches != null) {
- branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
+ branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
branches.trimToSize(); // compact the Vector
minimumLength = 0;
+ maximumLength = 0;
firstToken = lastToken = null;
addToken(new RETokenOneOf(subIndex,branches,false));
}
@@ -867,6 +984,199 @@ public class RE extends REToken {
}
+ private static class ParseCharClassResult {
+ RETokenOneOf token;
+ int index;
+ boolean returnAtAndOperator = false;
+ }
+
+ /**
+ * Parse [...] or [^...] and make an RETokenOneOf instance.
+ * @param subIndex subIndex to be given to the created RETokenOneOf instance.
+ * @param pattern Input array of characters to be parsed.
+ * @param index Index pointing to the character next to the beginning '['.
+ * @param pLength Limit of the input array.
+ * @param cflags Compilation flags used to parse the pattern.
+ * @param pflags Flags that affect the behavior of this method.
+ * @param syntax Syntax used to parse the pattern.
+ */
+ private static ParseCharClassResult parseCharClass(int subIndex,
+ char[] pattern, int index,
+ int pLength, int cflags, RESyntax syntax, int pflags)
+ throws REException {
+
+ boolean insens = ((cflags & REG_ICASE) > 0);
+ Vector options = new Vector();
+ Vector addition = new Vector();
+ boolean additionAndAppeared = false;
+ final int RETURN_AT_AND = 0x01;
+ boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
+ boolean negative = false;
+ char ch;
+
+ char lastChar = 0;
+ boolean lastCharIsSet = false;
+ if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
+
+ // Check for initial caret, negation
+ if ((ch = pattern[index]) == '^') {
+ negative = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ ch = pattern[index];
+ }
+
+ // Check for leading right bracket literal
+ if (ch == ']') {
+ lastChar = ch; lastCharIsSet = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ }
+
+ while ((ch = pattern[index++]) != ']') {
+ if ((ch == '-') && (lastCharIsSet)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ if ((ch = pattern[index]) == ']') {
+ options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = '-';
+ } else {
+ if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ ch = ce.ch;
+ index = index + ce.len - 1;
+ }
+ options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ lastChar = 0; lastCharIsSet = false;
+ index++;
+ }
+ } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ int posixID = -1;
+ boolean negate = false;
+ char asciiEsc = 0;
+ boolean asciiEscIsSet = false;
+ NamedProperty np = null;
+ if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
+ switch (pattern[index]) {
+ case 'D':
+ negate = true;
+ case 'd':
+ posixID = RETokenPOSIX.DIGIT;
+ break;
+ case 'S':
+ negate = true;
+ case 's':
+ posixID = RETokenPOSIX.SPACE;
+ break;
+ case 'W':
+ negate = true;
+ case 'w':
+ posixID = RETokenPOSIX.ALNUM;
+ break;
+ }
+ }
+ if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
+ np = getNamedProperty(pattern, index - 1, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 1 + np.len - 1;
+ }
+ else {
+ CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ asciiEsc = ce.ch; asciiEscIsSet = true;
+ index = index - 1 + ce.len - 1;
+ }
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ if (posixID != -1) {
+ options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+ } else if (np != null) {
+ options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ } else if (asciiEscIsSet) {
+ lastChar = asciiEsc; lastCharIsSet = true;
+ } else {
+ lastChar = pattern[index]; lastCharIsSet = true;
+ }
+ ++index;
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
+ StringBuffer posixSet = new StringBuffer();
+ index = getPosixSet(pattern,index+1,posixSet);
+ int posixId = RETokenPOSIX.intValue(posixSet.toString());
+ if (posixId != -1)
+ options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ index = result.index;
+ } else if ((ch == '&') &&
+ (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
+ (index < pLength) && (pattern[index] == '&')) {
+ if (returnAtAndOperator) {
+ ParseCharClassResult result = new ParseCharClassResult();
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,
+ options, addition, negative);
+ result.index = index - 1;
+ result.returnAtAndOperator = true;
+ return result;
+ }
+ // The precedence of the operator "&&" is the lowest.
+ // So we postpone adding "&" until other elements
+ // are added. And we insert Boolean.FALSE at the
+ // beginning of the list of tokens following "&&".
+ // So, "&&[a-b][k-m]" will be stored in the Vecter
+ // addition in this order:
+ // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
+ if (additionAndAppeared) addition.addElement("&");
+ addition.addElement(Boolean.FALSE);
+ additionAndAppeared = true;
+
+ // The part on which "&&" operates may be either
+ // (1) explicitly enclosed by []
+ // or
+ // (2) not enclosed by [] and terminated by the
+ // next "&&" or the end of the character list.
+ // Let the preceding else if block do the case (1).
+ // We must do something in case of (2).
+ if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index+1, pLength, cflags, syntax,
+ RETURN_AT_AND);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ // If the method returned at the next "&&", it is OK.
+ // Otherwise we have eaten the mark of the end of this
+ // character list "]". In this case we must give back
+ // the end mark.
+ index = (result.returnAtAndOperator ?
+ result.index: result.index - 1);
+ }
+ } else {
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = ch; lastCharIsSet = true;
+ }
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ } // while in list
+ // Out of list, index is one past ']'
+
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ ParseCharClassResult result = new ParseCharClassResult();
+ // Create a new RETokenOneOf
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,options, addition, negative);
+ result.index = index;
+ return result;
+ }
+
private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
unit.ch = input[index++];
unit.bk = (unit.ch == '\\'
@@ -878,6 +1188,176 @@ public class RE extends REToken {
return index;
}
+ private static int parseInt(char[] input, int pos, int len, int radix) {
+ int ret = 0;
+ for (int i = pos; i < pos + len; i++) {
+ ret = ret * radix + Character.digit(input[i], radix);
+ }
+ return ret;
+ }
+
+ /**
+ * This class represents various expressions for a character.
+ * "a" : 'a' itself.
+ * "\0123" : Octal char 0123
+ * "\x1b" : Hex char 0x1b
+ * "\u1234" : Unicode char \u1234
+ */
+ private static class CharExpression {
+ /** character represented by this expression */
+ char ch;
+ /** String expression */
+ String expr;
+ /** length of this expression */
+ int len;
+ public String toString() { return expr; }
+ }
+
+ private static CharExpression getCharExpression(char[] input, int pos, int lim,
+ RESyntax syntax) {
+ CharExpression ce = new CharExpression();
+ char c = input[pos];
+ if (c == '\\') {
+ if (pos + 1 >= lim) return null;
+ c = input[pos + 1];
+ switch(c) {
+ case 't':
+ ce.ch = '\t';
+ ce.len = 2;
+ break;
+ case 'n':
+ ce.ch = '\n';
+ ce.len = 2;
+ break;
+ case 'r':
+ ce.ch = '\r';
+ ce.len = 2;
+ break;
+ case 'x':
+ case 'u':
+ if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ int l = 0;
+ int expectedLength = (c == 'x' ? 2 : 4);
+ for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
+ if (i >= lim) break;
+ if (!((input[i] >= '0' && input[i] <= '9') ||
+ (input[i] >= 'A' && input[i] <= 'F') ||
+ (input[i] >= 'a' && input[i] <= 'f')))
+ break;
+ l++;
+ }
+ if (l != expectedLength) return null;
+ ce.ch = (char)(parseInt(input, pos + 2, l, 16));
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ case '0':
+ if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ int l = 0;
+ for (int i = pos + 2; i < pos + 2 + 3; i++) {
+ if (i >= lim) break;
+ if (input[i] < '0' || input[i] > '7') break;
+ l++;
+ }
+ if (l == 3 && input[pos + 2] > '3') l--;
+ if (l <= 0) return null;
+ ce.ch = (char)(parseInt(input, pos + 2, l, 8));
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ default:
+ ce.ch = c;
+ ce.len = 2;
+ break;
+ }
+ }
+ else {
+ ce.ch = input[pos];
+ ce.len = 1;
+ }
+ ce.expr = new String(input, pos, ce.len);
+ return ce;
+ }
+
+ /**
+ * This class represents a substring in a pattern string expressing
+ * a named property.
+ * "\pA" : Property named "A"
+ * "\p{prop}" : Property named "prop"
+ * "\PA" : Property named "A" (Negated)
+ * "\P{prop}" : Property named "prop" (Negated)
+ */
+ private static class NamedProperty {
+ /** Property name */
+ String name;
+ /** Negated or not */
+ boolean negate;
+ /** length of this expression */
+ int len;
+ }
+
+ private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
+ NamedProperty np = new NamedProperty();
+ char c = input[pos];
+ if (c == '\\') {
+ if (++pos >= lim) return null;
+ c = input[pos++];
+ switch(c) {
+ case 'p':
+ np.negate = false;
+ break;
+ case 'P':
+ np.negate = true;
+ break;
+ default:
+ return null;
+ }
+ c = input[pos++];
+ if (c == '{') {
+ int p = -1;
+ for (int i = pos; i < lim; i++) {
+ if (input[i] == '}') {
+ p = i;
+ break;
+ }
+ }
+ if (p < 0) return null;
+ int len = p - pos;
+ np.name = new String(input, pos, len);
+ np.len = len + 4;
+ }
+ else {
+ np.name = new String(input, pos - 1, 1);
+ np.len = 3;
+ }
+ return np;
+ }
+ else return null;
+ }
+
+ private static RETokenNamedProperty getRETokenNamedProperty(
+ int subIndex, NamedProperty np, boolean insens, int index)
+ throws REException {
+ try {
+ return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
+ }
+ catch (REException e) {
+ REException ree;
+ ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
+ ree.initCause(e);
+ throw ree;
+ }
+ }
+
/**
* Checks if the regular expression matches the input in its entirety.
*
@@ -958,6 +1438,10 @@ public class RE extends REToken {
return minimumLength;
}
+ public int getMaximumLength() {
+ return maximumLength;
+ }
+
/**
* Returns an array of all matches found in the input.
*
@@ -1025,7 +1509,9 @@ public class RE extends REToken {
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- if (firstToken == null) return next(input, mymatch);
+ if (firstToken == null) {
+ return next(input, mymatch);
+ }
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
@@ -1089,23 +1575,34 @@ public class RE extends REToken {
}
REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
+ boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+ RE re = (tryEntireMatch ? (RE) this.clone() : this);
+ if (tryEntireMatch) {
+ re.chain(new RETokenEnd(0, null));
+ }
// Create a new REMatch to hold results
REMatch mymatch = new REMatch(numSubs, anchor, eflags);
do {
// Optimization: check if anchor + minimumLength > length
if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
- if (match(input, mymatch)) {
- // Find longest match of them all to observe leftmost longest
- REMatch longest = mymatch;
+ if (re.match(input, mymatch)) {
+ REMatch best = mymatch;
+ // We assume that the match that coms first is the best.
+ // And the following "The longer, the better" rule has
+ // been commented out. The longest is not neccesarily
+ // the best. For example, "a" out of "aaa" is the best
+ // match for /a+?/.
+ /*
+ // Find best match of them all to observe leftmost longest
while ((mymatch = mymatch.next) != null) {
- if (mymatch.index > longest.index) {
- longest = mymatch;
+ if (mymatch.index > best.index) {
+ best = mymatch;
}
}
-
- longest.end[0] = longest.index;
- longest.finish(input);
- return longest;
+ */
+ best.end[0] = best.index;
+ best.finish(input);
+ return best;
}
}
mymatch.clear(++anchor);
@@ -1216,8 +1713,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m = getMatchImpl(input,index,eflags,buffer);
if (m==null) return buffer.toString();
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
if (input.move(m.end[0])) {
do {
buffer.append(input.charAt(0));
@@ -1278,8 +1774,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m;
while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
index = m.getEndIndex();
if (m.end[0] == 0) {
char ch = input.charAt(0);
@@ -1294,11 +1789,50 @@ public class RE extends REToken {
}
return buffer.toString();
}
+
+ public static String getReplacement(String replace, REMatch m, int eflags) {
+ if ((eflags & REG_NO_INTERPOLATE) > 0)
+ return replace;
+ else {
+ if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
+ StringBuffer sb = new StringBuffer();
+ int l = replace.length();
+ for (int i = 0; i < l; i++) {
+ char c = replace.charAt(i);
+ switch(c) {
+ case '\\':
+ i++;
+ // Let StringIndexOutOfBoundsException be thrown.
+ sb.append(replace.charAt(i));
+ break;
+ case '$':
+ int i1 = i + 1;
+ while (i1 < replace.length() &&
+ Character.isDigit(replace.charAt(i1))) i1++;
+ sb.append(m.substituteInto(replace.substring(i, i1)));
+ i = i1 - 1;
+ break;
+ default:
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+ else
+ return m.substituteInto(replace);
+ }
+ }
/* Helper function for constructor */
private void addToken(REToken next) {
if (next == null) return;
minimumLength += next.getMinimumLength();
+ int nmax = next.getMaximumLength();
+ if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
+ maximumLength += nmax;
+ else
+ maximumLength = Integer.MAX_VALUE;
+
if (firstToken == null) {
lastToken = firstToken = next;
} else {
OpenPOWER on IntegriCloud