summaryrefslogtreecommitdiffstats
path: root/libjava/classpath/gnu/regexp
diff options
context:
space:
mode:
Diffstat (limited to 'libjava/classpath/gnu/regexp')
-rw-r--r--libjava/classpath/gnu/regexp/CharIndexed.java14
-rw-r--r--libjava/classpath/gnu/regexp/CharIndexedCharArray.java11
-rw-r--r--libjava/classpath/gnu/regexp/CharIndexedInputStream.java12
-rw-r--r--libjava/classpath/gnu/regexp/CharIndexedString.java11
-rw-r--r--libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java11
-rw-r--r--libjava/classpath/gnu/regexp/RE.java778
-rw-r--r--libjava/classpath/gnu/regexp/REMatch.java60
-rw-r--r--libjava/classpath/gnu/regexp/RESyntax.java40
-rw-r--r--libjava/classpath/gnu/regexp/REToken.java15
-rw-r--r--libjava/classpath/gnu/regexp/RETokenAny.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenBackRef.java16
-rw-r--r--libjava/classpath/gnu/regexp/RETokenChar.java6
-rw-r--r--libjava/classpath/gnu/regexp/RETokenEnd.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenEndSub.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenIndependent.java76
-rw-r--r--libjava/classpath/gnu/regexp/RETokenLookAhead.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenLookBehind.java116
-rw-r--r--libjava/classpath/gnu/regexp/RETokenNamedProperty.java301
-rw-r--r--libjava/classpath/gnu/regexp/RETokenOneOf.java181
-rw-r--r--libjava/classpath/gnu/regexp/RETokenPOSIX.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenRange.java21
-rw-r--r--libjava/classpath/gnu/regexp/RETokenRepeated.java310
-rw-r--r--libjava/classpath/gnu/regexp/RETokenStart.java4
-rw-r--r--libjava/classpath/gnu/regexp/RETokenWordBoundary.java5
24 files changed, 1712 insertions, 296 deletions
diff --git a/libjava/classpath/gnu/regexp/CharIndexed.java b/libjava/classpath/gnu/regexp/CharIndexed.java
index a0d7106aefa..df1d8930c6b 100644
--- a/libjava/classpath/gnu/regexp/CharIndexed.java
+++ b/libjava/classpath/gnu/regexp/CharIndexed.java
@@ -1,5 +1,5 @@
/* gnu/regexp/CharIndexed.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -81,4 +81,16 @@ public interface CharIndexed {
* position at a valid position in the input.
*/
boolean isValid();
+
+ /**
+ * Returns another CharIndexed containing length characters to the left
+ * of the given index. The given length is an expected maximum and
+ * the returned CharIndexed may not necessarily contain so many characters.
+ */
+ CharIndexed lookBehind(int index, int length);
+
+ /**
+ * Returns the effective length of this CharIndexed
+ */
+ int length();
}
diff --git a/libjava/classpath/gnu/regexp/CharIndexedCharArray.java b/libjava/classpath/gnu/regexp/CharIndexedCharArray.java
index 63d858c8709..1388d4729bf 100644
--- a/libjava/classpath/gnu/regexp/CharIndexedCharArray.java
+++ b/libjava/classpath/gnu/regexp/CharIndexedCharArray.java
@@ -1,5 +1,5 @@
/* gnu/regexp/CharIndexedCharArray.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -59,4 +59,13 @@ class CharIndexedCharArray implements CharIndexed, Serializable {
public boolean move(int index) {
return ((anchor += index) < s.length);
}
+
+ public CharIndexed lookBehind(int index, int length) {
+ if (length > (anchor + index)) length = anchor + index;
+ return new CharIndexedCharArray(s, anchor + index - length);
+ }
+
+ public int length() {
+ return s.length - anchor;
+ }
}
diff --git a/libjava/classpath/gnu/regexp/CharIndexedInputStream.java b/libjava/classpath/gnu/regexp/CharIndexedInputStream.java
index 145fe11b135..d5225a79337 100644
--- a/libjava/classpath/gnu/regexp/CharIndexedInputStream.java
+++ b/libjava/classpath/gnu/regexp/CharIndexedInputStream.java
@@ -1,5 +1,5 @@
/* gnu/regexp/CharIndexedInputStream.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -145,5 +145,15 @@ class CharIndexedInputStream implements CharIndexed {
public boolean isValid() {
return (cached != OUT_OF_BOUNDS);
}
+
+ public CharIndexed lookBehind(int index, int length) {
+ throw new UnsupportedOperationException(
+ "difficult to look behind for an input stream");
+ }
+
+ public int length() {
+ throw new UnsupportedOperationException(
+ "difficult to tell the length for an input stream");
+ }
}
diff --git a/libjava/classpath/gnu/regexp/CharIndexedString.java b/libjava/classpath/gnu/regexp/CharIndexedString.java
index 05be07ac68c..fe4fa8f7637 100644
--- a/libjava/classpath/gnu/regexp/CharIndexedString.java
+++ b/libjava/classpath/gnu/regexp/CharIndexedString.java
@@ -1,5 +1,5 @@
/* gnu/regexp/CharIndexedString.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -61,4 +61,13 @@ class CharIndexedString implements CharIndexed, Serializable {
public boolean move(int index) {
return ((anchor += index) < len);
}
+
+ public CharIndexed lookBehind(int index, int length) {
+ if (length > (anchor + index)) length = anchor + index;
+ return new CharIndexedString(s, anchor + index - length);
+ }
+
+ public int length() {
+ return len - anchor;
+ }
}
diff --git a/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java b/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java
index 1b88e398571..9c9118dfee2 100644
--- a/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java
+++ b/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java
@@ -1,5 +1,5 @@
/* gnu/regexp/CharIndexedStringBuffer.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -59,4 +59,13 @@ class CharIndexedStringBuffer implements CharIndexed, Serializable {
public boolean move(int index) {
return ((anchor += index) < s.length());
}
+
+ public CharIndexed lookBehind(int index, int length) {
+ if (length > (anchor + index)) length = anchor + index;
+ return new CharIndexedStringBuffer(s, anchor + index - length);
+ }
+
+ public int length() {
+ return s.length() - anchor;
+ }
}
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java
index 9ac9b53d1a9..ef606a6d8a7 100644
--- a/libjava/classpath/gnu/regexp/RE.java
+++ b/libjava/classpath/gnu/regexp/RE.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RE.java
- Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -136,12 +136,13 @@ public class RE extends REToken {
/** Minimum length, in characters, of any possible match. */
private int minimumLength;
+ private int maximumLength;
/**
* Compilation flag. Do not differentiate case. Subsequent
* searches using this RE will be case insensitive.
*/
- public static final int REG_ICASE = 2;
+ public static final int REG_ICASE = 0x02;
/**
* Compilation flag. The match-any-character operator (dot)
@@ -149,14 +150,14 @@ public class RE extends REToken {
* bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to
* the "/s" operator in Perl.
*/
- public static final int REG_DOT_NEWLINE = 4;
+ public static final int REG_DOT_NEWLINE = 0x04;
/**
* Compilation flag. Use multiline mode. In this mode, the ^ and $
* anchors will match based on newlines within the input. This is
* equivalent to the "/m" operator in Perl.
*/
- public static final int REG_MULTILINE = 8;
+ public static final int REG_MULTILINE = 0x08;
/**
* Execution flag.
@@ -185,14 +186,14 @@ public class RE extends REToken {
* // m4.toString(): "fool"<BR>
* </CODE>
*/
- public static final int REG_NOTBOL = 16;
+ public static final int REG_NOTBOL = 0x10;
/**
* Execution flag.
* The match-end operator ($) does not match at the end
* of the input string. Useful for matching on substrings.
*/
- public static final int REG_NOTEOL = 32;
+ public static final int REG_NOTEOL = 0x20;
/**
* Execution flag.
@@ -206,7 +207,7 @@ public class RE extends REToken {
* the example under REG_NOTBOL. It also affects the use of the \&lt;
* and \b operators.
*/
- public static final int REG_ANCHORINDEX = 64;
+ public static final int REG_ANCHORINDEX = 0x40;
/**
* Execution flag.
@@ -215,7 +216,24 @@ public class RE extends REToken {
* the corresponding subexpressions. For example, you may want to
* replace all matches of "one dollar" with "$1".
*/
- public static final int REG_NO_INTERPOLATE = 128;
+ public static final int REG_NO_INTERPOLATE = 0x80;
+
+ /**
+ * Execution flag.
+ * Try to match the whole input string. An implicit match-end operator
+ * is added to this regexp.
+ */
+ public static final int REG_TRY_ENTIRE_MATCH = 0x0100;
+
+ /**
+ * Execution flag.
+ * The substitute and substituteAll methods will treat the
+ * character '\' in the replacement as an escape to a literal
+ * character. In this case "\n", "\$", "\\", "\x40" and "\012"
+ * will become "n", "$", "\", "x40" and "012" respectively.
+ * This flag has no effect if REG_NO_INTERPOLATE is set on.
+ */
+ public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200;
/** Returns a string representing the version of the gnu.regexp package. */
public static final String version() {
@@ -273,12 +291,13 @@ public class RE extends REToken {
}
// internal constructor used for alternation
- private RE(REToken first, REToken last,int subs, int subIndex, int minLength) {
+ private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) {
super(subIndex);
firstToken = first;
lastToken = last;
numSubs = subs;
minimumLength = minLength;
+ maximumLength = maxLength;
addToken(new RETokenEndSub(subIndex));
}
@@ -333,6 +352,11 @@ public class RE extends REToken {
char ch;
boolean quot = false;
+ // Saved syntax and flags.
+ RESyntax savedSyntax = null;
+ int savedCflags = 0;
+ boolean flagsSaved = false;
+
while (index < pLength) {
// read the next character unit (including backslash escapes)
index = getCharUnit(pattern,index,unit,quot);
@@ -359,8 +383,9 @@ public class RE extends REToken {
&& !syntax.get(RESyntax.RE_LIMITED_OPS)) {
// make everything up to here be a branch. create vector if nec.
addToken(currentToken);
- RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength);
+ RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength);
minimumLength = 0;
+ maximumLength = 0;
if (branches == null) {
branches = new Vector();
}
@@ -402,102 +427,12 @@ public class RE extends REToken {
// [...] | [^...]
else if ((unit.ch == '[') && !(unit.bk || quot)) {
- Vector options = new Vector();
- boolean negative = false;
- char lastChar = 0;
- if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
-
- // Check for initial caret, negation
- if ((ch = pattern[index]) == '^') {
- negative = true;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- ch = pattern[index];
- }
-
- // Check for leading right bracket literal
- if (ch == ']') {
- lastChar = ch;
- if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- }
-
- while ((ch = pattern[index++]) != ']') {
- if ((ch == '-') && (lastChar != 0)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- if ((ch = pattern[index]) == ']') {
- options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = '-';
- } else {
- options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
- lastChar = 0;
- index++;
- }
- } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- int posixID = -1;
- boolean negate = false;
- char asciiEsc = 0;
- if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
- switch (pattern[index]) {
- case 'D':
- negate = true;
- case 'd':
- posixID = RETokenPOSIX.DIGIT;
- break;
- case 'S':
- negate = true;
- case 's':
- posixID = RETokenPOSIX.SPACE;
- break;
- case 'W':
- negate = true;
- case 'w':
- posixID = RETokenPOSIX.ALNUM;
- break;
- }
- }
- else if ("nrt".indexOf(pattern[index]) != -1) {
- switch (pattern[index]) {
- case 'n':
- asciiEsc = '\n';
- break;
- case 't':
- asciiEsc = '\t';
- break;
- case 'r':
- asciiEsc = '\r';
- break;
- }
- }
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
- if (posixID != -1) {
- options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
- } else if (asciiEsc != 0) {
- lastChar = asciiEsc;
- } else {
- lastChar = pattern[index];
- }
- ++index;
- } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
- StringBuffer posixSet = new StringBuffer();
- index = getPosixSet(pattern,index+1,posixSet);
- int posixId = RETokenPOSIX.intValue(posixSet.toString());
- if (posixId != -1)
- options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
- } else {
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
- lastChar = ch;
- }
- if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
- } // while in list
- // Out of list, index is one past ']'
-
- if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens));
-
// Create a new RETokenOneOf
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
addToken(currentToken);
- options.trimToSize();
- currentToken = new RETokenOneOf(subIndex,options,negative);
+ currentToken = result.token;
+ index = result.index;
}
// SUBEXPRESSIONS
@@ -507,7 +442,10 @@ public class RE extends REToken {
boolean pure = false;
boolean comment = false;
boolean lookAhead = false;
+ boolean lookBehind = false;
+ boolean independent = false;
boolean negativelh = false;
+ boolean negativelb = false;
if ((index+1 < pLength) && (pattern[index] == '?')) {
switch (pattern[index+1]) {
case '!':
@@ -525,6 +463,114 @@ public class RE extends REToken {
index += 2;
}
break;
+ case '<':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports look-behind.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ index++;
+ switch (pattern[index +1]) {
+ case '!':
+ pure = true;
+ negativelb = true;
+ lookBehind = true;
+ index += 2;
+ break;
+ case '=':
+ pure = true;
+ lookBehind = true;
+ index += 2;
+ }
+ }
+ break;
+ case '>':
+ // We assume that if the syntax supports look-ahead,
+ // it also supports independent group.
+ if (syntax.get(RESyntax.RE_LOOKAHEAD)) {
+ pure = true;
+ independent = true;
+ index += 2;
+ }
+ break;
+ case 'i':
+ case 'd':
+ case 'm':
+ case 's':
+ // case 'u': not supported
+ // case 'x': not supported
+ case '-':
+ if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break;
+ // Set or reset syntax flags.
+ int flagIndex = index + 1;
+ int endFlag = -1;
+ RESyntax newSyntax = new RESyntax(syntax);
+ int newCflags = cflags;
+ boolean negate = false;
+ while (flagIndex < pLength && endFlag < 0) {
+ switch(pattern[flagIndex]) {
+ case 'i':
+ if (negate)
+ newCflags &= ~REG_ICASE;
+ else
+ newCflags |= REG_ICASE;
+ flagIndex++;
+ break;
+ case 'd':
+ if (negate)
+ newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR);
+ else
+ newSyntax.setLineSeparator("\n");
+ flagIndex++;
+ break;
+ case 'm':
+ if (negate)
+ newCflags &= ~REG_MULTILINE;
+ else
+ newCflags |= REG_MULTILINE;
+ flagIndex++;
+ break;
+ case 's':
+ if (negate)
+ newCflags &= ~REG_DOT_NEWLINE;
+ else
+ newCflags |= REG_DOT_NEWLINE;
+ flagIndex++;
+ break;
+ // case 'u': not supported
+ // case 'x': not supported
+ case '-':
+ negate = true;
+ flagIndex++;
+ break;
+ case ':':
+ case ')':
+ endFlag = pattern[flagIndex];
+ break;
+ default:
+ throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index);
+ }
+ }
+ if (endFlag == ')') {
+ syntax = newSyntax;
+ cflags = newCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ // This can be treated as though it were a comment.
+ comment = true;
+ index = flagIndex - 1;
+ break;
+ }
+ if (endFlag == ':') {
+ savedSyntax = syntax;
+ savedCflags = cflags;
+ flagsSaved = true;
+ syntax = newSyntax;
+ cflags = newCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ index = flagIndex -1;
+ // Fall through to the next case.
+ }
+ else {
+ throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index);
+ }
case ':':
if (syntax.get(RESyntax.RE_PURE_GROUPING)) {
pure = true;
@@ -607,15 +653,28 @@ public class RE extends REToken {
numSubs++;
}
- int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs;
+ int useIndex = (pure || lookAhead || lookBehind || independent) ?
+ 0 : nextSub + numSubs;
currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs);
numSubs += ((RE) currentToken).getNumSubs();
if (lookAhead) {
currentToken = new RETokenLookAhead(currentToken,negativelh);
}
+ else if (lookBehind) {
+ currentToken = new RETokenLookBehind(currentToken,negativelb);
+ }
+ else if (independent) {
+ currentToken = new RETokenIndependent(currentToken);
+ }
index = nextIndex;
+ if (flagsSaved) {
+ syntax = savedSyntax;
+ cflags = savedCflags;
+ insens = ((cflags & REG_ICASE) > 0);
+ flagsSaved = false;
+ }
} // not a comment
} // subexpression
@@ -715,14 +774,45 @@ public class RE extends REToken {
else
currentToken = setRepeated(currentToken,0,1,index);
}
+
+ // OCTAL CHARACTER
+ // \0377
+ else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid octal character", REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
// BACKREFERENCE OPERATOR
- // \1 \2 ... \9
+ // \1 \2 ... \9 and \10 \11 \12 ...
// not available if RE_NO_BK_REFS is set
+ // Perl recognizes \10, \11, and so on only if enough number of
+ // parentheses have opened before it, otherwise they are treated
+ // as aliases of \010, \011, ... (octal characters). In case of
+ // Sun's JDK, octal character expression must always begin with \0.
+ // We will do as JDK does. But FIXME, take a look at "(a)(b)\29".
+ // JDK treats \2 as a back reference to the 2nd group because
+ // there are only two groups. But in our poor implementation,
+ // we cannot help but treat \29 as a back reference to the 29th group.
else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) {
addToken(currentToken);
- currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens);
+ int numBegin = index - 1;
+ int numEnd = pLength;
+ for (int i = index; i < pLength; i++) {
+ if (! Character.isDigit(pattern[i])) {
+ numEnd = i;
+ break;
+ }
+ }
+ int num = parseInt(pattern, numBegin, numEnd-numBegin, 10);
+
+ currentToken = new RETokenBackRef(subIndex,num,insens);
+ index = numEnd;
}
// START OF STRING OPERATOR
@@ -844,6 +934,32 @@ public class RE extends REToken {
currentToken = new RETokenEnd(subIndex,null);
}
+ // HEX CHARACTER, UNICODE CHARACTER
+ // \x1B, \u1234
+
+ else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid hex character", REException.REG_ESCAPE, index);
+ index = index - 2 + ce.len;
+ addToken(currentToken);
+ currentToken = new RETokenChar(subIndex,ce.ch,insens);
+ }
+
+ // NAMED PROPERTY
+ // \p{prop}, \P{prop}
+
+ else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) ||
+ (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) {
+ NamedProperty np = getNamedProperty(pattern, index - 2, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 2 + np.len;
+ addToken(currentToken);
+ currentToken = getRETokenNamedProperty(subIndex,np,insens,index);
+ }
+
// NON-SPECIAL CHARACTER (or escape to make literal)
// c | \* for example
@@ -857,9 +973,10 @@ public class RE extends REToken {
addToken(currentToken);
if (branches != null) {
- branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength));
+ branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength));
branches.trimToSize(); // compact the Vector
minimumLength = 0;
+ maximumLength = 0;
firstToken = lastToken = null;
addToken(new RETokenOneOf(subIndex,branches,false));
}
@@ -867,6 +984,199 @@ public class RE extends REToken {
}
+ private static class ParseCharClassResult {
+ RETokenOneOf token;
+ int index;
+ boolean returnAtAndOperator = false;
+ }
+
+ /**
+ * Parse [...] or [^...] and make an RETokenOneOf instance.
+ * @param subIndex subIndex to be given to the created RETokenOneOf instance.
+ * @param pattern Input array of characters to be parsed.
+ * @param index Index pointing to the character next to the beginning '['.
+ * @param pLength Limit of the input array.
+ * @param cflags Compilation flags used to parse the pattern.
+ * @param pflags Flags that affect the behavior of this method.
+ * @param syntax Syntax used to parse the pattern.
+ */
+ private static ParseCharClassResult parseCharClass(int subIndex,
+ char[] pattern, int index,
+ int pLength, int cflags, RESyntax syntax, int pflags)
+ throws REException {
+
+ boolean insens = ((cflags & REG_ICASE) > 0);
+ Vector options = new Vector();
+ Vector addition = new Vector();
+ boolean additionAndAppeared = false;
+ final int RETURN_AT_AND = 0x01;
+ boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0);
+ boolean negative = false;
+ char ch;
+
+ char lastChar = 0;
+ boolean lastCharIsSet = false;
+ if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index);
+
+ // Check for initial caret, negation
+ if ((ch = pattern[index]) == '^') {
+ negative = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ ch = pattern[index];
+ }
+
+ // Check for leading right bracket literal
+ if (ch == ']') {
+ lastChar = ch; lastCharIsSet = true;
+ if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ }
+
+ while ((ch = pattern[index++]) != ']') {
+ if ((ch == '-') && (lastCharIsSet)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ if ((ch = pattern[index]) == ']') {
+ options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = '-';
+ } else {
+ if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ CharExpression ce = getCharExpression(pattern, index, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ ch = ce.ch;
+ index = index + ce.len - 1;
+ }
+ options.addElement(new RETokenRange(subIndex,lastChar,ch,insens));
+ lastChar = 0; lastCharIsSet = false;
+ index++;
+ }
+ } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) {
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ int posixID = -1;
+ boolean negate = false;
+ char asciiEsc = 0;
+ boolean asciiEscIsSet = false;
+ NamedProperty np = null;
+ if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) {
+ switch (pattern[index]) {
+ case 'D':
+ negate = true;
+ case 'd':
+ posixID = RETokenPOSIX.DIGIT;
+ break;
+ case 'S':
+ negate = true;
+ case 's':
+ posixID = RETokenPOSIX.SPACE;
+ break;
+ case 'W':
+ negate = true;
+ case 'w':
+ posixID = RETokenPOSIX.ALNUM;
+ break;
+ }
+ }
+ if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) {
+ np = getNamedProperty(pattern, index - 1, pLength);
+ if (np == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ index = index - 1 + np.len - 1;
+ }
+ else {
+ CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax);
+ if (ce == null)
+ throw new REException("invalid escape sequence", REException.REG_ESCAPE, index);
+ asciiEsc = ce.ch; asciiEscIsSet = true;
+ index = index - 1 + ce.len - 1;
+ }
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ if (posixID != -1) {
+ options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate));
+ } else if (np != null) {
+ options.addElement(getRETokenNamedProperty(subIndex,np,insens,index));
+ } else if (asciiEscIsSet) {
+ lastChar = asciiEsc; lastCharIsSet = true;
+ } else {
+ lastChar = pattern[index]; lastCharIsSet = true;
+ }
+ ++index;
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) {
+ StringBuffer posixSet = new StringBuffer();
+ index = getPosixSet(pattern,index+1,posixSet);
+ int posixId = RETokenPOSIX.intValue(posixSet.toString());
+ if (posixId != -1)
+ options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false));
+ } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index, pLength, cflags, syntax, 0);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ index = result.index;
+ } else if ((ch == '&') &&
+ (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) &&
+ (index < pLength) && (pattern[index] == '&')) {
+ if (returnAtAndOperator) {
+ ParseCharClassResult result = new ParseCharClassResult();
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,
+ options, addition, negative);
+ result.index = index - 1;
+ result.returnAtAndOperator = true;
+ return result;
+ }
+ // The precedence of the operator "&&" is the lowest.
+ // So we postpone adding "&" until other elements
+ // are added. And we insert Boolean.FALSE at the
+ // beginning of the list of tokens following "&&".
+ // So, "&&[a-b][k-m]" will be stored in the Vecter
+ // addition in this order:
+ // Boolean.FALSE, [a-b], "|", [k-m], "|", "&"
+ if (additionAndAppeared) addition.addElement("&");
+ addition.addElement(Boolean.FALSE);
+ additionAndAppeared = true;
+
+ // The part on which "&&" operates may be either
+ // (1) explicitly enclosed by []
+ // or
+ // (2) not enclosed by [] and terminated by the
+ // next "&&" or the end of the character list.
+ // Let the preceding else if block do the case (1).
+ // We must do something in case of (2).
+ if ((index + 1 < pLength) && (pattern[index + 1] != '[')) {
+ ParseCharClassResult result = parseCharClass(
+ subIndex, pattern, index+1, pLength, cflags, syntax,
+ RETURN_AT_AND);
+ addition.addElement(result.token);
+ addition.addElement("|");
+ // If the method returned at the next "&&", it is OK.
+ // Otherwise we have eaten the mark of the end of this
+ // character list "]". In this case we must give back
+ // the end mark.
+ index = (result.returnAtAndOperator ?
+ result.index: result.index - 1);
+ }
+ } else {
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+ lastChar = ch; lastCharIsSet = true;
+ }
+ if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index);
+ } // while in list
+ // Out of list, index is one past ']'
+
+ if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens));
+
+ ParseCharClassResult result = new ParseCharClassResult();
+ // Create a new RETokenOneOf
+ options.trimToSize();
+ if (additionAndAppeared) addition.addElement("&");
+ if (addition.size() == 0) addition = null;
+ result.token = new RETokenOneOf(subIndex,options, addition, negative);
+ result.index = index;
+ return result;
+ }
+
private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException {
unit.ch = input[index++];
unit.bk = (unit.ch == '\\'
@@ -878,6 +1188,176 @@ public class RE extends REToken {
return index;
}
+ private static int parseInt(char[] input, int pos, int len, int radix) {
+ int ret = 0;
+ for (int i = pos; i < pos + len; i++) {
+ ret = ret * radix + Character.digit(input[i], radix);
+ }
+ return ret;
+ }
+
+ /**
+ * This class represents various expressions for a character.
+ * "a" : 'a' itself.
+ * "\0123" : Octal char 0123
+ * "\x1b" : Hex char 0x1b
+ * "\u1234" : Unicode char \u1234
+ */
+ private static class CharExpression {
+ /** character represented by this expression */
+ char ch;
+ /** String expression */
+ String expr;
+ /** length of this expression */
+ int len;
+ public String toString() { return expr; }
+ }
+
+ private static CharExpression getCharExpression(char[] input, int pos, int lim,
+ RESyntax syntax) {
+ CharExpression ce = new CharExpression();
+ char c = input[pos];
+ if (c == '\\') {
+ if (pos + 1 >= lim) return null;
+ c = input[pos + 1];
+ switch(c) {
+ case 't':
+ ce.ch = '\t';
+ ce.len = 2;
+ break;
+ case 'n':
+ ce.ch = '\n';
+ ce.len = 2;
+ break;
+ case 'r':
+ ce.ch = '\r';
+ ce.len = 2;
+ break;
+ case 'x':
+ case 'u':
+ if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) ||
+ (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) {
+ int l = 0;
+ int expectedLength = (c == 'x' ? 2 : 4);
+ for (int i = pos + 2; i < pos + 2 + expectedLength; i++) {
+ if (i >= lim) break;
+ if (!((input[i] >= '0' && input[i] <= '9') ||
+ (input[i] >= 'A' && input[i] <= 'F') ||
+ (input[i] >= 'a' && input[i] <= 'f')))
+ break;
+ l++;
+ }
+ if (l != expectedLength) return null;
+ ce.ch = (char)(parseInt(input, pos + 2, l, 16));
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ case '0':
+ if (syntax.get(RESyntax.RE_OCTAL_CHAR)) {
+ int l = 0;
+ for (int i = pos + 2; i < pos + 2 + 3; i++) {
+ if (i >= lim) break;
+ if (input[i] < '0' || input[i] > '7') break;
+ l++;
+ }
+ if (l == 3 && input[pos + 2] > '3') l--;
+ if (l <= 0) return null;
+ ce.ch = (char)(parseInt(input, pos + 2, l, 8));
+ ce.len = l + 2;
+ }
+ else {
+ ce.ch = c;
+ ce.len = 2;
+ }
+ break;
+ default:
+ ce.ch = c;
+ ce.len = 2;
+ break;
+ }
+ }
+ else {
+ ce.ch = input[pos];
+ ce.len = 1;
+ }
+ ce.expr = new String(input, pos, ce.len);
+ return ce;
+ }
+
+ /**
+ * This class represents a substring in a pattern string expressing
+ * a named property.
+ * "\pA" : Property named "A"
+ * "\p{prop}" : Property named "prop"
+ * "\PA" : Property named "A" (Negated)
+ * "\P{prop}" : Property named "prop" (Negated)
+ */
+ private static class NamedProperty {
+ /** Property name */
+ String name;
+ /** Negated or not */
+ boolean negate;
+ /** length of this expression */
+ int len;
+ }
+
+ private static NamedProperty getNamedProperty(char[] input, int pos, int lim) {
+ NamedProperty np = new NamedProperty();
+ char c = input[pos];
+ if (c == '\\') {
+ if (++pos >= lim) return null;
+ c = input[pos++];
+ switch(c) {
+ case 'p':
+ np.negate = false;
+ break;
+ case 'P':
+ np.negate = true;
+ break;
+ default:
+ return null;
+ }
+ c = input[pos++];
+ if (c == '{') {
+ int p = -1;
+ for (int i = pos; i < lim; i++) {
+ if (input[i] == '}') {
+ p = i;
+ break;
+ }
+ }
+ if (p < 0) return null;
+ int len = p - pos;
+ np.name = new String(input, pos, len);
+ np.len = len + 4;
+ }
+ else {
+ np.name = new String(input, pos - 1, 1);
+ np.len = 3;
+ }
+ return np;
+ }
+ else return null;
+ }
+
+ private static RETokenNamedProperty getRETokenNamedProperty(
+ int subIndex, NamedProperty np, boolean insens, int index)
+ throws REException {
+ try {
+ return new RETokenNamedProperty(subIndex, np.name, insens, np.negate);
+ }
+ catch (REException e) {
+ REException ree;
+ ree = new REException(e.getMessage(), REException.REG_ESCAPE, index);
+ ree.initCause(e);
+ throw ree;
+ }
+ }
+
/**
* Checks if the regular expression matches the input in its entirety.
*
@@ -958,6 +1438,10 @@ public class RE extends REToken {
return minimumLength;
}
+ public int getMaximumLength() {
+ return maximumLength;
+ }
+
/**
* Returns an array of all matches found in the input.
*
@@ -1025,7 +1509,9 @@ public class RE extends REToken {
/* Implements abstract method REToken.match() */
boolean match(CharIndexed input, REMatch mymatch) {
- if (firstToken == null) return next(input, mymatch);
+ if (firstToken == null) {
+ return next(input, mymatch);
+ }
// Note the start of this subexpression
mymatch.start[subIndex] = mymatch.index;
@@ -1089,23 +1575,34 @@ public class RE extends REToken {
}
REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) {
+ boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0);
+ RE re = (tryEntireMatch ? (RE) this.clone() : this);
+ if (tryEntireMatch) {
+ re.chain(new RETokenEnd(0, null));
+ }
// Create a new REMatch to hold results
REMatch mymatch = new REMatch(numSubs, anchor, eflags);
do {
// Optimization: check if anchor + minimumLength > length
if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) {
- if (match(input, mymatch)) {
- // Find longest match of them all to observe leftmost longest
- REMatch longest = mymatch;
+ if (re.match(input, mymatch)) {
+ REMatch best = mymatch;
+ // We assume that the match that coms first is the best.
+ // And the following "The longer, the better" rule has
+ // been commented out. The longest is not neccesarily
+ // the best. For example, "a" out of "aaa" is the best
+ // match for /a+?/.
+ /*
+ // Find best match of them all to observe leftmost longest
while ((mymatch = mymatch.next) != null) {
- if (mymatch.index > longest.index) {
- longest = mymatch;
+ if (mymatch.index > best.index) {
+ best = mymatch;
}
}
-
- longest.end[0] = longest.index;
- longest.finish(input);
- return longest;
+ */
+ best.end[0] = best.index;
+ best.finish(input);
+ return best;
}
}
mymatch.clear(++anchor);
@@ -1216,8 +1713,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m = getMatchImpl(input,index,eflags,buffer);
if (m==null) return buffer.toString();
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
if (input.move(m.end[0])) {
do {
buffer.append(input.charAt(0));
@@ -1278,8 +1774,7 @@ public class RE extends REToken {
StringBuffer buffer = new StringBuffer();
REMatch m;
while ((m = getMatchImpl(input,index,eflags,buffer)) != null) {
- buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ?
- replace : m.substituteInto(replace) );
+ buffer.append(getReplacement(replace, m, eflags));
index = m.getEndIndex();
if (m.end[0] == 0) {
char ch = input.charAt(0);
@@ -1294,11 +1789,50 @@ public class RE extends REToken {
}
return buffer.toString();
}
+
+ public static String getReplacement(String replace, REMatch m, int eflags) {
+ if ((eflags & REG_NO_INTERPOLATE) > 0)
+ return replace;
+ else {
+ if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) {
+ StringBuffer sb = new StringBuffer();
+ int l = replace.length();
+ for (int i = 0; i < l; i++) {
+ char c = replace.charAt(i);
+ switch(c) {
+ case '\\':
+ i++;
+ // Let StringIndexOutOfBoundsException be thrown.
+ sb.append(replace.charAt(i));
+ break;
+ case '$':
+ int i1 = i + 1;
+ while (i1 < replace.length() &&
+ Character.isDigit(replace.charAt(i1))) i1++;
+ sb.append(m.substituteInto(replace.substring(i, i1)));
+ i = i1 - 1;
+ break;
+ default:
+ sb.append(c);
+ }
+ }
+ return sb.toString();
+ }
+ else
+ return m.substituteInto(replace);
+ }
+ }
/* Helper function for constructor */
private void addToken(REToken next) {
if (next == null) return;
minimumLength += next.getMinimumLength();
+ int nmax = next.getMaximumLength();
+ if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE)
+ maximumLength += nmax;
+ else
+ maximumLength = Integer.MAX_VALUE;
+
if (firstToken == null) {
lastToken = firstToken = next;
} else {
diff --git a/libjava/classpath/gnu/regexp/REMatch.java b/libjava/classpath/gnu/regexp/REMatch.java
index cf25bb331c5..91a3c0249c0 100644
--- a/libjava/classpath/gnu/regexp/REMatch.java
+++ b/libjava/classpath/gnu/regexp/REMatch.java
@@ -1,5 +1,5 @@
/* gnu/regexp/REMatch.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -67,6 +67,10 @@ public final class REMatch implements Serializable, Cloneable {
int[] start; // start positions (relative to offset) for each (sub)exp.
int[] end; // end positions for the same
REMatch next; // other possibility (to avoid having to use arrays)
+ boolean empty; // empty string matched. This flag is used only within
+ // RETokenRepeated.
+ int matchFlags; // flags passed to match methods
+ static final int MF_FIND_ALL = 0x01;
public Object clone() {
try {
@@ -177,7 +181,9 @@ public final class REMatch implements Serializable, Cloneable {
* @param sub Index of the subexpression.
*/
public String toString(int sub) {
- if ((sub >= start.length) || (start[sub] == -1)) return "";
+ if ((sub >= start.length) || sub < 0)
+ throw new IndexOutOfBoundsException("No group " + sub);
+ if (start[sub] == -1) return null;
return (matchedText.substring(start[sub],end[sub]));
}
@@ -242,6 +248,8 @@ public final class REMatch implements Serializable, Cloneable {
* <code>$0</code> through <code>$9</code>. <code>$0</code> matches
* the full substring matched; <code>$<i>n</i></code> matches
* subexpression number <i>n</i>.
+ * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions
+ * if such subexpressions exist.
*
* @param input A string consisting of literals and <code>$<i>n</i></code> tokens.
*/
@@ -252,6 +260,16 @@ public final class REMatch implements Serializable, Cloneable {
for (pos = 0; pos < input.length()-1; pos++) {
if ((input.charAt(pos) == '$') && (Character.isDigit(input.charAt(pos+1)))) {
int val = Character.digit(input.charAt(++pos),10);
+ int pos1 = pos + 1;
+ while (pos1 < input.length() &&
+ Character.isDigit(input.charAt(pos1))) {
+ int val1 = val*10 + Character.digit(input.charAt(pos1),10);
+ if (val1 >= start.length) break;
+ pos1++;
+ val = val1;
+ }
+ pos = pos1 - 1;
+
if (val < start.length) {
output.append(toString(val));
}
@@ -260,4 +278,42 @@ public final class REMatch implements Serializable, Cloneable {
if (pos < input.length()) output.append(input.charAt(pos));
return output.toString();
}
+
+ static class REMatchList {
+ REMatch head;
+ REMatch tail;
+ REMatchList() {
+ head = tail = null;
+ }
+ /* Not used now. But we may need this some day?
+ void addHead(REMatch newone) {
+ if (head == null) {
+ head = newone;
+ tail = newone;
+ while (tail.next != null) {
+ tail = tail.next;
+ }
+ }
+ else {
+ REMatch tmp = newone;
+ while (tmp.next != null) tmp = tmp.next;
+ tmp.next = head;
+ head = newone;
+ }
+ }
+ */
+ void addTail(REMatch newone) {
+ if (head == null) {
+ head = newone;
+ tail = newone;
+ }
+ else {
+ tail.next = newone;
+ }
+ while (tail.next != null) {
+ tail = tail.next;
+ }
+ }
+ }
+
}
diff --git a/libjava/classpath/gnu/regexp/RESyntax.java b/libjava/classpath/gnu/regexp/RESyntax.java
index 7272b03481b..81fd999bfcf 100644
--- a/libjava/classpath/gnu/regexp/RESyntax.java
+++ b/libjava/classpath/gnu/regexp/RESyntax.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RESyntax.java
- Copyright (C) 1998-2002, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -202,7 +202,37 @@ public final class RESyntax implements Serializable {
*/
public static final int RE_POSSESSIVE_OPS = 25;
- private static final int BIT_TOTAL = 26;
+ /**
+ * Syntax bit. Allow embedded flags, (?is-x), as in Perl5.
+ */
+ public static final int RE_EMBEDDED_FLAGS = 26;
+
+ /**
+ * Syntax bit. Allow octal char (\0377), as in Perl5.
+ */
+ public static final int RE_OCTAL_CHAR = 27;
+
+ /**
+ * Syntax bit. Allow hex char (\x1b), as in Perl5.
+ */
+ public static final int RE_HEX_CHAR = 28;
+
+ /**
+ * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4.
+ */
+ public static final int RE_UNICODE_CHAR = 29;
+
+ /**
+ * Syntax bit. Allow named property (\p{P}, \P{p}), as in Perl5.
+ */
+ public static final int RE_NAMED_PROPERTY = 30;
+
+ /**
+ * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4.
+ */
+ public static final int RE_NESTED_CHARCLASS = 31;
+
+ private static final int BIT_TOTAL = 32;
/**
* Predefined syntax.
@@ -422,6 +452,10 @@ public final class RESyntax implements Serializable {
.set(RE_STRING_ANCHORS) // \A,\Z
.set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within []
.set(RE_COMMENTS) // (?#)
+ .set(RE_EMBEDDED_FLAGS) // (?imsx-imsx)
+ .set(RE_OCTAL_CHAR) // \0377
+ .set(RE_HEX_CHAR) // \x1b
+ .set(RE_NAMED_PROPERTY) // \p{prop}, \P{prop}
.makeFinal();
RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5)
@@ -431,6 +465,8 @@ public final class RESyntax implements Serializable {
RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5)
// XXX
.set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+
+ .set(RE_UNICODE_CHAR) // \u1234
+ .set(RE_NESTED_CHARCLASS) // [a-z&&[^p-r]]
.makeFinal();
}
diff --git a/libjava/classpath/gnu/regexp/REToken.java b/libjava/classpath/gnu/regexp/REToken.java
index 4eae9ec473c..5f4659b21ac 100644
--- a/libjava/classpath/gnu/regexp/REToken.java
+++ b/libjava/classpath/gnu/regexp/REToken.java
@@ -38,12 +38,21 @@ exception statement from your version. */
package gnu.regexp;
import java.io.Serializable;
-abstract class REToken implements Serializable {
+abstract class REToken implements Serializable, Cloneable {
protected REToken next = null;
protected REToken uncle = null;
protected int subIndex;
+ public Object clone() {
+ try {
+ REToken copy = (REToken) super.clone();
+ return copy;
+ } catch (CloneNotSupportedException e) {
+ throw new Error(); // doesn't happen
+ }
+ }
+
protected REToken(int subIndex) {
this.subIndex = subIndex;
}
@@ -52,6 +61,10 @@ abstract class REToken implements Serializable {
return 0;
}
+ int getMaximumLength() {
+ return Integer.MAX_VALUE;
+ }
+
void setUncle(REToken anUncle) {
uncle = anUncle;
}
diff --git a/libjava/classpath/gnu/regexp/RETokenAny.java b/libjava/classpath/gnu/regexp/RETokenAny.java
index ac032dcb3bf..2b0967a79a1 100644
--- a/libjava/classpath/gnu/regexp/RETokenAny.java
+++ b/libjava/classpath/gnu/regexp/RETokenAny.java
@@ -55,6 +55,10 @@ final class RETokenAny extends REToken {
return 1;
}
+ int getMaximumLength() {
+ return 1;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
char ch = input.charAt(mymatch.index);
if ((ch == CharIndexed.OUT_OF_BOUNDS)
diff --git a/libjava/classpath/gnu/regexp/RETokenBackRef.java b/libjava/classpath/gnu/regexp/RETokenBackRef.java
index 674822abd70..060a6cf7d20 100644
--- a/libjava/classpath/gnu/regexp/RETokenBackRef.java
+++ b/libjava/classpath/gnu/regexp/RETokenBackRef.java
@@ -51,13 +51,25 @@ final class RETokenBackRef extends REToken {
// should implement getMinimumLength() -- any ideas?
boolean match(CharIndexed input, REMatch mymatch) {
+ if (num >= mymatch.start.length) return false;
+ if (num >= mymatch.end.length) return false;
int b,e;
b = mymatch.start[num];
e = mymatch.end[num];
if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but...
for (int i=b; i<e; i++) {
- if (input.charAt(mymatch.index+i-b) != input.charAt(i)) {
- return false;
+ char c1 = input.charAt(mymatch.index+i-b);
+ char c2 = input.charAt(i);
+ if (c1 != c2) {
+ if (insens) {
+ if (c1 != Character.toLowerCase(c2) &&
+ c1 != Character.toUpperCase(c2)) {
+ return false;
+ }
+ }
+ else {
+ return false;
+ }
}
}
mymatch.index += e-b;
diff --git a/libjava/classpath/gnu/regexp/RETokenChar.java b/libjava/classpath/gnu/regexp/RETokenChar.java
index a15449b2d96..5c087c68778 100644
--- a/libjava/classpath/gnu/regexp/RETokenChar.java
+++ b/libjava/classpath/gnu/regexp/RETokenChar.java
@@ -52,6 +52,10 @@ final class RETokenChar extends REToken {
return ch.length;
}
+ int getMaximumLength() {
+ return ch.length;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
int z = ch.length;
char c;
@@ -68,7 +72,7 @@ final class RETokenChar extends REToken {
// Overrides REToken.chain() to optimize for strings
boolean chain(REToken next) {
- if (next instanceof RETokenChar) {
+ if (next instanceof RETokenChar && ((RETokenChar)next).insens == insens) {
RETokenChar cnext = (RETokenChar) next;
// assume for now that next can only be one character
int newsize = ch.length + cnext.ch.length;
diff --git a/libjava/classpath/gnu/regexp/RETokenEnd.java b/libjava/classpath/gnu/regexp/RETokenEnd.java
index 70483b746a9..788a964da41 100644
--- a/libjava/classpath/gnu/regexp/RETokenEnd.java
+++ b/libjava/classpath/gnu/regexp/RETokenEnd.java
@@ -49,6 +49,10 @@ final class RETokenEnd extends REToken {
this.newline = newline;
}
+ int getMaximumLength() {
+ return 0;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
char ch = input.charAt(mymatch.index);
if (ch == CharIndexed.OUT_OF_BOUNDS)
diff --git a/libjava/classpath/gnu/regexp/RETokenEndSub.java b/libjava/classpath/gnu/regexp/RETokenEndSub.java
index f3bb4f2e131..fe2969d0592 100644
--- a/libjava/classpath/gnu/regexp/RETokenEndSub.java
+++ b/libjava/classpath/gnu/regexp/RETokenEndSub.java
@@ -41,6 +41,10 @@ final class RETokenEndSub extends REToken {
RETokenEndSub(int subIndex) {
super(subIndex);
}
+
+ int getMaximumLength() {
+ return 0;
+ }
boolean match(CharIndexed input, REMatch mymatch) {
mymatch.end[subIndex] = mymatch.index;
diff --git a/libjava/classpath/gnu/regexp/RETokenIndependent.java b/libjava/classpath/gnu/regexp/RETokenIndependent.java
new file mode 100644
index 00000000000..2eb14722361
--- /dev/null
+++ b/libjava/classpath/gnu/regexp/RETokenIndependent.java
@@ -0,0 +1,76 @@
+/* gnu/regexp/RETokenIndependent.java
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING. If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library. Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module. An independent module is a module which is not derived from
+or based on this library. If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so. If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.regexp;
+
+/**
+ * @author Ito Kazumitsu
+ */
+final class RETokenIndependent extends REToken
+{
+ REToken re;
+
+ RETokenIndependent(REToken re) throws REException {
+ super(0);
+ this.re = re;
+ }
+
+ int getMinimumLength() {
+ return re.getMinimumLength();
+ }
+
+ int getMaximumLength() {
+ return re.getMaximumLength();
+ }
+
+ boolean match(CharIndexed input, REMatch mymatch)
+ {
+ if (re.match(input, mymatch)) {
+ // Once we have found a match, we do not see other possible matches.
+ mymatch.next = null;
+ return next(input, mymatch);
+ }
+ return false;
+ }
+
+ void dump(StringBuffer os) {
+ os.append("(?>");
+ re.dumpAll(os);
+ os.append(')');
+ }
+}
+
diff --git a/libjava/classpath/gnu/regexp/RETokenLookAhead.java b/libjava/classpath/gnu/regexp/RETokenLookAhead.java
index 33eaec9fac1..b44dfa50c4f 100644
--- a/libjava/classpath/gnu/regexp/RETokenLookAhead.java
+++ b/libjava/classpath/gnu/regexp/RETokenLookAhead.java
@@ -52,6 +52,10 @@ final class RETokenLookAhead extends REToken
this.negative = negative;
}
+ int getMaximumLength() {
+ return 0;
+ }
+
boolean match(CharIndexed input, REMatch mymatch)
{
REMatch trymatch = (REMatch)mymatch.clone();
diff --git a/libjava/classpath/gnu/regexp/RETokenLookBehind.java b/libjava/classpath/gnu/regexp/RETokenLookBehind.java
new file mode 100644
index 00000000000..a6c1b34cb0b
--- /dev/null
+++ b/libjava/classpath/gnu/regexp/RETokenLookBehind.java
@@ -0,0 +1,116 @@
+/* gnu/regexp/RETokenLookBehind.java
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING. If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library. Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module. An independent module is a module which is not derived from
+or based on this library. If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so. If you do not wish to do so, delete this
+exception statement from your version. */
+
+package gnu.regexp;
+
+/**
+ * @author Ito Kazumitsu
+ */
+final class RETokenLookBehind extends REToken
+{
+ REToken re;
+ boolean negative;
+
+ RETokenLookBehind(REToken re, boolean negative) throws REException {
+ super(0);
+ this.re = re;
+ this.negative = negative;
+ }
+
+ int getMaximumLength() {
+ return 0;
+ }
+
+ boolean match(CharIndexed input, REMatch mymatch)
+ {
+ int max = re.getMaximumLength();
+ CharIndexed behind = input.lookBehind(mymatch.index, max);
+ REMatch trymatch = (REMatch)mymatch.clone();
+ REMatch trymatch1 = (REMatch)mymatch.clone();
+ REMatch newMatch = null;
+ int curIndex = trymatch.index + behind.length() - input.length();
+ trymatch.index = 0;
+ RETokenMatchHereOnly stopper = new RETokenMatchHereOnly(curIndex);
+ REToken re1 = (REToken) re.clone();
+ re1.chain(stopper);
+ if (re1.match(behind, trymatch)) {
+ if (negative) return false;
+ if (next(input, trymatch1))
+ newMatch = trymatch1;
+ }
+
+ if (newMatch != null) {
+ if (negative) return false;
+ //else
+ mymatch.assignFrom(newMatch);
+ return true;
+ }
+ else { // no match
+ if (negative)
+ return next(input, mymatch);
+ //else
+ return false;
+ }
+ }
+
+ void dump(StringBuffer os) {
+ os.append("(?<");
+ os.append(negative ? '!' : '=');
+ re.dumpAll(os);
+ os.append(')');
+ }
+
+ private static class RETokenMatchHereOnly extends REToken {
+
+ int getMaximumLength() { return 0; }
+
+ private int index;
+
+ RETokenMatchHereOnly(int index) {
+ super(0);
+ this.index = index;
+ }
+
+ boolean match(CharIndexed input, REMatch mymatch) {
+ return index == mymatch.index;
+ }
+
+ void dump(StringBuffer os) {}
+
+ }
+}
+
diff --git a/libjava/classpath/gnu/regexp/RETokenNamedProperty.java b/libjava/classpath/gnu/regexp/RETokenNamedProperty.java
new file mode 100644
index 00000000000..13c1e418a09
--- /dev/null
+++ b/libjava/classpath/gnu/regexp/RETokenNamedProperty.java
@@ -0,0 +1,301 @@
+/* gnu/regexp/RETokenNamedProperty.java
+ Copyright (C) 2006 Free Software Foundation, Inc.
+
+This file is part of GNU Classpath.
+
+GNU Classpath is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2, or (at your option)
+any later version.
+
+GNU Classpath is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with GNU Classpath; see the file COPYING. If not, write to the
+Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301 USA.
+
+Linking this library statically or dynamically with other modules is
+making a combined work based on this library. Thus, the terms and
+conditions of the GNU General Public License cover the whole
+combination.
+
+As a special exception, the copyright holders of this library give you
+permission to link this library with independent modules to produce an
+executable, regardless of the license terms of these independent
+modules, and to copy and distribute the resulting executable under
+terms of your choice, provided that you also meet, for each linked
+independent module, the terms and conditions of the license of that
+module. An independent module is a module which is not derived from
+or based on this library. If you modify this library, you may extend
+this exception to your version of the library, but you are not
+obligated to do so. If you do not wish to do so, delete this
+exception statement from your version. */
+
+
+package gnu.regexp;
+
+final class RETokenNamedProperty extends REToken {
+ String name;
+ boolean insens;
+ boolean negate;
+ Handler handler;
+
+ // Grouped properties
+ static final byte[] LETTER = new byte[]
+ { Character.LOWERCASE_LETTER,
+ Character.UPPERCASE_LETTER,
+ Character.TITLECASE_LETTER,
+ Character.MODIFIER_LETTER,
+ Character.OTHER_LETTER };
+
+ static final byte[] MARK = new byte[]
+ { Character.NON_SPACING_MARK,
+ Character.COMBINING_SPACING_MARK,
+ Character.ENCLOSING_MARK };
+
+ static final byte[] SEPARATOR = new byte[]
+ { Character.SPACE_SEPARATOR,
+ Character.LINE_SEPARATOR,
+ Character.PARAGRAPH_SEPARATOR };
+
+ static final byte[] SYMBOL = new byte[]
+ { Character.MATH_SYMBOL,
+ Character.CURRENCY_SYMBOL,
+ Character.MODIFIER_SYMBOL,
+ Character.OTHER_SYMBOL };
+
+ static final byte[] NUMBER = new byte[]
+ { Character.DECIMAL_DIGIT_NUMBER,
+ Character.LETTER_NUMBER,
+ Character.OTHER_NUMBER };
+
+ static final byte[] PUNCTUATION = new byte[]
+ { Character.DASH_PUNCTUATION,
+ Character.START_PUNCTUATION,
+ Character.END_PUNCTUATION,
+ Character.CONNECTOR_PUNCTUATION,
+ Character.OTHER_PUNCTUATION,
+ Character.INITIAL_QUOTE_PUNCTUATION,
+ Character.FINAL_QUOTE_PUNCTUATION};
+
+ static final byte[] OTHER = new byte[]
+ { Character.CONTROL,
+ Character.FORMAT,
+ Character.PRIVATE_USE,
+ Character.SURROGATE,
+ Character.UNASSIGNED };
+
+ RETokenNamedProperty(int subIndex, String name, boolean insens, boolean negate) throws REException {
+ super(subIndex);
+ this.name = name;
+ this.insens = insens;
+ this.negate = negate;
+ handler = getHandler(name);
+ }
+
+ int getMinimumLength() {
+ return 1;
+ }
+
+ int getMaximumLength() {
+ return 1;
+ }
+
+ boolean match(CharIndexed input, REMatch mymatch) {
+ char ch = input.charAt(mymatch.index);
+ if (ch == CharIndexed.OUT_OF_BOUNDS)
+ return false;
+
+ boolean retval = handler.includes(ch);
+ if (insens) {
+ retval = retval ||
+ handler.includes(Character.toUpperCase(ch)) ||
+ handler.includes(Character.toLowerCase(ch));
+ }
+
+ if (negate) retval = !retval;
+ if (retval) {
+ ++mymatch.index;
+ return next(input, mymatch);
+ }
+ else return false;
+ }
+
+ void dump(StringBuffer os) {
+ os.append("\\")
+ .append(negate ? "P" : "p")
+ .append("{" + name + "}");
+ }
+
+ private abstract static class Handler {
+ public abstract boolean includes(char c);
+ }
+
+ private Handler getHandler(String name) throws REException {
+ if (name.equals("Lower") ||
+ name.equals("Upper") ||
+ // name.equals("ASCII") ||
+ name.equals("Alpha") ||
+ name.equals("Digit") ||
+ name.equals("Alnum") ||
+ name.equals("Punct") ||
+ name.equals("Graph") ||
+ name.equals("Print") ||
+ name.equals("Blank") ||
+ name.equals("Cntrl") ||
+ name.equals("XDigit") ||
+ name.equals("Space") ) {
+ return new POSIXHandler(name);
+ }
+ if (name.startsWith("In")) {
+ try {
+ name = name.substring(2);
+ Character.UnicodeBlock block = Character.UnicodeBlock.forName(name);
+ return new UnicodeBlockHandler(block);
+ }
+ catch (IllegalArgumentException e) {
+ throw new REException("Invalid Unicode block name: " + name, REException.REG_ESCAPE, 0);
+ }
+ }
+ if (name.startsWith("Is")) {
+ name = name.substring(2);
+ }
+
+ // "grouped properties"
+ if (name.equals("L"))
+ return new UnicodeCategoriesHandler(LETTER);
+ if (name.equals("M"))
+ return new UnicodeCategoriesHandler(MARK);
+ if (name.equals("Z"))
+ return new UnicodeCategoriesHandler(SEPARATOR);
+ if (name.equals("S"))
+ return new UnicodeCategoriesHandler(SYMBOL);
+ if (name.equals("N"))
+ return new UnicodeCategoriesHandler(NUMBER);
+ if (name.equals("P"))
+ return new UnicodeCategoriesHandler(PUNCTUATION);
+ if (name.equals("C"))
+ return new UnicodeCategoriesHandler(OTHER);
+
+ if (name.equals("Mc"))
+ return new UnicodeCategoryHandler(Character.COMBINING_SPACING_MARK);
+ if (name.equals("Pc"))
+ return new UnicodeCategoryHandler(Character.CONNECTOR_PUNCTUATION);
+ if (name.equals("Cc"))
+ return new UnicodeCategoryHandler(Character.CONTROL);
+ if (name.equals("Sc"))
+ return new UnicodeCategoryHandler(Character.CURRENCY_SYMBOL);
+ if (name.equals("Pd"))
+ return new UnicodeCategoryHandler(Character.DASH_PUNCTUATION);
+ if (name.equals("Nd"))
+ return new UnicodeCategoryHandler(Character.DECIMAL_DIGIT_NUMBER);
+ if (name.equals("Me"))
+ return new UnicodeCategoryHandler(Character.ENCLOSING_MARK);
+ if (name.equals("Pe"))
+ return new UnicodeCategoryHandler(Character.END_PUNCTUATION);
+ if (name.equals("Pf"))
+ return new UnicodeCategoryHandler(Character.FINAL_QUOTE_PUNCTUATION);
+ if (name.equals("Cf"))
+ return new UnicodeCategoryHandler(Character.FORMAT);
+ if (name.equals("Pi"))
+ return new UnicodeCategoryHandler(Character.INITIAL_QUOTE_PUNCTUATION);
+ if (name.equals("Nl"))
+ return new UnicodeCategoryHandler(Character.LETTER_NUMBER);
+ if (name.equals("Zl"))
+ return new UnicodeCategoryHandler(Character.LINE_SEPARATOR);
+ if (name.equals("Ll"))
+ return new UnicodeCategoryHandler(Character.LOWERCASE_LETTER);
+ if (name.equals("Sm"))
+ return new UnicodeCategoryHandler(Character.MATH_SYMBOL);
+ if (name.equals("Lm"))
+ return new UnicodeCategoryHandler(Character.MODIFIER_LETTER);
+ if (name.equals("Sk"))
+ return new UnicodeCategoryHandler(Character.MODIFIER_SYMBOL);
+ if (name.equals("Mn"))
+ return new UnicodeCategoryHandler(Character.NON_SPACING_MARK);
+ if (name.equals("Lo"))
+ return new UnicodeCategoryHandler(Character.OTHER_LETTER);
+ if (name.equals("No"))
+ return new UnicodeCategoryHandler(Character.OTHER_NUMBER);
+ if (name.equals("Po"))
+ return new UnicodeCategoryHandler(Character.OTHER_PUNCTUATION);
+ if (name.equals("So"))
+ return new UnicodeCategoryHandler(Character.OTHER_SYMBOL);
+ if (name.equals("Zp"))
+ return new UnicodeCategoryHandler(Character.PARAGRAPH_SEPARATOR);
+ if (name.equals("Co"))
+ return new UnicodeCategoryHandler(Character.PRIVATE_USE);
+ if (name.equals("Zs"))
+ return new UnicodeCategoryHandler(Character.SPACE_SEPARATOR);
+ if (name.equals("Ps"))
+ return new UnicodeCategoryHandler(Character.START_PUNCTUATION);
+ if (name.equals("Cs"))
+ return new UnicodeCategoryHandler(Character.SURROGATE);
+ if (name.equals("Lt"))
+ return new UnicodeCategoryHandler(Character.TITLECASE_LETTER);
+ if (name.equals("Cn"))
+ return new UnicodeCategoryHandler(Character.UNASSIGNED);
+ if (name.equals("Lu"))
+ return new UnicodeCategoryHandler(Character.UPPERCASE_LETTER);
+ throw new REException("unsupported name " + name, REException.REG_ESCAPE, 0);
+ }
+
+ private static class POSIXHandler extends Handler {
+ private RETokenPOSIX retoken;
+ private REMatch mymatch = new REMatch(0,0,0);
+ private char[] chars = new char[1];
+ private CharIndexedCharArray ca = new CharIndexedCharArray(chars, 0);
+ public POSIXHandler(String name) {
+ int posixId = RETokenPOSIX.intValue(name.toLowerCase());
+ if (posixId != -1)
+ retoken = new RETokenPOSIX(0,posixId,false,false);
+ else
+ throw new RuntimeException("Unknown posix ID: " + name);
+ }
+ public boolean includes(char c) {
+ chars[0] = c;
+ mymatch.index = 0;
+ return retoken.match(ca, mymatch);
+ }
+ }
+
+ private static class UnicodeCategoryHandler extends Handler {
+ public UnicodeCategoryHandler(byte category) {
+ this.category = (int)category;
+ }
+ private int category;
+ public boolean includes(char c) {
+ return Character.getType(c) == category;
+ }
+ }
+
+ private static class UnicodeCategoriesHandler extends Handler {
+ public UnicodeCategoriesHandler(byte[] categories) {
+ this.categories = categories;
+ }
+ private byte[] categories;
+ public boolean includes(char c) {
+ int category = Character.getType(c);
+ for (int i = 0; i < categories.length; i++)
+ if (category == categories[i])
+ return true;
+ return false;
+ }
+ }
+
+ private static class UnicodeBlockHandler extends Handler {
+ public UnicodeBlockHandler(Character.UnicodeBlock block) {
+ this.block = block;
+ }
+ private Character.UnicodeBlock block;
+ public boolean includes(char c) {
+ Character.UnicodeBlock cblock = Character.UnicodeBlock.of(c);
+ return (cblock != null && cblock.equals(block));
+ }
+ }
+
+}
diff --git a/libjava/classpath/gnu/regexp/RETokenOneOf.java b/libjava/classpath/gnu/regexp/RETokenOneOf.java
index 3f6e89e2103..260bc4b8f67 100644
--- a/libjava/classpath/gnu/regexp/RETokenOneOf.java
+++ b/libjava/classpath/gnu/regexp/RETokenOneOf.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RETokenOneOf.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -37,11 +37,35 @@ exception statement from your version. */
package gnu.regexp;
import java.util.Vector;
+import java.util.Stack;
final class RETokenOneOf extends REToken {
private Vector options;
private boolean negative;
+ private Vector addition;
+ // This Vector addition is used to store nested character classes.
+ // For example, if the original expression is
+ // [2-7a-c[f-k][m-z]&&[^p-v][st]]
+ // the basic part /2-7a-c/ is stored in the Vector options, and
+ // the additional part /[f-k][m-z]&&[^p-v][st]/ is stored in the
+ // Vector addition in the following order (Reverse Polish Notation):
+ // -- The matching result of the basic part is assumed here.
+ // [f-k] -- REToken
+ // "|" -- or
+ // [m-z] -- REToken
+ // "|" -- or
+ // false
+ // [^p-v] -- REToken
+ // "|" -- or
+ // [st] -- REToken
+ // "|" -- or
+ // "&" -- and
+ //
+ // As it is clear from the explanation above, the Vector addition is
+ // effective only when this REToken originates from a character class
+ // expression.
+
// This constructor is used for convenience when we know the set beforehand,
// e.g. \d --> new RETokenOneOf("0123456789",false, ..)
// \D --> new RETokenOneOf("0123456789",true, ..)
@@ -60,7 +84,17 @@ final class RETokenOneOf extends REToken {
this.negative = negative;
}
+ RETokenOneOf(int subIndex, Vector options, Vector addition, boolean negative) {
+ super(subIndex);
+ this.options = options;
+ this.addition = addition;
+ this.negative = negative;
+ }
+
int getMinimumLength() {
+ // (negative || addition != null) occurs when this token originates from
+ // character class expression.
+ if (negative || addition != null) return 1;
int min = Integer.MAX_VALUE;
int x;
for (int i=0; i < options.size(); i++) {
@@ -70,54 +104,123 @@ final class RETokenOneOf extends REToken {
return min;
}
+ int getMaximumLength() {
+ // (negative || addition != null) occurs when this token originates from
+ // character class expression.
+ if (negative || addition != null) return 1;
+ int max = 0;
+ int x;
+ for (int i=0; i < options.size(); i++) {
+ if ((x = ((REToken) options.elementAt(i)).getMaximumLength()) > max)
+ max = x;
+ }
+ return max;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
- if (negative && (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS))
+ REMatch tryMatch;
+ boolean tryOnly;
+ if (addition == null) {
+ tryMatch = mymatch;
+ tryOnly = false;
+ }
+ else {
+ tryMatch = (REMatch) mymatch.clone();
+ tryOnly = true;
+ }
+ boolean b = negative ?
+ matchN(input, tryMatch, tryOnly) :
+ matchP(input, tryMatch, tryOnly);
+ if (addition == null) return b;
+
+ Stack stack = new Stack();
+ stack.push(new Boolean(b));
+ for (int i=0; i < addition.size(); i++) {
+ Object obj = addition.elementAt(i);
+ if (obj instanceof REToken) {
+ b = ((REToken)obj).match(input, (REMatch)mymatch.clone());
+ stack.push(new Boolean(b));
+ }
+ else if (obj instanceof Boolean) {
+ stack.push(obj);
+ }
+ else if (obj.equals("|")) {
+ b = ((Boolean)stack.pop()).booleanValue();
+ b = ((Boolean)stack.pop()).booleanValue() || b;
+ stack.push(new Boolean(b));
+ }
+ else if (obj.equals("&")) {
+ b = ((Boolean)stack.pop()).booleanValue();
+ b = ((Boolean)stack.pop()).booleanValue() && b;
+ stack.push(new Boolean(b));
+ }
+ else {
+ throw new RuntimeException("Invalid object found");
+ }
+ }
+ b = ((Boolean)stack.pop()).booleanValue();
+ if (b) {
+ ++mymatch.index;
+ return next(input, mymatch);
+ }
return false;
+ }
- REMatch newMatch = null;
- REMatch last = null;
- REToken tk;
- boolean isMatch;
- for (int i=0; i < options.size(); i++) {
+ private boolean matchN(CharIndexed input, REMatch mymatch, boolean tryOnly) {
+ if (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS)
+ return false;
+
+ REMatch newMatch = null;
+ REMatch last = null;
+ REToken tk;
+ for (int i=0; i < options.size(); i++) {
tk = (REToken) options.elementAt(i);
REMatch tryMatch = (REMatch) mymatch.clone();
if (tk.match(input, tryMatch)) { // match was successful
- if (negative) return false;
-
- if (next(input, tryMatch)) {
- // Add tryMatch to list of possibilities.
- if (last == null) {
- newMatch = tryMatch;
- last = tryMatch;
- } else {
- last.next = tryMatch;
- last = tryMatch;
- }
- } // next succeeds
- } // is a match
- } // try next option
-
- if (newMatch != null) {
- if (negative) {
return false;
- } else {
- // set contents of mymatch equal to newMatch
+ } // is a match
+ } // try next option
- // try each one that matched
- mymatch.assignFrom(newMatch);
- return true;
- }
- } else {
- if (negative) {
- ++mymatch.index;
- return next(input, mymatch);
- } else {
- return false;
- }
+ if (tryOnly) return true;
+ ++mymatch.index;
+ return next(input, mymatch);
}
- // index+1 works for [^abc] lists, not for generic lookahead (--> index)
- }
+ private boolean matchP(CharIndexed input, REMatch mymatch, boolean tryOnly) {
+ boolean stopMatchingIfSatisfied =
+ (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
+ REMatch.REMatchList newMatch = new REMatch.REMatchList();
+ REToken tk;
+ for (int i=0; i < options.size(); i++) {
+ // In order that the backtracking can work,
+ // each option must be chained to the next token.
+ // But the chain method has some side effect, so
+ // we use clones.
+ tk = (REToken)((REToken) options.elementAt(i)).clone();
+ if (! tryOnly) {
+ tk.chain(this.next);
+ tk.setUncle(this.uncle);
+ tk.subIndex = this.subIndex;
+ }
+ REMatch tryMatch = (REMatch) mymatch.clone();
+ if (tk.match(input, tryMatch)) { // match was successful
+ if (tryOnly) return true;
+ newMatch.addTail(tryMatch);
+ if (stopMatchingIfSatisfied) break;
+ } // is a match
+ } // try next option
+ if (tryOnly) return false;
+
+ if (newMatch.head != null) {
+ // set contents of mymatch equal to newMatch
+
+ // try each one that matched
+ mymatch.assignFrom(newMatch.head);
+ return true;
+ } else {
+ return false;
+ }
+ }
void dump(StringBuffer os) {
os.append(negative ? "[^" : "(?:");
diff --git a/libjava/classpath/gnu/regexp/RETokenPOSIX.java b/libjava/classpath/gnu/regexp/RETokenPOSIX.java
index bbb8066bca8..4182c6fab98 100644
--- a/libjava/classpath/gnu/regexp/RETokenPOSIX.java
+++ b/libjava/classpath/gnu/regexp/RETokenPOSIX.java
@@ -81,6 +81,10 @@ final class RETokenPOSIX extends REToken {
return 1;
}
+ int getMaximumLength() {
+ return 1;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
char ch = input.charAt(mymatch.index);
if (ch == CharIndexed.OUT_OF_BOUNDS)
diff --git a/libjava/classpath/gnu/regexp/RETokenRange.java b/libjava/classpath/gnu/regexp/RETokenRange.java
index dadaf2d8072..8a1ac86b212 100644
--- a/libjava/classpath/gnu/regexp/RETokenRange.java
+++ b/libjava/classpath/gnu/regexp/RETokenRange.java
@@ -43,19 +43,32 @@ final class RETokenRange extends REToken {
RETokenRange(int subIndex, char lo, char hi, boolean ins) {
super(subIndex);
- this.lo = (insens = ins) ? Character.toLowerCase(lo) : lo;
- this.hi = ins ? Character.toLowerCase(hi) : hi;
+ insens = ins;
+ this.lo = lo;
+ this.hi = hi;
}
int getMinimumLength() {
return 1;
}
+ int getMaximumLength() {
+ return 1;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
char c = input.charAt(mymatch.index);
if (c == CharIndexed.OUT_OF_BOUNDS) return false;
- if (insens) c = Character.toLowerCase(c);
- if ((c >= lo) && (c <= hi)) {
+ boolean matches = (c >= lo) && (c <= hi);
+ if (! matches && insens) {
+ char c1 = Character.toLowerCase(c);
+ matches = (c1 >= lo) && (c1 <= hi);
+ if (!matches) {
+ c1 = Character.toUpperCase(c);
+ matches = (c1 >= lo) && (c1 <= hi);
+ }
+ }
+ if (matches) {
++mymatch.index;
return next(input, mymatch);
}
diff --git a/libjava/classpath/gnu/regexp/RETokenRepeated.java b/libjava/classpath/gnu/regexp/RETokenRepeated.java
index 6291a3c3960..2d019c53cbd 100644
--- a/libjava/classpath/gnu/regexp/RETokenRepeated.java
+++ b/libjava/classpath/gnu/regexp/RETokenRepeated.java
@@ -1,5 +1,5 @@
/* gnu/regexp/RETokenRepeated.java
- Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc.
+ Copyright (C) 2006 Free Software Foundation, Inc.
This file is part of GNU Classpath.
@@ -39,6 +39,7 @@ exception statement from your version. */
package gnu.regexp;
import java.util.Vector;
+import java.util.Arrays;
final class RETokenRepeated extends REToken {
private REToken token;
@@ -82,6 +83,38 @@ final class RETokenRepeated extends REToken {
return (min * token.getMinimumLength());
}
+ int getMaximumLength() {
+ if (max == Integer.MAX_VALUE) return Integer.MAX_VALUE;
+ int tmax = token.getMaximumLength();
+ if (tmax == Integer.MAX_VALUE) return tmax;
+ return (max * tmax);
+ }
+
+ private static REMatch findDoables(REToken tk,
+ CharIndexed input, REMatch mymatch) {
+
+ REMatch.REMatchList doables = new REMatch.REMatchList();
+
+ // try next repeat at all possible positions
+ for (REMatch current = mymatch;
+ current != null; current = current.next) {
+ REMatch recurrent = (REMatch) current.clone();
+ int origin = recurrent.index;
+ tk = (REToken) tk.clone();
+ tk.next = tk.uncle = null;
+ recurrent.matchFlags |= REMatch.MF_FIND_ALL;
+ if (tk.match(input, recurrent)) {
+ for (REMatch m = recurrent; m != null; m = m.next) {
+ m.matchFlags &= ~REMatch.MF_FIND_ALL;
+ }
+ if (recurrent.index == origin) recurrent.empty = true;
+ // add all items in current to doables array
+ doables.addTail(recurrent);
+ }
+ }
+ return doables.head;
+ }
+
// We do need to save every possible point, but the number of clone()
// invocations here is really a killer for performance on non-stingy
// repeat operators. I'm open to suggestions...
@@ -91,59 +124,167 @@ final class RETokenRepeated extends REToken {
// the subexpression back-reference operator allow that?
boolean match(CharIndexed input, REMatch mymatch) {
- // number of times we've matched so far
- int numRepeats = 0;
-
- // Possible positions for the next repeat to match at
- REMatch newMatch = mymatch;
- REMatch last = null;
- REMatch current;
- // Add the '0-repeats' index
- // positions.elementAt(z) == position [] in input after <<z>> matches
- Vector positions = new Vector();
- positions.addElement(newMatch);
-
- // Declare variables used in loop
- REMatch doables;
- REMatch doablesLast;
- REMatch recurrent;
- int lastIndex = mymatch.index;
-
- do {
- // Check for stingy match for each possibility.
- if (stingy && (numRepeats >= min)) {
- REMatch result = matchRest(input, newMatch);
- if (result != null) {
- mymatch.assignFrom(result);
- return true;
- }
+ boolean stopMatchingIfSatisfied =
+ (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0;
+
+ REMatch newMatch = matchMinimum(input, mymatch);
+ if (newMatch == null) return false;
+
+ // Array of positions we have already visited
+ int[] visited = initVisited();
+ for (REMatch m = newMatch; m != null; m = m.next) {
+ visited = addVisited(m.index, visited);
+ }
+
+ int max1 = decreaseMax(max, min);
+
+ newMatch = _match(input, newMatch, max1,
+ stopMatchingIfSatisfied, visited);
+ if (newMatch != null) {
+ mymatch.assignFrom(newMatch);
+ return true;
+ }
+ return false;
+ }
+
+ private static int decreaseMax(int m, int n) {
+ if (m == Integer.MAX_VALUE) return m;
+ return m - n;
+ }
+
+ // Array visited is an array of character positions we have already
+ // visited. visited[0] is used to store the effective length of the
+ // array.
+ private static int[] initVisited() {
+ int[] visited = new int[32];
+ visited[0] = 0;
+ return visited;
+ }
+
+ private static boolean visitedContains(int n, int[] visited) {
+ // Experience tells that for a small array like this,
+ // simple linear search is faster than binary search.
+ for (int i = 1; i < visited[0]; i++) {
+ if (n == visited[i]) return true;
+ }
+ return false;
+ }
+
+ private static int[] addVisited(int n, int[] visited) {
+ if (visitedContains(n, visited)) return visited;
+ if (visited[0] >= visited.length - 1) {
+ int[] newvisited = new int[visited.length + 32];
+ System.arraycopy(visited, 0, newvisited, 0, visited.length);
+ visited = newvisited;
+ }
+ visited[0]++;
+ visited[visited[0]] = n;
+ return visited;
+ }
+
+ private REMatch _match(CharIndexed input, REMatch mymatch,
+ int max1, boolean stopMatchingIfSatisfied,
+ int[] visited) {
+
+ if (max1 == 0) {
+ return matchRest(input, mymatch);
+ }
+ max1 = decreaseMax(max1, 1);
+
+ REMatch.REMatchList allResults = new REMatch.REMatchList();
+
+ // Depth-first search
+
+ for (REMatch cur = mymatch; cur != null; cur = cur.next) {
+
+ REMatch cur1 = (REMatch) cur.clone();
+
+ if (stingy) {
+ REMatch results = matchRest(input, cur1);
+ if (results != null) {
+ if (stopMatchingIfSatisfied) {
+ return results;
+ }
+ allResults.addTail(results);
+ }
}
- doables = null;
- doablesLast = null;
+ DO_THIS:
+ do {
- // try next repeat at all possible positions
- for (current = newMatch; current != null; current = current.next) {
- recurrent = (REMatch) current.clone();
- if (token.match(input, recurrent)) {
- // add all items in current to doables array
- if (doables == null) {
- doables = recurrent;
- doablesLast = recurrent;
- } else {
- // Order these from longest to shortest
- // Start by assuming longest (more repeats)
- doablesLast.next = recurrent;
+ boolean emptyMatchFound = false;
+ REMatch doables = findDoables(token, input, cur1);
+ if (doables == null) break DO_THIS;
+ if (doables.empty) emptyMatchFound = true;
+
+ if (!emptyMatchFound) {
+ REMatch.REMatchList list = new REMatch.REMatchList();
+ for (REMatch m = doables; m != null; m = m.next) {
+ REMatch m1 = (REMatch) m.clone();
+ int n = m1.index;
+ if (! visitedContains(n, visited)) {
+ visited = addVisited(n, visited);
+ list.addTail(m1);
}
- // Find new doablesLast
- while (doablesLast.next != null) {
- doablesLast = doablesLast.next;
+ }
+ if (list.head == null) break DO_THIS;
+ doables = list.head;
+ }
+
+ for (REMatch m = doables; m != null; m = m.next) {
+ if (! emptyMatchFound) {
+ REMatch m1 = _match(input, m, max1,
+ stopMatchingIfSatisfied, visited);
+ if (possessive) return m1;
+ if (m1 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m1;
+ }
+ allResults.addTail(m1);
+ }
+ }
+ else {
+ REMatch m1 = matchRest(input, m);
+ if (m1 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m1;
+ }
+ allResults.addTail(m1);
}
}
}
- // if none of the possibilities worked out, break out of do/while
- if (doables == null) break;
+
+ } while (false); // DO_THIS only once;
+
+ // This point itself is a candidate.
+ if (!stingy) {
+ REMatch m2 = matchRest(input, cur1);
+ if (m2 != null) {
+ if (stopMatchingIfSatisfied) {
+ return m2;
+ }
+ allResults.addTail(m2);
+ }
+ }
+ }
+
+ return allResults.head;
+ }
+
+ private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) {
+ // Possible positions for the next repeat to match at
+ REMatch newMatch = mymatch;
+
+ // number of times we've matched so far
+ int numRepeats = 0;
+
+ while (numRepeats < min) {
+ REMatch doables = findDoables(token, input, newMatch);
+
+ // if none of the possibilities worked out,
+ // it means that minimum number of repeats could not be found.
+ if (doables == null) return null;
// reassign where the next repeat can match
newMatch = doables;
@@ -151,91 +292,24 @@ final class RETokenRepeated extends REToken {
// increment how many repeats we've successfully found
++numRepeats;
- positions.addElement(newMatch);
-
- // doables.index == lastIndex means an empty string
- // was the longest that matched this token.
- // We break here, otherwise we will fall into an endless loop.
- if (doables.index == lastIndex) {
- if (numRepeats < min) numRepeats = min;
- break;
- }
- lastIndex = doables.index;
- } while (numRepeats < max);
-
- // If there aren't enough repeats, then fail
- if (numRepeats < min) return false;
-
- // We're greedy, but ease off until a true match is found
- int posIndex = positions.size();
-
- // At this point we've either got too many or just the right amount.
- // See if this numRepeats works with the rest of the regexp.
- REMatch allResults = null;
- REMatch allResultsLast = null;
-
- REMatch results = null;
- int indexCount = posIndex - min;
- if (indexCount <= 0) {
- // This case occurs when we exited the previous do loop before
- // numRepeats >= min because an empty string matched the token.
- // In this case, an empty string can match as many times as
- // desired.
- indexCount = 1;
- }
- while (indexCount-- > 0) {
- --posIndex;
- newMatch = (REMatch) positions.elementAt(posIndex);
- results = matchRest(input, newMatch);
- if (results != null) {
- if (allResults == null) {
- allResults = results;
- allResultsLast = results;
- } else {
- // Order these from longest to shortest
- // Start by assuming longest (more repeats)
- allResultsLast.next = results;
- }
- // Find new doablesLast
- while (allResultsLast.next != null) {
- allResultsLast = allResultsLast.next;
- }
- }
- // else did not match rest of the tokens, try again on smaller sample
- // or break out when performing possessive matching
- if (possessive) break;
+ if (newMatch.empty) break;
}
- if (allResults != null) {
- mymatch.assignFrom(allResults); // does this get all?
- return true;
- }
- // If we fall out, no matches.
- return false;
+ return newMatch;
}
private REMatch matchRest(CharIndexed input, final REMatch newMatch) {
REMatch current, single;
- REMatch doneIndex = null;
- REMatch doneIndexLast = null;
+ REMatch.REMatchList doneIndex = new REMatch.REMatchList();
// Test all possible matches for this number of repeats
for (current = newMatch; current != null; current = current.next) {
// clone() separates a single match from the chain
single = (REMatch) current.clone();
if (next(input, single)) {
// chain results to doneIndex
- if (doneIndex == null) {
- doneIndex = single;
- doneIndexLast = single;
- } else {
- doneIndexLast.next = single;
- }
- // Find new doneIndexLast
- while (doneIndexLast.next != null) {
- doneIndexLast = doneIndexLast.next;
- }
+ doneIndex.addTail(single);
}
}
- return doneIndex;
+ return doneIndex.head;
}
void dump(StringBuffer os) {
diff --git a/libjava/classpath/gnu/regexp/RETokenStart.java b/libjava/classpath/gnu/regexp/RETokenStart.java
index 8f7198237e1..42e3c0b2de0 100644
--- a/libjava/classpath/gnu/regexp/RETokenStart.java
+++ b/libjava/classpath/gnu/regexp/RETokenStart.java
@@ -44,6 +44,10 @@ class RETokenStart extends REToken {
super(subIndex);
this.newline = newline;
}
+
+ int getMaximumLength() {
+ return 0;
+ }
boolean match(CharIndexed input, REMatch mymatch) {
// charAt(index-n) may be unknown on a Reader/InputStream. FIXME
diff --git a/libjava/classpath/gnu/regexp/RETokenWordBoundary.java b/libjava/classpath/gnu/regexp/RETokenWordBoundary.java
index 6804151e261..f86214bbf68 100644
--- a/libjava/classpath/gnu/regexp/RETokenWordBoundary.java
+++ b/libjava/classpath/gnu/regexp/RETokenWordBoundary.java
@@ -52,6 +52,11 @@ final class RETokenWordBoundary extends REToken {
this.where = where;
this.negated = negated;
}
+
+ int getMaximumLength() {
+ return 0;
+ }
+
boolean match(CharIndexed input, REMatch mymatch) {
// Word boundary means input[index-1] was a word character
OpenPOWER on IntegriCloud