diff options
Diffstat (limited to 'libjava/classpath/gnu/regexp')
24 files changed, 1712 insertions, 296 deletions
diff --git a/libjava/classpath/gnu/regexp/CharIndexed.java b/libjava/classpath/gnu/regexp/CharIndexed.java index a0d7106aefa..df1d8930c6b 100644 --- a/libjava/classpath/gnu/regexp/CharIndexed.java +++ b/libjava/classpath/gnu/regexp/CharIndexed.java @@ -1,5 +1,5 @@ /* gnu/regexp/CharIndexed.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -81,4 +81,16 @@ public interface CharIndexed { * position at a valid position in the input. */ boolean isValid(); + + /** + * Returns another CharIndexed containing length characters to the left + * of the given index. The given length is an expected maximum and + * the returned CharIndexed may not necessarily contain so many characters. + */ + CharIndexed lookBehind(int index, int length); + + /** + * Returns the effective length of this CharIndexed + */ + int length(); } diff --git a/libjava/classpath/gnu/regexp/CharIndexedCharArray.java b/libjava/classpath/gnu/regexp/CharIndexedCharArray.java index 63d858c8709..1388d4729bf 100644 --- a/libjava/classpath/gnu/regexp/CharIndexedCharArray.java +++ b/libjava/classpath/gnu/regexp/CharIndexedCharArray.java @@ -1,5 +1,5 @@ /* gnu/regexp/CharIndexedCharArray.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -59,4 +59,13 @@ class CharIndexedCharArray implements CharIndexed, Serializable { public boolean move(int index) { return ((anchor += index) < s.length); } + + public CharIndexed lookBehind(int index, int length) { + if (length > (anchor + index)) length = anchor + index; + return new CharIndexedCharArray(s, anchor + index - length); + } + + public int length() { + return s.length - anchor; + } } diff --git a/libjava/classpath/gnu/regexp/CharIndexedInputStream.java b/libjava/classpath/gnu/regexp/CharIndexedInputStream.java index 145fe11b135..d5225a79337 100644 --- a/libjava/classpath/gnu/regexp/CharIndexedInputStream.java +++ b/libjava/classpath/gnu/regexp/CharIndexedInputStream.java @@ -1,5 +1,5 @@ /* gnu/regexp/CharIndexedInputStream.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -145,5 +145,15 @@ class CharIndexedInputStream implements CharIndexed { public boolean isValid() { return (cached != OUT_OF_BOUNDS); } + + public CharIndexed lookBehind(int index, int length) { + throw new UnsupportedOperationException( + "difficult to look behind for an input stream"); + } + + public int length() { + throw new UnsupportedOperationException( + "difficult to tell the length for an input stream"); + } } diff --git a/libjava/classpath/gnu/regexp/CharIndexedString.java b/libjava/classpath/gnu/regexp/CharIndexedString.java index 05be07ac68c..fe4fa8f7637 100644 --- a/libjava/classpath/gnu/regexp/CharIndexedString.java +++ b/libjava/classpath/gnu/regexp/CharIndexedString.java @@ -1,5 +1,5 @@ /* gnu/regexp/CharIndexedString.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -61,4 +61,13 @@ class CharIndexedString implements CharIndexed, Serializable { public boolean move(int index) { return ((anchor += index) < len); } + + public CharIndexed lookBehind(int index, int length) { + if (length > (anchor + index)) length = anchor + index; + return new CharIndexedString(s, anchor + index - length); + } + + public int length() { + return len - anchor; + } } diff --git a/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java b/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java index 1b88e398571..9c9118dfee2 100644 --- a/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java +++ b/libjava/classpath/gnu/regexp/CharIndexedStringBuffer.java @@ -1,5 +1,5 @@ /* gnu/regexp/CharIndexedStringBuffer.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 1998-2001, 2004, 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -59,4 +59,13 @@ class CharIndexedStringBuffer implements CharIndexed, Serializable { public boolean move(int index) { return ((anchor += index) < s.length()); } + + public CharIndexed lookBehind(int index, int length) { + if (length > (anchor + index)) length = anchor + index; + return new CharIndexedStringBuffer(s, anchor + index - length); + } + + public int length() { + return s.length() - anchor; + } } diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java index 9ac9b53d1a9..ef606a6d8a7 100644 --- a/libjava/classpath/gnu/regexp/RE.java +++ b/libjava/classpath/gnu/regexp/RE.java @@ -1,5 +1,5 @@ /* gnu/regexp/RE.java - Copyright (C) 1998-2001, 2004, 2005 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -136,12 +136,13 @@ public class RE extends REToken { /** Minimum length, in characters, of any possible match. */ private int minimumLength; + private int maximumLength; /** * Compilation flag. Do not differentiate case. Subsequent * searches using this RE will be case insensitive. */ - public static final int REG_ICASE = 2; + public static final int REG_ICASE = 0x02; /** * Compilation flag. The match-any-character operator (dot) @@ -149,14 +150,14 @@ public class RE extends REToken { * bit RE_DOT_NEWLINE (see RESyntax for details). This is equivalent to * the "/s" operator in Perl. */ - public static final int REG_DOT_NEWLINE = 4; + public static final int REG_DOT_NEWLINE = 0x04; /** * Compilation flag. Use multiline mode. In this mode, the ^ and $ * anchors will match based on newlines within the input. This is * equivalent to the "/m" operator in Perl. */ - public static final int REG_MULTILINE = 8; + public static final int REG_MULTILINE = 0x08; /** * Execution flag. @@ -185,14 +186,14 @@ public class RE extends REToken { * // m4.toString(): "fool"<BR> * </CODE> */ - public static final int REG_NOTBOL = 16; + public static final int REG_NOTBOL = 0x10; /** * Execution flag. * The match-end operator ($) does not match at the end * of the input string. Useful for matching on substrings. */ - public static final int REG_NOTEOL = 32; + public static final int REG_NOTEOL = 0x20; /** * Execution flag. @@ -206,7 +207,7 @@ public class RE extends REToken { * the example under REG_NOTBOL. It also affects the use of the \< * and \b operators. */ - public static final int REG_ANCHORINDEX = 64; + public static final int REG_ANCHORINDEX = 0x40; /** * Execution flag. @@ -215,7 +216,24 @@ public class RE extends REToken { * the corresponding subexpressions. For example, you may want to * replace all matches of "one dollar" with "$1". */ - public static final int REG_NO_INTERPOLATE = 128; + public static final int REG_NO_INTERPOLATE = 0x80; + + /** + * Execution flag. + * Try to match the whole input string. An implicit match-end operator + * is added to this regexp. + */ + public static final int REG_TRY_ENTIRE_MATCH = 0x0100; + + /** + * Execution flag. + * The substitute and substituteAll methods will treat the + * character '\' in the replacement as an escape to a literal + * character. In this case "\n", "\$", "\\", "\x40" and "\012" + * will become "n", "$", "\", "x40" and "012" respectively. + * This flag has no effect if REG_NO_INTERPOLATE is set on. + */ + public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200; /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { @@ -273,12 +291,13 @@ public class RE extends REToken { } // internal constructor used for alternation - private RE(REToken first, REToken last,int subs, int subIndex, int minLength) { + private RE(REToken first, REToken last,int subs, int subIndex, int minLength, int maxLength) { super(subIndex); firstToken = first; lastToken = last; numSubs = subs; minimumLength = minLength; + maximumLength = maxLength; addToken(new RETokenEndSub(subIndex)); } @@ -333,6 +352,11 @@ public class RE extends REToken { char ch; boolean quot = false; + // Saved syntax and flags. + RESyntax savedSyntax = null; + int savedCflags = 0; + boolean flagsSaved = false; + while (index < pLength) { // read the next character unit (including backslash escapes) index = getCharUnit(pattern,index,unit,quot); @@ -359,8 +383,9 @@ public class RE extends REToken { && !syntax.get(RESyntax.RE_LIMITED_OPS)) { // make everything up to here be a branch. create vector if nec. addToken(currentToken); - RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength); + RE theBranch = new RE(firstToken, lastToken, numSubs, subIndex, minimumLength, maximumLength); minimumLength = 0; + maximumLength = 0; if (branches == null) { branches = new Vector(); } @@ -402,102 +427,12 @@ public class RE extends REToken { // [...] | [^...] else if ((unit.ch == '[') && !(unit.bk || quot)) { - Vector options = new Vector(); - boolean negative = false; - char lastChar = 0; - if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); - - // Check for initial caret, negation - if ((ch = pattern[index]) == '^') { - negative = true; - if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - ch = pattern[index]; - } - - // Check for leading right bracket literal - if (ch == ']') { - lastChar = ch; - if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - } - - while ((ch = pattern[index++]) != ']') { - if ((ch == '-') && (lastChar != 0)) { - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - if ((ch = pattern[index]) == ']') { - options.addElement(new RETokenChar(subIndex,lastChar,insens)); - lastChar = '-'; - } else { - options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); - lastChar = 0; - index++; - } - } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - int posixID = -1; - boolean negate = false; - char asciiEsc = 0; - if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { - switch (pattern[index]) { - case 'D': - negate = true; - case 'd': - posixID = RETokenPOSIX.DIGIT; - break; - case 'S': - negate = true; - case 's': - posixID = RETokenPOSIX.SPACE; - break; - case 'W': - negate = true; - case 'w': - posixID = RETokenPOSIX.ALNUM; - break; - } - } - else if ("nrt".indexOf(pattern[index]) != -1) { - switch (pattern[index]) { - case 'n': - asciiEsc = '\n'; - break; - case 't': - asciiEsc = '\t'; - break; - case 'r': - asciiEsc = '\r'; - break; - } - } - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - - if (posixID != -1) { - options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); - } else if (asciiEsc != 0) { - lastChar = asciiEsc; - } else { - lastChar = pattern[index]; - } - ++index; - } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { - StringBuffer posixSet = new StringBuffer(); - index = getPosixSet(pattern,index+1,posixSet); - int posixId = RETokenPOSIX.intValue(posixSet.toString()); - if (posixId != -1) - options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); - } else { - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - lastChar = ch; - } - if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); - } // while in list - // Out of list, index is one past ']' - - if (lastChar != 0) options.addElement(new RETokenChar(subIndex,lastChar,insens)); - // Create a new RETokenOneOf + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index, pLength, cflags, syntax, 0); addToken(currentToken); - options.trimToSize(); - currentToken = new RETokenOneOf(subIndex,options,negative); + currentToken = result.token; + index = result.index; } // SUBEXPRESSIONS @@ -507,7 +442,10 @@ public class RE extends REToken { boolean pure = false; boolean comment = false; boolean lookAhead = false; + boolean lookBehind = false; + boolean independent = false; boolean negativelh = false; + boolean negativelb = false; if ((index+1 < pLength) && (pattern[index] == '?')) { switch (pattern[index+1]) { case '!': @@ -525,6 +463,114 @@ public class RE extends REToken { index += 2; } break; + case '<': + // We assume that if the syntax supports look-ahead, + // it also supports look-behind. + if (syntax.get(RESyntax.RE_LOOKAHEAD)) { + index++; + switch (pattern[index +1]) { + case '!': + pure = true; + negativelb = true; + lookBehind = true; + index += 2; + break; + case '=': + pure = true; + lookBehind = true; + index += 2; + } + } + break; + case '>': + // We assume that if the syntax supports look-ahead, + // it also supports independent group. + if (syntax.get(RESyntax.RE_LOOKAHEAD)) { + pure = true; + independent = true; + index += 2; + } + break; + case 'i': + case 'd': + case 'm': + case 's': + // case 'u': not supported + // case 'x': not supported + case '-': + if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break; + // Set or reset syntax flags. + int flagIndex = index + 1; + int endFlag = -1; + RESyntax newSyntax = new RESyntax(syntax); + int newCflags = cflags; + boolean negate = false; + while (flagIndex < pLength && endFlag < 0) { + switch(pattern[flagIndex]) { + case 'i': + if (negate) + newCflags &= ~REG_ICASE; + else + newCflags |= REG_ICASE; + flagIndex++; + break; + case 'd': + if (negate) + newSyntax.setLineSeparator(RESyntax.DEFAULT_LINE_SEPARATOR); + else + newSyntax.setLineSeparator("\n"); + flagIndex++; + break; + case 'm': + if (negate) + newCflags &= ~REG_MULTILINE; + else + newCflags |= REG_MULTILINE; + flagIndex++; + break; + case 's': + if (negate) + newCflags &= ~REG_DOT_NEWLINE; + else + newCflags |= REG_DOT_NEWLINE; + flagIndex++; + break; + // case 'u': not supported + // case 'x': not supported + case '-': + negate = true; + flagIndex++; + break; + case ':': + case ')': + endFlag = pattern[flagIndex]; + break; + default: + throw new REException(getLocalizedMessage("repeat.no.token"), REException.REG_BADRPT, index); + } + } + if (endFlag == ')') { + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + // This can be treated as though it were a comment. + comment = true; + index = flagIndex - 1; + break; + } + if (endFlag == ':') { + savedSyntax = syntax; + savedCflags = cflags; + flagsSaved = true; + syntax = newSyntax; + cflags = newCflags; + insens = ((cflags & REG_ICASE) > 0); + index = flagIndex -1; + // Fall through to the next case. + } + else { + throw new REException(getLocalizedMessage("unmatched.paren"), REException.REG_ESUBREG,index); + } case ':': if (syntax.get(RESyntax.RE_PURE_GROUPING)) { pure = true; @@ -607,15 +653,28 @@ public class RE extends REToken { numSubs++; } - int useIndex = (pure || lookAhead) ? 0 : nextSub + numSubs; + int useIndex = (pure || lookAhead || lookBehind || independent) ? + 0 : nextSub + numSubs; currentToken = new RE(String.valueOf(pattern,index,endIndex-index).toCharArray(),cflags,syntax,useIndex,nextSub + numSubs); numSubs += ((RE) currentToken).getNumSubs(); if (lookAhead) { currentToken = new RETokenLookAhead(currentToken,negativelh); } + else if (lookBehind) { + currentToken = new RETokenLookBehind(currentToken,negativelb); + } + else if (independent) { + currentToken = new RETokenIndependent(currentToken); + } index = nextIndex; + if (flagsSaved) { + syntax = savedSyntax; + cflags = savedCflags; + insens = ((cflags & REG_ICASE) > 0); + flagsSaved = false; + } } // not a comment } // subexpression @@ -715,14 +774,45 @@ public class RE extends REToken { else currentToken = setRepeated(currentToken,0,1,index); } + + // OCTAL CHARACTER + // \0377 + else if (unit.bk && (unit.ch == '0') && syntax.get(RESyntax.RE_OCTAL_CHAR)) { + CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException("invalid octal character", REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken(currentToken); + currentToken = new RETokenChar(subIndex,ce.ch,insens); + } + // BACKREFERENCE OPERATOR - // \1 \2 ... \9 + // \1 \2 ... \9 and \10 \11 \12 ... // not available if RE_NO_BK_REFS is set + // Perl recognizes \10, \11, and so on only if enough number of + // parentheses have opened before it, otherwise they are treated + // as aliases of \010, \011, ... (octal characters). In case of + // Sun's JDK, octal character expression must always begin with \0. + // We will do as JDK does. But FIXME, take a look at "(a)(b)\29". + // JDK treats \2 as a back reference to the 2nd group because + // there are only two groups. But in our poor implementation, + // we cannot help but treat \29 as a back reference to the 29th group. else if (unit.bk && Character.isDigit(unit.ch) && !syntax.get(RESyntax.RE_NO_BK_REFS)) { addToken(currentToken); - currentToken = new RETokenBackRef(subIndex,Character.digit(unit.ch,10),insens); + int numBegin = index - 1; + int numEnd = pLength; + for (int i = index; i < pLength; i++) { + if (! Character.isDigit(pattern[i])) { + numEnd = i; + break; + } + } + int num = parseInt(pattern, numBegin, numEnd-numBegin, 10); + + currentToken = new RETokenBackRef(subIndex,num,insens); + index = numEnd; } // START OF STRING OPERATOR @@ -844,6 +934,32 @@ public class RE extends REToken { currentToken = new RETokenEnd(subIndex,null); } + // HEX CHARACTER, UNICODE CHARACTER + // \x1B, \u1234 + + else if ((unit.bk && (unit.ch == 'x') && syntax.get(RESyntax.RE_HEX_CHAR)) || + (unit.bk && (unit.ch == 'u') && syntax.get(RESyntax.RE_UNICODE_CHAR))) { + CharExpression ce = getCharExpression(pattern, index - 2, pLength, syntax); + if (ce == null) + throw new REException("invalid hex character", REException.REG_ESCAPE, index); + index = index - 2 + ce.len; + addToken(currentToken); + currentToken = new RETokenChar(subIndex,ce.ch,insens); + } + + // NAMED PROPERTY + // \p{prop}, \P{prop} + + else if ((unit.bk && (unit.ch == 'p') && syntax.get(RESyntax.RE_NAMED_PROPERTY)) || + (unit.bk && (unit.ch == 'P') && syntax.get(RESyntax.RE_NAMED_PROPERTY))) { + NamedProperty np = getNamedProperty(pattern, index - 2, pLength); + if (np == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + index = index - 2 + np.len; + addToken(currentToken); + currentToken = getRETokenNamedProperty(subIndex,np,insens,index); + } + // NON-SPECIAL CHARACTER (or escape to make literal) // c | \* for example @@ -857,9 +973,10 @@ public class RE extends REToken { addToken(currentToken); if (branches != null) { - branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength)); + branches.addElement(new RE(firstToken,lastToken,numSubs,subIndex,minimumLength, maximumLength)); branches.trimToSize(); // compact the Vector minimumLength = 0; + maximumLength = 0; firstToken = lastToken = null; addToken(new RETokenOneOf(subIndex,branches,false)); } @@ -867,6 +984,199 @@ public class RE extends REToken { } + private static class ParseCharClassResult { + RETokenOneOf token; + int index; + boolean returnAtAndOperator = false; + } + + /** + * Parse [...] or [^...] and make an RETokenOneOf instance. + * @param subIndex subIndex to be given to the created RETokenOneOf instance. + * @param pattern Input array of characters to be parsed. + * @param index Index pointing to the character next to the beginning '['. + * @param pLength Limit of the input array. + * @param cflags Compilation flags used to parse the pattern. + * @param pflags Flags that affect the behavior of this method. + * @param syntax Syntax used to parse the pattern. + */ + private static ParseCharClassResult parseCharClass(int subIndex, + char[] pattern, int index, + int pLength, int cflags, RESyntax syntax, int pflags) + throws REException { + + boolean insens = ((cflags & REG_ICASE) > 0); + Vector options = new Vector(); + Vector addition = new Vector(); + boolean additionAndAppeared = false; + final int RETURN_AT_AND = 0x01; + boolean returnAtAndOperator = ((pflags & RETURN_AT_AND) != 0); + boolean negative = false; + char ch; + + char lastChar = 0; + boolean lastCharIsSet = false; + if (index == pLength) throw new REException(getLocalizedMessage("unmatched.bracket"),REException.REG_EBRACK,index); + + // Check for initial caret, negation + if ((ch = pattern[index]) == '^') { + negative = true; + if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + ch = pattern[index]; + } + + // Check for leading right bracket literal + if (ch == ']') { + lastChar = ch; lastCharIsSet = true; + if (++index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + } + + while ((ch = pattern[index++]) != ']') { + if ((ch == '-') && (lastCharIsSet)) { + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + if ((ch = pattern[index]) == ']') { + options.addElement(new RETokenChar(subIndex,lastChar,insens)); + lastChar = '-'; + } else { + if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { + CharExpression ce = getCharExpression(pattern, index, pLength, syntax); + if (ce == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + ch = ce.ch; + index = index + ce.len - 1; + } + options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); + lastChar = 0; lastCharIsSet = false; + index++; + } + } else if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + int posixID = -1; + boolean negate = false; + char asciiEsc = 0; + boolean asciiEscIsSet = false; + NamedProperty np = null; + if (("dswDSW".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_CHAR_CLASS_ESC_IN_LISTS)) { + switch (pattern[index]) { + case 'D': + negate = true; + case 'd': + posixID = RETokenPOSIX.DIGIT; + break; + case 'S': + negate = true; + case 's': + posixID = RETokenPOSIX.SPACE; + break; + case 'W': + negate = true; + case 'w': + posixID = RETokenPOSIX.ALNUM; + break; + } + } + if (("pP".indexOf(pattern[index]) != -1) && syntax.get(RESyntax.RE_NAMED_PROPERTY)) { + np = getNamedProperty(pattern, index - 1, pLength); + if (np == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + index = index - 1 + np.len - 1; + } + else { + CharExpression ce = getCharExpression(pattern, index - 1, pLength, syntax); + if (ce == null) + throw new REException("invalid escape sequence", REException.REG_ESCAPE, index); + asciiEsc = ce.ch; asciiEscIsSet = true; + index = index - 1 + ce.len - 1; + } + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + + if (posixID != -1) { + options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); + } else if (np != null) { + options.addElement(getRETokenNamedProperty(subIndex,np,insens,index)); + } else if (asciiEscIsSet) { + lastChar = asciiEsc; lastCharIsSet = true; + } else { + lastChar = pattern[index]; lastCharIsSet = true; + } + ++index; + } else if ((ch == '[') && (syntax.get(RESyntax.RE_CHAR_CLASSES)) && (index < pLength) && (pattern[index] == ':')) { + StringBuffer posixSet = new StringBuffer(); + index = getPosixSet(pattern,index+1,posixSet); + int posixId = RETokenPOSIX.intValue(posixSet.toString()); + if (posixId != -1) + options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); + } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) { + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index, pLength, cflags, syntax, 0); + addition.addElement(result.token); + addition.addElement("|"); + index = result.index; + } else if ((ch == '&') && + (syntax.get(RESyntax.RE_NESTED_CHARCLASS)) && + (index < pLength) && (pattern[index] == '&')) { + if (returnAtAndOperator) { + ParseCharClassResult result = new ParseCharClassResult(); + options.trimToSize(); + if (additionAndAppeared) addition.addElement("&"); + if (addition.size() == 0) addition = null; + result.token = new RETokenOneOf(subIndex, + options, addition, negative); + result.index = index - 1; + result.returnAtAndOperator = true; + return result; + } + // The precedence of the operator "&&" is the lowest. + // So we postpone adding "&" until other elements + // are added. And we insert Boolean.FALSE at the + // beginning of the list of tokens following "&&". + // So, "&&[a-b][k-m]" will be stored in the Vecter + // addition in this order: + // Boolean.FALSE, [a-b], "|", [k-m], "|", "&" + if (additionAndAppeared) addition.addElement("&"); + addition.addElement(Boolean.FALSE); + additionAndAppeared = true; + + // The part on which "&&" operates may be either + // (1) explicitly enclosed by [] + // or + // (2) not enclosed by [] and terminated by the + // next "&&" or the end of the character list. + // Let the preceding else if block do the case (1). + // We must do something in case of (2). + if ((index + 1 < pLength) && (pattern[index + 1] != '[')) { + ParseCharClassResult result = parseCharClass( + subIndex, pattern, index+1, pLength, cflags, syntax, + RETURN_AT_AND); + addition.addElement(result.token); + addition.addElement("|"); + // If the method returned at the next "&&", it is OK. + // Otherwise we have eaten the mark of the end of this + // character list "]". In this case we must give back + // the end mark. + index = (result.returnAtAndOperator ? + result.index: result.index - 1); + } + } else { + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + lastChar = ch; lastCharIsSet = true; + } + if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); + } // while in list + // Out of list, index is one past ']' + + if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + + ParseCharClassResult result = new ParseCharClassResult(); + // Create a new RETokenOneOf + options.trimToSize(); + if (additionAndAppeared) addition.addElement("&"); + if (addition.size() == 0) addition = null; + result.token = new RETokenOneOf(subIndex,options, addition, negative); + result.index = index; + return result; + } + private static int getCharUnit(char[] input, int index, CharUnit unit, boolean quot) throws REException { unit.ch = input[index++]; unit.bk = (unit.ch == '\\' @@ -878,6 +1188,176 @@ public class RE extends REToken { return index; } + private static int parseInt(char[] input, int pos, int len, int radix) { + int ret = 0; + for (int i = pos; i < pos + len; i++) { + ret = ret * radix + Character.digit(input[i], radix); + } + return ret; + } + + /** + * This class represents various expressions for a character. + * "a" : 'a' itself. + * "\0123" : Octal char 0123 + * "\x1b" : Hex char 0x1b + * "\u1234" : Unicode char \u1234 + */ + private static class CharExpression { + /** character represented by this expression */ + char ch; + /** String expression */ + String expr; + /** length of this expression */ + int len; + public String toString() { return expr; } + } + + private static CharExpression getCharExpression(char[] input, int pos, int lim, + RESyntax syntax) { + CharExpression ce = new CharExpression(); + char c = input[pos]; + if (c == '\\') { + if (pos + 1 >= lim) return null; + c = input[pos + 1]; + switch(c) { + case 't': + ce.ch = '\t'; + ce.len = 2; + break; + case 'n': + ce.ch = '\n'; + ce.len = 2; + break; + case 'r': + ce.ch = '\r'; + ce.len = 2; + break; + case 'x': + case 'u': + if ((c == 'x' && syntax.get(RESyntax.RE_HEX_CHAR)) || + (c == 'u' && syntax.get(RESyntax.RE_UNICODE_CHAR))) { + int l = 0; + int expectedLength = (c == 'x' ? 2 : 4); + for (int i = pos + 2; i < pos + 2 + expectedLength; i++) { + if (i >= lim) break; + if (!((input[i] >= '0' && input[i] <= '9') || + (input[i] >= 'A' && input[i] <= 'F') || + (input[i] >= 'a' && input[i] <= 'f'))) + break; + l++; + } + if (l != expectedLength) return null; + ce.ch = (char)(parseInt(input, pos + 2, l, 16)); + ce.len = l + 2; + } + else { + ce.ch = c; + ce.len = 2; + } + break; + case '0': + if (syntax.get(RESyntax.RE_OCTAL_CHAR)) { + int l = 0; + for (int i = pos + 2; i < pos + 2 + 3; i++) { + if (i >= lim) break; + if (input[i] < '0' || input[i] > '7') break; + l++; + } + if (l == 3 && input[pos + 2] > '3') l--; + if (l <= 0) return null; + ce.ch = (char)(parseInt(input, pos + 2, l, 8)); + ce.len = l + 2; + } + else { + ce.ch = c; + ce.len = 2; + } + break; + default: + ce.ch = c; + ce.len = 2; + break; + } + } + else { + ce.ch = input[pos]; + ce.len = 1; + } + ce.expr = new String(input, pos, ce.len); + return ce; + } + + /** + * This class represents a substring in a pattern string expressing + * a named property. + * "\pA" : Property named "A" + * "\p{prop}" : Property named "prop" + * "\PA" : Property named "A" (Negated) + * "\P{prop}" : Property named "prop" (Negated) + */ + private static class NamedProperty { + /** Property name */ + String name; + /** Negated or not */ + boolean negate; + /** length of this expression */ + int len; + } + + private static NamedProperty getNamedProperty(char[] input, int pos, int lim) { + NamedProperty np = new NamedProperty(); + char c = input[pos]; + if (c == '\\') { + if (++pos >= lim) return null; + c = input[pos++]; + switch(c) { + case 'p': + np.negate = false; + break; + case 'P': + np.negate = true; + break; + default: + return null; + } + c = input[pos++]; + if (c == '{') { + int p = -1; + for (int i = pos; i < lim; i++) { + if (input[i] == '}') { + p = i; + break; + } + } + if (p < 0) return null; + int len = p - pos; + np.name = new String(input, pos, len); + np.len = len + 4; + } + else { + np.name = new String(input, pos - 1, 1); + np.len = 3; + } + return np; + } + else return null; + } + + private static RETokenNamedProperty getRETokenNamedProperty( + int subIndex, NamedProperty np, boolean insens, int index) + throws REException { + try { + return new RETokenNamedProperty(subIndex, np.name, insens, np.negate); + } + catch (REException e) { + REException ree; + ree = new REException(e.getMessage(), REException.REG_ESCAPE, index); + ree.initCause(e); + throw ree; + } + } + /** * Checks if the regular expression matches the input in its entirety. * @@ -958,6 +1438,10 @@ public class RE extends REToken { return minimumLength; } + public int getMaximumLength() { + return maximumLength; + } + /** * Returns an array of all matches found in the input. * @@ -1025,7 +1509,9 @@ public class RE extends REToken { /* Implements abstract method REToken.match() */ boolean match(CharIndexed input, REMatch mymatch) { - if (firstToken == null) return next(input, mymatch); + if (firstToken == null) { + return next(input, mymatch); + } // Note the start of this subexpression mymatch.start[subIndex] = mymatch.index; @@ -1089,23 +1575,34 @@ public class RE extends REToken { } REMatch getMatchImpl(CharIndexed input, int anchor, int eflags, StringBuffer buffer) { + boolean tryEntireMatch = ((eflags & REG_TRY_ENTIRE_MATCH) != 0); + RE re = (tryEntireMatch ? (RE) this.clone() : this); + if (tryEntireMatch) { + re.chain(new RETokenEnd(0, null)); + } // Create a new REMatch to hold results REMatch mymatch = new REMatch(numSubs, anchor, eflags); do { // Optimization: check if anchor + minimumLength > length if (minimumLength == 0 || input.charAt(minimumLength-1) != CharIndexed.OUT_OF_BOUNDS) { - if (match(input, mymatch)) { - // Find longest match of them all to observe leftmost longest - REMatch longest = mymatch; + if (re.match(input, mymatch)) { + REMatch best = mymatch; + // We assume that the match that coms first is the best. + // And the following "The longer, the better" rule has + // been commented out. The longest is not neccesarily + // the best. For example, "a" out of "aaa" is the best + // match for /a+?/. + /* + // Find best match of them all to observe leftmost longest while ((mymatch = mymatch.next) != null) { - if (mymatch.index > longest.index) { - longest = mymatch; + if (mymatch.index > best.index) { + best = mymatch; } } - - longest.end[0] = longest.index; - longest.finish(input); - return longest; + */ + best.end[0] = best.index; + best.finish(input); + return best; } } mymatch.clear(++anchor); @@ -1216,8 +1713,7 @@ public class RE extends REToken { StringBuffer buffer = new StringBuffer(); REMatch m = getMatchImpl(input,index,eflags,buffer); if (m==null) return buffer.toString(); - buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? - replace : m.substituteInto(replace) ); + buffer.append(getReplacement(replace, m, eflags)); if (input.move(m.end[0])) { do { buffer.append(input.charAt(0)); @@ -1278,8 +1774,7 @@ public class RE extends REToken { StringBuffer buffer = new StringBuffer(); REMatch m; while ((m = getMatchImpl(input,index,eflags,buffer)) != null) { - buffer.append( ((eflags & REG_NO_INTERPOLATE) > 0) ? - replace : m.substituteInto(replace) ); + buffer.append(getReplacement(replace, m, eflags)); index = m.getEndIndex(); if (m.end[0] == 0) { char ch = input.charAt(0); @@ -1294,11 +1789,50 @@ public class RE extends REToken { } return buffer.toString(); } + + public static String getReplacement(String replace, REMatch m, int eflags) { + if ((eflags & REG_NO_INTERPOLATE) > 0) + return replace; + else { + if ((eflags & REG_REPLACE_USE_BACKSLASHESCAPE) > 0) { + StringBuffer sb = new StringBuffer(); + int l = replace.length(); + for (int i = 0; i < l; i++) { + char c = replace.charAt(i); + switch(c) { + case '\\': + i++; + // Let StringIndexOutOfBoundsException be thrown. + sb.append(replace.charAt(i)); + break; + case '$': + int i1 = i + 1; + while (i1 < replace.length() && + Character.isDigit(replace.charAt(i1))) i1++; + sb.append(m.substituteInto(replace.substring(i, i1))); + i = i1 - 1; + break; + default: + sb.append(c); + } + } + return sb.toString(); + } + else + return m.substituteInto(replace); + } + } /* Helper function for constructor */ private void addToken(REToken next) { if (next == null) return; minimumLength += next.getMinimumLength(); + int nmax = next.getMaximumLength(); + if (nmax < Integer.MAX_VALUE && maximumLength < Integer.MAX_VALUE) + maximumLength += nmax; + else + maximumLength = Integer.MAX_VALUE; + if (firstToken == null) { lastToken = firstToken = next; } else { diff --git a/libjava/classpath/gnu/regexp/REMatch.java b/libjava/classpath/gnu/regexp/REMatch.java index cf25bb331c5..91a3c0249c0 100644 --- a/libjava/classpath/gnu/regexp/REMatch.java +++ b/libjava/classpath/gnu/regexp/REMatch.java @@ -1,5 +1,5 @@ /* gnu/regexp/REMatch.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -67,6 +67,10 @@ public final class REMatch implements Serializable, Cloneable { int[] start; // start positions (relative to offset) for each (sub)exp. int[] end; // end positions for the same REMatch next; // other possibility (to avoid having to use arrays) + boolean empty; // empty string matched. This flag is used only within + // RETokenRepeated. + int matchFlags; // flags passed to match methods + static final int MF_FIND_ALL = 0x01; public Object clone() { try { @@ -177,7 +181,9 @@ public final class REMatch implements Serializable, Cloneable { * @param sub Index of the subexpression. */ public String toString(int sub) { - if ((sub >= start.length) || (start[sub] == -1)) return ""; + if ((sub >= start.length) || sub < 0) + throw new IndexOutOfBoundsException("No group " + sub); + if (start[sub] == -1) return null; return (matchedText.substring(start[sub],end[sub])); } @@ -242,6 +248,8 @@ public final class REMatch implements Serializable, Cloneable { * <code>$0</code> through <code>$9</code>. <code>$0</code> matches * the full substring matched; <code>$<i>n</i></code> matches * subexpression number <i>n</i>. + * <code>$10, $11, ...</code> may match the 10th, 11th, ... subexpressions + * if such subexpressions exist. * * @param input A string consisting of literals and <code>$<i>n</i></code> tokens. */ @@ -252,6 +260,16 @@ public final class REMatch implements Serializable, Cloneable { for (pos = 0; pos < input.length()-1; pos++) { if ((input.charAt(pos) == '$') && (Character.isDigit(input.charAt(pos+1)))) { int val = Character.digit(input.charAt(++pos),10); + int pos1 = pos + 1; + while (pos1 < input.length() && + Character.isDigit(input.charAt(pos1))) { + int val1 = val*10 + Character.digit(input.charAt(pos1),10); + if (val1 >= start.length) break; + pos1++; + val = val1; + } + pos = pos1 - 1; + if (val < start.length) { output.append(toString(val)); } @@ -260,4 +278,42 @@ public final class REMatch implements Serializable, Cloneable { if (pos < input.length()) output.append(input.charAt(pos)); return output.toString(); } + + static class REMatchList { + REMatch head; + REMatch tail; + REMatchList() { + head = tail = null; + } + /* Not used now. But we may need this some day? + void addHead(REMatch newone) { + if (head == null) { + head = newone; + tail = newone; + while (tail.next != null) { + tail = tail.next; + } + } + else { + REMatch tmp = newone; + while (tmp.next != null) tmp = tmp.next; + tmp.next = head; + head = newone; + } + } + */ + void addTail(REMatch newone) { + if (head == null) { + head = newone; + tail = newone; + } + else { + tail.next = newone; + } + while (tail.next != null) { + tail = tail.next; + } + } + } + } diff --git a/libjava/classpath/gnu/regexp/RESyntax.java b/libjava/classpath/gnu/regexp/RESyntax.java index 7272b03481b..81fd999bfcf 100644 --- a/libjava/classpath/gnu/regexp/RESyntax.java +++ b/libjava/classpath/gnu/regexp/RESyntax.java @@ -1,5 +1,5 @@ /* gnu/regexp/RESyntax.java - Copyright (C) 1998-2002, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -202,7 +202,37 @@ public final class RESyntax implements Serializable { */ public static final int RE_POSSESSIVE_OPS = 25; - private static final int BIT_TOTAL = 26; + /** + * Syntax bit. Allow embedded flags, (?is-x), as in Perl5. + */ + public static final int RE_EMBEDDED_FLAGS = 26; + + /** + * Syntax bit. Allow octal char (\0377), as in Perl5. + */ + public static final int RE_OCTAL_CHAR = 27; + + /** + * Syntax bit. Allow hex char (\x1b), as in Perl5. + */ + public static final int RE_HEX_CHAR = 28; + + /** + * Syntax bit. Allow Unicode char (\u1234), as in Java 1.4. + */ + public static final int RE_UNICODE_CHAR = 29; + + /** + * Syntax bit. Allow named property (\p{P}, \P{p}), as in Perl5. + */ + public static final int RE_NAMED_PROPERTY = 30; + + /** + * Syntax bit. Allow nested characterclass ([a-z&&[^p-r]]), as in Java 1.4. + */ + public static final int RE_NESTED_CHARCLASS = 31; + + private static final int BIT_TOTAL = 32; /** * Predefined syntax. @@ -422,6 +452,10 @@ public final class RESyntax implements Serializable { .set(RE_STRING_ANCHORS) // \A,\Z .set(RE_CHAR_CLASS_ESC_IN_LISTS)// \d,\D,\w,\W,\s,\S within [] .set(RE_COMMENTS) // (?#) + .set(RE_EMBEDDED_FLAGS) // (?imsx-imsx) + .set(RE_OCTAL_CHAR) // \0377 + .set(RE_HEX_CHAR) // \x1b + .set(RE_NAMED_PROPERTY) // \p{prop}, \P{prop} .makeFinal(); RE_SYNTAX_PERL5_S = new RESyntax(RE_SYNTAX_PERL5) @@ -431,6 +465,8 @@ public final class RESyntax implements Serializable { RE_SYNTAX_JAVA_1_4 = new RESyntax(RE_SYNTAX_PERL5) // XXX .set(RE_POSSESSIVE_OPS) // *+,?+,++,{}+ + .set(RE_UNICODE_CHAR) // \u1234 + .set(RE_NESTED_CHARCLASS) // [a-z&&[^p-r]] .makeFinal(); } diff --git a/libjava/classpath/gnu/regexp/REToken.java b/libjava/classpath/gnu/regexp/REToken.java index 4eae9ec473c..5f4659b21ac 100644 --- a/libjava/classpath/gnu/regexp/REToken.java +++ b/libjava/classpath/gnu/regexp/REToken.java @@ -38,12 +38,21 @@ exception statement from your version. */ package gnu.regexp; import java.io.Serializable; -abstract class REToken implements Serializable { +abstract class REToken implements Serializable, Cloneable { protected REToken next = null; protected REToken uncle = null; protected int subIndex; + public Object clone() { + try { + REToken copy = (REToken) super.clone(); + return copy; + } catch (CloneNotSupportedException e) { + throw new Error(); // doesn't happen + } + } + protected REToken(int subIndex) { this.subIndex = subIndex; } @@ -52,6 +61,10 @@ abstract class REToken implements Serializable { return 0; } + int getMaximumLength() { + return Integer.MAX_VALUE; + } + void setUncle(REToken anUncle) { uncle = anUncle; } diff --git a/libjava/classpath/gnu/regexp/RETokenAny.java b/libjava/classpath/gnu/regexp/RETokenAny.java index ac032dcb3bf..2b0967a79a1 100644 --- a/libjava/classpath/gnu/regexp/RETokenAny.java +++ b/libjava/classpath/gnu/regexp/RETokenAny.java @@ -55,6 +55,10 @@ final class RETokenAny extends REToken { return 1; } + int getMaximumLength() { + return 1; + } + boolean match(CharIndexed input, REMatch mymatch) { char ch = input.charAt(mymatch.index); if ((ch == CharIndexed.OUT_OF_BOUNDS) diff --git a/libjava/classpath/gnu/regexp/RETokenBackRef.java b/libjava/classpath/gnu/regexp/RETokenBackRef.java index 674822abd70..060a6cf7d20 100644 --- a/libjava/classpath/gnu/regexp/RETokenBackRef.java +++ b/libjava/classpath/gnu/regexp/RETokenBackRef.java @@ -51,13 +51,25 @@ final class RETokenBackRef extends REToken { // should implement getMinimumLength() -- any ideas? boolean match(CharIndexed input, REMatch mymatch) { + if (num >= mymatch.start.length) return false; + if (num >= mymatch.end.length) return false; int b,e; b = mymatch.start[num]; e = mymatch.end[num]; if ((b==-1)||(e==-1)) return false; // this shouldn't happen, but... for (int i=b; i<e; i++) { - if (input.charAt(mymatch.index+i-b) != input.charAt(i)) { - return false; + char c1 = input.charAt(mymatch.index+i-b); + char c2 = input.charAt(i); + if (c1 != c2) { + if (insens) { + if (c1 != Character.toLowerCase(c2) && + c1 != Character.toUpperCase(c2)) { + return false; + } + } + else { + return false; + } } } mymatch.index += e-b; diff --git a/libjava/classpath/gnu/regexp/RETokenChar.java b/libjava/classpath/gnu/regexp/RETokenChar.java index a15449b2d96..5c087c68778 100644 --- a/libjava/classpath/gnu/regexp/RETokenChar.java +++ b/libjava/classpath/gnu/regexp/RETokenChar.java @@ -52,6 +52,10 @@ final class RETokenChar extends REToken { return ch.length; } + int getMaximumLength() { + return ch.length; + } + boolean match(CharIndexed input, REMatch mymatch) { int z = ch.length; char c; @@ -68,7 +72,7 @@ final class RETokenChar extends REToken { // Overrides REToken.chain() to optimize for strings boolean chain(REToken next) { - if (next instanceof RETokenChar) { + if (next instanceof RETokenChar && ((RETokenChar)next).insens == insens) { RETokenChar cnext = (RETokenChar) next; // assume for now that next can only be one character int newsize = ch.length + cnext.ch.length; diff --git a/libjava/classpath/gnu/regexp/RETokenEnd.java b/libjava/classpath/gnu/regexp/RETokenEnd.java index 70483b746a9..788a964da41 100644 --- a/libjava/classpath/gnu/regexp/RETokenEnd.java +++ b/libjava/classpath/gnu/regexp/RETokenEnd.java @@ -49,6 +49,10 @@ final class RETokenEnd extends REToken { this.newline = newline; } + int getMaximumLength() { + return 0; + } + boolean match(CharIndexed input, REMatch mymatch) { char ch = input.charAt(mymatch.index); if (ch == CharIndexed.OUT_OF_BOUNDS) diff --git a/libjava/classpath/gnu/regexp/RETokenEndSub.java b/libjava/classpath/gnu/regexp/RETokenEndSub.java index f3bb4f2e131..fe2969d0592 100644 --- a/libjava/classpath/gnu/regexp/RETokenEndSub.java +++ b/libjava/classpath/gnu/regexp/RETokenEndSub.java @@ -41,6 +41,10 @@ final class RETokenEndSub extends REToken { RETokenEndSub(int subIndex) { super(subIndex); } + + int getMaximumLength() { + return 0; + } boolean match(CharIndexed input, REMatch mymatch) { mymatch.end[subIndex] = mymatch.index; diff --git a/libjava/classpath/gnu/regexp/RETokenIndependent.java b/libjava/classpath/gnu/regexp/RETokenIndependent.java new file mode 100644 index 00000000000..2eb14722361 --- /dev/null +++ b/libjava/classpath/gnu/regexp/RETokenIndependent.java @@ -0,0 +1,76 @@ +/* gnu/regexp/RETokenIndependent.java + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.regexp; + +/** + * @author Ito Kazumitsu + */ +final class RETokenIndependent extends REToken +{ + REToken re; + + RETokenIndependent(REToken re) throws REException { + super(0); + this.re = re; + } + + int getMinimumLength() { + return re.getMinimumLength(); + } + + int getMaximumLength() { + return re.getMaximumLength(); + } + + boolean match(CharIndexed input, REMatch mymatch) + { + if (re.match(input, mymatch)) { + // Once we have found a match, we do not see other possible matches. + mymatch.next = null; + return next(input, mymatch); + } + return false; + } + + void dump(StringBuffer os) { + os.append("(?>"); + re.dumpAll(os); + os.append(')'); + } +} + diff --git a/libjava/classpath/gnu/regexp/RETokenLookAhead.java b/libjava/classpath/gnu/regexp/RETokenLookAhead.java index 33eaec9fac1..b44dfa50c4f 100644 --- a/libjava/classpath/gnu/regexp/RETokenLookAhead.java +++ b/libjava/classpath/gnu/regexp/RETokenLookAhead.java @@ -52,6 +52,10 @@ final class RETokenLookAhead extends REToken this.negative = negative; } + int getMaximumLength() { + return 0; + } + boolean match(CharIndexed input, REMatch mymatch) { REMatch trymatch = (REMatch)mymatch.clone(); diff --git a/libjava/classpath/gnu/regexp/RETokenLookBehind.java b/libjava/classpath/gnu/regexp/RETokenLookBehind.java new file mode 100644 index 00000000000..a6c1b34cb0b --- /dev/null +++ b/libjava/classpath/gnu/regexp/RETokenLookBehind.java @@ -0,0 +1,116 @@ +/* gnu/regexp/RETokenLookBehind.java + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + +package gnu.regexp; + +/** + * @author Ito Kazumitsu + */ +final class RETokenLookBehind extends REToken +{ + REToken re; + boolean negative; + + RETokenLookBehind(REToken re, boolean negative) throws REException { + super(0); + this.re = re; + this.negative = negative; + } + + int getMaximumLength() { + return 0; + } + + boolean match(CharIndexed input, REMatch mymatch) + { + int max = re.getMaximumLength(); + CharIndexed behind = input.lookBehind(mymatch.index, max); + REMatch trymatch = (REMatch)mymatch.clone(); + REMatch trymatch1 = (REMatch)mymatch.clone(); + REMatch newMatch = null; + int curIndex = trymatch.index + behind.length() - input.length(); + trymatch.index = 0; + RETokenMatchHereOnly stopper = new RETokenMatchHereOnly(curIndex); + REToken re1 = (REToken) re.clone(); + re1.chain(stopper); + if (re1.match(behind, trymatch)) { + if (negative) return false; + if (next(input, trymatch1)) + newMatch = trymatch1; + } + + if (newMatch != null) { + if (negative) return false; + //else + mymatch.assignFrom(newMatch); + return true; + } + else { // no match + if (negative) + return next(input, mymatch); + //else + return false; + } + } + + void dump(StringBuffer os) { + os.append("(?<"); + os.append(negative ? '!' : '='); + re.dumpAll(os); + os.append(')'); + } + + private static class RETokenMatchHereOnly extends REToken { + + int getMaximumLength() { return 0; } + + private int index; + + RETokenMatchHereOnly(int index) { + super(0); + this.index = index; + } + + boolean match(CharIndexed input, REMatch mymatch) { + return index == mymatch.index; + } + + void dump(StringBuffer os) {} + + } +} + diff --git a/libjava/classpath/gnu/regexp/RETokenNamedProperty.java b/libjava/classpath/gnu/regexp/RETokenNamedProperty.java new file mode 100644 index 00000000000..13c1e418a09 --- /dev/null +++ b/libjava/classpath/gnu/regexp/RETokenNamedProperty.java @@ -0,0 +1,301 @@ +/* gnu/regexp/RETokenNamedProperty.java + Copyright (C) 2006 Free Software Foundation, Inc. + +This file is part of GNU Classpath. + +GNU Classpath is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2, or (at your option) +any later version. + +GNU Classpath is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with GNU Classpath; see the file COPYING. If not, write to the +Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301 USA. + +Linking this library statically or dynamically with other modules is +making a combined work based on this library. Thus, the terms and +conditions of the GNU General Public License cover the whole +combination. + +As a special exception, the copyright holders of this library give you +permission to link this library with independent modules to produce an +executable, regardless of the license terms of these independent +modules, and to copy and distribute the resulting executable under +terms of your choice, provided that you also meet, for each linked +independent module, the terms and conditions of the license of that +module. An independent module is a module which is not derived from +or based on this library. If you modify this library, you may extend +this exception to your version of the library, but you are not +obligated to do so. If you do not wish to do so, delete this +exception statement from your version. */ + + +package gnu.regexp; + +final class RETokenNamedProperty extends REToken { + String name; + boolean insens; + boolean negate; + Handler handler; + + // Grouped properties + static final byte[] LETTER = new byte[] + { Character.LOWERCASE_LETTER, + Character.UPPERCASE_LETTER, + Character.TITLECASE_LETTER, + Character.MODIFIER_LETTER, + Character.OTHER_LETTER }; + + static final byte[] MARK = new byte[] + { Character.NON_SPACING_MARK, + Character.COMBINING_SPACING_MARK, + Character.ENCLOSING_MARK }; + + static final byte[] SEPARATOR = new byte[] + { Character.SPACE_SEPARATOR, + Character.LINE_SEPARATOR, + Character.PARAGRAPH_SEPARATOR }; + + static final byte[] SYMBOL = new byte[] + { Character.MATH_SYMBOL, + Character.CURRENCY_SYMBOL, + Character.MODIFIER_SYMBOL, + Character.OTHER_SYMBOL }; + + static final byte[] NUMBER = new byte[] + { Character.DECIMAL_DIGIT_NUMBER, + Character.LETTER_NUMBER, + Character.OTHER_NUMBER }; + + static final byte[] PUNCTUATION = new byte[] + { Character.DASH_PUNCTUATION, + Character.START_PUNCTUATION, + Character.END_PUNCTUATION, + Character.CONNECTOR_PUNCTUATION, + Character.OTHER_PUNCTUATION, + Character.INITIAL_QUOTE_PUNCTUATION, + Character.FINAL_QUOTE_PUNCTUATION}; + + static final byte[] OTHER = new byte[] + { Character.CONTROL, + Character.FORMAT, + Character.PRIVATE_USE, + Character.SURROGATE, + Character.UNASSIGNED }; + + RETokenNamedProperty(int subIndex, String name, boolean insens, boolean negate) throws REException { + super(subIndex); + this.name = name; + this.insens = insens; + this.negate = negate; + handler = getHandler(name); + } + + int getMinimumLength() { + return 1; + } + + int getMaximumLength() { + return 1; + } + + boolean match(CharIndexed input, REMatch mymatch) { + char ch = input.charAt(mymatch.index); + if (ch == CharIndexed.OUT_OF_BOUNDS) + return false; + + boolean retval = handler.includes(ch); + if (insens) { + retval = retval || + handler.includes(Character.toUpperCase(ch)) || + handler.includes(Character.toLowerCase(ch)); + } + + if (negate) retval = !retval; + if (retval) { + ++mymatch.index; + return next(input, mymatch); + } + else return false; + } + + void dump(StringBuffer os) { + os.append("\\") + .append(negate ? "P" : "p") + .append("{" + name + "}"); + } + + private abstract static class Handler { + public abstract boolean includes(char c); + } + + private Handler getHandler(String name) throws REException { + if (name.equals("Lower") || + name.equals("Upper") || + // name.equals("ASCII") || + name.equals("Alpha") || + name.equals("Digit") || + name.equals("Alnum") || + name.equals("Punct") || + name.equals("Graph") || + name.equals("Print") || + name.equals("Blank") || + name.equals("Cntrl") || + name.equals("XDigit") || + name.equals("Space") ) { + return new POSIXHandler(name); + } + if (name.startsWith("In")) { + try { + name = name.substring(2); + Character.UnicodeBlock block = Character.UnicodeBlock.forName(name); + return new UnicodeBlockHandler(block); + } + catch (IllegalArgumentException e) { + throw new REException("Invalid Unicode block name: " + name, REException.REG_ESCAPE, 0); + } + } + if (name.startsWith("Is")) { + name = name.substring(2); + } + + // "grouped properties" + if (name.equals("L")) + return new UnicodeCategoriesHandler(LETTER); + if (name.equals("M")) + return new UnicodeCategoriesHandler(MARK); + if (name.equals("Z")) + return new UnicodeCategoriesHandler(SEPARATOR); + if (name.equals("S")) + return new UnicodeCategoriesHandler(SYMBOL); + if (name.equals("N")) + return new UnicodeCategoriesHandler(NUMBER); + if (name.equals("P")) + return new UnicodeCategoriesHandler(PUNCTUATION); + if (name.equals("C")) + return new UnicodeCategoriesHandler(OTHER); + + if (name.equals("Mc")) + return new UnicodeCategoryHandler(Character.COMBINING_SPACING_MARK); + if (name.equals("Pc")) + return new UnicodeCategoryHandler(Character.CONNECTOR_PUNCTUATION); + if (name.equals("Cc")) + return new UnicodeCategoryHandler(Character.CONTROL); + if (name.equals("Sc")) + return new UnicodeCategoryHandler(Character.CURRENCY_SYMBOL); + if (name.equals("Pd")) + return new UnicodeCategoryHandler(Character.DASH_PUNCTUATION); + if (name.equals("Nd")) + return new UnicodeCategoryHandler(Character.DECIMAL_DIGIT_NUMBER); + if (name.equals("Me")) + return new UnicodeCategoryHandler(Character.ENCLOSING_MARK); + if (name.equals("Pe")) + return new UnicodeCategoryHandler(Character.END_PUNCTUATION); + if (name.equals("Pf")) + return new UnicodeCategoryHandler(Character.FINAL_QUOTE_PUNCTUATION); + if (name.equals("Cf")) + return new UnicodeCategoryHandler(Character.FORMAT); + if (name.equals("Pi")) + return new UnicodeCategoryHandler(Character.INITIAL_QUOTE_PUNCTUATION); + if (name.equals("Nl")) + return new UnicodeCategoryHandler(Character.LETTER_NUMBER); + if (name.equals("Zl")) + return new UnicodeCategoryHandler(Character.LINE_SEPARATOR); + if (name.equals("Ll")) + return new UnicodeCategoryHandler(Character.LOWERCASE_LETTER); + if (name.equals("Sm")) + return new UnicodeCategoryHandler(Character.MATH_SYMBOL); + if (name.equals("Lm")) + return new UnicodeCategoryHandler(Character.MODIFIER_LETTER); + if (name.equals("Sk")) + return new UnicodeCategoryHandler(Character.MODIFIER_SYMBOL); + if (name.equals("Mn")) + return new UnicodeCategoryHandler(Character.NON_SPACING_MARK); + if (name.equals("Lo")) + return new UnicodeCategoryHandler(Character.OTHER_LETTER); + if (name.equals("No")) + return new UnicodeCategoryHandler(Character.OTHER_NUMBER); + if (name.equals("Po")) + return new UnicodeCategoryHandler(Character.OTHER_PUNCTUATION); + if (name.equals("So")) + return new UnicodeCategoryHandler(Character.OTHER_SYMBOL); + if (name.equals("Zp")) + return new UnicodeCategoryHandler(Character.PARAGRAPH_SEPARATOR); + if (name.equals("Co")) + return new UnicodeCategoryHandler(Character.PRIVATE_USE); + if (name.equals("Zs")) + return new UnicodeCategoryHandler(Character.SPACE_SEPARATOR); + if (name.equals("Ps")) + return new UnicodeCategoryHandler(Character.START_PUNCTUATION); + if (name.equals("Cs")) + return new UnicodeCategoryHandler(Character.SURROGATE); + if (name.equals("Lt")) + return new UnicodeCategoryHandler(Character.TITLECASE_LETTER); + if (name.equals("Cn")) + return new UnicodeCategoryHandler(Character.UNASSIGNED); + if (name.equals("Lu")) + return new UnicodeCategoryHandler(Character.UPPERCASE_LETTER); + throw new REException("unsupported name " + name, REException.REG_ESCAPE, 0); + } + + private static class POSIXHandler extends Handler { + private RETokenPOSIX retoken; + private REMatch mymatch = new REMatch(0,0,0); + private char[] chars = new char[1]; + private CharIndexedCharArray ca = new CharIndexedCharArray(chars, 0); + public POSIXHandler(String name) { + int posixId = RETokenPOSIX.intValue(name.toLowerCase()); + if (posixId != -1) + retoken = new RETokenPOSIX(0,posixId,false,false); + else + throw new RuntimeException("Unknown posix ID: " + name); + } + public boolean includes(char c) { + chars[0] = c; + mymatch.index = 0; + return retoken.match(ca, mymatch); + } + } + + private static class UnicodeCategoryHandler extends Handler { + public UnicodeCategoryHandler(byte category) { + this.category = (int)category; + } + private int category; + public boolean includes(char c) { + return Character.getType(c) == category; + } + } + + private static class UnicodeCategoriesHandler extends Handler { + public UnicodeCategoriesHandler(byte[] categories) { + this.categories = categories; + } + private byte[] categories; + public boolean includes(char c) { + int category = Character.getType(c); + for (int i = 0; i < categories.length; i++) + if (category == categories[i]) + return true; + return false; + } + } + + private static class UnicodeBlockHandler extends Handler { + public UnicodeBlockHandler(Character.UnicodeBlock block) { + this.block = block; + } + private Character.UnicodeBlock block; + public boolean includes(char c) { + Character.UnicodeBlock cblock = Character.UnicodeBlock.of(c); + return (cblock != null && cblock.equals(block)); + } + } + +} diff --git a/libjava/classpath/gnu/regexp/RETokenOneOf.java b/libjava/classpath/gnu/regexp/RETokenOneOf.java index 3f6e89e2103..260bc4b8f67 100644 --- a/libjava/classpath/gnu/regexp/RETokenOneOf.java +++ b/libjava/classpath/gnu/regexp/RETokenOneOf.java @@ -1,5 +1,5 @@ /* gnu/regexp/RETokenOneOf.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -37,11 +37,35 @@ exception statement from your version. */ package gnu.regexp; import java.util.Vector; +import java.util.Stack; final class RETokenOneOf extends REToken { private Vector options; private boolean negative; + private Vector addition; + // This Vector addition is used to store nested character classes. + // For example, if the original expression is + // [2-7a-c[f-k][m-z]&&[^p-v][st]] + // the basic part /2-7a-c/ is stored in the Vector options, and + // the additional part /[f-k][m-z]&&[^p-v][st]/ is stored in the + // Vector addition in the following order (Reverse Polish Notation): + // -- The matching result of the basic part is assumed here. + // [f-k] -- REToken + // "|" -- or + // [m-z] -- REToken + // "|" -- or + // false + // [^p-v] -- REToken + // "|" -- or + // [st] -- REToken + // "|" -- or + // "&" -- and + // + // As it is clear from the explanation above, the Vector addition is + // effective only when this REToken originates from a character class + // expression. + // This constructor is used for convenience when we know the set beforehand, // e.g. \d --> new RETokenOneOf("0123456789",false, ..) // \D --> new RETokenOneOf("0123456789",true, ..) @@ -60,7 +84,17 @@ final class RETokenOneOf extends REToken { this.negative = negative; } + RETokenOneOf(int subIndex, Vector options, Vector addition, boolean negative) { + super(subIndex); + this.options = options; + this.addition = addition; + this.negative = negative; + } + int getMinimumLength() { + // (negative || addition != null) occurs when this token originates from + // character class expression. + if (negative || addition != null) return 1; int min = Integer.MAX_VALUE; int x; for (int i=0; i < options.size(); i++) { @@ -70,54 +104,123 @@ final class RETokenOneOf extends REToken { return min; } + int getMaximumLength() { + // (negative || addition != null) occurs when this token originates from + // character class expression. + if (negative || addition != null) return 1; + int max = 0; + int x; + for (int i=0; i < options.size(); i++) { + if ((x = ((REToken) options.elementAt(i)).getMaximumLength()) > max) + max = x; + } + return max; + } + boolean match(CharIndexed input, REMatch mymatch) { - if (negative && (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS)) + REMatch tryMatch; + boolean tryOnly; + if (addition == null) { + tryMatch = mymatch; + tryOnly = false; + } + else { + tryMatch = (REMatch) mymatch.clone(); + tryOnly = true; + } + boolean b = negative ? + matchN(input, tryMatch, tryOnly) : + matchP(input, tryMatch, tryOnly); + if (addition == null) return b; + + Stack stack = new Stack(); + stack.push(new Boolean(b)); + for (int i=0; i < addition.size(); i++) { + Object obj = addition.elementAt(i); + if (obj instanceof REToken) { + b = ((REToken)obj).match(input, (REMatch)mymatch.clone()); + stack.push(new Boolean(b)); + } + else if (obj instanceof Boolean) { + stack.push(obj); + } + else if (obj.equals("|")) { + b = ((Boolean)stack.pop()).booleanValue(); + b = ((Boolean)stack.pop()).booleanValue() || b; + stack.push(new Boolean(b)); + } + else if (obj.equals("&")) { + b = ((Boolean)stack.pop()).booleanValue(); + b = ((Boolean)stack.pop()).booleanValue() && b; + stack.push(new Boolean(b)); + } + else { + throw new RuntimeException("Invalid object found"); + } + } + b = ((Boolean)stack.pop()).booleanValue(); + if (b) { + ++mymatch.index; + return next(input, mymatch); + } return false; + } - REMatch newMatch = null; - REMatch last = null; - REToken tk; - boolean isMatch; - for (int i=0; i < options.size(); i++) { + private boolean matchN(CharIndexed input, REMatch mymatch, boolean tryOnly) { + if (input.charAt(mymatch.index) == CharIndexed.OUT_OF_BOUNDS) + return false; + + REMatch newMatch = null; + REMatch last = null; + REToken tk; + for (int i=0; i < options.size(); i++) { tk = (REToken) options.elementAt(i); REMatch tryMatch = (REMatch) mymatch.clone(); if (tk.match(input, tryMatch)) { // match was successful - if (negative) return false; - - if (next(input, tryMatch)) { - // Add tryMatch to list of possibilities. - if (last == null) { - newMatch = tryMatch; - last = tryMatch; - } else { - last.next = tryMatch; - last = tryMatch; - } - } // next succeeds - } // is a match - } // try next option - - if (newMatch != null) { - if (negative) { return false; - } else { - // set contents of mymatch equal to newMatch + } // is a match + } // try next option - // try each one that matched - mymatch.assignFrom(newMatch); - return true; - } - } else { - if (negative) { - ++mymatch.index; - return next(input, mymatch); - } else { - return false; - } + if (tryOnly) return true; + ++mymatch.index; + return next(input, mymatch); } - // index+1 works for [^abc] lists, not for generic lookahead (--> index) - } + private boolean matchP(CharIndexed input, REMatch mymatch, boolean tryOnly) { + boolean stopMatchingIfSatisfied = + (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0; + REMatch.REMatchList newMatch = new REMatch.REMatchList(); + REToken tk; + for (int i=0; i < options.size(); i++) { + // In order that the backtracking can work, + // each option must be chained to the next token. + // But the chain method has some side effect, so + // we use clones. + tk = (REToken)((REToken) options.elementAt(i)).clone(); + if (! tryOnly) { + tk.chain(this.next); + tk.setUncle(this.uncle); + tk.subIndex = this.subIndex; + } + REMatch tryMatch = (REMatch) mymatch.clone(); + if (tk.match(input, tryMatch)) { // match was successful + if (tryOnly) return true; + newMatch.addTail(tryMatch); + if (stopMatchingIfSatisfied) break; + } // is a match + } // try next option + if (tryOnly) return false; + + if (newMatch.head != null) { + // set contents of mymatch equal to newMatch + + // try each one that matched + mymatch.assignFrom(newMatch.head); + return true; + } else { + return false; + } + } void dump(StringBuffer os) { os.append(negative ? "[^" : "(?:"); diff --git a/libjava/classpath/gnu/regexp/RETokenPOSIX.java b/libjava/classpath/gnu/regexp/RETokenPOSIX.java index bbb8066bca8..4182c6fab98 100644 --- a/libjava/classpath/gnu/regexp/RETokenPOSIX.java +++ b/libjava/classpath/gnu/regexp/RETokenPOSIX.java @@ -81,6 +81,10 @@ final class RETokenPOSIX extends REToken { return 1; } + int getMaximumLength() { + return 1; + } + boolean match(CharIndexed input, REMatch mymatch) { char ch = input.charAt(mymatch.index); if (ch == CharIndexed.OUT_OF_BOUNDS) diff --git a/libjava/classpath/gnu/regexp/RETokenRange.java b/libjava/classpath/gnu/regexp/RETokenRange.java index dadaf2d8072..8a1ac86b212 100644 --- a/libjava/classpath/gnu/regexp/RETokenRange.java +++ b/libjava/classpath/gnu/regexp/RETokenRange.java @@ -43,19 +43,32 @@ final class RETokenRange extends REToken { RETokenRange(int subIndex, char lo, char hi, boolean ins) { super(subIndex); - this.lo = (insens = ins) ? Character.toLowerCase(lo) : lo; - this.hi = ins ? Character.toLowerCase(hi) : hi; + insens = ins; + this.lo = lo; + this.hi = hi; } int getMinimumLength() { return 1; } + int getMaximumLength() { + return 1; + } + boolean match(CharIndexed input, REMatch mymatch) { char c = input.charAt(mymatch.index); if (c == CharIndexed.OUT_OF_BOUNDS) return false; - if (insens) c = Character.toLowerCase(c); - if ((c >= lo) && (c <= hi)) { + boolean matches = (c >= lo) && (c <= hi); + if (! matches && insens) { + char c1 = Character.toLowerCase(c); + matches = (c1 >= lo) && (c1 <= hi); + if (!matches) { + c1 = Character.toUpperCase(c); + matches = (c1 >= lo) && (c1 <= hi); + } + } + if (matches) { ++mymatch.index; return next(input, mymatch); } diff --git a/libjava/classpath/gnu/regexp/RETokenRepeated.java b/libjava/classpath/gnu/regexp/RETokenRepeated.java index 6291a3c3960..2d019c53cbd 100644 --- a/libjava/classpath/gnu/regexp/RETokenRepeated.java +++ b/libjava/classpath/gnu/regexp/RETokenRepeated.java @@ -1,5 +1,5 @@ /* gnu/regexp/RETokenRepeated.java - Copyright (C) 1998-2001, 2004 Free Software Foundation, Inc. + Copyright (C) 2006 Free Software Foundation, Inc. This file is part of GNU Classpath. @@ -39,6 +39,7 @@ exception statement from your version. */ package gnu.regexp; import java.util.Vector; +import java.util.Arrays; final class RETokenRepeated extends REToken { private REToken token; @@ -82,6 +83,38 @@ final class RETokenRepeated extends REToken { return (min * token.getMinimumLength()); } + int getMaximumLength() { + if (max == Integer.MAX_VALUE) return Integer.MAX_VALUE; + int tmax = token.getMaximumLength(); + if (tmax == Integer.MAX_VALUE) return tmax; + return (max * tmax); + } + + private static REMatch findDoables(REToken tk, + CharIndexed input, REMatch mymatch) { + + REMatch.REMatchList doables = new REMatch.REMatchList(); + + // try next repeat at all possible positions + for (REMatch current = mymatch; + current != null; current = current.next) { + REMatch recurrent = (REMatch) current.clone(); + int origin = recurrent.index; + tk = (REToken) tk.clone(); + tk.next = tk.uncle = null; + recurrent.matchFlags |= REMatch.MF_FIND_ALL; + if (tk.match(input, recurrent)) { + for (REMatch m = recurrent; m != null; m = m.next) { + m.matchFlags &= ~REMatch.MF_FIND_ALL; + } + if (recurrent.index == origin) recurrent.empty = true; + // add all items in current to doables array + doables.addTail(recurrent); + } + } + return doables.head; + } + // We do need to save every possible point, but the number of clone() // invocations here is really a killer for performance on non-stingy // repeat operators. I'm open to suggestions... @@ -91,59 +124,167 @@ final class RETokenRepeated extends REToken { // the subexpression back-reference operator allow that? boolean match(CharIndexed input, REMatch mymatch) { - // number of times we've matched so far - int numRepeats = 0; - - // Possible positions for the next repeat to match at - REMatch newMatch = mymatch; - REMatch last = null; - REMatch current; - // Add the '0-repeats' index - // positions.elementAt(z) == position [] in input after <<z>> matches - Vector positions = new Vector(); - positions.addElement(newMatch); - - // Declare variables used in loop - REMatch doables; - REMatch doablesLast; - REMatch recurrent; - int lastIndex = mymatch.index; - - do { - // Check for stingy match for each possibility. - if (stingy && (numRepeats >= min)) { - REMatch result = matchRest(input, newMatch); - if (result != null) { - mymatch.assignFrom(result); - return true; - } + boolean stopMatchingIfSatisfied = + (mymatch.matchFlags & REMatch.MF_FIND_ALL) == 0; + + REMatch newMatch = matchMinimum(input, mymatch); + if (newMatch == null) return false; + + // Array of positions we have already visited + int[] visited = initVisited(); + for (REMatch m = newMatch; m != null; m = m.next) { + visited = addVisited(m.index, visited); + } + + int max1 = decreaseMax(max, min); + + newMatch = _match(input, newMatch, max1, + stopMatchingIfSatisfied, visited); + if (newMatch != null) { + mymatch.assignFrom(newMatch); + return true; + } + return false; + } + + private static int decreaseMax(int m, int n) { + if (m == Integer.MAX_VALUE) return m; + return m - n; + } + + // Array visited is an array of character positions we have already + // visited. visited[0] is used to store the effective length of the + // array. + private static int[] initVisited() { + int[] visited = new int[32]; + visited[0] = 0; + return visited; + } + + private static boolean visitedContains(int n, int[] visited) { + // Experience tells that for a small array like this, + // simple linear search is faster than binary search. + for (int i = 1; i < visited[0]; i++) { + if (n == visited[i]) return true; + } + return false; + } + + private static int[] addVisited(int n, int[] visited) { + if (visitedContains(n, visited)) return visited; + if (visited[0] >= visited.length - 1) { + int[] newvisited = new int[visited.length + 32]; + System.arraycopy(visited, 0, newvisited, 0, visited.length); + visited = newvisited; + } + visited[0]++; + visited[visited[0]] = n; + return visited; + } + + private REMatch _match(CharIndexed input, REMatch mymatch, + int max1, boolean stopMatchingIfSatisfied, + int[] visited) { + + if (max1 == 0) { + return matchRest(input, mymatch); + } + max1 = decreaseMax(max1, 1); + + REMatch.REMatchList allResults = new REMatch.REMatchList(); + + // Depth-first search + + for (REMatch cur = mymatch; cur != null; cur = cur.next) { + + REMatch cur1 = (REMatch) cur.clone(); + + if (stingy) { + REMatch results = matchRest(input, cur1); + if (results != null) { + if (stopMatchingIfSatisfied) { + return results; + } + allResults.addTail(results); + } } - doables = null; - doablesLast = null; + DO_THIS: + do { - // try next repeat at all possible positions - for (current = newMatch; current != null; current = current.next) { - recurrent = (REMatch) current.clone(); - if (token.match(input, recurrent)) { - // add all items in current to doables array - if (doables == null) { - doables = recurrent; - doablesLast = recurrent; - } else { - // Order these from longest to shortest - // Start by assuming longest (more repeats) - doablesLast.next = recurrent; + boolean emptyMatchFound = false; + REMatch doables = findDoables(token, input, cur1); + if (doables == null) break DO_THIS; + if (doables.empty) emptyMatchFound = true; + + if (!emptyMatchFound) { + REMatch.REMatchList list = new REMatch.REMatchList(); + for (REMatch m = doables; m != null; m = m.next) { + REMatch m1 = (REMatch) m.clone(); + int n = m1.index; + if (! visitedContains(n, visited)) { + visited = addVisited(n, visited); + list.addTail(m1); } - // Find new doablesLast - while (doablesLast.next != null) { - doablesLast = doablesLast.next; + } + if (list.head == null) break DO_THIS; + doables = list.head; + } + + for (REMatch m = doables; m != null; m = m.next) { + if (! emptyMatchFound) { + REMatch m1 = _match(input, m, max1, + stopMatchingIfSatisfied, visited); + if (possessive) return m1; + if (m1 != null) { + if (stopMatchingIfSatisfied) { + return m1; + } + allResults.addTail(m1); + } + } + else { + REMatch m1 = matchRest(input, m); + if (m1 != null) { + if (stopMatchingIfSatisfied) { + return m1; + } + allResults.addTail(m1); } } } - // if none of the possibilities worked out, break out of do/while - if (doables == null) break; + + } while (false); // DO_THIS only once; + + // This point itself is a candidate. + if (!stingy) { + REMatch m2 = matchRest(input, cur1); + if (m2 != null) { + if (stopMatchingIfSatisfied) { + return m2; + } + allResults.addTail(m2); + } + } + } + + return allResults.head; + } + + private REMatch matchMinimum(CharIndexed input, final REMatch mymatch) { + // Possible positions for the next repeat to match at + REMatch newMatch = mymatch; + + // number of times we've matched so far + int numRepeats = 0; + + while (numRepeats < min) { + REMatch doables = findDoables(token, input, newMatch); + + // if none of the possibilities worked out, + // it means that minimum number of repeats could not be found. + if (doables == null) return null; // reassign where the next repeat can match newMatch = doables; @@ -151,91 +292,24 @@ final class RETokenRepeated extends REToken { // increment how many repeats we've successfully found ++numRepeats; - positions.addElement(newMatch); - - // doables.index == lastIndex means an empty string - // was the longest that matched this token. - // We break here, otherwise we will fall into an endless loop. - if (doables.index == lastIndex) { - if (numRepeats < min) numRepeats = min; - break; - } - lastIndex = doables.index; - } while (numRepeats < max); - - // If there aren't enough repeats, then fail - if (numRepeats < min) return false; - - // We're greedy, but ease off until a true match is found - int posIndex = positions.size(); - - // At this point we've either got too many or just the right amount. - // See if this numRepeats works with the rest of the regexp. - REMatch allResults = null; - REMatch allResultsLast = null; - - REMatch results = null; - int indexCount = posIndex - min; - if (indexCount <= 0) { - // This case occurs when we exited the previous do loop before - // numRepeats >= min because an empty string matched the token. - // In this case, an empty string can match as many times as - // desired. - indexCount = 1; - } - while (indexCount-- > 0) { - --posIndex; - newMatch = (REMatch) positions.elementAt(posIndex); - results = matchRest(input, newMatch); - if (results != null) { - if (allResults == null) { - allResults = results; - allResultsLast = results; - } else { - // Order these from longest to shortest - // Start by assuming longest (more repeats) - allResultsLast.next = results; - } - // Find new doablesLast - while (allResultsLast.next != null) { - allResultsLast = allResultsLast.next; - } - } - // else did not match rest of the tokens, try again on smaller sample - // or break out when performing possessive matching - if (possessive) break; + if (newMatch.empty) break; } - if (allResults != null) { - mymatch.assignFrom(allResults); // does this get all? - return true; - } - // If we fall out, no matches. - return false; + return newMatch; } private REMatch matchRest(CharIndexed input, final REMatch newMatch) { REMatch current, single; - REMatch doneIndex = null; - REMatch doneIndexLast = null; + REMatch.REMatchList doneIndex = new REMatch.REMatchList(); // Test all possible matches for this number of repeats for (current = newMatch; current != null; current = current.next) { // clone() separates a single match from the chain single = (REMatch) current.clone(); if (next(input, single)) { // chain results to doneIndex - if (doneIndex == null) { - doneIndex = single; - doneIndexLast = single; - } else { - doneIndexLast.next = single; - } - // Find new doneIndexLast - while (doneIndexLast.next != null) { - doneIndexLast = doneIndexLast.next; - } + doneIndex.addTail(single); } } - return doneIndex; + return doneIndex.head; } void dump(StringBuffer os) { diff --git a/libjava/classpath/gnu/regexp/RETokenStart.java b/libjava/classpath/gnu/regexp/RETokenStart.java index 8f7198237e1..42e3c0b2de0 100644 --- a/libjava/classpath/gnu/regexp/RETokenStart.java +++ b/libjava/classpath/gnu/regexp/RETokenStart.java @@ -44,6 +44,10 @@ class RETokenStart extends REToken { super(subIndex); this.newline = newline; } + + int getMaximumLength() { + return 0; + } boolean match(CharIndexed input, REMatch mymatch) { // charAt(index-n) may be unknown on a Reader/InputStream. FIXME diff --git a/libjava/classpath/gnu/regexp/RETokenWordBoundary.java b/libjava/classpath/gnu/regexp/RETokenWordBoundary.java index 6804151e261..f86214bbf68 100644 --- a/libjava/classpath/gnu/regexp/RETokenWordBoundary.java +++ b/libjava/classpath/gnu/regexp/RETokenWordBoundary.java @@ -52,6 +52,11 @@ final class RETokenWordBoundary extends REToken { this.where = where; this.negated = negated; } + + int getMaximumLength() { + return 0; + } + boolean match(CharIndexed input, REMatch mymatch) { // Word boundary means input[index-1] was a word character |

