/* URI.java -- An URI class Copyright (C) 2002, 2004, 2005, 2006, 2008 Free Software Foundation, Inc. This file is part of GNU Classpath. GNU Classpath is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. GNU Classpath is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Classpath; see the file COPYING. If not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. Linking this library statically or dynamically with other modules is making a combined work based on this library. Thus, the terms and conditions of the GNU General Public License cover the whole combination. As a special exception, the copyright holders of this library give you permission to link this library with independent modules to produce an executable, regardless of the license terms of these independent modules, and to copy and distribute the resulting executable under terms of your choice, provided that you also meet, for each linked independent module, the terms and conditions of the license of that module. An independent module is a module which is not derived from or based on this library. If you modify this library, you may extend this exception to your version of the library, but you are not obligated to do so. If you do not wish to do so, delete this exception statement from your version. */ package java.net; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; import java.util.regex.Matcher; import java.util.regex.Pattern; /** *
* A URI instance represents that defined by * RFC3986, * with some deviations. *
** At its highest level, a URI consists of: *
*[scheme:]scheme-specific-part
 * [#fragment]
 * 
 * * where # and : are literal characters, * and those parts enclosed in square brackets are optional. *
** There are two main types of URI. An opaque URI is one * which just consists of the above three parts, and is not further * defined. An example of such a URI would be mailto: URI. * In contrast, hierarchical URIs give further definition * to the scheme-specific part, so as represent some part of a hierarchical * structure. *
*
 * [//authority][path]
 * [?query]
 * 
* with / and ? being literal characters. * When server-based, the authority section is further subdivided into: *
*
 * [user-info@]host
 * [:port]
 * 
* with @ and : as literal characters. * Authority sections that are not server-based are said to be registry-based. *
** Hierarchical URIs can be either relative or absolute. Absolute URIs * always start with a `/', while relative URIs don't * specify a scheme. Opaque URIs are always absolute. *
*
 * Each part of the URI may have one of three states: undefined, empty
 * or containing some content.  The former two of these are represented
 * by null and the empty string in Java, respectively.
 * The scheme-specific part may never be undefined.  It also follows from
 * this that the path sub-part may also not be undefined, so as to ensure
 * the former.
 * 
* The characters that can be used within a valid URI are restricted. * There are two main classes of characters which can't be used as is * within the URI: *
** The set of valid characters differs depending on the section of the URI: *
** These definitions reference the following sets of characters: *
*
 * The constructors and accessor methods allow the use and retrieval of
 * URI components which contain non-US-ASCII characters directly.
 * They are only escaped when the toASCIIString() method
 * is used.  In contrast, illegal characters are always quoted, with the
 * exception of the return values of the non-raw accessors.
 * 
* Returns the string content of the specified group of the supplied * matcher. The returned value is modified according to the following: *
*null is returned to indicate an undefined
   * value.  Otherwise, the value is truly the empty string and this is
   * the returned value.* This method is used for matching against all parts of the URI * that may be either undefined or empty (i.e. all those but the * scheme-specific part and the path). In each case, the preceding * group is the content of the original group, along with some * additional distinguishing feature. For example, the preceding * group for the query includes the preceding question mark, * while that of the fragment includes the hash symbol. The presence * of these features enables disambiguation between the two cases * of a completely unspecified value and a simple non-existant value. * The scheme differs in that it will never return an empty string; * the delimiter follows the scheme rather than preceding it, so * it becomes part of the following section. The same is true * of the user information. *
* * @param match the matcher, which contains the results of the URI * matched against the URI regular expression. * @return either the matched content,null for undefined
   *         values, or an empty string for a URI part with empty content.
   */
  private static String getURIGroup(Matcher match, int group)
  {
    String matched = match.group(group);
    if (matched == null || matched.length() == 0)
      {
	String prevMatched = match.group(group -1);
	if (prevMatched == null || prevMatched.length() == 0)
	  return null;
	else
	  return "";
      }
    return matched;
  }
  /**
   * Sets fields of this URI by parsing the given string.
   *
   * @param str The string to parse
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   */
  private void parseURI(String str) throws URISyntaxException
  {
    Matcher matcher = URI_PATTERN.matcher(str);
    
    if (matcher.matches())
      {
	scheme = getURIGroup(matcher, SCHEME_GROUP);
	rawSchemeSpecificPart = matcher.group(SCHEME_SPEC_PART_GROUP);
	schemeSpecificPart = unquote(rawSchemeSpecificPart);
	if (!isOpaque())
	  {
	    rawAuthority = getURIGroup(matcher, AUTHORITY_GROUP);
	    rawPath = matcher.group(PATH_GROUP);
	    rawQuery = getURIGroup(matcher, QUERY_GROUP);
	  }
	rawFragment = getURIGroup(matcher, FRAGMENT_GROUP);
      }
    else
      throw new URISyntaxException(str,
				   "doesn't match URI regular expression");
    parseServerAuthority();
    // We must eagerly unquote the parts, because this is the only time
    // we may throw an exception.
    authority = unquote(rawAuthority);
    userInfo = unquote(rawUserInfo);
    host = unquote(rawHost);
    path = unquote(rawPath);
    query = unquote(rawQuery);
    fragment = unquote(rawFragment);
  }
  /**
   * Unquote "%" + hex quotes characters
   *
   * @param str The string to unquote or null.
   *
   * @return The unquoted string or null if str was null.
   *
   * @exception URISyntaxException If the given string contains invalid
   * escape sequences.
   */
  private static String unquote(String str) throws URISyntaxException
  {
    if (str == null)
      return null;
    byte[] buf = new byte[str.length()];
    int pos = 0;
    for (int i = 0; i < str.length(); i++)
      {
	char c = str.charAt(i);
	if (c == '%')
	  {
	    if (i + 2 >= str.length())
	      throw new URISyntaxException(str, "Invalid quoted character");
	    int hi = Character.digit(str.charAt(++i), 16);
	    int lo = Character.digit(str.charAt(++i), 16);
	    if (lo < 0 || hi < 0)
	      throw new URISyntaxException(str, "Invalid quoted character");
	    buf[pos++] = (byte) (hi * 16 + lo);
	  }
	else
	  buf[pos++] = (byte) c;
      }
    try
      {
	return new String(buf, 0, pos, "utf-8");
      }
    catch (java.io.UnsupportedEncodingException x2)
      {
	throw (Error) new InternalError().initCause(x2);
      }
  }
  /**
   * Quote characters illegal in URIs in given string.
   *
   * Replace illegal characters by encoding their UTF-8
   * representation as "%" + hex code for each resulting
   * UTF-8 character.
   *
   * @param str The string to quote
   *
   * @return The quoted string.
   */
  private static String quote(String str)
  {
    return quote(str, RFC3986_SSP);
  }
  /**
   * Quote characters illegal in URI authorities in given string.
   *
   * Replace illegal characters by encoding their UTF-8
   * representation as "%" + hex code for each resulting
   * UTF-8 character.
   *
   * @param str The string to quote
   *
   * @return The quoted string.
   */
  private static String quoteAuthority(String str)
  {
    // Technically, we should be using RFC2396_AUTHORITY, but
    // it contains no additional characters.
    return quote(str, RFC3986_REG_NAME);
  }
  /**
   * Quotes the characters in the supplied string that are not part of
   * the specified set of legal characters.
   *
   * @param str the string to quote
   * @param legalCharacters the set of legal characters
   *
   * @return the quoted string.
   */
  private static String quote(String str, String legalCharacters)
  {
    StringBuilder sb = new StringBuilder(str.length());
    for (int i = 0; i < str.length(); i++)
      {
	char c = str.charAt(i);
	if ((legalCharacters.indexOf(c) == -1)
	    && (c <= 127))
	  {
	    sb.append('%');
	    sb.append(HEX.charAt(c / 16));
	    sb.append(HEX.charAt(c % 16));
	  }
      	else
	  sb.append(c);
      }
    return sb.toString();
  }
  /**
   * Quote characters illegal in URI hosts in given string.
   *
   * Replace illegal characters by encoding their UTF-8
   * representation as "%" + hex code for each resulting
   * UTF-8 character.
   *
   * @param str The string to quote
   *
   * @return The quoted string.
   */
  private static String quoteHost(String str)
  {
    return quote(str, RFC3986_HOST);
  }
  /**
   * Quote characters illegal in URI paths in given string.
   *
   * Replace illegal characters by encoding their UTF-8
   * representation as "%" + hex code for each resulting
   * UTF-8 character.
   *
   * @param str The string to quote
   *
   * @return The quoted string.
   */
  private static String quotePath(String str)
  {
    // Technically, we should be using RFC2396_PATH, but
    // it contains no additional characters.
    return quote(str, RFC3986_PATH_SEGMENTS);
  }
  /**
   * Quote characters illegal in URI user infos in given string.
   *
   * Replace illegal characters by encoding their UTF-8
   * representation as "%" + hex code for each resulting
   * UTF-8 character.
   *
   * @param str The string to quote
   *
   * @return The quoted string.
   */
  private static String quoteUserInfo(String str)
  {
    return quote(str, RFC3986_USERINFO);
  }
  /**
   * Creates an URI from the given string
   *
   * @param str The string to create the URI from
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   * @exception NullPointerException If str is null
   */
  public URI(String str) throws URISyntaxException
  {
    this.string = str;
    parseURI(str);
  }
  /**
   * Create an URI from the given components
   *
   * @param scheme The scheme name
   * @param userInfo The username and authorization info
   * @param host The hostname
   * @param port The port number
   * @param path The path
   * @param query The query
   * @param fragment The fragment
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   */
  public URI(String scheme, String userInfo, String host, int port,
             String path, String query, String fragment)
    throws URISyntaxException
  {
    this((scheme == null ? "" : scheme + ":")
         + (userInfo == null && host == null && port == -1 ? "" : "//")
         + (userInfo == null ? "" : quoteUserInfo(userInfo) + "@")
         + (host == null ? "" : quoteHost(host))
         + (port == -1 ? "" : ":" + String.valueOf(port))
         + (path == null ? "" : quotePath(path))
         + (query == null ? "" : "?" + quote(query))
         + (fragment == null ? "" : "#" + quote(fragment)));
  }
  /**
   * Create an URI from the given components
   *
   * @param scheme The scheme name
   * @param authority The authority
   * @param path The apth
   * @param query The query
   * @param fragment The fragment
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   */
  public URI(String scheme, String authority, String path, String query,
             String fragment) throws URISyntaxException
  {
    this((scheme == null ? "" : scheme + ":")
         + (authority == null ? "" : "//" + quoteAuthority(authority))
         + (path == null ? "" : quotePath(path))
         + (query == null ? "" : "?" + quote(query))
         + (fragment == null ? "" : "#" + quote(fragment)));
  }
  /**
   * Create an URI from the given components
   *
   * @param scheme The scheme name
   * @param host The hostname
   * @param path The path
   * @param fragment The fragment
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   */
  public URI(String scheme, String host, String path, String fragment)
    throws URISyntaxException
  {
    this(scheme, null, host, -1, path, null, fragment);
  }
  /**
   * Create an URI from the given components
   *
   * @param scheme The scheme name
   * @param ssp The scheme specific part
   * @param fragment The fragment
   *
   * @exception URISyntaxException If the given string violates RFC 2396
   */
  public URI(String scheme, String ssp, String fragment)
    throws URISyntaxException
  {
    this((scheme == null ? "" : scheme + ":")
         + (ssp == null ? "" : quote(ssp))
         + (fragment == null ? "" : "#" + quote(fragment)));
  }
  /**
   * Create an URI from the given string
   *
   * @param str The string to create the URI from
   *
   * @exception IllegalArgumentException If the given string violates RFC 2396
   * @exception NullPointerException If str is null
   */
  public static URI create(String str)
  {
    try
      {
	return new URI(str);
      }
    catch (URISyntaxException e)
      {
	throw (IllegalArgumentException) new IllegalArgumentException()
	      .initCause(e);
      }
  }
  /**
   * Attempts to parse this URI's authority component, if defined,
   * into user-information, host, and port components.  The purpose
   * of this method was to disambiguate between some authority sections,
   * which form invalid server-based authories, but valid registry
   * based authorities.  In the updated RFC 3986, the authority section
   * is defined differently, with registry-based authorities part of
   * the host section.  Thus, this method is now simply an explicit
   * way of parsing any authority section.
   *
   * @return the URI, with the authority section parsed into user
   *         information, host and port components.
   * @throws URISyntaxException if the given string violates RFC 2396
   */
  public URI parseServerAuthority() throws URISyntaxException
  {
    if (rawAuthority != null)
      {
	Matcher matcher = AUTHORITY_PATTERN.matcher(rawAuthority);
	if (matcher.matches())
	  {
	    rawUserInfo = getURIGroup(matcher, AUTHORITY_USERINFO_GROUP);
	    rawHost = getURIGroup(matcher, AUTHORITY_HOST_GROUP);
	    
	    String portStr = getURIGroup(matcher, AUTHORITY_PORT_GROUP);
	    
	    if (portStr != null && ! portStr.isEmpty())
	      try
		{
		  port = Integer.parseInt(portStr);
		}
	      catch (NumberFormatException e)
		{
		  URISyntaxException use =
		    new URISyntaxException
		      (string, "doesn't match URI regular expression");
		  use.initCause(e);
		  throw use;
		}
	  }
	else
	  throw new URISyntaxException(string,
				       "doesn't match URI regular expression");
      }
    return this;
  }
  /**
   * * Returns a normalized version of the URI. If the URI is opaque, * or its path is already in normal form, then this URI is simply * returned. Otherwise, the following transformation of the path * element takes place: *
** The resulting URI will be free of `.' and `..' segments, barring those * that were prepended or which couldn't be paired, respectively. *
* * @return the normalized URI. */ public URI normalize() { if (isOpaque() || path.indexOf("/./") == -1 && path.indexOf("/../") == -1) return this; try { return new URI(scheme, authority, normalizePath(path), query, fragment); } catch (URISyntaxException e) { throw (Error) new InternalError("Normalized URI variant could not "+ "be constructed").initCause(e); } } /** ** Normalize the given path. The following transformation takes place: *
** The resulting URI will be free of `.' and `..' segments, barring those * that were prepended or which couldn't be paired, respectively. *
* * @param relativePath the relative path to be normalized. * @return the normalized path. */ private String normalizePath(String relativePath) { /* This follows the algorithm in section 5.2.4. of RFC3986, but doesn't modify the input buffer. */ StringBuilder input = new StringBuilder(relativePath); StringBuilder output = new StringBuilder(); int start = 0; while (start < input.length()) { /* A */ if (input.indexOf("../",start) == start) { start += 3; continue; } if (input.indexOf("./",start) == start) { start += 2; continue; } /* B */ if (input.indexOf("/./",start) == start) { start += 2; continue; } if (input.indexOf("/.",start) == start && input.charAt(start + 2) != '.') { start += 1; input.setCharAt(start,'/'); continue; } /* C */ if (input.indexOf("/../",start) == start) { start += 3; removeLastSegment(output); continue; } if (input.indexOf("/..",start) == start) { start += 2; input.setCharAt(start,'/'); removeLastSegment(output); continue; } /* D */ if (start == input.length() - 1 && input.indexOf(".",start) == start) { input.delete(0,1); continue; } if (start == input.length() - 2 && input.indexOf("..",start) == start) { input.delete(0,2); continue; } /* E */ int indexOfSlash = input.indexOf("/",start); while (indexOfSlash == start) { output.append("/"); ++start; indexOfSlash = input.indexOf("/",start); } if (indexOfSlash == -1) indexOfSlash = input.length(); output.append(input.substring(start, indexOfSlash)); start = indexOfSlash; } return output.toString(); } /** * Removes the last segment of the path from the specified buffer. * * @param buffer the buffer containing the path. */ private void removeLastSegment(StringBuilder buffer) { int lastSlash = buffer.lastIndexOf("/"); if (lastSlash == -1) buffer.setLength(0); else buffer.setLength(lastSlash); } /** * Resolves the given URI against this URI * * @param uri The URI to resolve against this URI * * @return The resulting URI, or null when it couldn't be resolved * for some reason. * * @throws NullPointerException if uri is null */ public URI resolve(URI uri) { if (uri.isAbsolute()) return uri; if (uri.isOpaque()) return uri; String scheme = uri.getScheme(); String schemeSpecificPart = uri.getSchemeSpecificPart(); String authority = uri.getAuthority(); String path = uri.getPath(); String query = uri.getQuery(); String fragment = uri.getFragment(); try { if (fragment != null && path != null && path.equals("") && scheme == null && authority == null && query == null) return new URI(this.scheme, this.schemeSpecificPart, fragment); if (authority == null) { authority = this.authority; if (path == null) path = ""; if (! (path.startsWith("/"))) { StringBuilder basepath = new StringBuilder(this.path); int i = this.path.lastIndexOf('/'); if (i >= 0) basepath.delete(i + 1, basepath.length()); basepath.append(path); path = normalizePath(basepath.toString()); } } return new URI(this.scheme, authority, path, query, fragment); } catch (URISyntaxException e) { throw (Error) new InternalError("Resolved URI variant could not "+ "be constructed").initCause(e); } } /** * Resolves the given URI string against this URI * * @param str The URI as string to resolve against this URI * * @return The resulting URI * * @throws IllegalArgumentException If the given URI string * violates RFC 2396 * @throws NullPointerException If uri is null */ public URI resolve(String str) throws IllegalArgumentException { return resolve(create(str)); } /** ** Relativizes the given URI against this URI. The following * algorithm is used: *
* 
   * Compares the URI with the given object for equality.  If the
   * object is not a URI, then the method returns false.
   * Otherwise, the following criteria are observed:
   * 
true if the objects are equal, according to
   *         the specification above.
   */
  public boolean equals(Object obj)
  {
    if (!(obj instanceof URI))
      return false;
    URI uriObj = (URI) obj;
    if (scheme == null)
      {
	if (uriObj.getScheme() != null)
	  return false;
      }
    else
      if (!(scheme.equalsIgnoreCase(uriObj.getScheme())))
	return false;
    if (rawFragment == null)
      {
	if (uriObj.getRawFragment() != null)
	  return false;
      }
    else
      if (!(rawFragment.equalsIgnoreCase(uriObj.getRawFragment())))
	return false;
    boolean opaqueThis = isOpaque();
    boolean opaqueObj = uriObj.isOpaque();
    if (opaqueThis && opaqueObj)
      return rawSchemeSpecificPart.equals(uriObj.getRawSchemeSpecificPart());
    else if (!opaqueThis && !opaqueObj)
      {
	boolean common = rawPath.equalsIgnoreCase(uriObj.getRawPath())
	  && ((rawQuery == null && uriObj.getRawQuery() == null)
	      || rawQuery.equalsIgnoreCase(uriObj.getRawQuery()));
	if (rawAuthority == null && uriObj.getRawAuthority() == null)
	  return common;
	if (host == null)
	  return common 
	    && rawAuthority.equalsIgnoreCase(uriObj.getRawAuthority());
	return common 
	  && host.equalsIgnoreCase(uriObj.getHost())
	  && port == uriObj.getPort()
	  && (rawUserInfo == null ?
	      uriObj.getRawUserInfo() == null :
	      rawUserInfo.equalsIgnoreCase(uriObj.getRawUserInfo()));
      }
    else
      return false;
  }
  /**
   * Computes the hashcode of the URI
   */
  public int hashCode()
  {
    return (getScheme() == null ? 0 : 13 * getScheme().hashCode())
      + 17 * getRawSchemeSpecificPart().hashCode()
      + (getRawFragment() == null ? 0 : 21 + getRawFragment().hashCode());
  }
  /**
   * Compare the URI with another URI.
   * Undefined components are taken to be less than any other component.
   * The following criteria are observed:
   * 
   * toString() for URIs that don't contain any non-US-ASCII
   * characters.  Otherwise, the non-US-ASCII characters are replaced
   * by their percent-encoded representations.
   *
   * @return a string representation of the URI, containing only US-ASCII
   *         characters.
   */
  public String toASCIIString()
  {
    String strRep = toString();
    boolean inNonAsciiBlock = false;
    StringBuilder buffer = new StringBuilder();
    StringBuilder encBuffer = null;
    for (int i = 0; i < strRep.length(); i++)
      {
	char c = strRep.charAt(i);
	if (c <= 127)
	  {
	    if (inNonAsciiBlock)
	      {
		buffer.append(escapeCharacters(encBuffer.toString()));
		inNonAsciiBlock = false;
	      }
	    buffer.append(c);
	  }
	else
	  {
	    if (!inNonAsciiBlock)
	      {
		encBuffer = new StringBuilder();
		inNonAsciiBlock = true;
	      }
	    encBuffer.append(c);
	  }
      }
    return buffer.toString();
  }
  /**
   * Converts the non-ASCII characters in the supplied string
   * to their equivalent percent-encoded representations.
   * That is, they are replaced by "%" followed by their hexadecimal value.
   *
   * @param str a string including non-ASCII characters.
   * @return the string with the non-ASCII characters converted to their
   *         percent-encoded representations.
   */
  private static String escapeCharacters(String str)
  {
    try
      {
	StringBuilder sb = new StringBuilder(); 
	// this is far from optimal, but it works
	byte[] utf8 = str.getBytes("utf-8");
	for (int j = 0; j < utf8.length; j++)
	  {
	    sb.append('%');
	    sb.append(HEX.charAt((utf8[j] & 0xff) / 16));
	    sb.append(HEX.charAt((utf8[j] & 0xff) % 16));
	  }
	return sb.toString();
      }
    catch (java.io.UnsupportedEncodingException x)
      {
	throw (Error) new InternalError("Escaping error").initCause(x);
      }
  }
}