1 files changed, 178 insertions, 101 deletions
diff --git a/libgo/go/exp/locale/collate/collate.go b/libgo/go/exp/locale/collate/collate.go
index 8a5c9dc7a86..2cb29f24b74 100644
--- a/libgo/go/exp/locale/collate/collate.go
+++ b/libgo/go/exp/locale/collate/collate.go
@@ -12,22 +12,6 @@ import (
 	"exp/norm"
 )
 
-// Level identifies the collation comparison level.
-// The primary level corresponds to the basic sorting of text.
-// The secondary level corresponds to accents and related linguistic elements.
-// The tertiary level corresponds to casing and related concepts.
-// The quaternary level is derived from the other levels by the
-// various algorithms for handling variable elements.
-type Level int
-
-const (
-	Primary Level = iota
-	Secondary
-	Tertiary
-	Quaternary
-	Identity
-)
-
 // AlternateHandling identifies the various ways in which variables are handled.
 // A rune with a primary weight lower than the variable top is considered a
 // variable.
@@ -55,6 +39,12 @@ const (
 // Collator provides functionality for comparing strings for a given
 // collation order.
 type Collator struct {
+	// TODO: hide most of these options. Low-level options are set through the locale
+	// identifier (as defined by LDML) while high-level options are set through SetOptions.
+	// Using high-level options allows us to be more flexible (such as not ignoring
+	// Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing
+	// diacritical marks to be ignored but not case without having to fiddle with levels).
+
 	// Strength sets the maximum level to use in comparison.
 	Strength Level
 
@@ -80,13 +70,39 @@ type Collator struct {
 	// at a primary level with its numeric value.  For example, "A-21" < "A-123".
 	Numeric bool
 
+	// The largest primary value that is considered to be variable.
+	variableTop uint32
+
 	f norm.Form
 
-	t *table
+	t Weigher
+
+	sorter sorter
 
 	_iter [2]iter
 }
 
+// An Option is used to change the behavior of Collator.  They override the
+// settings passed through the locale identifier.
+type Option int
+
+const (
+	Numeric          Option = 1 << iota // Sort numbers numerically ("2" < "12").
+	IgnoreCase                          // Case-insensitive search.
+	IgnoreDiacritics                    // Ignore diacritical marks. ("o" == "ö").
+	IgnoreWidth                         // Ignore full versus normal width.
+	UpperFirst                          // Sort upper case before lower case.
+	LowerFirst                          // Sort lower case before upper case.
+	Force                               // Force ordering if strings are equivalent but not equal.
+
+	Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase
+)
+
+// SetOptions accepts a Options or-ed together.  All previous calls to SetOptions are ignored.
+func (c *Collator) SetOptions(o Option) {
+	// TODO: implement
+}
+
 func (c *Collator) iter(i int) *iter {
 	// TODO: evaluate performance for making the second iterator optional.
 	return &c._iter[i]
@@ -101,18 +117,20 @@ func Locales() []string {
 // New returns a new Collator initialized for the given locale.
 func New(loc string) *Collator {
 	// TODO: handle locale selection according to spec.
-	t := &mainTable
+	var t tableIndex
 	if loc != "" {
 		if idx, ok := locales[loc]; ok {
-			t = mainTable.indexedTable(idx)
+			t = idx
+		} else {
+			t = locales["root"]
 		}
 	}
-	return newCollator(t)
+	return NewFromTable(Init(t))
 }
 
-func newCollator(t *table) *Collator {
+func NewFromTable(t Weigher) *Collator {
 	c := &Collator{
-		Strength: Quaternary,
+		Strength: Tertiary,
 		f:        norm.NFD,
 		t:        t,
 	}
@@ -121,12 +139,6 @@ func newCollator(t *table) *Collator {
 	return c
 }
 
-// SetVariableTop sets all runes with primary strength less than the primary
-// strength of r to be variable and thus affected by alternate handling.
-func (c *Collator) SetVariableTop(r rune) {
-	// TODO: implement
-}
-
 // Buffer holds keys generated by Key and KeyString.
 type Buffer struct {
 	buf [4096]byte
@@ -149,8 +161,8 @@ func (b *Buffer) Reset() {
 func (c *Collator) Compare(a, b []byte) int {
 	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
 	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).setInput(c, a)
-	c.iter(1).setInput(c, b)
+	c.iter(0).setInput(a)
+	c.iter(1).setInput(b)
 	if res := c.compare(); res != 0 {
 		return res
 	}
@@ -165,8 +177,8 @@ func (c *Collator) Compare(a, b []byte) int {
 func (c *Collator) CompareString(a, b string) int {
 	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
 	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
-	c.iter(0).setInputString(c, a)
-	c.iter(1).setInputString(c, b)
+	c.iter(0).setInputString(a)
+	c.iter(1).setInputString(b)
 	if res := c.compare(); res != 0 {
 		return res
 	}
@@ -234,11 +246,6 @@ func (c *Collator) compare() int {
 	return 0
 }
 
-func (c *Collator) Prefix(s, prefix []byte) int {
-	// iterate over s, track bytes consumed.
-	return 0
-}
-
 // Key returns the collation key for str.
 // Passing the buffer buf may avoid memory allocations.
 // The returned slice will point to an allocation in Buffer and will remain
@@ -259,114 +266,184 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
 	return c.key(buf, c.getColElemsString(str))
 }
 
-func (c *Collator) key(buf *Buffer, w []colElem) []byte {
-	processWeights(c.Alternate, c.t.variableTop, w)
+func (c *Collator) key(buf *Buffer, w []Elem) []byte {
+	processWeights(c.Alternate, c.variableTop, w)
 	kn := len(buf.key)
 	c.keyFromElems(buf, w)
 	return buf.key[kn:]
 }
 
-func (c *Collator) getColElems(str []byte) []colElem {
+func (c *Collator) getColElems(str []byte) []Elem {
 	i := c.iter(0)
-	i.setInput(c, str)
-	for !i.done() {
-		i.next()
+	i.setInput(str)
+	for i.next() {
 	}
 	return i.ce
 }
 
-func (c *Collator) getColElemsString(str string) []colElem {
+func (c *Collator) getColElemsString(str string) []Elem {
 	i := c.iter(0)
-	i.setInputString(c, str)
-	for !i.done() {
-		i.next()
+	i.setInputString(str)
+	for i.next() {
 	}
 	return i.ce
 }
 
 type iter struct {
-	src        norm.Iter
-	norm       [1024]byte
-	buf        []byte
-	p          int
-	minBufSize int
-
-	wa  [512]colElem
-	ce  []colElem
+	bytes []byte
+	str   string
+
+	wa  [512]Elem
+	ce  []Elem
 	pce int
+	nce int // nce <= len(nce)
 
-	t          *table
-	_done, eof bool
+	prevCCC  uint8
+	pStarter int
+
+	t Weigher
 }
 
 func (i *iter) init(c *Collator) {
 	i.t = c.t
-	i.minBufSize = c.t.maxContractLen
 	i.ce = i.wa[:0]
-	i.buf = i.norm[:0]
 }
 
 func (i *iter) reset() {
 	i.ce = i.ce[:0]
-	i.buf = i.buf[:0]
-	i.p = 0
-	i.eof = i.src.Done()
-	i._done = i.eof
+	i.nce = 0
+	i.prevCCC = 0
+	i.pStarter = 0
 }
 
-func (i *iter) setInput(c *Collator, s []byte) *iter {
-	i.src.SetInput(c.f, s)
+func (i *iter) setInput(s []byte) *iter {
+	i.bytes = s
+	i.str = ""
 	i.reset()
 	return i
 }
 
-func (i *iter) setInputString(c *Collator, s string) *iter {
-	i.src.SetInputString(c.f, s)
+func (i *iter) setInputString(s string) *iter {
+	i.str = s
+	i.bytes = nil
 	i.reset()
 	return i
 }
 
 func (i *iter) done() bool {
-	return i._done
+	return len(i.str) == 0 && len(i.bytes) == 0
 }
 
-func (i *iter) next() {
-	if !i.eof && len(i.buf)-i.p < i.minBufSize {
-		// replenish buffer
-		n := copy(i.buf, i.buf[i.p:])
-		n += i.src.Next(i.buf[n:cap(i.buf)])
-		i.buf = i.buf[:n]
-		i.p = 0
-		i.eof = i.src.Done()
+func (i *iter) tail(n int) {
+	if i.bytes == nil {
+		i.str = i.str[n:]
+	} else {
+		i.bytes = i.bytes[n:]
 	}
-	if i.p == len(i.buf) {
-		i._done = true
+}
+
+func (i *iter) appendNext() int {
+	var sz int
+	if i.bytes == nil {
+		i.ce, sz = i.t.AppendNextString(i.ce, i.str)
+	} else {
+		i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
+	}
+	return sz
+}
+
+// next appends Elems to the internal array until it adds an element with CCC=0.
+// In the majority of cases, a Elem with a primary value > 0 will have
+// a CCC of 0. The CCC values of colation elements are also used to detect if the
+// input string was not normalized and to adjust the result accordingly.
+func (i *iter) next() bool {
+	for !i.done() {
+		p0 := len(i.ce)
+		sz := i.appendNext()
+		i.tail(sz)
+		last := len(i.ce) - 1
+		if ccc := i.ce[last].CCC(); ccc == 0 {
+			i.nce = len(i.ce)
+			i.pStarter = last
+			i.prevCCC = 0
+			return true
+		} else if p0 < last && i.ce[p0].CCC() == 0 {
+			// set i.nce to only cover part of i.ce for which ccc == 0 and
+			// use rest the next call to next.
+			for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
+			}
+			i.nce = p0
+			i.pStarter = p0 - 1
+			i.prevCCC = ccc
+			return true
+		} else if ccc < i.prevCCC {
+			i.doNorm(p0, ccc) // should be rare for most common cases
+		} else {
+			i.prevCCC = ccc
+		}
+	}
+	if len(i.ce) != i.nce {
+		i.nce = len(i.ce)
+		return true
+	}
+	return false
+}
+
+// nextPlain is the same as next, but does not "normalize" the collation
+// elements.
+// TODO: remove this function. Using this instead of next does not seem
+// to improve performance in any significant way. We retain this until
+// later for evaluation purposes.
+func (i *iter) nextPlain() bool {
+	if i.done() {
+		return false
+	}
+	sz := i.appendNext()
+	i.tail(sz)
+	i.nce = len(i.ce)
+	return true
+}
+
+const maxCombiningCharacters = 30
+
+// doNorm reorders the collation elements in i.ce.
+// It assumes that blocks of collation elements added with appendNext
+// either start and end with the same CCC or start with CCC == 0.
+// This allows for a single insertion point for the entire block.
+// The correctness of this assumption is verified in builder.go.
+func (i *iter) doNorm(p int, ccc uint8) {
+	if p-i.pStarter > maxCombiningCharacters {
+		i.prevCCC = i.ce[len(i.ce)-1].CCC()
+		i.pStarter = len(i.ce) - 1
 		return
 	}
-	sz := 0
-	i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:])
-	i.p += sz
+	n := len(i.ce)
+	k := p
+	for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
+	}
+	i.ce = append(i.ce, i.ce[p:k]...)
+	copy(i.ce[p:], i.ce[k:])
+	i.ce = i.ce[:n]
 }
 
 func (i *iter) nextPrimary() int {
 	for {
-		for ; i.pce < len(i.ce); i.pce++ {
-			if v := i.ce[i.pce].primary(); v != 0 {
+		for ; i.pce < i.nce; i.pce++ {
+			if v := i.ce[i.pce].Primary(); v != 0 {
 				i.pce++
 				return v
 			}
 		}
-		if i.done() {
+		if !i.next() {
 			return 0
 		}
-		i.next()
 	}
 	panic("should not reach here")
 }
 
 func (i *iter) nextSecondary() int {
 	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].secondary(); v != 0 {
+		if v := i.ce[i.pce].Secondary(); v != 0 {
 			i.pce++
 			return v
 		}
@@ -376,7 +453,7 @@ func (i *iter) nextSecondary() int {
 
 func (i *iter) prevSecondary() int {
 	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 {
+		if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
 			i.pce++
 			return v
 		}
@@ -386,7 +463,7 @@ func (i *iter) prevSecondary() int {
 
 func (i *iter) nextTertiary() int {
 	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].tertiary(); v != 0 {
+		if v := i.ce[i.pce].Tertiary(); v != 0 {
 			i.pce++
 			return int(v)
 		}
@@ -396,7 +473,7 @@ func (i *iter) nextTertiary() int {
 
 func (i *iter) nextQuaternary() int {
 	for ; i.pce < len(i.ce); i.pce++ {
-		if v := i.ce[i.pce].quaternary(); v != 0 {
+		if v := i.ce[i.pce].Quaternary(); v != 0 {
 			i.pce++
 			return v
 		}
@@ -416,9 +493,9 @@ func appendPrimary(key []byte, p int) []byte {
 
 // keyFromElems converts the weights ws to a compact sequence of bytes.
 // The result will be appended to the byte buffer in buf.
-func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
+func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) {
 	for _, v := range ws {
-		if w := v.primary(); w > 0 {
+		if w := v.Primary(); w > 0 {
 			buf.key = appendPrimary(buf.key, w)
 		}
 	}
@@ -427,13 +504,13 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
 		// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
 		if !c.Backwards {
 			for _, v := range ws {
-				if w := v.secondary(); w > 0 {
+				if w := v.Secondary(); w > 0 {
 					buf.key = append(buf.key, uint8(w>>8), uint8(w))
 				}
 			}
 		} else {
 			for i := len(ws) - 1; i >= 0; i-- {
-				if w := ws[i].secondary(); w > 0 {
+				if w := ws[i].Secondary(); w > 0 {
 					buf.key = append(buf.key, uint8(w>>8), uint8(w))
 				}
 			}
@@ -444,12 +521,12 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
 	if Tertiary <= c.Strength || c.CaseLevel {
 		buf.key = append(buf.key, 0, 0)
 		for _, v := range ws {
-			if w := v.tertiary(); w > 0 {
+			if w := v.Tertiary(); w > 0 {
 				buf.key = append(buf.key, uint8(w))
 			}
 		}
 		// Derive the quaternary weights from the options and other levels.
-		// Note that we represent maxQuaternary as 0xFF. The first byte of the
+		// Note that we represent MaxQuaternary as 0xFF. The first byte of the
 		// representation of a primary weight is always smaller than 0xFF,
 		// so using this single byte value will compare correctly.
 		if Quaternary <= c.Strength && c.Alternate >= AltShifted {
@@ -457,7 +534,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
 				lastNonFFFF := len(buf.key)
 				buf.key = append(buf.key, 0)
 				for _, v := range ws {
-					if w := v.quaternary(); w == maxQuaternary {
+					if w := v.Quaternary(); w == MaxQuaternary {
 						buf.key = append(buf.key, 0xFF)
 					} else if w > 0 {
 						buf.key = appendPrimary(buf.key, w)
@@ -468,7 +545,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
 			} else {
 				buf.key = append(buf.key, 0)
 				for _, v := range ws {
-					if w := v.quaternary(); w == maxQuaternary {
+					if w := v.Quaternary(); w == MaxQuaternary {
 						buf.key = append(buf.key, 0xFF)
 					} else if w > 0 {
 						buf.key = appendPrimary(buf.key, w)
@@ -479,14 +556,14 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) {
 	}
 }
 
-func processWeights(vw AlternateHandling, top uint32, wa []colElem) {
+func processWeights(vw AlternateHandling, top uint32, wa []Elem) {
 	ignore := false
 	vtop := int(top)
 	switch vw {
 	case AltShifted, AltShiftTrimmed:
 		for i := range wa {
-			if p := wa[i].primary(); p <= vtop && p != 0 {
-				wa[i] = makeQuaternary(p)
+			if p := wa[i].Primary(); p <= vtop && p != 0 {
+				wa[i] = MakeQuaternary(p)
 				ignore = true
 			} else if p == 0 {
 				if ignore {
@@ -498,7 +575,7 @@ func processWeights(vw AlternateHandling, top uint32, wa []colElem) {
 		}
 	case AltBlanked:
 		for i := range wa {
-			if p := wa[i].primary(); p <= vtop && (ignore || p != 0) {
+			if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
 				wa[i] = ceIgnore
 				ignore = true
 			} else {