diff options
Diffstat (limited to 'libgo/go/exp/locale/collate/collate.go')
-rw-r--r-- | libgo/go/exp/locale/collate/collate.go | 279 |
1 files changed, 178 insertions, 101 deletions
diff --git a/libgo/go/exp/locale/collate/collate.go b/libgo/go/exp/locale/collate/collate.go index 8a5c9dc7a86..2cb29f24b74 100644 --- a/libgo/go/exp/locale/collate/collate.go +++ b/libgo/go/exp/locale/collate/collate.go @@ -12,22 +12,6 @@ import ( "exp/norm" ) -// Level identifies the collation comparison level. -// The primary level corresponds to the basic sorting of text. -// The secondary level corresponds to accents and related linguistic elements. -// The tertiary level corresponds to casing and related concepts. -// The quaternary level is derived from the other levels by the -// various algorithms for handling variable elements. -type Level int - -const ( - Primary Level = iota - Secondary - Tertiary - Quaternary - Identity -) - // AlternateHandling identifies the various ways in which variables are handled. // A rune with a primary weight lower than the variable top is considered a // variable. @@ -55,6 +39,12 @@ const ( // Collator provides functionality for comparing strings for a given // collation order. type Collator struct { + // TODO: hide most of these options. Low-level options are set through the locale + // identifier (as defined by LDML) while high-level options are set through SetOptions. + // Using high-level options allows us to be more flexible (such as not ignoring + // Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing + // diacritical marks to be ignored but not case without having to fiddle with levels). + // Strength sets the maximum level to use in comparison. Strength Level @@ -80,13 +70,39 @@ type Collator struct { // at a primary level with its numeric value. For example, "A-21" < "A-123". Numeric bool + // The largest primary value that is considered to be variable. + variableTop uint32 + f norm.Form - t *table + t Weigher + + sorter sorter _iter [2]iter } +// An Option is used to change the behavior of Collator. They override the +// settings passed through the locale identifier. +type Option int + +const ( + Numeric Option = 1 << iota // Sort numbers numerically ("2" < "12"). + IgnoreCase // Case-insensitive search. + IgnoreDiacritics // Ignore diacritical marks. ("o" == "รถ"). + IgnoreWidth // Ignore full versus normal width. + UpperFirst // Sort upper case before lower case. + LowerFirst // Sort lower case before upper case. + Force // Force ordering if strings are equivalent but not equal. + + Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase +) + +// SetOptions accepts a Options or-ed together. All previous calls to SetOptions are ignored. +func (c *Collator) SetOptions(o Option) { + // TODO: implement +} + func (c *Collator) iter(i int) *iter { // TODO: evaluate performance for making the second iterator optional. return &c._iter[i] @@ -101,18 +117,20 @@ func Locales() []string { // New returns a new Collator initialized for the given locale. func New(loc string) *Collator { // TODO: handle locale selection according to spec. - t := &mainTable + var t tableIndex if loc != "" { if idx, ok := locales[loc]; ok { - t = mainTable.indexedTable(idx) + t = idx + } else { + t = locales["root"] } } - return newCollator(t) + return NewFromTable(Init(t)) } -func newCollator(t *table) *Collator { +func NewFromTable(t Weigher) *Collator { c := &Collator{ - Strength: Quaternary, + Strength: Tertiary, f: norm.NFD, t: t, } @@ -121,12 +139,6 @@ func newCollator(t *table) *Collator { return c } -// SetVariableTop sets all runes with primary strength less than the primary -// strength of r to be variable and thus affected by alternate handling. -func (c *Collator) SetVariableTop(r rune) { - // TODO: implement -} - // Buffer holds keys generated by Key and KeyString. type Buffer struct { buf [4096]byte @@ -149,8 +161,8 @@ func (b *Buffer) Reset() { func (c *Collator) Compare(a, b []byte) int { // TODO: skip identical prefixes once we have a fast way to detect if a rune is // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).setInput(c, a) - c.iter(1).setInput(c, b) + c.iter(0).setInput(a) + c.iter(1).setInput(b) if res := c.compare(); res != 0 { return res } @@ -165,8 +177,8 @@ func (c *Collator) Compare(a, b []byte) int { func (c *Collator) CompareString(a, b string) int { // TODO: skip identical prefixes once we have a fast way to detect if a rune is // part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest. - c.iter(0).setInputString(c, a) - c.iter(1).setInputString(c, b) + c.iter(0).setInputString(a) + c.iter(1).setInputString(b) if res := c.compare(); res != 0 { return res } @@ -234,11 +246,6 @@ func (c *Collator) compare() int { return 0 } -func (c *Collator) Prefix(s, prefix []byte) int { - // iterate over s, track bytes consumed. - return 0 -} - // Key returns the collation key for str. // Passing the buffer buf may avoid memory allocations. // The returned slice will point to an allocation in Buffer and will remain @@ -259,114 +266,184 @@ func (c *Collator) KeyFromString(buf *Buffer, str string) []byte { return c.key(buf, c.getColElemsString(str)) } -func (c *Collator) key(buf *Buffer, w []colElem) []byte { - processWeights(c.Alternate, c.t.variableTop, w) +func (c *Collator) key(buf *Buffer, w []Elem) []byte { + processWeights(c.Alternate, c.variableTop, w) kn := len(buf.key) c.keyFromElems(buf, w) return buf.key[kn:] } -func (c *Collator) getColElems(str []byte) []colElem { +func (c *Collator) getColElems(str []byte) []Elem { i := c.iter(0) - i.setInput(c, str) - for !i.done() { - i.next() + i.setInput(str) + for i.next() { } return i.ce } -func (c *Collator) getColElemsString(str string) []colElem { +func (c *Collator) getColElemsString(str string) []Elem { i := c.iter(0) - i.setInputString(c, str) - for !i.done() { - i.next() + i.setInputString(str) + for i.next() { } return i.ce } type iter struct { - src norm.Iter - norm [1024]byte - buf []byte - p int - minBufSize int - - wa [512]colElem - ce []colElem + bytes []byte + str string + + wa [512]Elem + ce []Elem pce int + nce int // nce <= len(nce) - t *table - _done, eof bool + prevCCC uint8 + pStarter int + + t Weigher } func (i *iter) init(c *Collator) { i.t = c.t - i.minBufSize = c.t.maxContractLen i.ce = i.wa[:0] - i.buf = i.norm[:0] } func (i *iter) reset() { i.ce = i.ce[:0] - i.buf = i.buf[:0] - i.p = 0 - i.eof = i.src.Done() - i._done = i.eof + i.nce = 0 + i.prevCCC = 0 + i.pStarter = 0 } -func (i *iter) setInput(c *Collator, s []byte) *iter { - i.src.SetInput(c.f, s) +func (i *iter) setInput(s []byte) *iter { + i.bytes = s + i.str = "" i.reset() return i } -func (i *iter) setInputString(c *Collator, s string) *iter { - i.src.SetInputString(c.f, s) +func (i *iter) setInputString(s string) *iter { + i.str = s + i.bytes = nil i.reset() return i } func (i *iter) done() bool { - return i._done + return len(i.str) == 0 && len(i.bytes) == 0 } -func (i *iter) next() { - if !i.eof && len(i.buf)-i.p < i.minBufSize { - // replenish buffer - n := copy(i.buf, i.buf[i.p:]) - n += i.src.Next(i.buf[n:cap(i.buf)]) - i.buf = i.buf[:n] - i.p = 0 - i.eof = i.src.Done() +func (i *iter) tail(n int) { + if i.bytes == nil { + i.str = i.str[n:] + } else { + i.bytes = i.bytes[n:] } - if i.p == len(i.buf) { - i._done = true +} + +func (i *iter) appendNext() int { + var sz int + if i.bytes == nil { + i.ce, sz = i.t.AppendNextString(i.ce, i.str) + } else { + i.ce, sz = i.t.AppendNext(i.ce, i.bytes) + } + return sz +} + +// next appends Elems to the internal array until it adds an element with CCC=0. +// In the majority of cases, a Elem with a primary value > 0 will have +// a CCC of 0. The CCC values of colation elements are also used to detect if the +// input string was not normalized and to adjust the result accordingly. +func (i *iter) next() bool { + for !i.done() { + p0 := len(i.ce) + sz := i.appendNext() + i.tail(sz) + last := len(i.ce) - 1 + if ccc := i.ce[last].CCC(); ccc == 0 { + i.nce = len(i.ce) + i.pStarter = last + i.prevCCC = 0 + return true + } else if p0 < last && i.ce[p0].CCC() == 0 { + // set i.nce to only cover part of i.ce for which ccc == 0 and + // use rest the next call to next. + for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ { + } + i.nce = p0 + i.pStarter = p0 - 1 + i.prevCCC = ccc + return true + } else if ccc < i.prevCCC { + i.doNorm(p0, ccc) // should be rare for most common cases + } else { + i.prevCCC = ccc + } + } + if len(i.ce) != i.nce { + i.nce = len(i.ce) + return true + } + return false +} + +// nextPlain is the same as next, but does not "normalize" the collation +// elements. +// TODO: remove this function. Using this instead of next does not seem +// to improve performance in any significant way. We retain this until +// later for evaluation purposes. +func (i *iter) nextPlain() bool { + if i.done() { + return false + } + sz := i.appendNext() + i.tail(sz) + i.nce = len(i.ce) + return true +} + +const maxCombiningCharacters = 30 + +// doNorm reorders the collation elements in i.ce. +// It assumes that blocks of collation elements added with appendNext +// either start and end with the same CCC or start with CCC == 0. +// This allows for a single insertion point for the entire block. +// The correctness of this assumption is verified in builder.go. +func (i *iter) doNorm(p int, ccc uint8) { + if p-i.pStarter > maxCombiningCharacters { + i.prevCCC = i.ce[len(i.ce)-1].CCC() + i.pStarter = len(i.ce) - 1 return } - sz := 0 - i.ce, sz = i.t.appendNext(i.ce, i.buf[i.p:]) - i.p += sz + n := len(i.ce) + k := p + for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- { + } + i.ce = append(i.ce, i.ce[p:k]...) + copy(i.ce[p:], i.ce[k:]) + i.ce = i.ce[:n] } func (i *iter) nextPrimary() int { for { - for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].primary(); v != 0 { + for ; i.pce < i.nce; i.pce++ { + if v := i.ce[i.pce].Primary(); v != 0 { i.pce++ return v } } - if i.done() { + if !i.next() { return 0 } - i.next() } panic("should not reach here") } func (i *iter) nextSecondary() int { for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].secondary(); v != 0 { + if v := i.ce[i.pce].Secondary(); v != 0 { i.pce++ return v } @@ -376,7 +453,7 @@ func (i *iter) nextSecondary() int { func (i *iter) prevSecondary() int { for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[len(i.ce)-i.pce-1].secondary(); v != 0 { + if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 { i.pce++ return v } @@ -386,7 +463,7 @@ func (i *iter) prevSecondary() int { func (i *iter) nextTertiary() int { for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].tertiary(); v != 0 { + if v := i.ce[i.pce].Tertiary(); v != 0 { i.pce++ return int(v) } @@ -396,7 +473,7 @@ func (i *iter) nextTertiary() int { func (i *iter) nextQuaternary() int { for ; i.pce < len(i.ce); i.pce++ { - if v := i.ce[i.pce].quaternary(); v != 0 { + if v := i.ce[i.pce].Quaternary(); v != 0 { i.pce++ return v } @@ -416,9 +493,9 @@ func appendPrimary(key []byte, p int) []byte { // keyFromElems converts the weights ws to a compact sequence of bytes. // The result will be appended to the byte buffer in buf. -func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { +func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) { for _, v := range ws { - if w := v.primary(); w > 0 { + if w := v.Primary(); w > 0 { buf.key = appendPrimary(buf.key, w) } } @@ -427,13 +504,13 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { // TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF. if !c.Backwards { for _, v := range ws { - if w := v.secondary(); w > 0 { + if w := v.Secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } } else { for i := len(ws) - 1; i >= 0; i-- { - if w := ws[i].secondary(); w > 0 { + if w := ws[i].Secondary(); w > 0 { buf.key = append(buf.key, uint8(w>>8), uint8(w)) } } @@ -444,12 +521,12 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { if Tertiary <= c.Strength || c.CaseLevel { buf.key = append(buf.key, 0, 0) for _, v := range ws { - if w := v.tertiary(); w > 0 { + if w := v.Tertiary(); w > 0 { buf.key = append(buf.key, uint8(w)) } } // Derive the quaternary weights from the options and other levels. - // Note that we represent maxQuaternary as 0xFF. The first byte of the + // Note that we represent MaxQuaternary as 0xFF. The first byte of the // representation of a primary weight is always smaller than 0xFF, // so using this single byte value will compare correctly. if Quaternary <= c.Strength && c.Alternate >= AltShifted { @@ -457,7 +534,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { lastNonFFFF := len(buf.key) buf.key = append(buf.key, 0) for _, v := range ws { - if w := v.quaternary(); w == maxQuaternary { + if w := v.Quaternary(); w == MaxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) @@ -468,7 +545,7 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { } else { buf.key = append(buf.key, 0) for _, v := range ws { - if w := v.quaternary(); w == maxQuaternary { + if w := v.Quaternary(); w == MaxQuaternary { buf.key = append(buf.key, 0xFF) } else if w > 0 { buf.key = appendPrimary(buf.key, w) @@ -479,14 +556,14 @@ func (c *Collator) keyFromElems(buf *Buffer, ws []colElem) { } } -func processWeights(vw AlternateHandling, top uint32, wa []colElem) { +func processWeights(vw AlternateHandling, top uint32, wa []Elem) { ignore := false vtop := int(top) switch vw { case AltShifted, AltShiftTrimmed: for i := range wa { - if p := wa[i].primary(); p <= vtop && p != 0 { - wa[i] = makeQuaternary(p) + if p := wa[i].Primary(); p <= vtop && p != 0 { + wa[i] = MakeQuaternary(p) ignore = true } else if p == 0 { if ignore { @@ -498,7 +575,7 @@ func processWeights(vw AlternateHandling, top uint32, wa []colElem) { } case AltBlanked: for i := range wa { - if p := wa[i].primary(); p <= vtop && (ignore || p != 0) { + if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) { wa[i] = ceIgnore ignore = true } else { |