diff options
Diffstat (limited to 'libgo/go/exp/norm/composition.go')
-rw-r--r-- | libgo/go/exp/norm/composition.go | 115 |
1 files changed, 57 insertions, 58 deletions
diff --git a/libgo/go/exp/norm/composition.go b/libgo/go/exp/norm/composition.go index b2d2abaf63b..1d722230d6f 100644 --- a/libgo/go/exp/norm/composition.go +++ b/libgo/go/exp/norm/composition.go @@ -7,27 +7,46 @@ package norm import "utf8" const ( - maxCombiningChars = 30 + 2 // +2 to hold CGJ and Hangul overflow. + maxCombiningChars = 30 + maxBufferSize = maxCombiningChars + 2 // +1 to hold starter +1 to hold CGJ maxBackRunes = maxCombiningChars - 1 maxNFCExpansion = 3 // NFC(0x1D160) maxNFKCExpansion = 18 // NFKC(0xFDFA) - maxRuneSizeInDecomp = 4 - // Need to multiply by 2 as we don't reuse byte buffer space for recombining. - maxByteBufferSize = 2 * maxRuneSizeInDecomp * maxCombiningChars // 256 + maxByteBufferSize = utf8.UTFMax * maxBufferSize // 128 ) // reorderBuffer is used to normalize a single segment. Characters inserted with -// insert() are decomposed and reordered based on CCC. The compose() method can +// insert are decomposed and reordered based on CCC. The compose method can // be used to recombine characters. Note that the byte buffer does not hold // the UTF-8 characters in order. Only the rune array is maintained in sorted -// order. flush() writes the resulting segment to a byte array. +// order. flush writes the resulting segment to a byte array. type reorderBuffer struct { - rune [maxCombiningChars]runeInfo // Per character info. - byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos. - nrune int // Number of runeInfos. - nbyte uint8 // Number or bytes. + rune [maxBufferSize]runeInfo // Per character info. + byte [maxByteBufferSize]byte // UTF-8 buffer. Referenced by runeInfo.pos. + nrune int // Number of runeInfos. + nbyte uint8 // Number or bytes. f formInfo + + src input + nsrc int + srcBytes inputBytes + srcString inputString + tmpBytes inputBytes +} + +func (rb *reorderBuffer) init(f Form, src []byte) { + rb.f = *formTable[f] + rb.srcBytes = inputBytes(src) + rb.src = &rb.srcBytes + rb.nsrc = len(src) +} + +func (rb *reorderBuffer) initString(f Form, src string) { + rb.f = *formTable[f] + rb.srcString = inputString(src) + rb.src = &rb.srcString + rb.nsrc = len(src) } // reset discards all characters from the buffer. @@ -49,10 +68,10 @@ func (rb *reorderBuffer) flush(out []byte) []byte { // insertOrdered inserts a rune in the buffer, ordered by Canonical Combining Class. // It returns false if the buffer is not large enough to hold the rune. -// It is used internally by insert. +// It is used internally by insert and insertString only. func (rb *reorderBuffer) insertOrdered(info runeInfo) bool { n := rb.nrune - if n >= maxCombiningChars { + if n >= maxCombiningChars+1 { return false } b := rb.rune[:] @@ -68,7 +87,7 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool { } rb.nrune += 1 pos := uint8(rb.nbyte) - rb.nbyte += info.size + rb.nbyte += utf8.UTFMax info.pos = pos b[n] = info return true @@ -76,53 +95,32 @@ func (rb *reorderBuffer) insertOrdered(info runeInfo) bool { // insert inserts the given rune in the buffer ordered by CCC. // It returns true if the buffer was large enough to hold the decomposed rune. -func (rb *reorderBuffer) insert(src []byte, info runeInfo) bool { - if info.size == 3 && isHangul(src) { - rune, _ := utf8.DecodeRune(src) - return rb.decomposeHangul(uint32(rune)) - } - pos := rb.nbyte - if info.flags.hasDecomposition() { - dcomp := rb.f.decompose(src) - for i := 0; i < len(dcomp); i += int(info.size) { - info = rb.f.info(dcomp[i:]) - if !rb.insertOrdered(info) { - return false - } - } - copy(rb.byte[pos:], dcomp) - } else { - if !rb.insertOrdered(info) { - return false +func (rb *reorderBuffer) insert(src input, i int, info runeInfo) bool { + if info.size == 3 { + if rune := src.hangul(i); rune != 0 { + return rb.decomposeHangul(uint32(rune)) } - copy(rb.byte[pos:], src[:info.size]) } - return true -} - -// insertString inserts the given rune in the buffer ordered by CCC. -// It returns true if the buffer was large enough to hold the decomposed rune. -func (rb *reorderBuffer) insertString(src string, info runeInfo) bool { - if info.size == 3 && isHangulString(src) { - rune, _ := utf8.DecodeRuneInString(src) - return rb.decomposeHangul(uint32(rune)) - } - pos := rb.nbyte - dcomp := rb.f.decomposeString(src) - dn := len(dcomp) - if dn != 0 { - for i := 0; i < dn; i += int(info.size) { - info = rb.f.info(dcomp[i:]) + if info.flags.hasDecomposition() { + dcomp := rb.f.decompose(src, i) + rb.tmpBytes = inputBytes(dcomp) + for i := 0; i < len(dcomp); { + info = rb.f.info(&rb.tmpBytes, i) + pos := rb.nbyte if !rb.insertOrdered(info) { return false } + end := i + int(info.size) + copy(rb.byte[pos:], dcomp[i:end]) + i = end } - copy(rb.byte[pos:], dcomp) } else { + // insertOrder changes nbyte + pos := rb.nbyte if !rb.insertOrdered(info) { return false } - copy(rb.byte[pos:], src[:info.size]) + src.copySlice(rb.byte[pos:], i, i+int(info.size)) } return true } @@ -131,17 +129,16 @@ func (rb *reorderBuffer) insertString(src string, info runeInfo) bool { func (rb *reorderBuffer) appendRune(rune uint32) { bn := rb.nbyte sz := utf8.EncodeRune(rb.byte[bn:], int(rune)) - rb.nbyte += uint8(sz) + rb.nbyte += utf8.UTFMax rb.rune[rb.nrune] = runeInfo{bn, uint8(sz), 0, 0} rb.nrune++ } // assignRune sets a rune at position pos. It is used for Hangul and recomposition. func (rb *reorderBuffer) assignRune(pos int, rune uint32) { - bn := rb.nbyte + bn := rb.rune[pos].pos sz := utf8.EncodeRune(rb.byte[bn:], int(rune)) rb.rune[pos] = runeInfo{bn, uint8(sz), 0, 0} - rb.nbyte += uint8(sz) } // runeAt returns the rune at position n. It is used for Hangul and recomposition. @@ -259,11 +256,10 @@ func (rb *reorderBuffer) decomposeHangul(rune uint32) bool { // combineHangul algorithmically combines Jamo character components into Hangul. // See http://unicode.org/reports/tr15/#Hangul for details on combining Hangul. -func (rb *reorderBuffer) combineHangul() { - k := 1 +func (rb *reorderBuffer) combineHangul(s, i, k int) { b := rb.rune[:] bn := rb.nrune - for s, i := 0, 1; i < bn; i++ { + for ; i < bn; i++ { cccB := b[k-1].ccc cccC := b[i].ccc if cccB == 0 { @@ -305,14 +301,17 @@ func (rb *reorderBuffer) compose() { // blocked from S if and only if there is some character B between S // and C, and either B is a starter or it has the same or higher // combining class as C." + bn := rb.nrune + if bn == 0 { + return + } k := 1 b := rb.rune[:] - bn := rb.nrune for s, i := 0, 1; i < bn; i++ { if isJamoVT(rb.bytesAt(i)) { // Redo from start in Hangul mode. Necessary to support // U+320E..U+321E in NFKC mode. - rb.combineHangul() + rb.combineHangul(s, i, k) return } ii := b[i] |