diff options
author | ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> | 2012-01-25 21:54:22 +0000 |
---|---|---|
committer | ian <ian@138bc75d-0d04-0410-961f-82ee72b054a4> | 2012-01-25 21:54:22 +0000 |
commit | 476d22b59304cb6f5820c2644763a2ce43874a41 (patch) | |
tree | c8e8990a2197e33f6fe50a28a16714aafe982102 /libgo/go/exp/utf8string | |
parent | 422eaae5fe0038ad189b8fd28cfd6a7094d67ae1 (diff) | |
download | ppe42-gcc-476d22b59304cb6f5820c2644763a2ce43874a41.tar.gz ppe42-gcc-476d22b59304cb6f5820c2644763a2ce43874a41.zip |
libgo: Update to weekly.2012-01-20.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@183540 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgo/go/exp/utf8string')
-rw-r--r-- | libgo/go/exp/utf8string/string.go | 203 | ||||
-rw-r--r-- | libgo/go/exp/utf8string/string_test.go | 123 |
2 files changed, 326 insertions, 0 deletions
diff --git a/libgo/go/exp/utf8string/string.go b/libgo/go/exp/utf8string/string.go new file mode 100644 index 00000000000..da1e2de1ea2 --- /dev/null +++ b/libgo/go/exp/utf8string/string.go @@ -0,0 +1,203 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package utf8string provides an efficient way to index strings by rune rather than by byte. +package utf8string + +import ( + "errors" + "unicode/utf8" +) + +// String wraps a regular string with a small structure that provides more +// efficient indexing by code point index, as opposed to byte index. +// Scanning incrementally forwards or backwards is O(1) per index operation +// (although not as fast a range clause going forwards). Random access is +// O(N) in the length of the string, but the overhead is less than always +// scanning from the beginning. +// If the string is ASCII, random access is O(1). +// Unlike the built-in string type, String has internal mutable state and +// is not thread-safe. +type String struct { + str string + numRunes int + // If width > 0, the rune at runePos starts at bytePos and has the specified width. + width int + bytePos int + runePos int + nonASCII int // byte index of the first non-ASCII rune. +} + +// NewString returns a new UTF-8 string with the provided contents. +func NewString(contents string) *String { + return new(String).Init(contents) +} + +// Init initializes an existing String to hold the provided contents. +// It returns a pointer to the initialized String. +func (s *String) Init(contents string) *String { + s.str = contents + s.bytePos = 0 + s.runePos = 0 + for i := 0; i < len(contents); i++ { + if contents[i] >= utf8.RuneSelf { + // Not ASCII. + s.numRunes = utf8.RuneCountInString(contents) + _, s.width = utf8.DecodeRuneInString(contents) + s.nonASCII = i + return s + } + } + // ASCII is simple. Also, the empty string is ASCII. + s.numRunes = len(contents) + s.width = 0 + s.nonASCII = len(contents) + return s +} + +// String returns the contents of the String. This method also means the +// String is directly printable by fmt.Print. +func (s *String) String() string { + return s.str +} + +// RuneCount returns the number of runes (Unicode code points) in the String. +func (s *String) RuneCount() int { + return s.numRunes +} + +// IsASCII returns a boolean indicating whether the String contains only ASCII bytes. +func (s *String) IsASCII() bool { + return s.width == 0 +} + +// Slice returns the string sliced at rune positions [i:j]. +func (s *String) Slice(i, j int) string { + // ASCII is easy. Let the compiler catch the indexing error if there is one. + if j < s.nonASCII { + return s.str[i:j] + } + if i < 0 || j > s.numRunes || i > j { + panic(sliceOutOfRange) + } + if i == j { + return "" + } + // For non-ASCII, after At(i), bytePos is always the position of the indexed character. + var low, high int + switch { + case i < s.nonASCII: + low = i + case i == s.numRunes: + low = len(s.str) + default: + s.At(i) + low = s.bytePos + } + switch { + case j == s.numRunes: + high = len(s.str) + default: + s.At(j) + high = s.bytePos + } + return s.str[low:high] +} + +// At returns the rune with index i in the String. The sequence of runes is the same +// as iterating over the contents with a "for range" clause. +func (s *String) At(i int) rune { + // ASCII is easy. Let the compiler catch the indexing error if there is one. + if i < s.nonASCII { + return rune(s.str[i]) + } + + // Now we do need to know the index is valid. + if i < 0 || i >= s.numRunes { + panic(outOfRange) + } + + var r rune + + // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end. + // With these cases, all scans from beginning or end work in O(1) time per rune. + switch { + + case i == s.runePos-1: // backing up one rune + r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos = i + s.bytePos -= s.width + return r + case i == s.runePos+1: // moving ahead one rune + s.runePos = i + s.bytePos += s.width + fallthrough + case i == s.runePos: + r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) + return r + case i == 0: // start of string + r, s.width = utf8.DecodeRuneInString(s.str) + s.runePos = 0 + s.bytePos = 0 + return r + + case i == s.numRunes-1: // last rune in string + r, s.width = utf8.DecodeLastRuneInString(s.str) + s.runePos = i + s.bytePos = len(s.str) - s.width + return r + } + + // We need to do a linear scan. There are three places to start from: + // 1) The beginning + // 2) bytePos/runePos. + // 3) The end + // Choose the closest in rune count, scanning backwards if necessary. + forward := true + if i < s.runePos { + // Between beginning and pos. Which is closer? + // Since both i and runePos are guaranteed >= nonASCII, that's the + // lowest location we need to start from. + if i < (s.runePos-s.nonASCII)/2 { + // Scan forward from beginning + s.bytePos, s.runePos = s.nonASCII, s.nonASCII + } else { + // Scan backwards from where we are + forward = false + } + } else { + // Between pos and end. Which is closer? + if i-s.runePos < (s.numRunes-s.runePos)/2 { + // Scan forward from pos + } else { + // Scan backwards from end + s.bytePos, s.runePos = len(s.str), s.numRunes + forward = false + } + } + if forward { + // TODO: Is it much faster to use a range loop for this scan? + for { + r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) + if s.runePos == i { + break + } + s.runePos++ + s.bytePos += s.width + } + } else { + for { + r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) + s.runePos-- + s.bytePos -= s.width + if s.runePos == i { + break + } + } + } + return r +} + +var outOfRange = errors.New("utf8.String: index out of range") +var sliceOutOfRange = errors.New("utf8.String: slice index out of range") diff --git a/libgo/go/exp/utf8string/string_test.go b/libgo/go/exp/utf8string/string_test.go new file mode 100644 index 00000000000..28511b2f5f1 --- /dev/null +++ b/libgo/go/exp/utf8string/string_test.go @@ -0,0 +1,123 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package utf8string + +import ( + "math/rand" + "testing" + "unicode/utf8" +) + +var testStrings = []string{ + "", + "abcd", + "☺☻☹", + "日a本b語ç日ð本Ê語þ日¥本¼語i日©", + "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©", + "\x80\x80\x80\x80", +} + +func TestScanForwards(t *testing.T) { + for _, s := range testStrings { + runes := []rune(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for i, expect := range runes { + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got) + } + } + } +} + +func TestScanBackwards(t *testing.T) { + for _, s := range testStrings { + runes := []rune(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for i := len(runes) - 1; i >= 0; i-- { + expect := runes[i] + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got) + } + } + } +} + +func randCount() int { + if testing.Short() { + return 100 + } + return 100000 +} + +func TestRandomAccess(t *testing.T) { + for _, s := range testStrings { + if len(s) == 0 { + continue + } + runes := []rune(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for j := 0; j < randCount(); j++ { + i := rand.Intn(len(runes)) + expect := runes[i] + got := str.At(i) + if got != expect { + t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got) + } + } + } +} + +func TestRandomSliceAccess(t *testing.T) { + for _, s := range testStrings { + if len(s) == 0 || s[0] == '\x80' { // the bad-UTF-8 string fools this simple test + continue + } + runes := []rune(s) + str := NewString(s) + if str.RuneCount() != len(runes) { + t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount()) + break + } + for k := 0; k < randCount(); k++ { + i := rand.Intn(len(runes)) + j := rand.Intn(len(runes) + 1) + if i > j { // include empty strings + continue + } + expect := string(runes[i:j]) + got := str.Slice(i, j) + if got != expect { + t.Errorf("%s[%d:%d]: expected %q got %q", s, i, j, expect, got) + } + } + } +} + +func TestLimitSliceAccess(t *testing.T) { + for _, s := range testStrings { + str := NewString(s) + if str.Slice(0, 0) != "" { + t.Error("failure with empty slice at beginning") + } + nr := utf8.RuneCountInString(s) + if str.Slice(nr, nr) != "" { + t.Error("failure with empty slice at end") + } + } +} |