summaryrefslogtreecommitdiffstats
path: root/libgo/go/exp/utf8string
diff options
context:
space:
mode:
authorian <ian@138bc75d-0d04-0410-961f-82ee72b054a4>2012-01-25 21:54:22 +0000
committerian <ian@138bc75d-0d04-0410-961f-82ee72b054a4>2012-01-25 21:54:22 +0000
commit476d22b59304cb6f5820c2644763a2ce43874a41 (patch)
treec8e8990a2197e33f6fe50a28a16714aafe982102 /libgo/go/exp/utf8string
parent422eaae5fe0038ad189b8fd28cfd6a7094d67ae1 (diff)
downloadppe42-gcc-476d22b59304cb6f5820c2644763a2ce43874a41.tar.gz
ppe42-gcc-476d22b59304cb6f5820c2644763a2ce43874a41.zip
libgo: Update to weekly.2012-01-20.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@183540 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'libgo/go/exp/utf8string')
-rw-r--r--libgo/go/exp/utf8string/string.go203
-rw-r--r--libgo/go/exp/utf8string/string_test.go123
2 files changed, 326 insertions, 0 deletions
diff --git a/libgo/go/exp/utf8string/string.go b/libgo/go/exp/utf8string/string.go
new file mode 100644
index 00000000000..da1e2de1ea2
--- /dev/null
+++ b/libgo/go/exp/utf8string/string.go
@@ -0,0 +1,203 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package utf8string provides an efficient way to index strings by rune rather than by byte.
+package utf8string
+
+import (
+ "errors"
+ "unicode/utf8"
+)
+
+// String wraps a regular string with a small structure that provides more
+// efficient indexing by code point index, as opposed to byte index.
+// Scanning incrementally forwards or backwards is O(1) per index operation
+// (although not as fast a range clause going forwards). Random access is
+// O(N) in the length of the string, but the overhead is less than always
+// scanning from the beginning.
+// If the string is ASCII, random access is O(1).
+// Unlike the built-in string type, String has internal mutable state and
+// is not thread-safe.
+type String struct {
+ str string
+ numRunes int
+ // If width > 0, the rune at runePos starts at bytePos and has the specified width.
+ width int
+ bytePos int
+ runePos int
+ nonASCII int // byte index of the first non-ASCII rune.
+}
+
+// NewString returns a new UTF-8 string with the provided contents.
+func NewString(contents string) *String {
+ return new(String).Init(contents)
+}
+
+// Init initializes an existing String to hold the provided contents.
+// It returns a pointer to the initialized String.
+func (s *String) Init(contents string) *String {
+ s.str = contents
+ s.bytePos = 0
+ s.runePos = 0
+ for i := 0; i < len(contents); i++ {
+ if contents[i] >= utf8.RuneSelf {
+ // Not ASCII.
+ s.numRunes = utf8.RuneCountInString(contents)
+ _, s.width = utf8.DecodeRuneInString(contents)
+ s.nonASCII = i
+ return s
+ }
+ }
+ // ASCII is simple. Also, the empty string is ASCII.
+ s.numRunes = len(contents)
+ s.width = 0
+ s.nonASCII = len(contents)
+ return s
+}
+
+// String returns the contents of the String. This method also means the
+// String is directly printable by fmt.Print.
+func (s *String) String() string {
+ return s.str
+}
+
+// RuneCount returns the number of runes (Unicode code points) in the String.
+func (s *String) RuneCount() int {
+ return s.numRunes
+}
+
+// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
+func (s *String) IsASCII() bool {
+ return s.width == 0
+}
+
+// Slice returns the string sliced at rune positions [i:j].
+func (s *String) Slice(i, j int) string {
+ // ASCII is easy. Let the compiler catch the indexing error if there is one.
+ if j < s.nonASCII {
+ return s.str[i:j]
+ }
+ if i < 0 || j > s.numRunes || i > j {
+ panic(sliceOutOfRange)
+ }
+ if i == j {
+ return ""
+ }
+ // For non-ASCII, after At(i), bytePos is always the position of the indexed character.
+ var low, high int
+ switch {
+ case i < s.nonASCII:
+ low = i
+ case i == s.numRunes:
+ low = len(s.str)
+ default:
+ s.At(i)
+ low = s.bytePos
+ }
+ switch {
+ case j == s.numRunes:
+ high = len(s.str)
+ default:
+ s.At(j)
+ high = s.bytePos
+ }
+ return s.str[low:high]
+}
+
+// At returns the rune with index i in the String. The sequence of runes is the same
+// as iterating over the contents with a "for range" clause.
+func (s *String) At(i int) rune {
+ // ASCII is easy. Let the compiler catch the indexing error if there is one.
+ if i < s.nonASCII {
+ return rune(s.str[i])
+ }
+
+ // Now we do need to know the index is valid.
+ if i < 0 || i >= s.numRunes {
+ panic(outOfRange)
+ }
+
+ var r rune
+
+ // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
+ // With these cases, all scans from beginning or end work in O(1) time per rune.
+ switch {
+
+ case i == s.runePos-1: // backing up one rune
+ r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
+ s.runePos = i
+ s.bytePos -= s.width
+ return r
+ case i == s.runePos+1: // moving ahead one rune
+ s.runePos = i
+ s.bytePos += s.width
+ fallthrough
+ case i == s.runePos:
+ r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
+ return r
+ case i == 0: // start of string
+ r, s.width = utf8.DecodeRuneInString(s.str)
+ s.runePos = 0
+ s.bytePos = 0
+ return r
+
+ case i == s.numRunes-1: // last rune in string
+ r, s.width = utf8.DecodeLastRuneInString(s.str)
+ s.runePos = i
+ s.bytePos = len(s.str) - s.width
+ return r
+ }
+
+ // We need to do a linear scan. There are three places to start from:
+ // 1) The beginning
+ // 2) bytePos/runePos.
+ // 3) The end
+ // Choose the closest in rune count, scanning backwards if necessary.
+ forward := true
+ if i < s.runePos {
+ // Between beginning and pos. Which is closer?
+ // Since both i and runePos are guaranteed >= nonASCII, that's the
+ // lowest location we need to start from.
+ if i < (s.runePos-s.nonASCII)/2 {
+ // Scan forward from beginning
+ s.bytePos, s.runePos = s.nonASCII, s.nonASCII
+ } else {
+ // Scan backwards from where we are
+ forward = false
+ }
+ } else {
+ // Between pos and end. Which is closer?
+ if i-s.runePos < (s.numRunes-s.runePos)/2 {
+ // Scan forward from pos
+ } else {
+ // Scan backwards from end
+ s.bytePos, s.runePos = len(s.str), s.numRunes
+ forward = false
+ }
+ }
+ if forward {
+ // TODO: Is it much faster to use a range loop for this scan?
+ for {
+ r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
+ if s.runePos == i {
+ break
+ }
+ s.runePos++
+ s.bytePos += s.width
+ }
+ } else {
+ for {
+ r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
+ s.runePos--
+ s.bytePos -= s.width
+ if s.runePos == i {
+ break
+ }
+ }
+ }
+ return r
+}
+
+var outOfRange = errors.New("utf8.String: index out of range")
+var sliceOutOfRange = errors.New("utf8.String: slice index out of range")
diff --git a/libgo/go/exp/utf8string/string_test.go b/libgo/go/exp/utf8string/string_test.go
new file mode 100644
index 00000000000..28511b2f5f1
--- /dev/null
+++ b/libgo/go/exp/utf8string/string_test.go
@@ -0,0 +1,123 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package utf8string
+
+import (
+ "math/rand"
+ "testing"
+ "unicode/utf8"
+)
+
+var testStrings = []string{
+ "",
+ "abcd",
+ "☺☻☹",
+ "日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+ "日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
+ "\x80\x80\x80\x80",
+}
+
+func TestScanForwards(t *testing.T) {
+ for _, s := range testStrings {
+ runes := []rune(s)
+ str := NewString(s)
+ if str.RuneCount() != len(runes) {
+ t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+ break
+ }
+ for i, expect := range runes {
+ got := str.At(i)
+ if got != expect {
+ t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+ }
+ }
+ }
+}
+
+func TestScanBackwards(t *testing.T) {
+ for _, s := range testStrings {
+ runes := []rune(s)
+ str := NewString(s)
+ if str.RuneCount() != len(runes) {
+ t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+ break
+ }
+ for i := len(runes) - 1; i >= 0; i-- {
+ expect := runes[i]
+ got := str.At(i)
+ if got != expect {
+ t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+ }
+ }
+ }
+}
+
+func randCount() int {
+ if testing.Short() {
+ return 100
+ }
+ return 100000
+}
+
+func TestRandomAccess(t *testing.T) {
+ for _, s := range testStrings {
+ if len(s) == 0 {
+ continue
+ }
+ runes := []rune(s)
+ str := NewString(s)
+ if str.RuneCount() != len(runes) {
+ t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+ break
+ }
+ for j := 0; j < randCount(); j++ {
+ i := rand.Intn(len(runes))
+ expect := runes[i]
+ got := str.At(i)
+ if got != expect {
+ t.Errorf("%s[%d]: expected %c (%U); got %c (%U)", s, i, expect, expect, got, got)
+ }
+ }
+ }
+}
+
+func TestRandomSliceAccess(t *testing.T) {
+ for _, s := range testStrings {
+ if len(s) == 0 || s[0] == '\x80' { // the bad-UTF-8 string fools this simple test
+ continue
+ }
+ runes := []rune(s)
+ str := NewString(s)
+ if str.RuneCount() != len(runes) {
+ t.Errorf("%s: expected %d runes; got %d", s, len(runes), str.RuneCount())
+ break
+ }
+ for k := 0; k < randCount(); k++ {
+ i := rand.Intn(len(runes))
+ j := rand.Intn(len(runes) + 1)
+ if i > j { // include empty strings
+ continue
+ }
+ expect := string(runes[i:j])
+ got := str.Slice(i, j)
+ if got != expect {
+ t.Errorf("%s[%d:%d]: expected %q got %q", s, i, j, expect, got)
+ }
+ }
+ }
+}
+
+func TestLimitSliceAccess(t *testing.T) {
+ for _, s := range testStrings {
+ str := NewString(s)
+ if str.Slice(0, 0) != "" {
+ t.Error("failure with empty slice at beginning")
+ }
+ nr := utf8.RuneCountInString(s)
+ if str.Slice(nr, nr) != "" {
+ t.Error("failure with empty slice at end")
+ }
+ }
+}
OpenPOWER on IntegriCloud