Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve string encoding by following json approach #1350

Merged
merged 9 commits into from Sep 9, 2023
5 changes: 5 additions & 0 deletions buffer/buffer.go
Expand Up @@ -42,6 +42,11 @@ func (b *Buffer) AppendByte(v byte) {
b.bs = append(b.bs, v)
}

// AppendBytes writes a single byte to the Buffer.
Copy link

@makkes makkes Sep 18, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just came here from the 1.26.0 release notes. Great job! This comment, though, doesn't seem to be right in that the method writes all bytes to the buffer. /cc @abhinav

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oops, yes, you're right. I missed this. Thanks!

func (b *Buffer) AppendBytes(v []byte) {
b.bs = append(b.bs, v...)
}

// AppendString writes a string to the Buffer.
func (b *Buffer) AppendString(s string) {
b.bs = append(b.bs, s...)
Expand Down
143 changes: 84 additions & 59 deletions zapcore/json_encoder.go
Expand Up @@ -486,73 +486,98 @@ func (enc *jsonEncoder) appendFloat(val float64, bitSize int) {
// Unlike the standard library's encoder, it doesn't attempt to protect the
// user from browser vulnerabilities or JSONP-related problems.
func (enc *jsonEncoder) safeAddString(s string) {
for i := 0; i < len(s); {
if enc.tryAddRuneSelf(s[i]) {
i++
continue
}
r, size := utf8.DecodeRuneInString(s[i:])
if enc.tryAddRuneError(r, size) {
i++
continue
}
enc.buf.AppendString(s[i : i+size])
i += size
}
safeAppendStringLike(
(*buffer.Buffer).AppendString,
utf8.DecodeRuneInString,
enc.buf,
s,
)
}

// safeAddByteString is no-alloc equivalent of safeAddString(string(s)) for s []byte.
func (enc *jsonEncoder) safeAddByteString(s []byte) {
safeAppendStringLike(
(*buffer.Buffer).AppendBytes,
utf8.DecodeRune,
enc.buf,
s,
)
}

// safeAppendStringLike is a generic implementation of safeAddString and safeAddByteString.
// It appends a string or byte slice to the buffer, escaping all special characters.
func safeAppendStringLike[S []byte | string](
// appendTo appends this string-like object to the buffer.
appendTo func(*buffer.Buffer, S),
// decodeRune decodes the next rune from the string-like object
// and returns its value and width in bytes.
decodeRune func(S) (rune, int),
buf *buffer.Buffer,
s S,
) {
// The encoding logic below works by skipping over characters
// that can be safely copied as-is,
// until a character is found that needs special handling.
// At that point, we copy everything we've seen so far,
// and then handle that special character.
//
// last is the index of the last byte that was copied to the buffer.
last := 0
for i := 0; i < len(s); {
if enc.tryAddRuneSelf(s[i]) {
if s[i] >= utf8.RuneSelf {
// Character >= RuneSelf may be part of a multi-byte rune.
// They need to be decoded before we can decide how to handle them.
r, size := decodeRune(s[i:])
if r != utf8.RuneError || size != 1 {
// No special handling required.
// Skip over this rune and continue.
i += size
continue
}

// Invalid UTF-8 sequence.
// Replace it with the Unicode replacement character.
appendTo(buf, s[last:i])
buf.AppendString(`\ufffd`)

i++
continue
}
r, size := utf8.DecodeRune(s[i:])
if enc.tryAddRuneError(r, size) {
last = i
} else {
// Character < RuneSelf is a single-byte UTF-8 rune.
if s[i] >= 0x20 && s[i] != '\\' && s[i] != '"' {
// No escaping necessary.
// Skip over this character and continue.
i++
continue
}

// This character needs to be escaped.
appendTo(buf, s[last:i])
switch s[i] {
case '\\', '"':
buf.AppendByte('\\')
buf.AppendByte(s[i])
case '\n':
buf.AppendByte('\\')
buf.AppendByte('n')
case '\r':
buf.AppendByte('\\')
buf.AppendByte('r')
case '\t':
buf.AppendByte('\\')
buf.AppendByte('t')
default:
// Encode bytes < 0x20, except for the escape sequences above.
buf.AppendString(`\u00`)
buf.AppendByte(_hex[s[i]>>4])
buf.AppendByte(_hex[s[i]&0xF])
}

i++
continue
last = i
}
enc.buf.Write(s[i : i+size])
i += size
}
}

// tryAddRuneSelf appends b if it is valid UTF-8 character represented in a single byte.
func (enc *jsonEncoder) tryAddRuneSelf(b byte) bool {
if b >= utf8.RuneSelf {
return false
}
if b >= 0x20 && b != '\\' && b != '"' {
enc.buf.AppendByte(b)
return true
}
switch b {
case '\\', '"':
enc.buf.AppendByte('\\')
enc.buf.AppendByte(b)
case '\n':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('n')
case '\r':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('r')
case '\t':
enc.buf.AppendByte('\\')
enc.buf.AppendByte('t')
default:
// Encode bytes < 0x20, except for the escape sequences above.
enc.buf.AppendString(`\u00`)
enc.buf.AppendByte(_hex[b>>4])
enc.buf.AppendByte(_hex[b&0xF])
}
return true
}

func (enc *jsonEncoder) tryAddRuneError(r rune, size int) bool {
if r == utf8.RuneError && size == 1 {
enc.buf.AppendString(`\ufffd`)
return true
}
return false
// add remaining
appendTo(buf, s[last:])
}
33 changes: 29 additions & 4 deletions zapcore/json_encoder_bench_test.go
Expand Up @@ -22,6 +22,7 @@ package zapcore_test

import (
"encoding/json"
"fmt"
"testing"
"time"

Expand Down Expand Up @@ -51,7 +52,28 @@ func BenchmarkZapJSONFloat32AndComplex64(b *testing.B) {
})
}

const _sliceSize = 5000

type StringSlice []string

func (s StringSlice) MarshalLogArray(encoder ArrayEncoder) error {
for _, str := range s {
encoder.AppendString(str)
}
return nil
}

func generateStringSlice(n int) StringSlice {
output := make(StringSlice, 0, n)
for i := 0; i < n; i++ {
output = append(output, fmt.Sprint("00000000-0000-0000-0000-0000000000", i))
}
return output
}

func BenchmarkZapJSON(b *testing.B) {
additional := generateStringSlice(_sliceSize)
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
for pb.Next() {
enc := NewJSONEncoder(testEncoderConfig())
Expand All @@ -64,6 +86,7 @@ func BenchmarkZapJSON(b *testing.B) {
enc.AddString("string3", "🤔")
enc.AddString("string4", "🙊")
enc.AddBool("bool", true)
_ = enc.AddArray("test", additional)
buf, _ := enc.EncodeEntry(Entry{
Message: "fake",
Level: DebugLevel,
Expand All @@ -75,10 +98,11 @@ func BenchmarkZapJSON(b *testing.B) {

func BenchmarkStandardJSON(b *testing.B) {
record := struct {
Level string `json:"level"`
Message string `json:"msg"`
Time time.Time `json:"ts"`
Fields map[string]interface{} `json:"fields"`
Level string `json:"level"`
Message string `json:"msg"`
Time time.Time `json:"ts"`
Fields map[string]interface{} `json:"fields"`
Additional StringSlice
}{
Level: "debug",
Message: "fake",
Expand All @@ -94,6 +118,7 @@ func BenchmarkStandardJSON(b *testing.B) {
"string4": "🙊",
"bool": true,
},
Additional: generateStringSlice(_sliceSize),
}
b.ResetTimer()
b.RunParallel(func(pb *testing.PB) {
Expand Down
72 changes: 72 additions & 0 deletions zapcore/json_encoder_impl_test.go
Expand Up @@ -29,10 +29,13 @@ import (
"testing"
"testing/quick"
"time"
"unicode/utf8"

"go.uber.org/zap/buffer"
"go.uber.org/zap/internal/bufferpool"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/multierr"
)

Expand Down Expand Up @@ -662,3 +665,72 @@ func TestJSONQuick(t *testing.T) {
check(asciiRoundTripsCorrectlyString)
check(asciiRoundTripsCorrectlyByteString)
}

var _stringLikeCorpus = []string{
"",
"foo",
"bar",
"a\nb",
"a\tb",
"a\\b",
`a"b`,
}

func FuzzSafeAppendStringLike_bytes(f *testing.F) {
for _, s := range _stringLikeCorpus {
f.Add([]byte(s))
}
f.Fuzz(func(t *testing.T, b []byte) {
if !utf8.Valid(b) {
t.Skip()
}

fuzzSafeAppendStringLike(t, string(b), func(buf *buffer.Buffer) {
safeAppendStringLike(
(*buffer.Buffer).AppendBytes,
utf8.DecodeRune,
buf,
b,
)
})
})
}

func FuzzSafeAppendStringLike_string(f *testing.F) {
for _, s := range _stringLikeCorpus {
f.Add(s)
}
f.Fuzz(func(t *testing.T, s string) {
if !utf8.ValidString(s) {
t.Skip()
}

fuzzSafeAppendStringLike(t, s, func(buf *buffer.Buffer) {
safeAppendStringLike(
(*buffer.Buffer).AppendString,
utf8.DecodeRuneInString,
buf,
s,
)
})
})
}

func fuzzSafeAppendStringLike(
t *testing.T,
want string,
writeString func(*buffer.Buffer),
) {
t.Helper()

buf := bufferpool.Get()
defer buf.Free()

buf.AppendByte('"')
writeString(buf)
buf.AppendByte('"')

var got string
require.NoError(t, json.Unmarshal(buf.Bytes(), &got))
assert.Equal(t, want, got)
}