Commit fe8b5334 by Emil Tullstedt Committed by GitHub

Modules: Add patched goavro dependency for extensions (#21027)

parent 32c9d558
......@@ -4,7 +4,9 @@ go 1.13
require (
github.com/BurntSushi/toml v0.3.1
github.com/DataDog/zstd v1.4.4 // indirect
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f
github.com/apache/thrift v0.13.0 // indirect
github.com/aws/aws-sdk-go v1.25.48
github.com/beevik/etree v1.1.0 // indirect
github.com/benbjohnson/clock v0.0.0-20161215174838-7dc76406b6d3
......@@ -42,6 +44,7 @@ require (
github.com/klauspost/compress v1.4.1 // indirect
github.com/klauspost/cpuid v1.2.0 // indirect
github.com/lib/pq v1.2.0
github.com/linkedin/goavro/v2 v2.9.7
github.com/mattn/go-isatty v0.0.10
github.com/mattn/go-sqlite3 v1.11.0
github.com/opentracing/opentracing-go v1.1.0
......@@ -62,6 +65,8 @@ require (
github.com/uber/jaeger-client-go v2.16.0+incompatible
github.com/uber/jaeger-lib v2.0.0+incompatible // indirect
github.com/unknwon/com v1.0.1
github.com/xitongsys/parquet-go v1.4.0 // indirect
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6 // indirect
github.com/yudai/gojsondiff v1.0.0
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 // indirect
github.com/yudai/pp v2.0.1+incompatible // indirect
......
......@@ -3,12 +3,16 @@ cloud.google.com/go v0.34.0 h1:eOI3/cP2VTU6uZLDYAoic+eyzzB9YyGmJ7eIjl8rOPg=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/DataDog/zstd v1.4.4 h1:+IawcoXhCBylN7ccwdwf8LOH2jKq7NavGpEPanrlTzE=
github.com/DataDog/zstd v1.4.4/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f h1:HR5nRmUQgXrwqZOwZ2DAc/aCi3Bu3xENpspW935vxu0=
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f/go.mod h1:f3HiCrHjHBdcm6E83vGaXh1KomZMA2P6aeo3hKx/wg0=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/apache/arrow/go/arrow v0.0.0-20190716210558-5f564424c71c h1:iHUHzx3S1TU5xt+D7vLb0PAk3e+RfayF9IhR6+hyO/k=
github.com/apache/arrow/go/arrow v0.0.0-20190716210558-5f564424c71c/go.mod h1:VTxUBvSJ3s3eHAg65PNgrsn5BtqCRPdmyXh6rAfdxN0=
github.com/apache/thrift v0.13.0 h1:5hryIiq9gtn+MiLVn0wP37kb/uTeRZgN08WoCsAhIhI=
github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
github.com/aws/aws-sdk-go v1.25.48 h1:J82DYDGZHOKHdhx6hD24Tm30c2C3GchYGfN0mf9iKUk=
github.com/aws/aws-sdk-go v1.25.48/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
github.com/beevik/etree v1.0.1/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A=
......@@ -97,6 +101,7 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A=
github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
......@@ -163,6 +168,8 @@ github.com/lib/pq v1.0.0 h1:X5PMW56eZitiTeO7tKzZxFCSpbFZJtkMMooicw2us9A=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0=
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/linkedin/goavro/v2 v2.9.7 h1:Vd++Rb/RKcmNJjM0HP/JJFMEWa21eUBVKPYlKehOGrM=
github.com/linkedin/goavro/v2 v2.9.7/go.mod h1:UgQUb2N/pmueQYH9bfqFioWxzYCZXSfF8Jw03O5sjqA=
github.com/lunny/log v0.0.0-20160921050905-7887c61bf0de/go.mod h1:3q8WtuPQsoRbatJuy3nvq/hRSvuBJrHHr+ybPPiNvHQ=
github.com/lunny/nodb v0.0.0-20160621015157-fc1ef06ad4af/go.mod h1:Cqz6pqow14VObJ7peltM+2n3PWOz7yTrfUuGbVFkzN0=
github.com/mattetti/filebuffer v1.0.0 h1:ixTvQ0JjBTwWbdpDZ98lLrydo7KRi8xNRIi5RFszsbY=
......@@ -283,6 +290,10 @@ github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6x
github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
github.com/unknwon/com v1.0.1 h1:3d1LTxD+Lnf3soQiD4Cp/0BRB+Rsa/+RTvz8GMMzIXs=
github.com/unknwon/com v1.0.1/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
github.com/xitongsys/parquet-go v1.4.0 h1:+3+QFRRwAilhTdNcJU2hPxslLCAKJ+Tn8C2OhnCVWDo=
github.com/xitongsys/parquet-go v1.4.0/go.mod h1:on8bl2K/PEouGNEJqxht0t3K4IyN/ABeFu84Hh3lzrE=
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6 h1:KPDKkdchSII+K5KS7iMpE062MVh2OucaM31599ER4U0=
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6/go.mod h1:xxCx7Wpym/3QCo6JhujJX51dzSXrwmb0oH6FQb39SEA=
github.com/yudai/gojsondiff v1.0.0 h1:27cbfqXLVEJ1o8I6v3y9lg8Ydm53EKqHXAOMxEGlCOA=
github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 h1:BHyfKlQyqbsFN5p3IfnEUduWvb9is428/nNb5L3U01M=
......
......@@ -9,6 +9,7 @@ import (
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/registry"
_ "github.com/jung-kurt/gofpdf"
_ "github.com/linkedin/goavro/v2"
_ "github.com/pkg/errors"
_ "github.com/robfig/cron"
_ "github.com/robfig/cron/v3"
......
cmd/snappytool/snappytool
testdata/bench
# These explicitly listed benchmark data files are for an obsolete version of
# snappy_test.go.
testdata/alice29.txt
testdata/asyoulik.txt
testdata/fireworks.jpeg
testdata/geo.protodata
testdata/html
testdata/html_x_4
testdata/kppkn.gtb
testdata/lcet10.txt
testdata/paper-100k.pdf
testdata/plrabn12.txt
testdata/urls.10K
# This is the official list of Snappy-Go authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Damian Gryski <dgryski@gmail.com>
Google Inc.
Jan Mercl <0xjnml@gmail.com>
Rodolfo Carvalho <rhcarvalho@gmail.com>
Sebastien Binet <seb.binet@gmail.com>
# This is the official list of people who can contribute
# (and typically have contributed) code to the Snappy-Go repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Damian Gryski <dgryski@gmail.com>
Jan Mercl <0xjnml@gmail.com>
Kai Backman <kaib@golang.org>
Marc-Antoine Ruel <maruel@chromium.org>
Nigel Tao <nigeltao@golang.org>
Rob Pike <r@golang.org>
Rodolfo Carvalho <rhcarvalho@gmail.com>
Russ Cox <rsc@golang.org>
Sebastien Binet <seb.binet@gmail.com>
Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The Snappy compression format in the Go programming language.
To download and install from source:
$ go get github.com/golang/snappy
Unless otherwise noted, the Snappy-Go source files are distributed
under the BSD-style license found in the LICENSE file.
Benchmarks.
The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
or so files, the same set used by the C++ Snappy code (github.com/google/snappy
and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
"go test -test.bench=."
_UFlat0-8 2.19GB/s ± 0% html
_UFlat1-8 1.41GB/s ± 0% urls
_UFlat2-8 23.5GB/s ± 2% jpg
_UFlat3-8 1.91GB/s ± 0% jpg_200
_UFlat4-8 14.0GB/s ± 1% pdf
_UFlat5-8 1.97GB/s ± 0% html4
_UFlat6-8 814MB/s ± 0% txt1
_UFlat7-8 785MB/s ± 0% txt2
_UFlat8-8 857MB/s ± 0% txt3
_UFlat9-8 719MB/s ± 1% txt4
_UFlat10-8 2.84GB/s ± 0% pb
_UFlat11-8 1.05GB/s ± 0% gaviota
_ZFlat0-8 1.04GB/s ± 0% html
_ZFlat1-8 534MB/s ± 0% urls
_ZFlat2-8 15.7GB/s ± 1% jpg
_ZFlat3-8 740MB/s ± 3% jpg_200
_ZFlat4-8 9.20GB/s ± 1% pdf
_ZFlat5-8 991MB/s ± 0% html4
_ZFlat6-8 379MB/s ± 0% txt1
_ZFlat7-8 352MB/s ± 0% txt2
_ZFlat8-8 396MB/s ± 1% txt3
_ZFlat9-8 327MB/s ± 1% txt4
_ZFlat10-8 1.33GB/s ± 1% pb
_ZFlat11-8 605MB/s ± 1% gaviota
"go test -test.bench=. -tags=noasm"
_UFlat0-8 621MB/s ± 2% html
_UFlat1-8 494MB/s ± 1% urls
_UFlat2-8 23.2GB/s ± 1% jpg
_UFlat3-8 1.12GB/s ± 1% jpg_200
_UFlat4-8 4.35GB/s ± 1% pdf
_UFlat5-8 609MB/s ± 0% html4
_UFlat6-8 296MB/s ± 0% txt1
_UFlat7-8 288MB/s ± 0% txt2
_UFlat8-8 309MB/s ± 1% txt3
_UFlat9-8 280MB/s ± 1% txt4
_UFlat10-8 753MB/s ± 0% pb
_UFlat11-8 400MB/s ± 0% gaviota
_ZFlat0-8 409MB/s ± 1% html
_ZFlat1-8 250MB/s ± 1% urls
_ZFlat2-8 12.3GB/s ± 1% jpg
_ZFlat3-8 132MB/s ± 0% jpg_200
_ZFlat4-8 2.92GB/s ± 0% pdf
_ZFlat5-8 405MB/s ± 1% html4
_ZFlat6-8 179MB/s ± 1% txt1
_ZFlat7-8 170MB/s ± 1% txt2
_ZFlat8-8 189MB/s ± 1% txt3
_ZFlat9-8 164MB/s ± 1% txt4
_ZFlat10-8 479MB/s ± 1% pb
_ZFlat11-8 270MB/s ± 1% gaviota
For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
are the numbers from C++ Snappy's
make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
BM_UFlat/0 2.4GB/s html
BM_UFlat/1 1.4GB/s urls
BM_UFlat/2 21.8GB/s jpg
BM_UFlat/3 1.5GB/s jpg_200
BM_UFlat/4 13.3GB/s pdf
BM_UFlat/5 2.1GB/s html4
BM_UFlat/6 1.0GB/s txt1
BM_UFlat/7 959.4MB/s txt2
BM_UFlat/8 1.0GB/s txt3
BM_UFlat/9 864.5MB/s txt4
BM_UFlat/10 2.9GB/s pb
BM_UFlat/11 1.2GB/s gaviota
BM_ZFlat/0 944.3MB/s html (22.31 %)
BM_ZFlat/1 501.6MB/s urls (47.78 %)
BM_ZFlat/2 14.3GB/s jpg (99.95 %)
BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %)
BM_ZFlat/4 8.3GB/s pdf (83.30 %)
BM_ZFlat/5 903.5MB/s html4 (22.52 %)
BM_ZFlat/6 336.0MB/s txt1 (57.88 %)
BM_ZFlat/7 312.3MB/s txt2 (61.91 %)
BM_ZFlat/8 353.1MB/s txt3 (54.99 %)
BM_ZFlat/9 289.9MB/s txt4 (66.26 %)
BM_ZFlat/10 1.2GB/s pb (19.68 %)
BM_ZFlat/11 527.4MB/s gaviota (37.72 %)
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package snappy
import (
"encoding/binary"
"errors"
"io"
)
var (
// ErrCorrupt reports that the input is invalid.
ErrCorrupt = errors.New("snappy: corrupt input")
// ErrTooLarge reports that the uncompressed length is too large.
ErrTooLarge = errors.New("snappy: decoded block is too large")
// ErrUnsupported reports that the input isn't supported.
ErrUnsupported = errors.New("snappy: unsupported input")
errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
)
// DecodedLen returns the length of the decoded block.
func DecodedLen(src []byte) (int, error) {
v, _, err := decodedLen(src)
return v, err
}
// decodedLen returns the length of the decoded block and the number of bytes
// that the length header occupied.
func decodedLen(src []byte) (blockLen, headerLen int, err error) {
v, n := binary.Uvarint(src)
if n <= 0 || v > 0xffffffff {
return 0, 0, ErrCorrupt
}
const wordSize = 32 << (^uint(0) >> 32 & 1)
if wordSize == 32 && v > 0x7fffffff {
return 0, 0, ErrTooLarge
}
return int(v), n, nil
}
const (
decodeErrCodeCorrupt = 1
decodeErrCodeUnsupportedLiteralLength = 2
)
// Decode returns the decoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire decoded block.
// Otherwise, a newly allocated slice will be returned.
//
// The dst and src must not overlap. It is valid to pass a nil dst.
func Decode(dst, src []byte) ([]byte, error) {
dLen, s, err := decodedLen(src)
if err != nil {
return nil, err
}
if dLen <= len(dst) {
dst = dst[:dLen]
} else {
dst = make([]byte, dLen)
}
switch decode(dst, src[s:]) {
case 0:
return dst, nil
case decodeErrCodeUnsupportedLiteralLength:
return nil, errUnsupportedLiteralLength
}
return nil, ErrCorrupt
}
// NewReader returns a new Reader that decompresses from r, using the framing
// format described at
// https://github.com/google/snappy/blob/master/framing_format.txt
func NewReader(r io.Reader) *Reader {
return &Reader{
r: r,
decoded: make([]byte, maxBlockSize),
buf: make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
}
}
// Reader is an io.Reader that can read Snappy-compressed bytes.
type Reader struct {
r io.Reader
err error
decoded []byte
buf []byte
// decoded[i:j] contains decoded bytes that have not yet been passed on.
i, j int
readHeader bool
}
// Reset discards any buffered data, resets all state, and switches the Snappy
// reader to read from r. This permits reusing a Reader rather than allocating
// a new one.
func (r *Reader) Reset(reader io.Reader) {
r.r = reader
r.err = nil
r.i = 0
r.j = 0
r.readHeader = false
}
func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
if _, r.err = io.ReadFull(r.r, p); r.err != nil {
if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
r.err = ErrCorrupt
}
return false
}
return true
}
// Read satisfies the io.Reader interface.
func (r *Reader) Read(p []byte) (int, error) {
if r.err != nil {
return 0, r.err
}
for {
if r.i < r.j {
n := copy(p, r.decoded[r.i:r.j])
r.i += n
return n, nil
}
if !r.readFull(r.buf[:4], true) {
return 0, r.err
}
chunkType := r.buf[0]
if !r.readHeader {
if chunkType != chunkTypeStreamIdentifier {
r.err = ErrCorrupt
return 0, r.err
}
r.readHeader = true
}
chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
if chunkLen > len(r.buf) {
r.err = ErrUnsupported
return 0, r.err
}
// The chunk types are specified at
// https://github.com/google/snappy/blob/master/framing_format.txt
switch chunkType {
case chunkTypeCompressedData:
// Section 4.2. Compressed data (chunk type 0x00).
if chunkLen < checksumSize {
r.err = ErrCorrupt
return 0, r.err
}
buf := r.buf[:chunkLen]
if !r.readFull(buf, false) {
return 0, r.err
}
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
buf = buf[checksumSize:]
n, err := DecodedLen(buf)
if err != nil {
r.err = err
return 0, r.err
}
if n > len(r.decoded) {
r.err = ErrCorrupt
return 0, r.err
}
if _, err := Decode(r.decoded, buf); err != nil {
r.err = err
return 0, r.err
}
if crc(r.decoded[:n]) != checksum {
r.err = ErrCorrupt
return 0, r.err
}
r.i, r.j = 0, n
continue
case chunkTypeUncompressedData:
// Section 4.3. Uncompressed data (chunk type 0x01).
if chunkLen < checksumSize {
r.err = ErrCorrupt
return 0, r.err
}
buf := r.buf[:checksumSize]
if !r.readFull(buf, false) {
return 0, r.err
}
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
// Read directly into r.decoded instead of via r.buf.
n := chunkLen - checksumSize
if n > len(r.decoded) {
r.err = ErrCorrupt
return 0, r.err
}
if !r.readFull(r.decoded[:n], false) {
return 0, r.err
}
if crc(r.decoded[:n]) != checksum {
r.err = ErrCorrupt
return 0, r.err
}
r.i, r.j = 0, n
continue
case chunkTypeStreamIdentifier:
// Section 4.1. Stream identifier (chunk type 0xff).
if chunkLen != len(magicBody) {
r.err = ErrCorrupt
return 0, r.err
}
if !r.readFull(r.buf[:len(magicBody)], false) {
return 0, r.err
}
for i := 0; i < len(magicBody); i++ {
if r.buf[i] != magicBody[i] {
r.err = ErrCorrupt
return 0, r.err
}
}
continue
}
if chunkType <= 0x7f {
// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
r.err = ErrUnsupported
return 0, r.err
}
// Section 4.4 Padding (chunk type 0xfe).
// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
if !r.readFull(r.buf[:chunkLen], false) {
return 0, r.err
}
}
}
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
package snappy
// decode has the same semantics as in decode_other.go.
//
//go:noescape
func decode(dst, src []byte) int
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
// The asm code generally follows the pure Go code in decode_other.go, except
// where marked with a "!!!".
// func decode(dst, src []byte) int
//
// All local variables fit into registers. The non-zero stack size is only to
// spill registers and push args when issuing a CALL. The register allocation:
// - AX scratch
// - BX scratch
// - CX length or x
// - DX offset
// - SI &src[s]
// - DI &dst[d]
// + R8 dst_base
// + R9 dst_len
// + R10 dst_base + dst_len
// + R11 src_base
// + R12 src_len
// + R13 src_base + src_len
// - R14 used by doCopy
// - R15 used by doCopy
//
// The registers R8-R13 (marked with a "+") are set at the start of the
// function, and after a CALL returns, and are not otherwise modified.
//
// The d variable is implicitly DI - R8, and len(dst)-d is R10 - DI.
// The s variable is implicitly SI - R11, and len(src)-s is R13 - SI.
TEXT ·decode(SB), NOSPLIT, $48-56
// Initialize SI, DI and R8-R13.
MOVQ dst_base+0(FP), R8
MOVQ dst_len+8(FP), R9
MOVQ R8, DI
MOVQ R8, R10
ADDQ R9, R10
MOVQ src_base+24(FP), R11
MOVQ src_len+32(FP), R12
MOVQ R11, SI
MOVQ R11, R13
ADDQ R12, R13
loop:
// for s < len(src)
CMPQ SI, R13
JEQ end
// CX = uint32(src[s])
//
// switch src[s] & 0x03
MOVBLZX (SI), CX
MOVL CX, BX
ANDL $3, BX
CMPL BX, $1
JAE tagCopy
// ----------------------------------------
// The code below handles literal tags.
// case tagLiteral:
// x := uint32(src[s] >> 2)
// switch
SHRL $2, CX
CMPL CX, $60
JAE tagLit60Plus
// case x < 60:
// s++
INCQ SI
doLit:
// This is the end of the inner "switch", when we have a literal tag.
//
// We assume that CX == x and x fits in a uint32, where x is the variable
// used in the pure Go decode_other.go code.
// length = int(x) + 1
//
// Unlike the pure Go code, we don't need to check if length <= 0 because
// CX can hold 64 bits, so the increment cannot overflow.
INCQ CX
// Prepare to check if copying length bytes will run past the end of dst or
// src.
//
// AX = len(dst) - d
// BX = len(src) - s
MOVQ R10, AX
SUBQ DI, AX
MOVQ R13, BX
SUBQ SI, BX
// !!! Try a faster technique for short (16 or fewer bytes) copies.
//
// if length > 16 || len(dst)-d < 16 || len(src)-s < 16 {
// goto callMemmove // Fall back on calling runtime·memmove.
// }
//
// The C++ snappy code calls this TryFastAppend. It also checks len(src)-s
// against 21 instead of 16, because it cannot assume that all of its input
// is contiguous in memory and so it needs to leave enough source bytes to
// read the next tag without refilling buffers, but Go's Decode assumes
// contiguousness (the src argument is a []byte).
CMPQ CX, $16
JGT callMemmove
CMPQ AX, $16
JLT callMemmove
CMPQ BX, $16
JLT callMemmove
// !!! Implement the copy from src to dst as a 16-byte load and store.
// (Decode's documentation says that dst and src must not overlap.)
//
// This always copies 16 bytes, instead of only length bytes, but that's
// OK. If the input is a valid Snappy encoding then subsequent iterations
// will fix up the overrun. Otherwise, Decode returns a nil []byte (and a
// non-nil error), so the overrun will be ignored.
//
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
MOVOU 0(SI), X0
MOVOU X0, 0(DI)
// d += length
// s += length
ADDQ CX, DI
ADDQ CX, SI
JMP loop
callMemmove:
// if length > len(dst)-d || length > len(src)-s { etc }
CMPQ CX, AX
JGT errCorrupt
CMPQ CX, BX
JGT errCorrupt
// copy(dst[d:], src[s:s+length])
//
// This means calling runtime·memmove(&dst[d], &src[s], length), so we push
// DI, SI and CX as arguments. Coincidentally, we also need to spill those
// three registers to the stack, to save local variables across the CALL.
MOVQ DI, 0(SP)
MOVQ SI, 8(SP)
MOVQ CX, 16(SP)
MOVQ DI, 24(SP)
MOVQ SI, 32(SP)
MOVQ CX, 40(SP)
CALL runtime·memmove(SB)
// Restore local variables: unspill registers from the stack and
// re-calculate R8-R13.
MOVQ 24(SP), DI
MOVQ 32(SP), SI
MOVQ 40(SP), CX
MOVQ dst_base+0(FP), R8
MOVQ dst_len+8(FP), R9
MOVQ R8, R10
ADDQ R9, R10
MOVQ src_base+24(FP), R11
MOVQ src_len+32(FP), R12
MOVQ R11, R13
ADDQ R12, R13
// d += length
// s += length
ADDQ CX, DI
ADDQ CX, SI
JMP loop
tagLit60Plus:
// !!! This fragment does the
//
// s += x - 58; if uint(s) > uint(len(src)) { etc }
//
// checks. In the asm version, we code it once instead of once per switch case.
ADDQ CX, SI
SUBQ $58, SI
MOVQ SI, BX
SUBQ R11, BX
CMPQ BX, R12
JA errCorrupt
// case x == 60:
CMPL CX, $61
JEQ tagLit61
JA tagLit62Plus
// x = uint32(src[s-1])
MOVBLZX -1(SI), CX
JMP doLit
tagLit61:
// case x == 61:
// x = uint32(src[s-2]) | uint32(src[s-1])<<8
MOVWLZX -2(SI), CX
JMP doLit
tagLit62Plus:
CMPL CX, $62
JA tagLit63
// case x == 62:
// x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
MOVWLZX -3(SI), CX
MOVBLZX -1(SI), BX
SHLL $16, BX
ORL BX, CX
JMP doLit
tagLit63:
// case x == 63:
// x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
MOVL -4(SI), CX
JMP doLit
// The code above handles literal tags.
// ----------------------------------------
// The code below handles copy tags.
tagCopy4:
// case tagCopy4:
// s += 5
ADDQ $5, SI
// if uint(s) > uint(len(src)) { etc }
MOVQ SI, BX
SUBQ R11, BX
CMPQ BX, R12
JA errCorrupt
// length = 1 + int(src[s-5])>>2
SHRQ $2, CX
INCQ CX
// offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
MOVLQZX -4(SI), DX
JMP doCopy
tagCopy2:
// case tagCopy2:
// s += 3
ADDQ $3, SI
// if uint(s) > uint(len(src)) { etc }
MOVQ SI, BX
SUBQ R11, BX
CMPQ BX, R12
JA errCorrupt
// length = 1 + int(src[s-3])>>2
SHRQ $2, CX
INCQ CX
// offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
MOVWQZX -2(SI), DX
JMP doCopy
tagCopy:
// We have a copy tag. We assume that:
// - BX == src[s] & 0x03
// - CX == src[s]
CMPQ BX, $2
JEQ tagCopy2
JA tagCopy4
// case tagCopy1:
// s += 2
ADDQ $2, SI
// if uint(s) > uint(len(src)) { etc }
MOVQ SI, BX
SUBQ R11, BX
CMPQ BX, R12
JA errCorrupt
// offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
MOVQ CX, DX
ANDQ $0xe0, DX
SHLQ $3, DX
MOVBQZX -1(SI), BX
ORQ BX, DX
// length = 4 + int(src[s-2])>>2&0x7
SHRQ $2, CX
ANDQ $7, CX
ADDQ $4, CX
doCopy:
// This is the end of the outer "switch", when we have a copy tag.
//
// We assume that:
// - CX == length && CX > 0
// - DX == offset
// if offset <= 0 { etc }
CMPQ DX, $0
JLE errCorrupt
// if d < offset { etc }
MOVQ DI, BX
SUBQ R8, BX
CMPQ BX, DX
JLT errCorrupt
// if length > len(dst)-d { etc }
MOVQ R10, BX
SUBQ DI, BX
CMPQ CX, BX
JGT errCorrupt
// forwardCopy(dst[d:d+length], dst[d-offset:]); d += length
//
// Set:
// - R14 = len(dst)-d
// - R15 = &dst[d-offset]
MOVQ R10, R14
SUBQ DI, R14
MOVQ DI, R15
SUBQ DX, R15
// !!! Try a faster technique for short (16 or fewer bytes) forward copies.
//
// First, try using two 8-byte load/stores, similar to the doLit technique
// above. Even if dst[d:d+length] and dst[d-offset:] can overlap, this is
// still OK if offset >= 8. Note that this has to be two 8-byte load/stores
// and not one 16-byte load/store, and the first store has to be before the
// second load, due to the overlap if offset is in the range [8, 16).
//
// if length > 16 || offset < 8 || len(dst)-d < 16 {
// goto slowForwardCopy
// }
// copy 16 bytes
// d += length
CMPQ CX, $16
JGT slowForwardCopy
CMPQ DX, $8
JLT slowForwardCopy
CMPQ R14, $16
JLT slowForwardCopy
MOVQ 0(R15), AX
MOVQ AX, 0(DI)
MOVQ 8(R15), BX
MOVQ BX, 8(DI)
ADDQ CX, DI
JMP loop
slowForwardCopy:
// !!! If the forward copy is longer than 16 bytes, or if offset < 8, we
// can still try 8-byte load stores, provided we can overrun up to 10 extra
// bytes. As above, the overrun will be fixed up by subsequent iterations
// of the outermost loop.
//
// The C++ snappy code calls this technique IncrementalCopyFastPath. Its
// commentary says:
//
// ----
//
// The main part of this loop is a simple copy of eight bytes at a time
// until we've copied (at least) the requested amount of bytes. However,
// if d and d-offset are less than eight bytes apart (indicating a
// repeating pattern of length < 8), we first need to expand the pattern in
// order to get the correct results. For instance, if the buffer looks like
// this, with the eight-byte <d-offset> and <d> patterns marked as
// intervals:
//
// abxxxxxxxxxxxx
// [------] d-offset
// [------] d
//
// a single eight-byte copy from <d-offset> to <d> will repeat the pattern
// once, after which we can move <d> two bytes without moving <d-offset>:
//
// ababxxxxxxxxxx
// [------] d-offset
// [------] d
//
// and repeat the exercise until the two no longer overlap.
//
// This allows us to do very well in the special case of one single byte
// repeated many times, without taking a big hit for more general cases.
//
// The worst case of extra writing past the end of the match occurs when
// offset == 1 and length == 1; the last copy will read from byte positions
// [0..7] and write to [4..11], whereas it was only supposed to write to
// position 1. Thus, ten excess bytes.
//
// ----
//
// That "10 byte overrun" worst case is confirmed by Go's
// TestSlowForwardCopyOverrun, which also tests the fixUpSlowForwardCopy
// and finishSlowForwardCopy algorithm.
//
// if length > len(dst)-d-10 {
// goto verySlowForwardCopy
// }
SUBQ $10, R14
CMPQ CX, R14
JGT verySlowForwardCopy
makeOffsetAtLeast8:
// !!! As above, expand the pattern so that offset >= 8 and we can use
// 8-byte load/stores.
//
// for offset < 8 {
// copy 8 bytes from dst[d-offset:] to dst[d:]
// length -= offset
// d += offset
// offset += offset
// // The two previous lines together means that d-offset, and therefore
// // R15, is unchanged.
// }
CMPQ DX, $8
JGE fixUpSlowForwardCopy
MOVQ (R15), BX
MOVQ BX, (DI)
SUBQ DX, CX
ADDQ DX, DI
ADDQ DX, DX
JMP makeOffsetAtLeast8
fixUpSlowForwardCopy:
// !!! Add length (which might be negative now) to d (implied by DI being
// &dst[d]) so that d ends up at the right place when we jump back to the
// top of the loop. Before we do that, though, we save DI to AX so that, if
// length is positive, copying the remaining length bytes will write to the
// right place.
MOVQ DI, AX
ADDQ CX, DI
finishSlowForwardCopy:
// !!! Repeat 8-byte load/stores until length <= 0. Ending with a negative
// length means that we overrun, but as above, that will be fixed up by
// subsequent iterations of the outermost loop.
CMPQ CX, $0
JLE loop
MOVQ (R15), BX
MOVQ BX, (AX)
ADDQ $8, R15
ADDQ $8, AX
SUBQ $8, CX
JMP finishSlowForwardCopy
verySlowForwardCopy:
// verySlowForwardCopy is a simple implementation of forward copy. In C
// parlance, this is a do/while loop instead of a while loop, since we know
// that length > 0. In Go syntax:
//
// for {
// dst[d] = dst[d - offset]
// d++
// length--
// if length == 0 {
// break
// }
// }
MOVB (R15), BX
MOVB BX, (DI)
INCQ R15
INCQ DI
DECQ CX
JNZ verySlowForwardCopy
JMP loop
// The code above handles copy tags.
// ----------------------------------------
end:
// This is the end of the "for s < len(src)".
//
// if d != len(dst) { etc }
CMPQ DI, R10
JNE errCorrupt
// return 0
MOVQ $0, ret+48(FP)
RET
errCorrupt:
// return decodeErrCodeCorrupt
MOVQ $1, ret+48(FP)
RET
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 appengine !gc noasm
package snappy
// decode writes the decoding of src to dst. It assumes that the varint-encoded
// length of the decompressed bytes has already been read, and that len(dst)
// equals that length.
//
// It returns 0 on success or a decodeErrCodeXxx error code on failure.
func decode(dst, src []byte) int {
var d, s, offset, length int
for s < len(src) {
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
switch {
case x < 60:
s++
case x == 60:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-1])
case x == 61:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
s += 4
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length <= 0 {
return decodeErrCodeUnsupportedLiteralLength
}
if length > len(dst)-d || length > len(src)-s {
return decodeErrCodeCorrupt
}
copy(dst[d:], src[s:s+length])
d += length
s += length
continue
case tagCopy1:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 4 + int(src[s-2])>>2&0x7
offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
case tagCopy2:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-3])>>2
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-5])>>2
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
return decodeErrCodeCorrupt
}
// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
// the built-in copy function, this byte-by-byte copy always runs
// forwards, even if the slices overlap. Conceptually, this is:
//
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
for end := d + length; d != end; d++ {
dst[d] = dst[d-offset]
}
}
if d != len(dst) {
return decodeErrCodeCorrupt
}
return 0
}
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package snappy
import (
"encoding/binary"
"errors"
"io"
)
// Encode returns the encoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire encoded block.
// Otherwise, a newly allocated slice will be returned.
//
// The dst and src must not overlap. It is valid to pass a nil dst.
func Encode(dst, src []byte) []byte {
if n := MaxEncodedLen(len(src)); n < 0 {
panic(ErrTooLarge)
} else if len(dst) < n {
dst = make([]byte, n)
}
// The block starts with the varint-encoded length of the decompressed bytes.
d := binary.PutUvarint(dst, uint64(len(src)))
for len(src) > 0 {
p := src
src = nil
if len(p) > maxBlockSize {
p, src = p[:maxBlockSize], p[maxBlockSize:]
}
if len(p) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], p)
} else {
d += encodeBlock(dst[d:], p)
}
}
return dst[:d]
}
// inputMargin is the minimum number of extra input bytes to keep, inside
// encodeBlock's inner loop. On some architectures, this margin lets us
// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
// literals can be implemented as a single load to and store from a 16-byte
// register. That literal's actual length can be as short as 1 byte, so this
// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
// the encoding loop will fix up the copy overrun, and this inputMargin ensures
// that we don't overrun the dst and src buffers.
const inputMargin = 16 - 1
// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
// could be encoded with a copy tag. This is the minimum with respect to the
// algorithm used by encodeBlock, not a minimum enforced by the file format.
//
// The encoded output must start with at least a 1 byte literal, as there are
// no previous bytes to copy. A minimal (1 byte) copy after that, generated
// from an emitCopy call in encodeBlock's main loop, would require at least
// another inputMargin bytes, for the reason above: we want any emitLiteral
// calls inside encodeBlock's main loop to use the fast path if possible, which
// requires being able to overrun by inputMargin bytes. Thus,
// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
//
// The C++ code doesn't use this exact threshold, but it could, as discussed at
// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
// optimization. It should not affect the encoded form. This is tested by
// TestSameEncodingAsCppShortCopies.
const minNonLiteralBlockSize = 1 + 1 + inputMargin
// MaxEncodedLen returns the maximum length of a snappy block, given its
// uncompressed length.
//
// It will return a negative value if srcLen is too large to encode.
func MaxEncodedLen(srcLen int) int {
n := uint64(srcLen)
if n > 0xffffffff {
return -1
}
// Compressed data can be defined as:
// compressed := item* literal*
// item := literal* copy
//
// The trailing literal sequence has a space blowup of at most 62/60
// since a literal of length 60 needs one tag byte + one extra byte
// for length information.
//
// Item blowup is trickier to measure. Suppose the "copy" op copies
// 4 bytes of data. Because of a special check in the encoding code,
// we produce a 4-byte copy only if the offset is < 65536. Therefore
// the copy op takes 3 bytes to encode, and this type of item leads
// to at most the 62/60 blowup for representing literals.
//
// Suppose the "copy" op copies 5 bytes of data. If the offset is big
// enough, it will take 5 bytes to encode the copy op. Therefore the
// worst case here is a one-byte literal followed by a five-byte copy.
// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
//
// This last factor dominates the blowup, so the final estimate is:
n = 32 + n + n/6
if n > 0xffffffff {
return -1
}
return int(n)
}
var errClosed = errors.New("snappy: Writer is closed")
// NewWriter returns a new Writer that compresses to w.
//
// The Writer returned does not buffer writes. There is no need to Flush or
// Close such a Writer.
//
// Deprecated: the Writer returned is not suitable for many small writes, only
// for few large writes. Use NewBufferedWriter instead, which is efficient
// regardless of the frequency and shape of the writes, and remember to Close
// that Writer when done.
func NewWriter(w io.Writer) *Writer {
return &Writer{
w: w,
obuf: make([]byte, obufLen),
}
}
// NewBufferedWriter returns a new Writer that compresses to w, using the
// framing format described at
// https://github.com/google/snappy/blob/master/framing_format.txt
//
// The Writer returned buffers writes. Users must call Close to guarantee all
// data has been forwarded to the underlying io.Writer. They may also call
// Flush zero or more times before calling Close.
func NewBufferedWriter(w io.Writer) *Writer {
return &Writer{
w: w,
ibuf: make([]byte, 0, maxBlockSize),
obuf: make([]byte, obufLen),
}
}
// Writer is an io.Writer that can write Snappy-compressed bytes.
type Writer struct {
w io.Writer
err error
// ibuf is a buffer for the incoming (uncompressed) bytes.
//
// Its use is optional. For backwards compatibility, Writers created by the
// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
// therefore do not need to be Flush'ed or Close'd.
ibuf []byte
// obuf is a buffer for the outgoing (compressed) bytes.
obuf []byte
// wroteStreamHeader is whether we have written the stream header.
wroteStreamHeader bool
}
// Reset discards the writer's state and switches the Snappy writer to write to
// w. This permits reusing a Writer rather than allocating a new one.
func (w *Writer) Reset(writer io.Writer) {
w.w = writer
w.err = nil
if w.ibuf != nil {
w.ibuf = w.ibuf[:0]
}
w.wroteStreamHeader = false
}
// Write satisfies the io.Writer interface.
func (w *Writer) Write(p []byte) (nRet int, errRet error) {
if w.ibuf == nil {
// Do not buffer incoming bytes. This does not perform or compress well
// if the caller of Writer.Write writes many small slices. This
// behavior is therefore deprecated, but still supported for backwards
// compatibility with code that doesn't explicitly Flush or Close.
return w.write(p)
}
// The remainder of this method is based on bufio.Writer.Write from the
// standard library.
for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
var n int
if len(w.ibuf) == 0 {
// Large write, empty buffer.
// Write directly from p to avoid copy.
n, _ = w.write(p)
} else {
n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
w.ibuf = w.ibuf[:len(w.ibuf)+n]
w.Flush()
}
nRet += n
p = p[n:]
}
if w.err != nil {
return nRet, w.err
}
n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
w.ibuf = w.ibuf[:len(w.ibuf)+n]
nRet += n
return nRet, nil
}
func (w *Writer) write(p []byte) (nRet int, errRet error) {
if w.err != nil {
return 0, w.err
}
for len(p) > 0 {
obufStart := len(magicChunk)
if !w.wroteStreamHeader {
w.wroteStreamHeader = true
copy(w.obuf, magicChunk)
obufStart = 0
}
var uncompressed []byte
if len(p) > maxBlockSize {
uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
} else {
uncompressed, p = p, nil
}
checksum := crc(uncompressed)
// Compress the buffer, discarding the result if the improvement
// isn't at least 12.5%.
compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
chunkType := uint8(chunkTypeCompressedData)
chunkLen := 4 + len(compressed)
obufEnd := obufHeaderLen + len(compressed)
if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
chunkType = chunkTypeUncompressedData
chunkLen = 4 + len(uncompressed)
obufEnd = obufHeaderLen
}
// Fill in the per-chunk header that comes before the body.
w.obuf[len(magicChunk)+0] = chunkType
w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
w.err = err
return nRet, err
}
if chunkType == chunkTypeUncompressedData {
if _, err := w.w.Write(uncompressed); err != nil {
w.err = err
return nRet, err
}
}
nRet += len(uncompressed)
}
return nRet, nil
}
// Flush flushes the Writer to its underlying io.Writer.
func (w *Writer) Flush() error {
if w.err != nil {
return w.err
}
if len(w.ibuf) == 0 {
return nil
}
w.write(w.ibuf)
w.ibuf = w.ibuf[:0]
return w.err
}
// Close calls Flush and then closes the Writer.
func (w *Writer) Close() error {
w.Flush()
ret := w.err
if w.err == nil {
w.err = errClosed
}
return ret
}
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
package snappy
// emitLiteral has the same semantics as in encode_other.go.
//
//go:noescape
func emitLiteral(dst, lit []byte) int
// emitCopy has the same semantics as in encode_other.go.
//
//go:noescape
func emitCopy(dst []byte, offset, length int) int
// extendMatch has the same semantics as in encode_other.go.
//
//go:noescape
func extendMatch(src []byte, i, j int) int
// encodeBlock has the same semantics as in encode_other.go.
//
//go:noescape
func encodeBlock(dst, src []byte) (d int)
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
// The XXX lines assemble on Go 1.4, 1.5 and 1.7, but not 1.6, due to a
// Go toolchain regression. See https://github.com/golang/go/issues/15426 and
// https://github.com/golang/snappy/issues/29
//
// As a workaround, the package was built with a known good assembler, and
// those instructions were disassembled by "objdump -d" to yield the
// 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
// style comments, in AT&T asm syntax. Note that rsp here is a physical
// register, not Go/asm's SP pseudo-register (see https://golang.org/doc/asm).
// The instructions were then encoded as "BYTE $0x.." sequences, which assemble
// fine on Go 1.6.
// The asm code generally follows the pure Go code in encode_other.go, except
// where marked with a "!!!".
// ----------------------------------------------------------------------------
// func emitLiteral(dst, lit []byte) int
//
// All local variables fit into registers. The register allocation:
// - AX len(lit)
// - BX n
// - DX return value
// - DI &dst[i]
// - R10 &lit[0]
//
// The 24 bytes of stack space is to call runtime·memmove.
//
// The unusual register allocation of local variables, such as R10 for the
// source pointer, matches the allocation used at the call site in encodeBlock,
// which makes it easier to manually inline this function.
TEXT ·emitLiteral(SB), NOSPLIT, $24-56
MOVQ dst_base+0(FP), DI
MOVQ lit_base+24(FP), R10
MOVQ lit_len+32(FP), AX
MOVQ AX, DX
MOVL AX, BX
SUBL $1, BX
CMPL BX, $60
JLT oneByte
CMPL BX, $256
JLT twoBytes
threeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
ADDQ $3, DX
JMP memmove
twoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
ADDQ $2, DX
JMP memmove
oneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
ADDQ $1, DX
memmove:
MOVQ DX, ret+48(FP)
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
// DI, R10 and AX as arguments.
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
CALL runtime·memmove(SB)
RET
// ----------------------------------------------------------------------------
// func emitCopy(dst []byte, offset, length int) int
//
// All local variables fit into registers. The register allocation:
// - AX length
// - SI &dst[0]
// - DI &dst[i]
// - R11 offset
//
// The unusual register allocation of local variables, such as R11 for the
// offset, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
TEXT ·emitCopy(SB), NOSPLIT, $0-48
MOVQ dst_base+0(FP), DI
MOVQ DI, SI
MOVQ offset+24(FP), R11
MOVQ length+32(FP), AX
loop0:
// for length >= 68 { etc }
CMPL AX, $68
JLT step1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $64, AX
JMP loop0
step1:
// if length > 64 { etc }
CMPL AX, $64
JLE step2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $60, AX
step2:
// if length >= 12 || offset >= 2048 { goto step3 }
CMPL AX, $12
JGE step3
CMPL R11, $2048
JGE step3
// Emit the remaining copy, encoded as 2 bytes.
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
ADDQ $2, DI
// Return the number of bytes written.
SUBQ SI, DI
MOVQ DI, ret+40(FP)
RET
step3:
// Emit the remaining copy, encoded as 3 bytes.
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
// Return the number of bytes written.
SUBQ SI, DI
MOVQ DI, ret+40(FP)
RET
// ----------------------------------------------------------------------------
// func extendMatch(src []byte, i, j int) int
//
// All local variables fit into registers. The register allocation:
// - DX &src[0]
// - SI &src[j]
// - R13 &src[len(src) - 8]
// - R14 &src[len(src)]
// - R15 &src[i]
//
// The unusual register allocation of local variables, such as R15 for a source
// pointer, matches the allocation used at the call site in encodeBlock, which
// makes it easier to manually inline this function.
TEXT ·extendMatch(SB), NOSPLIT, $0-48
MOVQ src_base+0(FP), DX
MOVQ src_len+8(FP), R14
MOVQ i+24(FP), R15
MOVQ j+32(FP), SI
ADDQ DX, R14
ADDQ DX, R15
ADDQ DX, SI
MOVQ R14, R13
SUBQ $8, R13
cmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ SI, R13
JA cmp1
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE bsf
ADDQ $8, R15
ADDQ $8, SI
JMP cmp8
bsf:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, SI
// Convert from &src[ret] to ret.
SUBQ DX, SI
MOVQ SI, ret+40(FP)
RET
cmp1:
// In src's tail, compare 1 byte at a time.
CMPQ SI, R14
JAE extendMatchEnd
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE extendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
JMP cmp1
extendMatchEnd:
// Convert from &src[ret] to ret.
SUBQ DX, SI
MOVQ SI, ret+40(FP)
RET
// ----------------------------------------------------------------------------
// func encodeBlock(dst, src []byte) (d int)
//
// All local variables fit into registers, other than "var table". The register
// allocation:
// - AX . .
// - BX . .
// - CX 56 shift (note that amd64 shifts by non-immediates must use CX).
// - DX 64 &src[0], tableSize
// - SI 72 &src[s]
// - DI 80 &dst[d]
// - R9 88 sLimit
// - R10 . &src[nextEmit]
// - R11 96 prevHash, currHash, nextHash, offset
// - R12 104 &src[base], skip
// - R13 . &src[nextS], &src[len(src) - 8]
// - R14 . len(src), bytesBetweenHashLookups, &src[len(src)], x
// - R15 112 candidate
//
// The second column (56, 64, etc) is the stack offset to spill the registers
// when calling other functions. We could pack this slightly tighter, but it's
// simpler to have a dedicated spill map independent of the function called.
//
// "var table [maxTableSize]uint16" takes up 32768 bytes of stack space. An
// extra 56 bytes, to call other functions, and an extra 64 bytes, to spill
// local variables (registers) during calls gives 32768 + 56 + 64 = 32888.
TEXT ·encodeBlock(SB), 0, $32888-56
MOVQ dst_base+0(FP), DI
MOVQ src_base+24(FP), SI
MOVQ src_len+32(FP), R14
// shift, tableSize := uint32(32-8), 1<<8
MOVQ $24, CX
MOVQ $256, DX
calcShift:
// for ; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
// shift--
// }
CMPQ DX, $16384
JGE varTable
CMPQ DX, R14
JGE varTable
SUBQ $1, CX
SHLQ $1, DX
JMP calcShift
varTable:
// var table [maxTableSize]uint16
//
// In the asm code, unlike the Go code, we can zero-initialize only the
// first tableSize elements. Each uint16 element is 2 bytes and each MOVOU
// writes 16 bytes, so we can do only tableSize/8 writes instead of the
// 2048 writes that would zero-initialize all of table's 32768 bytes.
SHRQ $3, DX
LEAQ table-32768(SP), BX
PXOR X0, X0
memclr:
MOVOU X0, 0(BX)
ADDQ $16, BX
SUBQ $1, DX
JNZ memclr
// !!! DX = &src[0]
MOVQ SI, DX
// sLimit := len(src) - inputMargin
MOVQ R14, R9
SUBQ $15, R9
// !!! Pre-emptively spill CX, DX and R9 to the stack. Their values don't
// change for the rest of the function.
MOVQ CX, 56(SP)
MOVQ DX, 64(SP)
MOVQ R9, 88(SP)
// nextEmit := 0
MOVQ DX, R10
// s := 1
ADDQ $1, SI
// nextHash := hash(load32(src, s), shift)
MOVL 0(SI), R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
outer:
// for { etc }
// skip := 32
MOVQ $32, R12
// nextS := s
MOVQ SI, R13
// candidate := 0
MOVQ $0, R15
inner0:
// for { etc }
// s := nextS
MOVQ R13, SI
// bytesBetweenHashLookups := skip >> 5
MOVQ R12, R14
SHRQ $5, R14
// nextS = s + bytesBetweenHashLookups
ADDQ R14, R13
// skip += bytesBetweenHashLookups
ADDQ R14, R12
// if nextS > sLimit { goto emitRemainder }
MOVQ R13, AX
SUBQ DX, AX
CMPQ AX, R9
JA emitRemainder
// candidate = int(table[nextHash])
// XXX: MOVWQZX table-32768(SP)(R11*2), R15
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
BYTE $0x4e
BYTE $0x0f
BYTE $0xb7
BYTE $0x7c
BYTE $0x5c
BYTE $0x78
// table[nextHash] = uint16(s)
MOVQ SI, AX
SUBQ DX, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// nextHash = hash(load32(src, nextS), shift)
MOVL 0(R13), R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// if load32(src, s) != load32(src, candidate) { continue } break
MOVL 0(SI), AX
MOVL (DX)(R15*1), BX
CMPL AX, BX
JNE inner0
fourByteMatch:
// As per the encode_other.go code:
//
// A 4-byte match has been found. We'll later see etc.
// !!! Jump to a fast path for short (<= 16 byte) literals. See the comment
// on inputMargin in encode.go.
MOVQ SI, AX
SUBQ R10, AX
CMPQ AX, $16
JLE emitLiteralFastPath
// ----------------------------------------
// Begin inline of the emitLiteral call.
//
// d += emitLiteral(dst[d:], src[nextEmit:s])
MOVL AX, BX
SUBL $1, BX
CMPL BX, $60
JLT inlineEmitLiteralOneByte
CMPL BX, $256
JLT inlineEmitLiteralTwoBytes
inlineEmitLiteralThreeBytes:
MOVB $0xf4, 0(DI)
MOVW BX, 1(DI)
ADDQ $3, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralTwoBytes:
MOVB $0xf0, 0(DI)
MOVB BX, 1(DI)
ADDQ $2, DI
JMP inlineEmitLiteralMemmove
inlineEmitLiteralOneByte:
SHLB $2, BX
MOVB BX, 0(DI)
ADDQ $1, DI
inlineEmitLiteralMemmove:
// Spill local variables (registers) onto the stack; call; unspill.
//
// copy(dst[i:], lit)
//
// This means calling runtime·memmove(&dst[i], &lit[0], len(lit)), so we push
// DI, R10 and AX as arguments.
MOVQ DI, 0(SP)
MOVQ R10, 8(SP)
MOVQ AX, 16(SP)
ADDQ AX, DI // Finish the "d +=" part of "d += emitLiteral(etc)".
MOVQ SI, 72(SP)
MOVQ DI, 80(SP)
MOVQ R15, 112(SP)
CALL runtime·memmove(SB)
MOVQ 56(SP), CX
MOVQ 64(SP), DX
MOVQ 72(SP), SI
MOVQ 80(SP), DI
MOVQ 88(SP), R9
MOVQ 112(SP), R15
JMP inner1
inlineEmitLiteralEnd:
// End inline of the emitLiteral call.
// ----------------------------------------
emitLiteralFastPath:
// !!! Emit the 1-byte encoding "uint8(len(lit)-1)<<2".
MOVB AX, BX
SUBB $1, BX
SHLB $2, BX
MOVB BX, (DI)
ADDQ $1, DI
// !!! Implement the copy from lit to dst as a 16-byte load and store.
// (Encode's documentation says that dst and src must not overlap.)
//
// This always copies 16 bytes, instead of only len(lit) bytes, but that's
// OK. Subsequent iterations will fix up the overrun.
//
// Note that on amd64, it is legal and cheap to issue unaligned 8-byte or
// 16-byte loads and stores. This technique probably wouldn't be as
// effective on architectures that are fussier about alignment.
MOVOU 0(R10), X0
MOVOU X0, 0(DI)
ADDQ AX, DI
inner1:
// for { etc }
// base := s
MOVQ SI, R12
// !!! offset := base - candidate
MOVQ R12, R11
SUBQ R15, R11
SUBQ DX, R11
// ----------------------------------------
// Begin inline of the extendMatch call.
//
// s = extendMatch(src, candidate+4, s+4)
// !!! R14 = &src[len(src)]
MOVQ src_len+32(FP), R14
ADDQ DX, R14
// !!! R13 = &src[len(src) - 8]
MOVQ R14, R13
SUBQ $8, R13
// !!! R15 = &src[candidate + 4]
ADDQ $4, R15
ADDQ DX, R15
// !!! s += 4
ADDQ $4, SI
inlineExtendMatchCmp8:
// As long as we are 8 or more bytes before the end of src, we can load and
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
CMPQ SI, R13
JA inlineExtendMatchCmp1
MOVQ (R15), AX
MOVQ (SI), BX
CMPQ AX, BX
JNE inlineExtendMatchBSF
ADDQ $8, R15
ADDQ $8, SI
JMP inlineExtendMatchCmp8
inlineExtendMatchBSF:
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
// the index of the first byte that differs. The BSF instruction finds the
// least significant 1 bit, the amd64 architecture is little-endian, and
// the shift by 3 converts a bit index to a byte index.
XORQ AX, BX
BSFQ BX, BX
SHRQ $3, BX
ADDQ BX, SI
JMP inlineExtendMatchEnd
inlineExtendMatchCmp1:
// In src's tail, compare 1 byte at a time.
CMPQ SI, R14
JAE inlineExtendMatchEnd
MOVB (R15), AX
MOVB (SI), BX
CMPB AX, BX
JNE inlineExtendMatchEnd
ADDQ $1, R15
ADDQ $1, SI
JMP inlineExtendMatchCmp1
inlineExtendMatchEnd:
// End inline of the extendMatch call.
// ----------------------------------------
// ----------------------------------------
// Begin inline of the emitCopy call.
//
// d += emitCopy(dst[d:], base-candidate, s-base)
// !!! length := s - base
MOVQ SI, AX
SUBQ R12, AX
inlineEmitCopyLoop0:
// for length >= 68 { etc }
CMPL AX, $68
JLT inlineEmitCopyStep1
// Emit a length 64 copy, encoded as 3 bytes.
MOVB $0xfe, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $64, AX
JMP inlineEmitCopyLoop0
inlineEmitCopyStep1:
// if length > 64 { etc }
CMPL AX, $64
JLE inlineEmitCopyStep2
// Emit a length 60 copy, encoded as 3 bytes.
MOVB $0xee, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
SUBL $60, AX
inlineEmitCopyStep2:
// if length >= 12 || offset >= 2048 { goto inlineEmitCopyStep3 }
CMPL AX, $12
JGE inlineEmitCopyStep3
CMPL R11, $2048
JGE inlineEmitCopyStep3
// Emit the remaining copy, encoded as 2 bytes.
MOVB R11, 1(DI)
SHRL $8, R11
SHLB $5, R11
SUBB $4, AX
SHLB $2, AX
ORB AX, R11
ORB $1, R11
MOVB R11, 0(DI)
ADDQ $2, DI
JMP inlineEmitCopyEnd
inlineEmitCopyStep3:
// Emit the remaining copy, encoded as 3 bytes.
SUBL $1, AX
SHLB $2, AX
ORB $2, AX
MOVB AX, 0(DI)
MOVW R11, 1(DI)
ADDQ $3, DI
inlineEmitCopyEnd:
// End inline of the emitCopy call.
// ----------------------------------------
// nextEmit = s
MOVQ SI, R10
// if s >= sLimit { goto emitRemainder }
MOVQ SI, AX
SUBQ DX, AX
CMPQ AX, R9
JAE emitRemainder
// As per the encode_other.go code:
//
// We could immediately etc.
// x := load64(src, s-1)
MOVQ -1(SI), R14
// prevHash := hash(uint32(x>>0), shift)
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// table[prevHash] = uint16(s-1)
MOVQ SI, AX
SUBQ DX, AX
SUBQ $1, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// currHash := hash(uint32(x>>8), shift)
SHRQ $8, R14
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// candidate = int(table[currHash])
// XXX: MOVWQZX table-32768(SP)(R11*2), R15
// XXX: 4e 0f b7 7c 5c 78 movzwq 0x78(%rsp,%r11,2),%r15
BYTE $0x4e
BYTE $0x0f
BYTE $0xb7
BYTE $0x7c
BYTE $0x5c
BYTE $0x78
// table[currHash] = uint16(s)
ADDQ $1, AX
// XXX: MOVW AX, table-32768(SP)(R11*2)
// XXX: 66 42 89 44 5c 78 mov %ax,0x78(%rsp,%r11,2)
BYTE $0x66
BYTE $0x42
BYTE $0x89
BYTE $0x44
BYTE $0x5c
BYTE $0x78
// if uint32(x>>8) == load32(src, candidate) { continue }
MOVL (DX)(R15*1), BX
CMPL R14, BX
JEQ inner1
// nextHash = hash(uint32(x>>16), shift)
SHRQ $8, R14
MOVL R14, R11
IMULL $0x1e35a7bd, R11
SHRL CX, R11
// s++
ADDQ $1, SI
// break out of the inner1 for loop, i.e. continue the outer loop.
JMP outer
emitRemainder:
// if nextEmit < len(src) { etc }
MOVQ src_len+32(FP), AX
ADDQ DX, AX
CMPQ R10, AX
JEQ encodeBlockEnd
// d += emitLiteral(dst[d:], src[nextEmit:])
//
// Push args.
MOVQ DI, 0(SP)
MOVQ $0, 8(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ $0, 16(SP) // Unnecessary, as the callee ignores it, but conservative.
MOVQ R10, 24(SP)
SUBQ R10, AX
MOVQ AX, 32(SP)
MOVQ AX, 40(SP) // Unnecessary, as the callee ignores it, but conservative.
// Spill local variables (registers) onto the stack; call; unspill.
MOVQ DI, 80(SP)
CALL ·emitLiteral(SB)
MOVQ 80(SP), DI
// Finish the "d +=" part of "d += emitLiteral(etc)".
ADDQ 48(SP), DI
encodeBlockEnd:
MOVQ dst_base+0(FP), AX
SUBQ AX, DI
MOVQ DI, d+48(FP)
RET
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 appengine !gc noasm
package snappy
func load32(b []byte, i int) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
func load64(b []byte, i int) uint64 {
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= len(lit) && len(lit) <= 65536
func emitLiteral(dst, lit []byte) int {
i, n := 0, uint(len(lit)-1)
switch {
case n < 60:
dst[0] = uint8(n)<<2 | tagLiteral
i = 1
case n < 1<<8:
dst[0] = 60<<2 | tagLiteral
dst[1] = uint8(n)
i = 2
default:
dst[0] = 61<<2 | tagLiteral
dst[1] = uint8(n)
dst[2] = uint8(n >> 8)
i = 3
}
return i + copy(dst[i:], lit)
}
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= 65535
// 4 <= length && length <= 65535
func emitCopy(dst []byte, offset, length int) int {
i := 0
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
// length emitted down below is is a little lower (at 60 = 64 - 4), because
// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
for length >= 68 {
// Emit a length 64 copy, encoded as 3 bytes.
dst[i+0] = 63<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
i += 3
length -= 64
}
if length > 64 {
// Emit a length 60 copy, encoded as 3 bytes.
dst[i+0] = 59<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
i += 3
length -= 60
}
if length >= 12 || offset >= 2048 {
// Emit the remaining copy, encoded as 3 bytes.
dst[i+0] = uint8(length-1)<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
return i + 3
}
// Emit the remaining copy, encoded as 2 bytes.
dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
dst[i+1] = uint8(offset)
return i + 2
}
// extendMatch returns the largest k such that k <= len(src) and that
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
}
return j
}
func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
// The table element type is uint16, as s < sLimit and sLimit < len(src)
// and len(src) <= maxBlockSize and maxBlockSize == 65536.
const (
maxTableSize = 1 << 14
// tableMask is redundant, but helps the compiler eliminate bounds
// checks.
tableMask = maxTableSize - 1
)
shift := uint32(32 - 8)
for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
shift--
}
// In Go, all array elements are zero-initialized, so there is no advantage
// to a smaller tableSize per se. However, it matches the C++ algorithm,
// and in the asm versions of this code, we can get away with zeroing only
// the first tableSize elements.
var table [maxTableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
nextHash := hash(load32(src, s), shift)
for {
// Copied from the C++ snappy implementation:
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
// scanned (or skipped), look at every third byte, etc.. When a match
// is found, immediately go back to looking at every byte. This is a
// small loss (~5% performance, ~0.1% density) for compressible data
// due to more bookkeeping, but for non-compressible data (such as
// JPEG) it's a huge win since the compressor quickly "realizes" the
// data is incompressible and doesn't bother looking for matches
// everywhere.
//
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
// the number of bytes to move ahead for each iteration.
skip := 32
nextS := s
candidate := 0
for {
s = nextS
bytesBetweenHashLookups := skip >> 5
nextS = s + bytesBetweenHashLookups
skip += bytesBetweenHashLookups
if nextS > sLimit {
goto emitRemainder
}
candidate = int(table[nextHash&tableMask])
table[nextHash&tableMask] = uint16(s)
nextHash = hash(load32(src, nextS), shift)
if load32(src, s) == load32(src, candidate) {
break
}
}
// A 4-byte match has been found. We'll later see if more than 4 bytes
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.
d += emitLiteral(dst[d:], src[nextEmit:s])
// Call emitCopy, and then see if another emitCopy could be our next
// move. Repeat until we find no match for the input immediately after
// what was consumed by the last emitCopy call.
//
// If we exit this loop normally then we need to call emitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can
// exit this loop via goto if we get close to exhausting the input.
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
// Extend the 4-byte match as long as possible.
//
// This is an inlined version of:
// s = extendMatch(src, candidate+4, s+4)
s += 4
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
}
d += emitCopy(dst[d:], base-candidate, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
// We could immediately start working at s now, but to improve
// compression we first update the hash table at s-1 and at s. If
// another emitCopy is not our next move, also calculate nextHash
// at s+1. At least on GOARCH=amd64, these three hash calculations
// are faster as one load64 call (with some shifts) instead of
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
table[prevHash&tableMask] = uint16(s - 1)
currHash := hash(uint32(x>>8), shift)
candidate = int(table[currHash&tableMask])
table[currHash&tableMask] = uint16(s)
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++
break
}
}
}
emitRemainder:
if nextEmit < len(src) {
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
module github.com/golang/snappy
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package snappy implements the Snappy compression format. It aims for very
// high speeds and reasonable compression.
//
// There are actually two Snappy formats: block and stream. They are related,
// but different: trying to decompress block-compressed data as a Snappy stream
// will fail, and vice versa. The block format is the Decode and Encode
// functions and the stream format is the Reader and Writer types.
//
// The block format, the more common case, is used when the complete size (the
// number of bytes) of the original data is known upfront, at the time
// compression starts. The stream format, also known as the framing format, is
// for when that isn't always true.
//
// The canonical, C++ implementation is at https://github.com/google/snappy and
// it only implements the block format.
package snappy // import "github.com/golang/snappy"
import (
"hash/crc32"
)
/*
Each encoded block begins with the varint-encoded length of the decoded data,
followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
first byte of each chunk is broken into its 2 least and 6 most significant bits
called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
Zero means a literal tag. All other values mean a copy tag.
For literal tags:
- If m < 60, the next 1 + m bytes are literal bytes.
- Otherwise, let n be the little-endian unsigned integer denoted by the next
m - 59 bytes. The next 1 + n bytes after that are literal bytes.
For copy tags, length bytes are copied from offset bytes ago, in the style of
Lempel-Ziv compression algorithms. In particular:
- For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
of the offset. The next byte is bits 0-7 of the offset.
- For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
The length is 1 + m. The offset is the little-endian unsigned integer
denoted by the next 2 bytes.
- For l == 3, this tag is a legacy format that is no longer issued by most
encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
[1, 65). The length is 1 + m. The offset is the little-endian unsigned
integer denoted by the next 4 bytes.
*/
const (
tagLiteral = 0x00
tagCopy1 = 0x01
tagCopy2 = 0x02
tagCopy4 = 0x03
)
const (
checksumSize = 4
chunkHeaderSize = 4
magicChunk = "\xff\x06\x00\x00" + magicBody
magicBody = "sNaPpY"
// maxBlockSize is the maximum size of the input to encodeBlock. It is not
// part of the wire format per se, but some parts of the encoder assume
// that an offset fits into a uint16.
//
// Also, for the framing format (Writer type instead of Encode function),
// https://github.com/google/snappy/blob/master/framing_format.txt says
// that "the uncompressed data in a chunk must be no longer than 65536
// bytes".
maxBlockSize = 65536
// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
// hard coded to be a const instead of a variable, so that obufLen can also
// be a const. Their equivalence is confirmed by
// TestMaxEncodedLenOfMaxBlockSize.
maxEncodedLenOfMaxBlockSize = 76490
obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize
)
const (
chunkTypeCompressedData = 0x00
chunkTypeUncompressedData = 0x01
chunkTypePadding = 0xfe
chunkTypeStreamIdentifier = 0xff
)
var crcTable = crc32.MakeTable(crc32.Castagnoli)
// crc implements the checksum specified in section 3 of
// https://github.com/google/snappy/blob/master/framing_format.txt
func crc(b []byte) uint32 {
c := crc32.Update(0, crcTable, b)
return uint32(c>>15|c<<17) + 0xa282ead8
}
Goavro was originally created during the Fall of 2014 at LinkedIn,
Corp., in New York City, New York, USA.
The following persons, listed in alphabetical order, have participated
with goavro development by contributing code and test cases.
Alan Gardner <alanctgardner@gmail.com>
Billy Hand <bhand@mediamath.com>
Christian Blades <christian.blades@careerbuilder.com>
Corey Scott <corey.scott@gmail.com>
Darshan Shaligram <scintilla@gmail.com>
Dylan Wen <hhkbp2@gmail.com>
Enrico Candino <enrico.candino@gmail.com>
Fellyn Silliman <fsilliman@linkedin.com>
James Crasta <jcrasta@underarmour.com>
Jeff Haynie <jhaynie@gmail.com>
Joe Roth <joseph_roth@cable.comcast.com>
Karrick S. McDermott <kmcdermott@linkedin.com>
Kasey Klipsch <kklipsch@mediamath.com>
Michael Johnson <mijohnson@linkedin.com>
Murray Resinski <murray.resinski@octanner.com>
Nicolas Kaiser <nikai@nikai.net>
Sebastien Launay <sebastien@opendns.com>
Thomas Desrosiers <thomasdesr@gmail.com>
kklipsch <junk@klipsch.net>
seborama <sebastien.chatal@sainsburys.co.uk>
A big thank you to these persons who provided testing and amazing
feedback to goavro during its initial implementation:
Dennis Ordanov <dordanov@linkedin.com>
Thomas Desrosiers <thomasdesr@gmail.com>
Also a big thank you is extended to our supervisors who supported our
efforts to bring goavro to the open source community:
Greg Leffler <gleffler@linkedin.com>
Nick Berry <niberry@linkedin.com>
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright [yyyy] [name of copyright owner]
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
# goavro
Goavro is a library that encodes and decodes Avro data.
## Description
* Encodes to and decodes from both binary and textual JSON Avro data.
* `Codec` is stateless and is safe to use by multiple goroutines.
With the exception of features not yet supported, goavro attempts to
be fully compliant with the most recent version of the
[Avro specification](http://avro.apache.org/docs/1.8.2/spec.html).
## Dependency Notice
All usage of `gopkg.in` has been removed in favor of Go modules.
Please update your import paths to `github.com/linkedin/goavro/v2`. v1
users can still use old versions of goavro by adding a constraint to
your `go.mod` or `Gopkg.toml` file.
```
require (
github.com/linkedin/goavro v1.0.5
)
```
```toml
[[constraint]]
name = "github.com/linkedin/goavro"
version = "=1.0.5"
```
## Major Improvements in v2 over v1
### Avro namespaces
The original version of this library was written prior to my really
understanding how Avro namespaces ought to work. After using Avro for
a long time now, and after a lot of research, I think I grok Avro
namespaces properly, and the library now correctly handles every test
case the Apache Avro distribution has for namespaces, including being
able to refer to a previously defined data type later on in the same
schema.
### Getting Data into and out of Records
The original version of this library required creating `goavro.Record`
instances, and use of getters and setters to access a record's
fields. When schemas were complex, this required a lot of work to
debug and get right. The original version also required users to break
schemas in chunks, and have a different schema for each record
type. This was cumbersome, annoying, and error prone.
The new version of this library eliminates the `goavro.Record` type,
and accepts a native Go map for all records to be encoded. Keys are
the field names, and values are the field values. Nothing could be
more easy. Conversely, decoding Avro data yields a native Go map for
the upstream client to pull data back out of.
Furthermore, there is never a reason to ever have to break your schema
down into record schemas. Merely feed the entire schema into the
`NewCodec` function once when you create the `Codec`, then use
it. This library knows how to parse the data provided to it and ensure
data values for records and their fields are properly encoded and
decoded.
### 3x--4x Performance Improvement
The original version of this library was truly written with Go's idea
of `io.Reader` and `io.Writer` composition in mind. Although
composition is a powerful tool, the original library had to pull bytes
off the `io.Reader`--often one byte at a time--check for read errors,
decode the bytes, and repeat. This version, by using a native Go byte
slice, both decoding and encoding complex Avro data here at LinkedIn
is between three and four times faster than before.
### Avro JSON Support
The original version of this library did not support JSON encoding or
decoding, because it wasn't deemed useful for our internal use at the
time. When writing the new version of the library I decided to tackle
this issue once and for all, because so many engineers needed this
functionality for their work.
### Better Handling of Record Field Default Values
The original version of this library did not well handle default
values for record fields. This version of the library uses a default
value of a record field when encoding from native Go data to Avro data
and the record field is not specified. Additionally, when decoding
from Avro JSON data to native Go data, and a field is not specified,
the default value will be used to populate the field.
## Contrast With Code Generation Tools
If you have the ability to rebuild and redeploy your software whenever
data schemas change, code generation tools might be the best solution
for your application.
There are numerous excellent tools for generating source code to
translate data between native and Avro binary or textual data. One
such tool is linked below. If a particular application is designed to
work with a rarely changing schema, programs that use code generated
functions can potentially be more performant than a program that uses
goavro to create a `Codec` dynamically at run time.
* [gogen-avro](https://github.com/alanctgardner/gogen-avro)
I recommend benchmarking the resultant programs using typical data
using both the code generated functions and using goavro to see which
performs better. Not all code generated functions will out perform
goavro for all data corpuses.
If you don't have the ability to rebuild and redeploy software updates
whenever a data schema change occurs, goavro could be a great fit for
your needs. With goavro, your program can be given a new schema while
running, compile it into a `Codec` on the fly, and immediately start
encoding or decoding data using that `Codec`. Because Avro encoding
specifies that encoded data always be accompanied by a schema this is
not usually a problem. If the schema change is backwards compatible,
and the portion of your program that handles the decoded data is still
able to reference the decoded fields, there is nothing that needs to
be done when the schema change is detected by your program when using
goavro `Codec` instances to encode or decode data.
## Resources
* [Avro CLI Examples](https://github.com/miguno/avro-cli-examples)
* [Avro](https://avro.apache.org/)
* [Google Snappy](https://google.github.io/snappy/)
* [JavaScript Object Notation, JSON](https://www.json.org/)
* [Kafka](https://kafka.apache.org)
## Usage
Documentation is available via
[![GoDoc](https://godoc.org/github.com/linkedin/goavro?status.svg)](https://godoc.org/github.com/linkedin/goavro).
```Go
package main
import (
"fmt"
"github.com/linkedin/goavro/v2"
)
func main() {
codec, err := goavro.NewCodec(`
{
"type": "record",
"name": "LongList",
"fields" : [
{"name": "next", "type": ["null", "LongList"], "default": null}
]
}`)
if err != nil {
fmt.Println(err)
}
// NOTE: May omit fields when using default value
textual := []byte(`{"next":{"LongList":{}}}`)
// Convert textual Avro data (in Avro JSON format) to native Go form
native, _, err := codec.NativeFromTextual(textual)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to binary Avro data
binary, err := codec.BinaryFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// Convert binary Avro data back to native Go form
native, _, err = codec.NativeFromBinary(binary)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to textual Avro data
textual, err = codec.TextualFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// NOTE: Textual encoding will show all fields, even those with values that
// match their default values
fmt.Println(string(textual))
// Output: {"next":{"LongList":{"next":null}}}
}
```
Also please see the example programs in the `examples` directory for
reference.
### ab2t
The `ab2t` program is similar to the reference standard
`avrocat` program and converts Avro OCF files to Avro JSON
encoding.
### arw
The Avro-ReWrite program, `arw`, can be used to rewrite an
Avro OCF file while optionally changing the block counts, the
compression algorithm. `arw` can also upgrade the schema provided the
existing datum values can be encoded with the newly provided schema.
### avroheader
The Avro Header program, `avroheader`, can be used to print various
header information from an OCF file.
### splice
The `splice` program can be used to splice together an OCF file from
an Avro schema file and a raw Avro binary data file.
### Translating Data
A `Codec` provides four methods for translating between a byte slice
of either binary or textual Avro data and native Go data.
The following methods convert data between native Go data and byte
slices of the binary Avro representation:
BinaryFromNative
NativeFromBinary
The following methods convert data between native Go data and byte
slices of the textual Avro representation:
NativeFromTextual
TextualFromNative
Each `Codec` also exposes the `Schema` method to return a simplified
version of the JSON schema string used to create the `Codec`.
#### Translating From Avro to Go Data
Goavro does not use Go's structure tags to translate data between
native Go types and Avro encoded data.
When translating from either binary or textual Avro to native Go data,
goavro returns primitive Go data values for corresponding Avro data
values. The table below shows how goavro translates Avro types to Go
types.
| Avro | Go     |
| ------------------ | ------------------------ |
| `null` | `nil` |
| `boolean` | `bool` |
| `bytes` | `[]byte` |
| `float` | `float32` |
| `double` | `float64` |
| `long` | `int64` |
| `int` | `int32`   |
| `string` | `string` |
| `array` | `[]interface{}` |
| `enum` | `string` |
| `fixed` | `[]byte`       |
| `map` and `record` | `map[string]interface{}` |
| `union` | *see below*    |
Because of encoding rules for Avro unions, when an union's value is
`null`, a simple Go `nil` is returned. However when an union's value
is non-`nil`, a Go `map[string]interface{}` with a single key is
returned for the union. The map's single key is the Avro type name and
its value is the datum's value.
#### Translating From Go to Avro Data
Goavro does not use Go's structure tags to translate data between
native Go types and Avro encoded data.
When translating from native Go to either binary or textual Avro data,
goavro generally requires the same native Go data types as the decoder
would provide, with some exceptions for programmer convenience. Goavro
will accept any numerical data type provided there is no precision
lost when encoding the value. For instance, providing `float64(3.0)`
to an encoder expecting an Avro `int` would succeed, while sending
`float64(3.5)` to the same encoder would return an error.
When providing a slice of items for an encoder, the encoder will
accept either `[]interface{}`, or any slice of the required type. For
instance, when the Avro schema specifies:
`{"type":"array","items":"string"}`, the encoder will accept either
`[]interface{}`, or `[]string`. If given `[]int`, the encoder will
return an error when it attempts to encode the first non-string array
value using the string encoder.
When providing a value for an Avro union, the encoder will accept
`nil` for a `null` value. If the value is non-`nil`, it must be a
`map[string]interface{}` with a single key-value pair, where the key
is the Avro type name and the value is the datum's value. As a
convenience, the `Union` function wraps any datum value in a map as
specified above.
```Go
func ExampleUnion() {
codec, err := goavro.NewCodec(`["null","string","int"]`)
if err != nil {
fmt.Println(err)
}
buf, err := codec.TextualFromNative(nil, goavro.Union("string", "some string"))
if err != nil {
fmt.Println(err)
}
fmt.Println(string(buf))
// Output: {"string":"some string"}
}
```
## Limitations
Goavro is a fully featured encoder and decoder of binary and textual
JSON Avro data. It fully supports recursive data structures, unions,
and namespacing. It does have a few limitations that have yet to be
implemented.
### Aliases
The Avro specification allows an implementation to optionally map a
writer's schema to a reader's schema using aliases. Although goavro
can compile schemas with aliases, it does not yet implement this
feature.
### Kafka Streams
[Kafka](http://kafka.apache.org) is the reason goavro was
written. Similar to Avro Object Container Files being a layer of
abstraction above Avro Data Serialization format, Kafka's use of Avro
is a layer of abstraction that also sits above Avro Data Serialization
format, but has its own schema. Like Avro Object Container Files, this
has been implemented but removed until the API can be improved.
### Default Maximum Block Counts, and Block Sizes
When decoding arrays, maps, and OCF files, the Avro specification
states that the binary includes block counts and block sizes that
specify how many items are in the next block, and how many bytes are
in the next block. To prevent possible denial-of-service attacks on
clients that use this library caused by attempting to decode
maliciously crafted data, decoded block counts and sizes are compared
against public library variables MaxBlockCount and MaxBlockSize. When
the decoded values exceed these values, the decoder returns an error.
Because not every upstream client is the same, we've chosen some sane
defaults for these values, but left them as mutable variables, so that
clients are able to override if deemed necessary for their
purposes. Their initial default values are (`math.MaxInt32` or
~2.2GB).
### Schema Evolution
Please see [my reasons why schema evolution is broken for Avro
1.x](https://github.com/linkedin/goavro/blob/master/SCHEMA-EVOLUTION.md).
## License
### Goavro license
Copyright 2017 LinkedIn Corp. Licensed under the Apache License,
Version 2.0 (the "License"); you may not use this file except in
compliance with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied.
### Google Snappy license
Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
## Third Party Dependencies
### Google Snappy
Goavro links with [Google Snappy](http://google.github.io/snappy/)
to provide Snappy compression and decompression support.
From the Avro specification:
default: A default value for this field, used when reading instances
that lack this field (optional). Permitted values depend on the
field's schema type, according to the table below. Default values for
union fields correspond to the first schema in the union. Default
values for bytes and fixed fields are JSON strings, where Unicode code
points 0-255 are mapped to unsigned 8-bit byte values 0-255. I read
the above to mean that the purpose of default values are to allow
reading Avro data that was written without the fields, and not
necessarily augmentation of data being serialized. So in general I
agree with you in terms of purpose.
One very important aspect of Avro is that the schema used to serialize
the data should always remain with the data, so that a reader would
always be able to read the schema and then be able to consume the
data. I think most people still agree so far.
However, this is where things get messy. Schema evolution is
frequently cited when folks want to use a new version of the schema to
read data that was once written using an older version of that schema.
I do not believe the Avro specification properly handles schema
evolution. Here's a simple example:
```
Record v0:
name: string
nickname: string, default: ""
```
```
Record v1:
name: string
nickname: string, default: ""
title: string, default: ""
```
Okay, now a binary stream of records is just a bunch of strings. Let's
do that now.
```
0x0A, A, l, i, c, e, 0x06, B, o, b, 0x0A, B, r, u, c, e, 0x0A, S, a, l, l, y, 0x06, A, n, n
```
How many records is that? It could be as many as 5 records, each of a
single name and no nicknames. It could be as few as 2 records, one of
them with a nickname and a title, and one with only a nickname, or a
title.
Now to drive home the nail that Avro schema evolution is broken, even
if each record had a header that indicated how many bytes it would
consume, we could know where one record began and ended, and how many
records there are. But if we were to read a record with two strings
in it, is the second string the nickname or the title?
The Avro specification has no answer to that question, so neither do I.
Effectively, Avro could be a great tool for serializing complex data,
but it's broken in its current form, and to fix it would require it to
break compatibility with itself, effectively rendering any binary data
serialized in a previous version of Avro unreadable by new versions,
unless it had some sort of version marker on the data so a library
could branch.
One great solution would be augmenting the binary encoding with a
simple field number identifier. Let's imagine an Avro 2.x that had
this feature, and would support schema evolution. Here's an example
stream of bytes that could be unambiguously decoded using the new
schema:
```
0x02, 0x0A, A, l, i, c, e, 0x02, 0x06, B, o, B, 0x04, 0x0A, B, r, u, c, e, 0x02, 0x0C, C, h, a, r, l, i, e, 0x06, 0x04, M, r
```
In the above example of my fake Avro 2.0, this can be
deterministically decoded because 0x02 indicates the following is
field number 1 (name), followed by string length 5, followed by
Alice.
Then the decoder would see 0x02, marking field number 1 again,
which means, "next record", followed by string length 3, followed by
Bob, followed by 0x04, which means field number 2 (nickname), followed
by string length 5, followed by Bruce.
Followed by field number 1 (next record), followed by string length 6,
followed by Charlie, followed by field number 3 (title), followed by
string length 2, followed by Mr.
In my hypothetical version of Avro 2, Avro can cope with schema
evolution using record defaults and such. Sadly, Avro 1.x cannot and
thus we should avoid using it if your use-case requires schema
evolution.
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"math"
"reflect"
)
func makeArrayCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
// array type must have items
itemSchema, ok := schemaMap["items"]
if !ok {
return nil, fmt.Errorf("Array ought to have items key")
}
itemCodec, err := buildCodec(st, enclosingNamespace, itemSchema)
if err != nil {
return nil, fmt.Errorf("Array items ought to be valid Avro type: %s", err)
}
return &Codec{
typeName: &name{"array", nullNamespace},
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) {
var value interface{}
var err error
// block count and block size
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err)
}
blockCount := value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long encoded
// block size following the negative block count. We have no use
// for the block size in this decoder, so we read and discard
// the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be made positive
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// NOTE: While the attempt of a RAM optimization shown below is not
// necessary, many encoders will encode all items in a single block.
// We can optimize amount of RAM allocated by runtime for the array
// by initializing the array for that number of items.
arrayValues := make([]interface{}, 0, blockCount)
for blockCount != 0 {
// Decode `blockCount` datum values from buffer
for i := int64(0); i < blockCount; i++ {
if value, buf, err = itemCodec.nativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array item %d: %s", i+1, err)
}
arrayValues = append(arrayValues, value)
}
// Decode next blockCount from buffer, because there may be more blocks
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err)
}
blockCount = value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long
// encoded block size following the negative block count. We
// have no use for the block size in this decoder, so we
// read and discard the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can
// never be made positive
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
}
return arrayValues, buf, nil
},
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
arrayValues, err := convertArray(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode binary array: %s", err)
}
arrayLength := int64(len(arrayValues))
var alreadyEncoded, remainingInBlock int64
for i, item := range arrayValues {
if remainingInBlock == 0 { // start a new block
remainingInBlock = arrayLength - alreadyEncoded
if remainingInBlock > MaxBlockCount {
// limit block count to MacBlockCount
remainingInBlock = MaxBlockCount
}
buf, _ = longBinaryFromNative(buf, remainingInBlock)
}
if buf, err = itemCodec.binaryFromNative(buf, item); err != nil {
return nil, fmt.Errorf("cannot encode binary array item %d: %v: %s", i+1, item, err)
}
remainingInBlock--
alreadyEncoded++
}
return longBinaryFromNative(buf, 0) // append trailing 0 block count to signal end of Array
},
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) {
var arrayValues []interface{}
var value interface{}
var err error
var b byte
if buf, err = advanceAndConsume(buf, '['); err != nil {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err)
}
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
// NOTE: Special case for empty array
if buf[0] == ']' {
return arrayValues, buf[1:], nil
}
// NOTE: Also terminates when read ']' byte.
for len(buf) > 0 {
// decode value
value, buf, err = itemCodec.nativeFromTextual(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err)
}
arrayValues = append(arrayValues, value)
// either comma or closing curly brace
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
switch b = buf[0]; b {
case ']':
return arrayValues, buf[1:], nil
case ',':
// no-op
default:
return nil, nil, fmt.Errorf("cannot decode textual array: expected ',' or ']'; received: %q", b)
}
// NOTE: consume comma from above
if buf, _ = advanceToNonWhitespace(buf[1:]); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
}
return nil, buf, io.ErrShortBuffer
},
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
arrayValues, err := convertArray(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode textual array: %s", err)
}
var atLeastOne bool
buf = append(buf, '[')
for i, item := range arrayValues {
atLeastOne = true
// Encode value
buf, err = itemCodec.textualFromNative(buf, item)
if err != nil {
// field was specified in datum; therefore its value was invalid
return nil, fmt.Errorf("cannot encode textual array item %d; %v: %s", i+1, item, err)
}
buf = append(buf, ',')
}
if atLeastOne {
return append(buf[:len(buf)-1], ']'), nil
}
return append(buf, ']'), nil
},
}, nil
}
// convertArray converts interface{} to []interface{} if possible.
func convertArray(datum interface{}) ([]interface{}, error) {
arrayValues, ok := datum.([]interface{})
if ok {
return arrayValues, nil
}
// NOTE: When given a slice of any other type, zip values to
// items as a convenience to client.
v := reflect.ValueOf(datum)
if v.Kind() != reflect.Slice {
return nil, fmt.Errorf("cannot create []interface{}: expected slice; received: %T", datum)
}
// NOTE: Two better alternatives to the current algorithm are:
// (1) mutate the reflection tuple underneath to convert the
// []int, for example, to []interface{}, with O(1) complexity
// (2) use copy builtin to zip the data items over with O(n) complexity,
// but more efficient than what's below.
// Suggestions?
arrayValues = make([]interface{}, v.Len())
for idx := 0; idx < v.Len(); idx++ {
arrayValues[idx] = v.Index(idx).Interface()
}
return arrayValues, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"math"
)
// bytesBinaryReader reads bytes from io.Reader and returns byte slice of
// specified size or the error encountered while trying to read those bytes.
func bytesBinaryReader(ior io.Reader) ([]byte, error) {
size, err := longBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read bytes: cannot read size: %s", err)
}
if size < 0 {
return nil, fmt.Errorf("cannot read bytes: size is negative: %d", size)
}
if size > MaxBlockSize {
return nil, fmt.Errorf("cannot read bytes: size exceeds MaxBlockSize: %d > %d", size, MaxBlockSize)
}
buf := make([]byte, size)
_, err = io.ReadAtLeast(ior, buf, int(size))
if err != nil {
return nil, fmt.Errorf("cannot read bytes: %s", err)
}
return buf, nil
}
// longBinaryReader reads bytes from io.Reader until has complete long value, or
// read error.
func longBinaryReader(ior io.Reader) (int64, error) {
var value uint64
var shift uint
var err error
var b byte
// NOTE: While benchmarks show it's more performant to invoke ReadByte when
// available, testing whether a variable's data type implements a particular
// method is quite slow too. So perform the test once, and branch to the
// appropriate loop based on the results.
if byteReader, ok := ior.(io.ByteReader); ok {
for {
if b, err = byteReader.ReadByte(); err != nil {
return 0, err // NOTE: must send back unaltered error to detect io.EOF
}
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), nil
}
shift += 7
}
}
// NOTE: ior does not also implement io.ByteReader, so we must allocate a
// byte slice with a single byte, and read each byte into the slice.
buf := make([]byte, 1)
for {
if _, err = ior.Read(buf); err != nil {
return 0, err // NOTE: must send back unaltered error to detect io.EOF
}
b = buf[0]
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), nil
}
shift += 7
}
}
// metadataBinaryReader reads bytes from io.Reader until has entire map value,
// or read error.
func metadataBinaryReader(ior io.Reader) (map[string][]byte, error) {
var err error
var value interface{}
// block count and block size
if value, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block count: %s", err)
}
blockCount := value.(int64)
if blockCount < 0 {
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be
// made positive
return nil, fmt.Errorf("cannot read map with block count: %d", blockCount)
}
// NOTE: A negative block count implies there is a long encoded block
// size following the negative block count. We have no use for the block
// size in this decoder, so we read and discard the value.
blockCount = -blockCount // convert to its positive equivalent
if _, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// NOTE: While the attempt of a RAM optimization shown below is not
// necessary, many encoders will encode all items in a single block. We can
// optimize amount of RAM allocated by runtime for the array by initializing
// the array for that number of items.
mapValues := make(map[string][]byte, blockCount)
for blockCount != 0 {
// Decode `blockCount` datum values from buffer
for i := int64(0); i < blockCount; i++ {
// first decode the key string
keyBytes, err := bytesBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read map key: %s", err)
}
key := string(keyBytes)
if _, ok := mapValues[key]; ok {
return nil, fmt.Errorf("cannot read map: duplicate key: %q", key)
}
// metadata values are always bytes
buf, err := bytesBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read map value for key %q: %s", key, err)
}
mapValues[key] = buf
}
// Decode next blockCount from buffer, because there may be more blocks
if value, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block count: %s", err)
}
blockCount = value.(int64)
if blockCount < 0 {
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be
// made positive
return nil, fmt.Errorf("cannot read map with block count: %d", blockCount)
}
// NOTE: A negative block count implies there is a long encoded
// block size following the negative block count. We have no use for
// the block size in this decoder, so we read and discard the value.
blockCount = -blockCount // convert to its positive equivalent
if _, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
}
return mapValues, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
"io"
)
func booleanNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < 1 {
return nil, nil, io.ErrShortBuffer
}
var b byte
b, buf = buf[0], buf[1:]
switch b {
case byte(0):
return false, buf, nil
case byte(1):
return true, buf, nil
default:
return nil, nil, fmt.Errorf("cannot decode binary boolean: expected: Go byte(0) or byte(1); received: byte(%d)", b)
}
}
func booleanBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
value, ok := datum.(bool)
if !ok {
return nil, fmt.Errorf("cannot encode binary boolean: expected: Go bool; received: %T", datum)
}
var b byte
if value {
b = 1
}
return append(buf, b), nil
}
func booleanNativeFromTextual(buf []byte) (interface{}, []byte, error) {
if len(buf) < 4 {
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:4], []byte("true")) {
return true, buf[4:], nil
}
if len(buf) < 5 {
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:5], []byte("false")) {
return false, buf[5:], nil
}
return nil, nil, errors.New("expected false or true")
}
func booleanTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
value, ok := datum.(bool)
if !ok {
return nil, fmt.Errorf("boolean: expected: Go bool; received: %T", datum)
}
if value {
return append(buf, "true"...), nil
}
return append(buf, "false"...), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"encoding/hex"
"errors"
"fmt"
"io"
"os"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
////////////////////////////////////////
// Binary Decode
////////////////////////////////////////
func bytesNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < 1 {
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer)
}
var decoded interface{}
var err error
if decoded, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", err)
}
size := decoded.(int64) // always returns int64
if size < 0 {
return nil, nil, fmt.Errorf("cannot decode binary bytes: negative size: %d", size)
}
if size > int64(len(buf)) {
return nil, nil, fmt.Errorf("cannot decode binary bytes: %s", io.ErrShortBuffer)
}
return buf[:size], buf[size:], nil
}
func stringNativeFromBinary(buf []byte) (interface{}, []byte, error) {
d, b, err := bytesNativeFromBinary(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode binary string: %s", err)
}
return string(d.([]byte)), b, nil
}
////////////////////////////////////////
// Binary Encode
////////////////////////////////////////
func bytesBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode binary bytes: expected: []byte or string; received: %T", datum)
}
buf, _ = longBinaryFromNative(buf, len(someBytes)) // only fails when given non integer
return append(buf, someBytes...), nil // append datum bytes
}
func stringBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode binary bytes: expected: string; received: %T", datum)
}
buf, _ = longBinaryFromNative(buf, len(someBytes)) // only fails when given non integer
return append(buf, someBytes...), nil // append datum bytes
}
////////////////////////////////////////
// Text Decode
////////////////////////////////////////
func bytesNativeFromTextual(buf []byte) (interface{}, []byte, error) {
buflen := len(buf)
if buflen < 2 {
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer)
}
if buf[0] != '"' {
return nil, nil, fmt.Errorf("cannot decode textual bytes: expected initial \"; found: %#U", buf[0])
}
var newBytes []byte
var escaped bool
// Loop through bytes following initial double quote, but note we will
// return immediately when find unescaped double quote.
for i := 1; i < buflen; i++ {
b := buf[i]
if escaped {
escaped = false
if b2, ok := unescapeSpecialJSON(b); ok {
newBytes = append(newBytes, b2)
continue
}
if b == 'u' {
// NOTE: Need at least 4 more bytes to read uint16, but subtract
// 1 because do not want to count the trailing quote and
// subtract another 1 because already consumed u but have yet to
// increment i.
if i > buflen-6 {
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", io.ErrShortBuffer)
}
// NOTE: Avro bytes represent binary data, and do not
// necessarily represent text. Therefore, Avro bytes are not
// encoded in UTF-16. Each \u is followed by 4 hexadecimal
// digits, the first and second of which must be 0.
v, err := parseUint64FromHexSlice(buf[i+3 : i+5])
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual bytes: %s", err)
}
i += 4 // absorb 4 characters: one 'u' and three of the digits
newBytes = append(newBytes, byte(v))
continue
}
newBytes = append(newBytes, b)
continue
}
if b == '\\' {
escaped = true
continue
}
if b == '"' {
return newBytes, buf[i+1:], nil
}
newBytes = append(newBytes, b)
}
return nil, nil, fmt.Errorf("cannot decode textual bytes: expected final \"; found: %#U", buf[buflen-1])
}
func stringNativeFromTextual(buf []byte) (interface{}, []byte, error) {
buflen := len(buf)
if buflen < 2 {
return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer)
}
if buf[0] != '"' {
return nil, nil, fmt.Errorf("cannot decode textual string: expected initial \"; found: %#U", buf[0])
}
var newBytes []byte
var escaped bool
// Loop through bytes following initial double quote, but note we will
// return immediately when find unescaped double quote.
for i := 1; i < buflen; i++ {
b := buf[i]
if escaped {
escaped = false
if b2, ok := unescapeSpecialJSON(b); ok {
newBytes = append(newBytes, b2)
continue
}
if b == 'u' {
// NOTE: Need at least 4 more bytes to read uint16, but subtract
// 1 because do not want to count the trailing quote and
// subtract another 1 because already consumed u but have yet to
// increment i.
if i > buflen-6 {
return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer)
}
v, err := parseUint64FromHexSlice(buf[i+1 : i+5])
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual string: %s", err)
}
i += 4 // absorb 4 characters: one 'u' and three of the digits
nbl := len(newBytes)
newBytes = append(newBytes, []byte{0, 0, 0, 0}...) // grow to make room for UTF-8 encoded rune
r := rune(v)
if utf16.IsSurrogate(r) {
i++ // absorb final hexadecimal digit from previous value
// Expect second half of surrogate pair
if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' {
return nil, nil, errors.New("cannot decode textual string: missing second half of surrogate pair")
}
v, err = parseUint64FromHexSlice(buf[i+2 : i+6])
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual string: %s", err)
}
i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits
// Get code point by combining high and low surrogate bits
r = utf16.DecodeRune(r, rune(v))
}
width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
newBytes = newBytes[:nbl+width] // trim off excess bytes
continue
}
newBytes = append(newBytes, b)
continue
}
if b == '\\' {
escaped = true
continue
}
if b == '"' {
return string(newBytes), buf[i+1:], nil
}
newBytes = append(newBytes, b)
}
if escaped {
return nil, nil, fmt.Errorf("cannot decode textual string: %s", io.ErrShortBuffer)
}
return nil, nil, fmt.Errorf("cannot decode textual string: expected final \"; found: %x", buf[buflen-1])
}
func unescapeUnicodeString(some string) (string, error) {
if some == "" {
return "", nil
}
buf := []byte(some)
buflen := len(buf)
var i int
var newBytes []byte
var escaped bool
// Loop through bytes following initial double quote, but note we will
// return immediately when find unescaped double quote.
for i = 0; i < buflen; i++ {
b := buf[i]
if escaped {
escaped = false
if b == 'u' {
// NOTE: Need at least 4 more bytes to read uint16, but subtract
// 1 because do not want to count the trailing quote and
// subtract another 1 because already consumed u but have yet to
// increment i.
if i > buflen-6 {
return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalent: %s", io.ErrShortBuffer)
}
v, err := parseUint64FromHexSlice(buf[i+1 : i+5])
if err != nil {
return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalent: %s", err)
}
i += 4 // absorb 4 characters: one 'u' and three of the digits
nbl := len(newBytes)
newBytes = append(newBytes, []byte{0, 0, 0, 0}...) // grow to make room for UTF-8 encoded rune
r := rune(v)
if utf16.IsSurrogate(r) {
i++ // absorb final hexadecimal digit from previous value
// Expect second half of surrogate pair
if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' {
return "", errors.New("cannot replace escaped characters with UTF-8 equivalent: missing second half of surrogate pair")
}
v, err = parseUint64FromHexSlice(buf[i+2 : i+6])
if err != nil {
return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalents: %s", err)
}
i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits
// Get code point by combining high and low surrogate bits
r = utf16.DecodeRune(r, rune(v))
}
width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
newBytes = newBytes[:nbl+width] // trim off excess bytes
continue
}
newBytes = append(newBytes, b)
continue
}
if b == '\\' {
escaped = true
continue
}
newBytes = append(newBytes, b)
}
if escaped {
return "", fmt.Errorf("cannot replace escaped characters with UTF-8 equivalents: %s", io.ErrShortBuffer)
}
return string(newBytes), nil
}
func parseUint64FromHexSlice(buf []byte) (uint64, error) {
var value uint64
for _, b := range buf {
diff := uint64(b - '0')
if diff < 10 {
value = (value << 4) | diff
continue
}
b10 := b + 10
diff = uint64(b10 - 'A')
if diff < 10 {
return 0, hex.InvalidByteError(b)
}
if diff < 16 {
value = (value << 4) | diff
continue
}
diff = uint64(b10 - 'a')
if diff < 10 {
return 0, hex.InvalidByteError(b)
}
if diff < 16 {
value = (value << 4) | diff
continue
}
return 0, hex.InvalidByteError(b)
}
return value, nil
}
func unescapeSpecialJSON(b byte) (byte, bool) {
// NOTE: The following 8 special JSON characters must be escaped:
switch b {
case '"', '\\', '/':
return b, true
case 'b':
return '\b', true
case 'f':
return '\f', true
case 'n':
return '\n', true
case 'r':
return '\r', true
case 't':
return '\t', true
}
return b, false
}
////////////////////////////////////////
// Text Encode
////////////////////////////////////////
func bytesTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode textual bytes: expected: []byte or string; received: %T", datum)
}
buf = append(buf, '"') // prefix buffer with double quote
for _, b := range someBytes {
if escaped, ok := escapeSpecialJSON(b); ok {
buf = append(buf, escaped...)
continue
}
if r := rune(b); r < utf8.RuneSelf && unicode.IsPrint(r) {
buf = append(buf, b)
continue
}
// This Code Point _could_ be encoded as a single byte, however, it's
// above standard ASCII range (b > 127), therefore must encode using its
// four-byte hexadecimal equivalent, which will always start with the
// high byte 00
buf = appendUnicodeHex(buf, uint16(b))
}
return append(buf, '"'), nil // postfix buffer with double quote
}
func stringTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
var someString string
switch d := datum.(type) {
case []byte:
someString = string(d)
case string:
someString = d
default:
return nil, fmt.Errorf("cannot encode textual string: expected: []byte or string; received: %T", datum)
}
buf = append(buf, '"') // prefix buffer with double quote
for _, r := range someString {
if r < utf8.RuneSelf {
if escaped, ok := escapeSpecialJSON(byte(r)); ok {
buf = append(buf, escaped...)
continue
}
if unicode.IsPrint(r) {
buf = append(buf, byte(r))
continue
}
}
// NOTE: Attempt to encode code point as UTF-16 surrogate pair
r1, r2 := utf16.EncodeRune(r)
if r1 != unicode.ReplacementChar || r2 != unicode.ReplacementChar {
// code point does require surrogate pair, and thus two uint16 values
buf = appendUnicodeHex(buf, uint16(r1))
buf = appendUnicodeHex(buf, uint16(r2))
continue
}
// Code Point does not require surrogate pair.
buf = appendUnicodeHex(buf, uint16(r))
}
return append(buf, '"'), nil // postfix buffer with double quote
}
func appendUnicodeHex(buf []byte, v uint16) []byte {
// Start with '\u' prefix:
buf = append(buf, sliceUnicode...)
// And tack on 4 hexadecimal digits:
buf = append(buf, hexDigits[(v&0xF000)>>12])
buf = append(buf, hexDigits[(v&0xF00)>>8])
buf = append(buf, hexDigits[(v&0xF0)>>4])
buf = append(buf, hexDigits[(v&0xF)])
return buf
}
const hexDigits = "0123456789ABCDEF"
func escapeSpecialJSON(b byte) ([]byte, bool) {
// NOTE: The following 8 special JSON characters must be escaped:
switch b {
case '"':
return sliceQuote, true
case '\\':
return sliceBackslash, true
case '/':
return sliceSlash, true
case '\b':
return sliceBackspace, true
case '\f':
return sliceFormfeed, true
case '\n':
return sliceNewline, true
case '\r':
return sliceCarriageReturn, true
case '\t':
return sliceTab, true
}
return nil, false
}
// While slices in Go are never constants, we can initialize them once and reuse
// them many times. We define these slices at library load time and reuse them
// when encoding JSON.
var (
sliceQuote = []byte("\\\"")
sliceBackslash = []byte("\\\\")
sliceSlash = []byte("\\/")
sliceBackspace = []byte("\\b")
sliceFormfeed = []byte("\\f")
sliceNewline = []byte("\\n")
sliceCarriageReturn = []byte("\\r")
sliceTab = []byte("\\t")
sliceUnicode = []byte("\\u")
)
// DEBUG -- remove function prior to committing
func decodedStringFromJSON(buf []byte) (string, []byte, error) {
fmt.Fprintf(os.Stderr, "decodedStringFromJSON(%v)\n", buf)
buflen := len(buf)
if buflen < 2 {
return "", buf, fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
}
if buf[0] != '"' {
return "", buf, fmt.Errorf("cannot decode string: expected initial '\"'; found: %#U", buf[0])
}
var newBytes []byte
var escaped, ok bool
// Loop through bytes following initial double quote, but note we will
// return immediately when find unescaped double quote.
for i := 1; i < buflen; i++ {
b := buf[i]
if escaped {
escaped = false
if b, ok = unescapeSpecialJSON(b); ok {
newBytes = append(newBytes, b)
continue
}
if b == 'u' {
// NOTE: Need at least 4 more bytes to read uint16, but subtract
// 1 because do not want to count the trailing quote and
// subtract another 1 because already consumed u but have yet to
// increment i.
if i > buflen-6 {
return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", io.ErrShortBuffer)
}
v, err := parseUint64FromHexSlice(buf[i+1 : i+5])
if err != nil {
return "", buf[i+1:], fmt.Errorf("cannot decode string: %s", err)
}
i += 4 // absorb 4 characters: one 'u' and three of the digits
nbl := len(newBytes)
newBytes = append(newBytes, 0, 0, 0, 0) // grow to make room for UTF-8 encoded rune
r := rune(v)
if utf16.IsSurrogate(r) {
i++ // absorb final hexidecimal digit from previous value
// Expect second half of surrogate pair
if i > buflen-6 || buf[i] != '\\' || buf[i+1] != 'u' {
return "", buf[i+1:], errors.New("cannot decode string: missing second half of surrogate pair")
}
v, err = parseUint64FromHexSlice(buf[i+2 : i+6])
if err != nil {
return "", buf[i+1:], fmt.Errorf("cannot decode string: cannot decode second half of surrogate pair: %s", err)
}
i += 5 // absorb 5 characters: two for '\u', and 3 of the 4 digits
// Get code point by combining high and low surrogate bits
r = utf16.DecodeRune(r, rune(v))
}
width := utf8.EncodeRune(newBytes[nbl:], r) // append UTF-8 encoded version of code point
newBytes = newBytes[:nbl+width] // trim off excess bytes
continue
}
newBytes = append(newBytes, b)
continue
}
if b == '\\' {
escaped = true
continue
}
if b == '"' {
return string(newBytes), buf[i+1:], nil
}
newBytes = append(newBytes, b)
}
return "", buf, fmt.Errorf("cannot decode string: expected final '\"'; found: %#U", buf[buflen-1])
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"sort"
"strconv"
"strings"
)
// pcfProcessor is a function type that given a parsed JSON object, returns its
// Parsing Canonical Form according to the Avro specification.
type pcfProcessor func(s interface{}) (string, error)
// parsingCanonialForm returns the "Parsing Canonical Form" (pcf) for a parsed
// JSON structure of a valid Avro schema, or an error describing the schema
// error.
func parsingCanonicalForm(schema interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
switch val := schema.(type) {
case map[string]interface{}:
// JSON objects are decoded as a map of strings to empty interfaces
return pcfObject(val, parentNamespace, typeLookup)
case []interface{}:
// JSON arrays are decoded as a slice of empty interfaces
return pcfArray(val, parentNamespace, typeLookup)
case string:
// JSON string values are decoded as a Go string
return pcfString(val, typeLookup)
case float64:
// JSON numerical values are decoded as Go float64
return pcfNumber(val)
default:
return "", fmt.Errorf("cannot parse schema with invalid schema type; ought to be map[string]interface{}, []interface{}, string, or float64; received: %T: %v", schema, schema)
}
}
// pcfNumber returns the parsing canonical form for a numerical value.
func pcfNumber(val float64) (string, error) {
return strconv.FormatFloat(val, 'g', -1, 64), nil
}
// pcfString returns the parsing canonical form for a string value.
func pcfString(val string, typeLookup map[string]string) (string, error) {
if canonicalName, ok := typeLookup[val]; ok {
return `"` + canonicalName + `"`, nil
}
return `"` + val + `"`, nil
}
// pcfArray returns the parsing canonical form for a JSON array.
func pcfArray(val []interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
items := make([]string, len(val))
for i, el := range val {
p, err := parsingCanonicalForm(el, parentNamespace, typeLookup)
if err != nil {
return "", err
}
items[i] = p
}
return "[" + strings.Join(items, ",") + "]", nil
}
// pcfObject returns the parsing canonical form for a JSON object.
func pcfObject(jsonMap map[string]interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
pairs := make(stringPairs, 0, len(jsonMap))
// Remember the namespace to fully qualify names later
var namespace string
if namespaceJSON, ok := jsonMap["namespace"]; ok {
if namespaceStr, ok := namespaceJSON.(string); ok {
// and it's value is string (otherwise invalid schema)
if parentNamespace == "" {
namespace = namespaceStr
} else {
namespace = parentNamespace + "." + namespaceStr
}
parentNamespace = namespace
}
} else if objectType, ok := jsonMap["type"]; ok && objectType == "record" {
namespace = parentNamespace
}
for k, v := range jsonMap {
// Reduce primitive schemas to their simple form.
if len(jsonMap) == 1 && k == "type" {
if t, ok := v.(string); ok {
return "\"" + t + "\"", nil
}
}
// Only keep relevant attributes (strip 'doc', 'alias', 'namespace')
if _, ok := fieldOrder[k]; !ok {
continue
}
// Add namespace to a non-qualified name.
if k == "name" && namespace != "" {
// Check if the name isn't already qualified.
if t, ok := v.(string); ok && !strings.ContainsRune(t, '.') {
v = namespace + "." + t
typeLookup[t] = v.(string)
}
}
// Only fixed type allows size, and we must convert a string size to a
// float.
if k == "size" {
if s, ok := v.(string); ok {
s, err := strconv.ParseUint(s, 10, 0)
if err != nil {
// should never get here because already validated schema
return "", fmt.Errorf("Fixed size ought to be number greater than zero: %v", s)
}
v = float64(s)
}
}
pk, err := parsingCanonicalForm(k, parentNamespace, typeLookup)
if err != nil {
return "", err
}
pv, err := parsingCanonicalForm(v, parentNamespace, typeLookup)
if err != nil {
return "", err
}
pairs = append(pairs, stringPair{k, pk + ":" + pv})
}
// Sort keys by their order in specification.
sort.Sort(byAvroFieldOrder(pairs))
return "{" + strings.Join(pairs.Bs(), ",") + "}", nil
}
// stringPair represents a pair of string values.
type stringPair struct {
A string
B string
}
// stringPairs is a sortable slice of pairs of strings.
type stringPairs []stringPair
// Bs returns an array of second values of an array of pairs.
func (sp *stringPairs) Bs() []string {
items := make([]string, len(*sp))
for i, el := range *sp {
items[i] = el.B
}
return items
}
// fieldOrder defines fields that show up in canonical schema and specifies
// their precedence.
var fieldOrder = map[string]int{
"name": 1,
"type": 2,
"fields": 3,
"symbols": 4,
"items": 5,
"values": 6,
"size": 7,
}
// byAvroFieldOrder is equipped with a sort order of fields according to the
// specification.
type byAvroFieldOrder []stringPair
func (s byAvroFieldOrder) Len() int {
return len(s)
}
func (s byAvroFieldOrder) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s byAvroFieldOrder) Less(i, j int) bool {
return fieldOrder[s[i].A] < fieldOrder[s[j].A]
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"encoding/binary"
"encoding/json"
"fmt"
"math"
"strconv"
)
var (
// MaxBlockCount is the maximum number of data items allowed in a single
// block that will be decoded from a binary stream, whether when reading
// blocks to decode an array or a map, or when reading blocks from an OCF
// stream. This check is to ensure decoding binary data will not cause the
// library to over allocate RAM, potentially creating a denial of service on
// the system.
//
// If a particular application needs to decode binary Avro data that
// potentially has more data items in a single block, then this variable may
// be modified at your discretion.
MaxBlockCount = int64(math.MaxInt32)
// MaxBlockSize is the maximum number of bytes that will be allocated for a
// single block of data items when decoding from a binary stream. This check
// is to ensure decoding binary data will not cause the library to over
// allocate RAM, potentially creating a denial of service on the system.
//
// If a particular application needs to decode binary Avro data that
// potentially has more bytes in a single block, then this variable may be
// modified at your discretion.
MaxBlockSize = int64(math.MaxInt32)
)
// Codec supports decoding binary and text Avro data to Go native data types,
// and conversely encoding Go native data types to binary or text Avro data. A
// Codec is created as a stateless structure that can be safely used in multiple
// go routines simultaneously.
type Codec struct {
soeHeader []byte // single-object-encoding header
schemaOriginal string
schemaCanonical string
typeName *name
nativeFromTextual func([]byte) (interface{}, []byte, error)
binaryFromNative func([]byte, interface{}) ([]byte, error)
nativeFromBinary func([]byte) (interface{}, []byte, error)
textualFromNative func([]byte, interface{}) ([]byte, error)
Rabin uint64
}
// NewCodec returns a Codec used to translate between a byte slice of either
// binary or textual Avro data and native Go data.
//
// Creating a `Codec` is fast, but ought to be performed exactly once per Avro
// schema to process. Once a `Codec` is created, it may be used multiple times
// to convert data between native form and binary Avro representation, or
// between native form and textual Avro representation.
//
// A particular `Codec` can work with only one Avro schema. However,
// there is no practical limit to how many `Codec`s may be created and
// used in a program. Internally a `Codec` is merely a named tuple of
// four function pointers, and maintains no runtime state that is mutated
// after instantiation. In other words, `Codec`s may be safely used by
// many go routines simultaneously, as your program requires.
//
// codec, err := goavro.NewCodec(`
// {
// "type": "record",
// "name": "LongList",
// "fields" : [
// {"name": "next", "type": ["null", "LongList"], "default": null}
// ]
// }`)
// if err != nil {
// fmt.Println(err)
// }
func NewCodec(schemaSpecification string) (*Codec, error) {
var schema interface{}
if err := json.Unmarshal([]byte(schemaSpecification), &schema); err != nil {
return nil, fmt.Errorf("cannot unmarshal schema JSON: %s", err)
}
// bootstrap a symbol table with primitive type codecs for the new codec
st := newSymbolTable()
c, err := buildCodec(st, nullNamespace, schema)
if err != nil {
return nil, err
}
c.schemaCanonical, err = parsingCanonicalForm(schema, "", make(map[string]string))
if err != nil {
return nil, err // should not get here because schema was validated above
}
c.Rabin = rabin([]byte(c.schemaCanonical))
c.soeHeader = []byte{0xC3, 0x01, 0, 0, 0, 0, 0, 0, 0, 0}
binary.LittleEndian.PutUint64(c.soeHeader[2:], c.Rabin)
c.schemaOriginal = schemaSpecification
return c, nil
}
func newSymbolTable() map[string]*Codec {
return map[string]*Codec{
"boolean": {
typeName: &name{"boolean", nullNamespace},
schemaOriginal: "boolean",
schemaCanonical: "boolean",
binaryFromNative: booleanBinaryFromNative,
nativeFromBinary: booleanNativeFromBinary,
nativeFromTextual: booleanNativeFromTextual,
textualFromNative: booleanTextualFromNative,
},
"bytes": {
typeName: &name{"bytes", nullNamespace},
schemaOriginal: "bytes",
schemaCanonical: "bytes",
binaryFromNative: bytesBinaryFromNative,
nativeFromBinary: bytesNativeFromBinary,
nativeFromTextual: bytesNativeFromTextual,
textualFromNative: bytesTextualFromNative,
},
"double": {
typeName: &name{"double", nullNamespace},
schemaOriginal: "double",
schemaCanonical: "double",
binaryFromNative: doubleBinaryFromNative,
nativeFromBinary: doubleNativeFromBinary,
nativeFromTextual: doubleNativeFromTextual,
textualFromNative: doubleTextualFromNative,
},
"float": {
typeName: &name{"float", nullNamespace},
schemaOriginal: "float",
schemaCanonical: "float",
binaryFromNative: floatBinaryFromNative,
nativeFromBinary: floatNativeFromBinary,
nativeFromTextual: floatNativeFromTextual,
textualFromNative: floatTextualFromNative,
},
"int": {
typeName: &name{"int", nullNamespace},
schemaOriginal: "int",
schemaCanonical: "int",
binaryFromNative: intBinaryFromNative,
nativeFromBinary: intNativeFromBinary,
nativeFromTextual: intNativeFromTextual,
textualFromNative: intTextualFromNative,
},
"long": {
typeName: &name{"long", nullNamespace},
schemaOriginal: "long",
schemaCanonical: "long",
binaryFromNative: longBinaryFromNative,
nativeFromBinary: longNativeFromBinary,
nativeFromTextual: longNativeFromTextual,
textualFromNative: longTextualFromNative,
},
"null": {
typeName: &name{"null", nullNamespace},
schemaOriginal: "null",
schemaCanonical: "null",
binaryFromNative: nullBinaryFromNative,
nativeFromBinary: nullNativeFromBinary,
nativeFromTextual: nullNativeFromTextual,
textualFromNative: nullTextualFromNative,
},
"string": {
typeName: &name{"string", nullNamespace},
schemaOriginal: "string",
schemaCanonical: "string",
binaryFromNative: stringBinaryFromNative,
nativeFromBinary: stringNativeFromBinary,
nativeFromTextual: stringNativeFromTextual,
textualFromNative: stringTextualFromNative,
},
// Start of compiled logical types using format typeName.logicalType where there is
// no dependence on schema.
"long.timestamp-millis": {
typeName: &name{"long.timestamp-millis", nullNamespace},
schemaOriginal: "long",
schemaCanonical: "long",
nativeFromTextual: nativeFromTimeStampMillis(longNativeFromTextual),
binaryFromNative: timeStampMillisFromNative(longBinaryFromNative),
nativeFromBinary: nativeFromTimeStampMillis(longNativeFromBinary),
textualFromNative: timeStampMillisFromNative(longTextualFromNative),
},
"long.timestamp-micros": {
typeName: &name{"long.timestamp-micros", nullNamespace},
schemaOriginal: "long",
schemaCanonical: "long",
nativeFromTextual: nativeFromTimeStampMicros(longNativeFromTextual),
binaryFromNative: timeStampMicrosFromNative(longBinaryFromNative),
nativeFromBinary: nativeFromTimeStampMicros(longNativeFromBinary),
textualFromNative: timeStampMicrosFromNative(longTextualFromNative),
},
"int.time-millis": {
typeName: &name{"int.time-millis", nullNamespace},
schemaOriginal: "int",
schemaCanonical: "int",
nativeFromTextual: nativeFromTimeMillis(intNativeFromTextual),
binaryFromNative: timeMillisFromNative(intBinaryFromNative),
nativeFromBinary: nativeFromTimeMillis(intNativeFromBinary),
textualFromNative: timeMillisFromNative(intTextualFromNative),
},
"long.time-micros": {
typeName: &name{"long.time-micros", nullNamespace},
schemaOriginal: "long",
schemaCanonical: "long",
nativeFromTextual: nativeFromTimeMicros(longNativeFromTextual),
binaryFromNative: timeMicrosFromNative(longBinaryFromNative),
nativeFromBinary: nativeFromTimeMicros(longNativeFromBinary),
textualFromNative: timeMicrosFromNative(longTextualFromNative),
},
"int.date": {
typeName: &name{"int.date", nullNamespace},
schemaOriginal: "int",
schemaCanonical: "int",
nativeFromTextual: nativeFromDate(intNativeFromTextual),
binaryFromNative: dateFromNative(intBinaryFromNative),
nativeFromBinary: nativeFromDate(intNativeFromBinary),
textualFromNative: dateFromNative(intTextualFromNative),
},
}
}
// BinaryFromNative appends the binary encoded byte slice representation of the
// provided native datum value to the provided byte slice in accordance with the
// Avro schema supplied when creating the Codec. It is supplied a byte slice to
// which to append the binary encoded data along with the actual data to encode.
// On success, it returns a new byte slice with the encoded bytes appended, and
// a nil error value. On error, it returns the original byte slice, and the
// error message.
//
// func ExampleBinaryFromNative() {
// codec, err := goavro.NewCodec(`
// {
// "type": "record",
// "name": "LongList",
// "fields" : [
// {"name": "next", "type": ["null", "LongList"], "default": null}
// ]
// }`)
// if err != nil {
// fmt.Println(err)
// }
//
// // Convert native Go form to binary Avro data
// binary, err := codec.BinaryFromNative(nil, map[string]interface{}{
// "next": map[string]interface{}{
// "LongList": map[string]interface{}{
// "next": map[string]interface{}{
// "LongList": map[string]interface{}{
// // NOTE: May omit fields when using default value
// },
// },
// },
// },
// })
// if err != nil {
// fmt.Println(err)
// }
//
// fmt.Printf("%#v", binary)
// // Output: []byte{0x2, 0x2, 0x0}
// }
func (c *Codec) BinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
newBuf, err := c.binaryFromNative(buf, datum)
if err != nil {
return buf, err // if error, return original byte slice
}
return newBuf, nil
}
// NativeFromBinary returns a native datum value from the binary encoded byte
// slice in accordance with the Avro schema supplied when creating the Codec. On
// success, it returns the decoded datum, a byte slice containing the remaining
// undecoded bytes, and a nil error value. On error, it returns nil for
// the datum value, the original byte slice, and the error message.
//
// func ExampleNativeFromBinary() {
// codec, err := goavro.NewCodec(`
// {
// "type": "record",
// "name": "LongList",
// "fields" : [
// {"name": "next", "type": ["null", "LongList"], "default": null}
// ]
// }`)
// if err != nil {
// fmt.Println(err)
// }
//
// // Convert native Go form to binary Avro data
// binary := []byte{0x2, 0x2, 0x0}
//
// native, _, err := codec.NativeFromBinary(binary)
// if err != nil {
// fmt.Println(err)
// }
//
// fmt.Printf("%v", native)
// // Output: map[next:map[LongList:map[next:map[LongList:map[next:<nil>]]]]]
// }
func (c *Codec) NativeFromBinary(buf []byte) (interface{}, []byte, error) {
value, newBuf, err := c.nativeFromBinary(buf)
if err != nil {
return nil, buf, err // if error, return original byte slice
}
return value, newBuf, nil
}
// NativeFromSingle converts Avro data from Single-Object-Encoded format from
// the provided byte slice to Go native data types in accordance with the Avro
// schema supplied when creating the Codec. On success, it returns the decoded
// datum, along with a new byte slice with the decoded bytes consumed, and a nil
// error value. On error, it returns nil for the datum value, the original byte
// slice, and the error message.
//
// func decode(codec *goavro.Codec, buf []byte) error {
// datum, _, err := codec.NativeFromSingle(buf)
// if err != nil {
// return err
// }
// _, err = fmt.Println(datum)
// return err
// }
func (c *Codec) NativeFromSingle(buf []byte) (interface{}, []byte, error) {
fingerprint, newBuf, err := FingerprintFromSOE(buf)
if err != nil {
return nil, buf, err
}
if !bytes.Equal(buf[:len(c.soeHeader)], c.soeHeader) {
return nil, buf, ErrWrongCodec(fingerprint)
}
value, newBuf, err := c.nativeFromBinary(newBuf)
if err != nil {
return nil, buf, err // if error, return original byte slice
}
return value, newBuf, nil
}
// NativeFromTextual converts Avro data in JSON text format from the provided byte
// slice to Go native data types in accordance with the Avro schema supplied
// when creating the Codec. On success, it returns the decoded datum, along with
// a new byte slice with the decoded bytes consumed, and a nil error value. On
// error, it returns nil for the datum value, the original byte slice, and the
// error message.
//
// func ExampleNativeFromTextual() {
// codec, err := goavro.NewCodec(`
// {
// "type": "record",
// "name": "LongList",
// "fields" : [
// {"name": "next", "type": ["null", "LongList"], "default": null}
// ]
// }`)
// if err != nil {
// fmt.Println(err)
// }
//
// // Convert native Go form to text Avro data
// text := []byte(`{"next":{"LongList":{"next":{"LongList":{"next":null}}}}}`)
//
// native, _, err := codec.NativeFromTextual(text)
// if err != nil {
// fmt.Println(err)
// }
//
// fmt.Printf("%v", native)
// // Output: map[next:map[LongList:map[next:map[LongList:map[next:<nil>]]]]]
// }
func (c *Codec) NativeFromTextual(buf []byte) (interface{}, []byte, error) {
value, newBuf, err := c.nativeFromTextual(buf)
if err != nil {
return nil, buf, err // if error, return original byte slice
}
return value, newBuf, nil
}
// SingleFromNative appends the single-object-encoding byte slice representation
// of the provided native datum value to the provided byte slice in accordance
// with the Avro schema supplied when creating the Codec. It is supplied a byte
// slice to which to append the header and binary encoded data, along with the
// actual data to encode. On success, it returns a new byte slice with the
// encoded bytes appended, and a nil error value. On error, it returns the
// original byte slice, and the error message.
//
// func ExampleSingleItemEncoding() {
// codec, err := goavro.NewCodec(`"int"`)
// if err != nil {
// fmt.Fprintf(os.Stderr, "%s\n", err)
// return
// }
//
// buf, err := codec.SingleFromNative(nil, 3)
// if err != nil {
// fmt.Fprintf(os.Stderr, "%s\n", err)
// return
// }
//
// fmt.Println(buf)
// // Output: [195 1 143 92 57 63 26 213 117 114 6]
// }
func (c *Codec) SingleFromNative(buf []byte, datum interface{}) ([]byte, error) {
newBuf, err := c.binaryFromNative(append(buf, c.soeHeader...), datum)
if err != nil {
return buf, err
}
return newBuf, nil
}
// TextualFromNative converts Go native data types to Avro data in JSON text format in
// accordance with the Avro schema supplied when creating the Codec. It is
// supplied a byte slice to which to append the encoded data and the actual data
// to encode. On success, it returns a new byte slice with the encoded bytes
// appended, and a nil error value. On error, it returns the original byte
// slice, and the error message.
//
// func ExampleTextualFromNative() {
// codec, err := goavro.NewCodec(`
// {
// "type": "record",
// "name": "LongList",
// "fields" : [
// {"name": "next", "type": ["null", "LongList"], "default": null}
// ]
// }`)
// if err != nil {
// fmt.Println(err)
// }
//
// // Convert native Go form to text Avro data
// text, err := codec.TextualFromNative(nil, map[string]interface{}{
// "next": map[string]interface{}{
// "LongList": map[string]interface{}{
// "next": map[string]interface{}{
// "LongList": map[string]interface{}{
// // NOTE: May omit fields when using default value
// },
// },
// },
// },
// })
// if err != nil {
// fmt.Println(err)
// }
//
// fmt.Printf("%s", text)
// // Output: {"next":{"LongList":{"next":{"LongList":{"next":null}}}}}
// }
func (c *Codec) TextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
newBuf, err := c.textualFromNative(buf, datum)
if err != nil {
return buf, err // if error, return original byte slice
}
return newBuf, nil
}
// Schema returns the original schema used to create the Codec.
func (c *Codec) Schema() string {
return c.schemaOriginal
}
// CanonicalSchema returns the Parsing Canonical Form of the schema according to
// the Avro specification.
func (c *Codec) CanonicalSchema() string {
return c.schemaCanonical
}
// SchemaCRC64Avro returns a signed 64-bit integer Rabin fingerprint for the
// canonical schema. This method returns the signed 64-bit cast of the unsigned
// 64-bit schema Rabin fingerprint.
//
// DEPRECATED: This method has been replaced by the Rabin structure Codec field
// and is provided for backward compatibility only.
func (c *Codec) SchemaCRC64Avro() int64 {
return int64(c.Rabin)
}
// convert a schema data structure to a codec, prefixing with specified
// namespace
func buildCodec(st map[string]*Codec, enclosingNamespace string, schema interface{}) (*Codec, error) {
switch schemaType := schema.(type) {
case map[string]interface{}:
return buildCodecForTypeDescribedByMap(st, enclosingNamespace, schemaType)
case string:
return buildCodecForTypeDescribedByString(st, enclosingNamespace, schemaType, nil)
case []interface{}:
return buildCodecForTypeDescribedBySlice(st, enclosingNamespace, schemaType)
default:
return nil, fmt.Errorf("unknown schema type: %T", schema)
}
}
// Reach into the map, grabbing its "type". Use that to create the codec.
func buildCodecForTypeDescribedByMap(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
t, ok := schemaMap["type"]
if !ok {
return nil, fmt.Errorf("missing type: %v", schemaMap)
}
switch v := t.(type) {
case string:
// Already defined types may be abbreviated with its string name.
// EXAMPLE: "type":"array"
// EXAMPLE: "type":"enum"
// EXAMPLE: "type":"fixed"
// EXAMPLE: "type":"int"
// EXAMPLE: "type":"record"
// EXAMPLE: "type":"somePreviouslyDefinedCustomTypeString"
return buildCodecForTypeDescribedByString(st, enclosingNamespace, v, schemaMap)
case map[string]interface{}:
return buildCodecForTypeDescribedByMap(st, enclosingNamespace, v)
case []interface{}:
return buildCodecForTypeDescribedBySlice(st, enclosingNamespace, v)
default:
return nil, fmt.Errorf("type ought to be either string, map[string]interface{}, or []interface{}; received: %T", t)
}
}
func buildCodecForTypeDescribedByString(st map[string]*Codec, enclosingNamespace string, typeName string, schemaMap map[string]interface{}) (*Codec, error) {
isLogicalType := false
searchType := typeName
// logicalType will be non-nil for those fields without a logicalType property set
if lt := schemaMap["logicalType"]; lt != nil {
isLogicalType = true
searchType = fmt.Sprintf("%s.%s", typeName, lt)
}
// NOTE: When codec already exists, return it. This includes both primitive and
// logicalType codecs added in NewCodec, and user-defined types, added while
// building the codec.
if cd, ok := st[searchType]; ok {
return cd, nil
}
// Avro specification allows abbreviation of type name inside a namespace.
if enclosingNamespace != "" {
if cd, ok := st[enclosingNamespace+"."+typeName]; ok {
return cd, nil
}
}
// There are only a small handful of complex Avro data types.
switch searchType {
case "array":
return makeArrayCodec(st, enclosingNamespace, schemaMap)
case "enum":
return makeEnumCodec(st, enclosingNamespace, schemaMap)
case "fixed":
return makeFixedCodec(st, enclosingNamespace, schemaMap)
case "map":
return makeMapCodec(st, enclosingNamespace, schemaMap)
case "record":
return makeRecordCodec(st, enclosingNamespace, schemaMap)
case "bytes.decimal":
return makeDecimalBytesCodec(st, enclosingNamespace, schemaMap)
case "fixed.decimal":
return makeDecimalFixedCodec(st, enclosingNamespace, schemaMap)
default:
if isLogicalType {
delete(schemaMap, "logicalType")
return buildCodecForTypeDescribedByString(st, enclosingNamespace, typeName, schemaMap)
}
return nil, fmt.Errorf("unknown type name: %q", searchType)
}
}
// notion of enclosing namespace changes when record, enum, or fixed create a
// new namespace, for child objects.
func registerNewCodec(st map[string]*Codec, schemaMap map[string]interface{}, enclosingNamespace string) (*Codec, error) {
n, err := newNameFromSchemaMap(enclosingNamespace, schemaMap)
if err != nil {
return nil, err
}
c := &Codec{typeName: n}
st[n.fullName] = c
return c, nil
}
// ErrWrongCodec is returned when an attempt is made to decode a single-object
// encoded value using the wrong codec.
type ErrWrongCodec uint64
func (e ErrWrongCodec) Error() string { return "wrong codec: " + strconv.FormatUint(uint64(e), 10) }
// ErrNotSingleObjectEncoded is returned when an attempt is made to decode a
// single-object encoded value from a buffer that does not have the correct
// magic prefix.
type ErrNotSingleObjectEncoded string
func (e ErrNotSingleObjectEncoded) Error() string {
return "cannot decode buffer as single-object encoding: " + string(e)
}
// +build goavro_debug
package goavro
import (
"fmt"
"os"
)
// debug formats and prints arguments to stderr for development builds
func debug(f string, a ...interface{}) {
os.Stderr.Write([]byte("goavro: " + fmt.Sprintf(f, a...)))
}
// +build !goavro_debug
package goavro
// debug is a no-op for release builds, and the function call is optimized out
// by the compiler.
func debug(_ string, _ ...interface{}) {}
/*
Package goavro is a library that encodes and decodes Avro data.
Goavro provides methods to encode native Go data into both binary and textual
JSON Avro data, and methods to decode both binary and textual JSON Avro data to
native Go data.
Goavro also provides methods to read and write Object Container File (OCF)
formatted files, and the library contains example programs to read and write OCF
files.
Usage Example:
package main
import (
"fmt"
"github.com/linkedin/goavro"
)
func main() {
codec, err := goavro.NewCodec(`
{
"type": "record",
"name": "LongList",
"fields" : [
{"name": "next", "type": ["null", "LongList", {"type": "long", "logicalType": "timestamp-millis"}], "default": null}
]
}`)
if err != nil {
fmt.Println(err)
}
// NOTE: May omit fields when using default value
textual := []byte(`{"next":{"LongList":{}}}`)
// Convert textual Avro data (in Avro JSON format) to native Go form
native, _, err := codec.NativeFromTextual(textual)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to binary Avro data
binary, err := codec.BinaryFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// Convert binary Avro data back to native Go form
native, _, err = codec.NativeFromBinary(binary)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to textual Avro data
textual, err = codec.TextualFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// NOTE: Textual encoding will show all fields, even those with values that
// match their default values
fmt.Println(string(textual))
// Output: {"next":{"LongList":{"next":null}}}
}
*/
package goavro
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
)
// enum does not have child objects, therefore whatever namespace it defines is
// just to store its name in the symbol table.
func makeEnumCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Enum ought to have valid name: %s", err)
}
// enum type must have symbols
s1, ok := schemaMap["symbols"]
if !ok {
return nil, fmt.Errorf("Enum %q ought to have symbols key", c.typeName)
}
s2, ok := s1.([]interface{})
if !ok || len(s2) == 0 {
return nil, fmt.Errorf("Enum %q symbols ought to be non-empty array of strings: %v", c.typeName, s1)
}
symbols := make([]string, len(s2))
for i, s := range s2 {
symbol, ok := s.(string)
if !ok {
return nil, fmt.Errorf("Enum %q symbol %d ought to be non-empty string; received: %T", c.typeName, i+1, symbol)
}
if err := checkString(symbol); err != nil {
return nil, fmt.Errorf("Enum %q symbol %d ought to %s", c.typeName, i+1, err)
}
symbols[i] = symbol
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
var value interface{}
var err error
var index int64
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary enum %q index: %s", c.typeName, err)
}
index = value.(int64)
if index < 0 || index >= int64(len(symbols)) {
return nil, nil, fmt.Errorf("cannot decode binary enum %q: index ought to be between 0 and %d; read index: %d", c.typeName, len(symbols)-1, index)
}
return symbols[index], buf, nil
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
someString, ok := datum.(string)
if !ok {
return nil, fmt.Errorf("cannot encode binary enum %q: expected string; received: %T", c.typeName, datum)
}
for i, symbol := range symbols {
if symbol == someString {
return longBinaryFromNative(buf, i)
}
}
return nil, fmt.Errorf("cannot encode binary enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual enum: %s", io.ErrShortBuffer)
}
// decode enum string
var value interface{}
var err error
value, buf, err = stringNativeFromTextual(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual enum: expected key: %s", err)
}
someString := value.(string)
for _, symbol := range symbols {
if symbol == someString {
return someString, buf, nil
}
}
return nil, nil, fmt.Errorf("cannot decode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
someString, ok := datum.(string)
if !ok {
return nil, fmt.Errorf("cannot encode textual enum %q: expected string; received: %T", c.typeName, datum)
}
for _, symbol := range symbols {
if symbol == someString {
return stringTextualFromNative(buf, someString)
}
}
return nil, fmt.Errorf("cannot encode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
return c, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"strconv"
)
// Fixed does not have child objects, therefore whatever namespace it defines is
// just to store its name in the symbol table.
func makeFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Fixed ought to have valid name: %s", err)
}
size, err := sizeFromSchemaMap(c.typeName, schemaMap)
if err != nil {
return nil, err
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
if buflen := uint(len(buf)); size > buflen {
return nil, nil, fmt.Errorf("cannot decode binary fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen)
}
return buf[:size], buf[size:], nil
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode binary fixed %q: expected []byte or string; received: %T", c.typeName, datum)
}
if count := uint(len(someBytes)); count != size {
return nil, fmt.Errorf("cannot encode binary fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return append(buf, someBytes...), nil
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
if buflen := uint(len(buf)); size > buflen {
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen)
}
var datum interface{}
var err error
datum, buf, err = bytesNativeFromTextual(buf)
if err != nil {
return nil, buf, err
}
datumBytes := datum.([]byte)
if count := uint(len(datumBytes)); count != size {
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return datum, buf, err
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode textual fixed %q: expected []byte or string; received: %T", c.typeName, datum)
}
if count := uint(len(someBytes)); count != size {
return nil, fmt.Errorf("cannot encode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return bytesTextualFromNative(buf, someBytes)
}
return c, nil
}
func sizeFromSchemaMap(typeName *name, schemaMap map[string]interface{}) (uint, error) {
// Fixed type must have size
sizeRaw, ok := schemaMap["size"]
if !ok {
return 0, fmt.Errorf("Fixed %q ought to have size key", typeName)
}
var size uint
switch val := sizeRaw.(type) {
case string:
s, err := strconv.ParseUint(val, 10, 0)
if err != nil {
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
size = uint(s)
case float64:
if val <= 0 {
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
size = uint(val)
default:
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
return size, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"strconv"
)
const (
doubleEncodedLength = 8 // double requires 8 bytes
floatEncodedLength = 4 // float requires 4 bytes
)
////////////////////////////////////////
// Binary Decode
////////////////////////////////////////
func doubleNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < doubleEncodedLength {
return nil, nil, fmt.Errorf("cannot decode binary double: %s", io.ErrShortBuffer)
}
return math.Float64frombits(binary.LittleEndian.Uint64(buf[:doubleEncodedLength])), buf[doubleEncodedLength:], nil
}
func floatNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < floatEncodedLength {
return nil, nil, fmt.Errorf("cannot decode binary float: %s", io.ErrShortBuffer)
}
return math.Float32frombits(binary.LittleEndian.Uint32(buf[:floatEncodedLength])), buf[floatEncodedLength:], nil
}
////////////////////////////////////////
// Binary Encode
////////////////////////////////////////
func doubleBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value float64
switch v := datum.(type) {
case float64:
value = v
case float32:
value = float64(v)
case int:
if value = float64(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int would lose precision: %d", v)
}
case int64:
if value = float64(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int64 would lose precision: %d", v)
}
case int32:
if value = float64(v); int32(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int32 would lose precision: %d", v)
}
default:
return nil, fmt.Errorf("cannot encode binary double: expected: Go numeric; received: %T", datum)
}
buf = append(buf, 0, 0, 0, 0, 0, 0, 0, 0)
binary.LittleEndian.PutUint64(buf[len(buf)-doubleEncodedLength:], math.Float64bits(value))
return buf, nil
}
func floatBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value float32
switch v := datum.(type) {
case float32:
value = v
case float64:
// Assume runtime can cast special floats correctly, and if there is a
// loss of precision from float64 and float32, that should be expected
// or at least understood by the client.
value = float32(v)
case int:
if value = float32(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int would lose precision: %d", v)
}
case int64:
if value = float32(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int64 would lose precision: %d", v)
}
case int32:
if value = float32(v); int32(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int32 would lose precision: %d", v)
}
default:
return nil, fmt.Errorf("cannot encode binary float: expected: Go numeric; received: %T", datum)
}
// return floatingBinaryEncoder(buf, uint64(math.Float32bits(value)), floatEncodedLength)
buf = append(buf, 0, 0, 0, 0)
binary.LittleEndian.PutUint32(buf[len(buf)-floatEncodedLength:], uint32(math.Float32bits(value)))
return buf, nil
}
////////////////////////////////////////
// Text Decode
////////////////////////////////////////
func doubleNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return floatingTextDecoder(buf, 64)
}
func floatNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return floatingTextDecoder(buf, 32)
}
func floatingTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) {
buflen := len(buf)
if buflen >= 4 {
if bytes.Equal(buf[:4], []byte("null")) {
return math.NaN(), buf[4:], nil
}
if buflen >= 5 {
if bytes.Equal(buf[:5], []byte("1e999")) {
return math.Inf(1), buf[5:], nil
}
if buflen >= 6 {
if bytes.Equal(buf[:6], []byte("-1e999")) {
return math.Inf(-1), buf[6:], nil
}
}
}
}
index, err := numberLength(buf, true) // NOTE: floatAllowed = true
if err != nil {
return nil, nil, err
}
datum, err := strconv.ParseFloat(string(buf[:index]), bitSize)
if err != nil {
return nil, nil, err
}
if bitSize == 32 {
return float32(datum), buf[index:], nil
}
return datum, buf[index:], nil
}
func numberLength(buf []byte, floatAllowed bool) (int, error) {
// ALGORITHM: increment index as long as bytes are valid for number state engine.
var index, buflen, count int
var b byte
// STATE 0: begin, optional: -
if buflen = len(buf); index == buflen {
return 0, io.ErrShortBuffer
}
if buf[index] == '-' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
}
// STATE 1: if 0, goto 2; otherwise if 1-9, goto 3; otherwise bail
if b = buf[index]; b == '0' {
if index++; index == buflen {
return index, nil // valid number
}
} else if b >= '1' && b <= '9' {
if index++; index == buflen {
return index, nil // valid number
}
// STATE 3: absorb zero or more digits
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
if index++; index == buflen {
return index, nil // valid number
}
}
} else {
return 0, fmt.Errorf("unexpected byte: %q", b)
}
if floatAllowed {
// STATE 2: if ., goto 4; otherwise goto 5
if buf[index] == '.' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
// STATE 4: absorb one or more digits
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
count++
if index++; index == buflen {
return index, nil // valid number
}
}
if count == 0 {
// did not get at least one digit
return 0, fmt.Errorf("unexpected byte: %q", b)
}
}
// STATE 5: if e|e, goto 6; otherwise goto 7
if b = buf[index]; b == 'e' || b == 'E' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
// STATE 6: if -|+, goto 8; otherwise goto 8
if b = buf[index]; b == '+' || b == '-' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
}
// STATE 8: absorb one or more digits
count = 0
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
count++
if index++; index == buflen {
return index, nil // valid number
}
}
if count == 0 {
// did not get at least one digit
return 0, fmt.Errorf("unexpected byte: %q", b)
}
}
}
// STATE 7: end
return index, nil
}
////////////////////////////////////////
// Text Encode
////////////////////////////////////////
func floatTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return floatingTextEncoder(buf, datum, 32)
}
func doubleTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return floatingTextEncoder(buf, datum, 64)
}
func floatingTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) {
var isFloat bool
var someFloat64 float64
var someInt64 int64
switch v := datum.(type) {
case float32:
isFloat = true
someFloat64 = float64(v)
case float64:
isFloat = true
someFloat64 = v
case int:
if someInt64 = int64(v); int(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: provided Go int would lose precision: %d", v)
}
return nil, fmt.Errorf("cannot encode textual float: provided Go int would lose precision: %d", v)
}
case int64:
someInt64 = v
case int32:
if someInt64 = int64(v); int32(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: provided Go int32 would lose precision: %d", v)
}
return nil, fmt.Errorf("cannot encode textual float: provided Go int32 would lose precision: %d", v)
}
default:
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: expected: Go numeric; received: %T", datum)
}
return nil, fmt.Errorf("cannot encode textual float: expected: Go numeric; received: %T", datum)
}
if isFloat {
if math.IsNaN(someFloat64) {
return append(buf, "null"...), nil
}
if math.IsInf(someFloat64, 1) {
return append(buf, "1e999"...), nil
}
if math.IsInf(someFloat64, -1) {
return append(buf, "-1e999"...), nil
}
return strconv.AppendFloat(buf, someFloat64, 'g', -1, bitSize), nil
}
return strconv.AppendInt(buf, someInt64, 10), nil
}
module github.com/linkedin/goavro/v2
go 1.12
require github.com/golang/snappy v0.0.1
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"strconv"
)
const (
intDownShift = uint32(31)
intFlag = byte(128)
intMask = byte(127)
longDownShift = uint32(63)
)
////////////////////////////////////////
// Binary Decode
////////////////////////////////////////
func intNativeFromBinary(buf []byte) (interface{}, []byte, error) {
var offset, value int
var shift uint
for offset = 0; offset < len(buf); offset++ {
b := buf[offset]
value |= int(b&intMask) << shift
if b&intFlag == 0 {
return (int32(value>>1) ^ -int32(value&1)), buf[offset+1:], nil
}
shift += 7
}
return nil, nil, io.ErrShortBuffer
}
func longNativeFromBinary(buf []byte) (interface{}, []byte, error) {
var offset int
var value uint64
var shift uint
for offset = 0; offset < len(buf); offset++ {
b := buf[offset]
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), buf[offset+1:], nil
}
shift += 7
}
return nil, nil, io.ErrShortBuffer
}
////////////////////////////////////////
// Binary Encode
////////////////////////////////////////
func intBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value int32
switch v := datum.(type) {
case int32:
value = v
case int:
if value = int32(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go int would lose precision: %d", v)
}
case int64:
if value = int32(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go int64 would lose precision: %d", v)
}
case float64:
if value = int32(v); float64(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go float64 would lose precision: %f", v)
}
case float32:
if value = int32(v); float32(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go float32 would lose precision: %f", v)
}
default:
return nil, fmt.Errorf("cannot encode binary int: expected: Go numeric; received: %T", datum)
}
encoded := uint64((uint32(value) << 1) ^ uint32(value>>intDownShift))
return integerBinaryEncoder(buf, encoded)
}
func longBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value int64
switch v := datum.(type) {
case int64:
value = v
case int:
value = int64(v)
case int32:
value = int64(v)
case float64:
if value = int64(v); float64(value) != v {
return nil, fmt.Errorf("cannot encode binary long: provided Go float64 would lose precision: %f", v)
}
case float32:
if value = int64(v); float32(value) != v {
return nil, fmt.Errorf("cannot encode binary long: provided Go float32 would lose precision: %f", v)
}
default:
return nil, fmt.Errorf("long: expected: Go numeric; received: %T", datum)
}
encoded := (uint64(value) << 1) ^ uint64(value>>longDownShift)
return integerBinaryEncoder(buf, encoded)
}
func integerBinaryEncoder(buf []byte, encoded uint64) ([]byte, error) {
// used by both intBinaryEncoder and longBinaryEncoder
if encoded == 0 {
return append(buf, 0), nil
}
for encoded > 0 {
b := byte(encoded) & intMask
encoded = encoded >> 7
if encoded != 0 {
b |= intFlag // set high bit; we have more bytes
}
buf = append(buf, b)
}
return buf, nil
}
////////////////////////////////////////
// Text Decode
////////////////////////////////////////
func longNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return integerTextDecoder(buf, 64)
}
func intNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return integerTextDecoder(buf, 32)
}
func integerTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) {
index, err := numberLength(buf, false) // NOTE: floatAllowed = false
if err != nil {
return nil, nil, err
}
datum, err := strconv.ParseInt(string(buf[:index]), 10, bitSize)
if err != nil {
return nil, nil, err
}
if bitSize == 32 {
return int32(datum), buf[index:], nil
}
return datum, buf[index:], nil
}
////////////////////////////////////////
// Text Encode
////////////////////////////////////////
func longTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return integerTextEncoder(buf, datum, 64)
}
func intTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return integerTextEncoder(buf, datum, 32)
}
func integerTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) {
var someInt64 int64
switch v := datum.(type) {
case int:
someInt64 = int64(v)
case int32:
someInt64 = int64(v)
case int64:
someInt64 = v
case float32:
if someInt64 = int64(v); float32(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: provided Go float32 would lose precision: %f", v)
}
return nil, fmt.Errorf("cannot encode textual int: provided Go float32 would lose precision: %f", v)
}
case float64:
if someInt64 = int64(v); float64(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: provided Go float64 would lose precision: %f", v)
}
return nil, fmt.Errorf("cannot encode textual int: provided Go float64 would lose precision: %f", v)
}
default:
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: expected: Go numeric; received: %T", datum)
}
return nil, fmt.Errorf("cannot encode textual int: expected: Go numeric; received: %T", datum)
}
return strconv.AppendInt(buf, someInt64, 10), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"errors"
"fmt"
"math"
"math/big"
"time"
)
type toNativeFn func([]byte) (interface{}, []byte, error)
type fromNativeFn func([]byte, interface{}) ([]byte, error)
//////////////////////////////////////////////////////////////////////////////////////////////
// date logical type - to/from time.Time, time.UTC location
//////////////////////////////////////////////////////////////////////////////////////////////
func nativeFromDate(fn toNativeFn) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
l, b, err := fn(bytes)
if err != nil {
return l, b, err
}
i, ok := l.(int32)
if !ok {
return l, b, fmt.Errorf("cannot transform to native date, expected int, received %T", l)
}
t := time.Date(1970, 1, 1, 0, 0, 0, 0, time.UTC).AddDate(0, 0, int(i)).UTC()
return t, b, nil
}
}
func dateFromNative(fn fromNativeFn) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
t, ok := d.(time.Time)
if !ok {
return nil, fmt.Errorf("cannot transform to binary date, expected time.Time, received %T", d)
}
// The number of days calculation is incredibly naive we take the time.Duration
// between the given time and unix epoch and divide that by (24 * time.Hour)
// This accuracy seems acceptable given the relation to unix epoch for now
// TODO: replace with a better method
numDays := t.UnixNano() / int64(24*time.Hour)
return fn(b, numDays)
}
}
//////////////////////////////////////////////////////////////////////////////////////////////
// time-millis logical type - to/from time.Time, time.UTC location
//////////////////////////////////////////////////////////////////////////////////////////////
func nativeFromTimeMillis(fn toNativeFn) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
l, b, err := fn(bytes)
if err != nil {
return l, b, err
}
i, ok := l.(int32)
if !ok {
return l, b, fmt.Errorf("cannot transform to native time.Duration, expected int, received %T", l)
}
t := time.Duration(i) * time.Millisecond
return t, b, nil
}
}
func timeMillisFromNative(fn fromNativeFn) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
t, ok := d.(time.Duration)
if !ok {
return nil, fmt.Errorf("cannot transform to binary time-millis, expected time.Duration, received %T", d)
}
duration := int32(t.Nanoseconds() / int64(time.Millisecond))
return fn(b, duration)
}
}
//////////////////////////////////////////////////////////////////////////////////////////////
// time-micros logical type - to/from time.Time, time.UTC location
//////////////////////////////////////////////////////////////////////////////////////////////
func nativeFromTimeMicros(fn toNativeFn) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
l, b, err := fn(bytes)
if err != nil {
return l, b, err
}
i, ok := l.(int64)
if !ok {
return l, b, fmt.Errorf("cannot transform to native time.Duration, expected long, received %T", l)
}
t := time.Duration(i) * time.Microsecond
return t, b, nil
}
}
func timeMicrosFromNative(fn fromNativeFn) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
t, ok := d.(time.Duration)
if !ok {
return nil, fmt.Errorf("cannot transform to binary time-micros, expected time.Duration, received %T", d)
}
duration := t.Nanoseconds() / int64(time.Microsecond)
return fn(b, duration)
}
}
//////////////////////////////////////////////////////////////////////////////////////////////
// timestamp-millis logical type - to/from time.Time, time.UTC location
//////////////////////////////////////////////////////////////////////////////////////////////
func nativeFromTimeStampMillis(fn toNativeFn) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
l, b, err := fn(bytes)
if err != nil {
return l, b, err
}
i, ok := l.(int64)
if !ok {
return l, b, fmt.Errorf("cannot transform native timestamp-millis, expected int64, received %T", l)
}
secs := i / int64(time.Microsecond)
nanosecs := (i - secs*int64(time.Microsecond)) * int64(time.Millisecond)
return time.Unix(secs, nanosecs).UTC(), b, nil
}
}
func timeStampMillisFromNative(fn fromNativeFn) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
t, ok := d.(time.Time)
if !ok {
return nil, fmt.Errorf("cannot transform binary timestamp-millis, expected time.Time, received %T", d)
}
millisecs := t.UnixNano() / int64(time.Millisecond)
return fn(b, millisecs)
}
}
//////////////////////////////////////////////////////////////////////////////////////////////
// timestamp-micros logical type - to/from time.Time, time.UTC location
//////////////////////////////////////////////////////////////////////////////////////////////
func nativeFromTimeStampMicros(fn toNativeFn) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
l, b, err := fn(bytes)
if err != nil {
return l, b, err
}
microseconds, ok := l.(int64)
if !ok {
return l, b, fmt.Errorf("cannot transform native timestamp-micros, expected int64, received %T", l)
}
// While this code performs a few more steps than seem required, it is
// written this way to allow the best time resolution on UNIX and
// Windows without overflowing the int64 value. Windows has a zero-time
// value of 1601-01-01 UTC, and the number of nanoseconds since that
// zero-time overflows 64-bit integers.
seconds := microseconds / 1e6
nanoseconds := (microseconds - (seconds * 1e6)) * 1e3
return time.Unix(seconds, nanoseconds).UTC(), b, nil
}
}
func timeStampMicrosFromNative(fn fromNativeFn) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
t, ok := d.(time.Time)
if !ok {
return nil, fmt.Errorf("cannot transform binary timestamp-micros, expected time.Time, received %T", d)
}
// While this code performs a few more steps than seem required, it is
// written this way to allow the best time resolution on UNIX and
// Windows without overflowing the int64 value. Windows has a zero-time
// value of 1601-01-01 UTC, and the number of nanoseconds since that
// zero-time overflows 64-bit integers.
return fn(b, t.Unix()*1e6+int64(t.Nanosecond()/1e3))
}
}
/////////////////////////////////////////////////////////////////////////////////////////////
// decimal logical-type - byte/fixed - to/from math/big.Rat
// two's complement algorithm taken from:
// https://groups.google.com/d/msg/golang-nuts/TV4bRVrHZUw/UcQt7S4IYlcJ by rog
/////////////////////////////////////////////////////////////////////////////////////////////
type makeCodecFn func(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error)
func precisionAndScaleFromSchemaMap(schemaMap map[string]interface{}) (int, int, error) {
p1, ok := schemaMap["precision"]
if !ok {
return 0, 0, errors.New("cannot create decimal logical type without precision")
}
p2, ok := p1.(float64)
if !ok {
return 0, 0, fmt.Errorf("cannot create decimal logical type with wrong precision type; expected: float64; received: %T", p1)
}
p3 := int(p2)
if p3 <= 1 {
return 0, 0, fmt.Errorf("cannot create decimal logical type when precision is less than one: %d", p3)
}
var s3 int // scale defaults to 0 if not set
if s1, ok := schemaMap["scale"]; ok {
s2, ok := s1.(float64)
if !ok {
return 0, 0, fmt.Errorf("cannot create decimal logical type with wrong precision type; expected: float64; received: %T", p1)
}
s3 = int(s2)
if s3 < 0 {
return 0, 0, fmt.Errorf("cannot create decimal logical type when scale is less than zero: %d", s3)
}
if s3 > p3 {
return 0, 0, fmt.Errorf("cannot create decimal logical type when scale is larger than precision: %d > %d", s3, p3)
}
}
return p3, s3, nil
}
var one = big.NewInt(1)
func makeDecimalBytesCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap)
if err != nil {
return nil, err
}
if _, ok := schemaMap["name"]; !ok {
schemaMap["name"] = "bytes.decimal"
}
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Bytes ought to have valid name: %s", err)
}
c.binaryFromNative = decimalBytesFromNative(bytesBinaryFromNative, toSignedBytes, precision, scale)
c.textualFromNative = decimalBytesFromNative(bytesTextualFromNative, toSignedBytes, precision, scale)
c.nativeFromBinary = nativeFromDecimalBytes(bytesNativeFromBinary, precision, scale)
c.nativeFromTextual = nativeFromDecimalBytes(bytesNativeFromTextual, precision, scale)
return c, nil
}
func nativeFromDecimalBytes(fn toNativeFn, precision, scale int) toNativeFn {
return func(bytes []byte) (interface{}, []byte, error) {
d, b, err := fn(bytes)
if err != nil {
return d, b, err
}
bs, ok := d.([]byte)
if !ok {
return nil, bytes, fmt.Errorf("cannot transform to native decimal, expected []byte, received %T", d)
}
i := big.NewInt(0)
fromSignedBytes(i, bs)
if i.BitLen() > 64 {
// Avro spec specifies we return underlying type if the logicalType is invalid
return d, b, err
}
r := big.NewRat(i.Int64(), int64(math.Pow10(scale)))
return r, b, nil
}
}
func decimalBytesFromNative(fromNativeFn fromNativeFn, toBytesFn toBytesFn, precision, scale int) fromNativeFn {
return func(b []byte, d interface{}) ([]byte, error) {
r, ok := d.(*big.Rat)
if !ok {
return nil, fmt.Errorf("cannot transform to bytes, expected *big.Rat, received %T", d)
}
// we reduce accuracy to precision by dividing and multiplying by digit length
num := big.NewInt(0).Set(r.Num())
denom := big.NewInt(0).Set(r.Denom())
// we get the scaled decimal representation
i := new(big.Int).Mul(num, big.NewInt(int64(math.Pow10(scale))))
// divide that by the denominator
precnum := new(big.Int).Div(i, denom)
bout, err := toBytesFn(precnum)
if err != nil {
return nil, err
}
return fromNativeFn(b, bout)
}
}
func makeDecimalFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
precision, scale, err := precisionAndScaleFromSchemaMap(schemaMap)
if err != nil {
return nil, err
}
if _, ok := schemaMap["name"]; !ok {
schemaMap["name"] = "fixed.decimal"
}
c, err := makeFixedCodec(st, enclosingNamespace, schemaMap)
if err != nil {
return nil, err
}
size, err := sizeFromSchemaMap(c.typeName, schemaMap)
if err != nil {
return nil, err
}
c.binaryFromNative = decimalBytesFromNative(c.binaryFromNative, toSignedFixedBytes(size), precision, scale)
c.textualFromNative = decimalBytesFromNative(c.textualFromNative, toSignedFixedBytes(size), precision, scale)
c.nativeFromBinary = nativeFromDecimalBytes(c.nativeFromBinary, precision, scale)
c.nativeFromTextual = nativeFromDecimalBytes(c.nativeFromTextual, precision, scale)
return c, nil
}
func padBytes(bytes []byte, fixedSize uint) []byte {
s := int(fixedSize)
padded := make([]byte, s, s)
if s >= len(bytes) {
copy(padded[s-len(bytes):], bytes)
}
return padded
}
type toBytesFn func(n *big.Int) ([]byte, error)
// fromSignedBytes sets the value of n to the big-endian two's complement
// value stored in the given data. If data[0]&80 != 0, the number
// is negative. If data is empty, the result will be 0.
func fromSignedBytes(n *big.Int, data []byte) {
n.SetBytes(data)
if len(data) > 0 && data[0]&0x80 > 0 {
n.Sub(n, new(big.Int).Lsh(one, uint(len(data))*8))
}
}
// toSignedBytes returns the big-endian two's complement
// form of n.
func toSignedBytes(n *big.Int) ([]byte, error) {
switch n.Sign() {
case 0:
return []byte{0}, nil
case 1:
b := n.Bytes()
if b[0]&0x80 > 0 {
b = append([]byte{0}, b...)
}
return b, nil
case -1:
length := uint(n.BitLen()/8+1) * 8
b := new(big.Int).Add(n, new(big.Int).Lsh(one, length)).Bytes()
// When the most significant bit is on a byte
// boundary, we can get some extra significant
// bits, so strip them off when that happens.
if len(b) >= 2 && b[0] == 0xff && b[1]&0x80 != 0 {
b = b[1:]
}
return b, nil
}
return nil, fmt.Errorf("toSignedBytes: error big.Int.Sign() returned unexpected value")
}
// toSignedFixedBytes returns the big-endian two's complement
// form of n for a given length of bytes.
func toSignedFixedBytes(size uint) func(*big.Int) ([]byte, error) {
return func(n *big.Int) ([]byte, error) {
switch n.Sign() {
case 0:
return []byte{0}, nil
case 1:
b := n.Bytes()
if b[0]&0x80 > 0 {
b = append([]byte{0}, b...)
}
return padBytes(b, size), nil
case -1:
length := size * 8
b := new(big.Int).Add(n, new(big.Int).Lsh(one, length)).Bytes()
// Unlike a variable length byte length we need the extra bits to meet byte length
return b, nil
}
return nil, fmt.Errorf("toSignedBytes: error big.Int.Sign() returned unexpected value")
}
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"errors"
"fmt"
"io"
"math"
"reflect"
)
func makeMapCodec(st map[string]*Codec, namespace string, schemaMap map[string]interface{}) (*Codec, error) {
// map type must have values
valueSchema, ok := schemaMap["values"]
if !ok {
return nil, errors.New("Map ought to have values key")
}
valueCodec, err := buildCodec(st, namespace, valueSchema)
if err != nil {
return nil, fmt.Errorf("Map values ought to be valid Avro type: %s", err)
}
return &Codec{
typeName: &name{"map", nullNamespace},
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) {
var err error
var value interface{}
// block count and block size
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map block count: %s", err)
}
blockCount := value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long encoded
// block size following the negative block count. We have no use
// for the block size in this decoder, so we read and discard
// the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can
// never be made positive
return nil, nil, fmt.Errorf("cannot decode binary map with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// NOTE: While the attempt of a RAM optimization shown below is not
// necessary, many encoders will encode all items in a single block.
// We can optimize amount of RAM allocated by runtime for the array
// by initializing the array for that number of items.
mapValues := make(map[string]interface{}, blockCount)
for blockCount != 0 {
// Decode `blockCount` datum values from buffer
for i := int64(0); i < blockCount; i++ {
// first decode the key string
if value, buf, err = stringNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map key: %s", err)
}
key := value.(string) // string decoder always returns a string
if _, ok := mapValues[key]; ok {
return nil, nil, fmt.Errorf("cannot decode binary map: duplicate key: %q", key)
}
// then decode the value
if value, buf, err = valueCodec.nativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map value for key %q: %s", key, err)
}
mapValues[key] = value
}
// Decode next blockCount from buffer, because there may be more blocks
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map block count: %s", err)
}
blockCount = value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long
// encoded block size following the negative block count. We
// have no use for the block size in this decoder, so we
// read and discard the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can
// never be made positive
return nil, nil, fmt.Errorf("cannot decode binary map with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
}
return mapValues, buf, nil
},
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
mapValues, err := convertMap(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode binary map: %s", err)
}
keyCount := int64(len(mapValues))
var alreadyEncoded, remainingInBlock int64
for k, v := range mapValues {
if remainingInBlock == 0 { // start a new block
remainingInBlock = keyCount - alreadyEncoded
if remainingInBlock > MaxBlockCount {
// limit block count to MacBlockCount
remainingInBlock = MaxBlockCount
}
buf, _ = longBinaryFromNative(buf, remainingInBlock)
}
// only fails when given non string, so elide error checking
buf, _ = stringBinaryFromNative(buf, k)
// encode the value
if buf, err = valueCodec.binaryFromNative(buf, v); err != nil {
return nil, fmt.Errorf("cannot encode binary map value for key %q: %v: %s", k, v, err)
}
remainingInBlock--
alreadyEncoded++
}
return longBinaryFromNative(buf, 0) // append tailing 0 block count to signal end of Map
},
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) {
return genericMapTextDecoder(buf, valueCodec, nil) // codecFromKey == nil
},
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
return genericMapTextEncoder(buf, datum, valueCodec, nil)
},
}, nil
}
// genericMapTextDecoder decodes a JSON text blob to a native Go map, using the
// codecs from codecFromKey, and if a key is not found in that map, from
// defaultCodec if provided. If defaultCodec is nil, this function returns an
// error if it encounters a map key that is not present in codecFromKey. If
// codecFromKey is nil, every map value will be decoded using defaultCodec, if
// possible.
func genericMapTextDecoder(buf []byte, defaultCodec *Codec, codecFromKey map[string]*Codec) (map[string]interface{}, []byte, error) {
var value interface{}
var err error
var b byte
lencodec := len(codecFromKey)
mapValues := make(map[string]interface{}, lencodec)
if buf, err = advanceAndConsume(buf, '{'); err != nil {
return nil, nil, err
}
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, io.ErrShortBuffer
}
// NOTE: Special case empty map
if buf[0] == '}' {
return mapValues, buf[1:], nil
}
// NOTE: Also terminates when read '}' byte.
for len(buf) > 0 {
// decode key string
value, buf, err = stringNativeFromTextual(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual map: expected key: %s", err)
}
key := value.(string)
// Is key already used?
if _, ok := mapValues[key]; ok {
return nil, nil, fmt.Errorf("cannot decode textual map: duplicate key: %q", key)
}
// Find a codec for the key
fieldCodec := codecFromKey[key]
if fieldCodec == nil {
fieldCodec = defaultCodec
}
if fieldCodec == nil {
return nil, nil, fmt.Errorf("cannot decode textual map: cannot determine codec: %q", key)
}
// decode colon
if buf, err = advanceAndConsume(buf, ':'); err != nil {
return nil, nil, err
}
// decode value
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, io.ErrShortBuffer
}
value, buf, err = fieldCodec.nativeFromTextual(buf)
if err != nil {
return nil, nil, err
}
// set map value for key
mapValues[key] = value
// either comma or closing curly brace
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, io.ErrShortBuffer
}
switch b = buf[0]; b {
case '}':
return mapValues, buf[1:], nil
case ',':
// no-op
default:
return nil, nil, fmt.Errorf("cannot decode textual map: expected ',' or '}'; received: %q", b)
}
// NOTE: consume comma from above
if buf, _ = advanceToNonWhitespace(buf[1:]); len(buf) == 0 {
return nil, nil, io.ErrShortBuffer
}
}
return nil, nil, io.ErrShortBuffer
}
// genericMapTextEncoder encodes a native Go map to a JSON text blob, using the
// codecs from codecFromKey, and if a key is not found in that map, from
// defaultCodec if provided. If defaultCodec is nil, this function returns an
// error if it encounters a map key that is not present in codecFromKey. If
// codecFromKey is nil, every map value will be encoded using defaultCodec, if
// possible.
func genericMapTextEncoder(buf []byte, datum interface{}, defaultCodec *Codec, codecFromKey map[string]*Codec) ([]byte, error) {
mapValues, err := convertMap(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode textual map: %s", err)
}
var atLeastOne bool
buf = append(buf, '{')
for key, value := range mapValues {
atLeastOne = true
// Find a codec for the key
fieldCodec := codecFromKey[key]
if fieldCodec == nil {
fieldCodec = defaultCodec
}
if fieldCodec == nil {
return nil, fmt.Errorf("cannot encode textual map: cannot determine codec: %q", key)
}
// Encode key string
buf, err = stringTextualFromNative(buf, key)
if err != nil {
return nil, err
}
buf = append(buf, ':')
// Encode value
buf, err = fieldCodec.textualFromNative(buf, value)
if err != nil {
// field was specified in datum; therefore its value was invalid
return nil, fmt.Errorf("cannot encode textual map: value for %q does not match its schema: %s", key, err)
}
buf = append(buf, ',')
}
if atLeastOne {
return append(buf[:len(buf)-1], '}'), nil
}
return append(buf, '}'), nil
}
// convertMap converts datum to map[string]interface{} if possible.
func convertMap(datum interface{}) (map[string]interface{}, error) {
mapValues, ok := datum.(map[string]interface{})
if ok {
return mapValues, nil
}
// NOTE: When given a map of any other type, zip values to items as a
// convenience to client.
v := reflect.ValueOf(datum)
if v.Kind() != reflect.Map {
return nil, fmt.Errorf("cannot create map[string]interface{}: expected map[string]...; received: %T", datum)
}
// NOTE: Two better alternatives to the current algorithm are:
// (1) mutate the reflection tuple underneath to convert the
// map[string]int, for example, to map[string]interface{}, with
// O(1) complexity.
// (2) use copy builtin to zip the data items over with O(n) complexity,
// but more efficient than what's below.
mapValues = make(map[string]interface{}, v.Len())
for _, key := range v.MapKeys() {
k, ok := key.Interface().(string)
if !ok {
// bail when map key type is not string
return nil, fmt.Errorf("cannot create map[string]interface{}: expected map[string]...; received: %T", datum)
}
mapValues[string(k)] = v.MapIndex(key).Interface()
}
return mapValues, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"errors"
"fmt"
"strings"
)
const nullNamespace = ""
// ErrInvalidName is the error returned when one or more parts of an Avro name
// is invalid.
type ErrInvalidName struct {
Message string
}
func (e ErrInvalidName) Error() string {
return "schema name ought to " + e.Message
}
// NOTE: This function designed to work with name components, after they have
// been split on the period rune.
func isRuneInvalidForFirstCharacter(r rune) bool {
return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && r != '_'
}
func isRuneInvalidForOtherCharacters(r rune) bool {
return isRuneInvalidForFirstCharacter(r) && (r < '0' || r > '9')
}
func checkNameComponent(s string) error {
err := checkString(s)
if err != nil {
return &ErrInvalidName{err.Error()}
}
return err
}
func checkString(s string) error {
if len(s) == 0 {
return errors.New("be non-empty string")
}
if strings.IndexFunc(s[:1], isRuneInvalidForFirstCharacter) != -1 {
return errors.New("start with [A-Za-z_]: " + s)
}
if strings.IndexFunc(s[1:], isRuneInvalidForOtherCharacters) != -1 {
return errors.New("have second and remaining characters contain only [A-Za-z0-9_]: " + s)
}
return nil
}
// name describes an Avro name in terms of its full name and namespace.
type name struct {
fullName string // the instance's Avro name
namespace string // for use when building new name from existing one
}
// newName returns a new Name instance after first ensuring the arguments do not
// violate any of the Avro naming rules.
func newName(n, ns, ens string) (*name, error) {
var nn name
if index := strings.LastIndexByte(n, '.'); index > -1 {
// inputName does contain a dot, so ignore everything else and use it as the full name
nn.fullName = n
nn.namespace = n[:index]
} else {
// inputName does not contain a dot, therefore is not the full name
if ns != nullNamespace {
// if namespace provided in the schema in the same schema level, use it
nn.fullName = ns + "." + n
nn.namespace = ns
} else if ens != nullNamespace {
// otherwise if enclosing namespace provided, use it
nn.fullName = ens + "." + n
nn.namespace = ens
} else {
// otherwise no namespace, so use null namespace, the empty string
nn.fullName = n
}
}
// verify all components of the full name for adherence to Avro naming rules
for i, component := range strings.Split(nn.fullName, ".") {
if i == 0 && RelaxedNameValidation && component == "" {
continue
}
if err := checkNameComponent(component); err != nil {
return nil, err
}
}
return &nn, nil
}
var (
// RelaxedNameValidation causes name validation to allow the first component
// of an Avro namespace to be the empty string.
RelaxedNameValidation bool
)
func newNameFromSchemaMap(enclosingNamespace string, schemaMap map[string]interface{}) (*name, error) {
var nameString, namespaceString string
name, ok := schemaMap["name"]
if !ok {
return nil, errors.New("schema ought to have name key")
}
nameString, ok = name.(string)
if !ok || nameString == nullNamespace {
return nil, fmt.Errorf("schema name ought to be non-empty string; received: %T: %v", name, name)
}
if namespace, ok := schemaMap["namespace"]; ok {
namespaceString, ok = namespace.(string)
if !ok {
return nil, fmt.Errorf("schema namespace, if provided, ought to be a string; received: %T: %v", namespace, namespace)
}
}
return newName(nameString, namespaceString, enclosingNamespace)
}
func (n *name) String() string {
return n.fullName
}
// short returns the name without the prefixed namespace.
func (n *name) short() string {
if index := strings.LastIndexByte(n.fullName, '.'); index > -1 {
return n.fullName[index+1:]
}
return n.fullName
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
"io"
)
var nullBytes = []byte("null")
func nullNativeFromBinary(buf []byte) (interface{}, []byte, error) { return nil, buf, nil }
func nullBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
if datum != nil {
return nil, fmt.Errorf("cannot encode binary null: expected: Go nil; received: %T", datum)
}
return buf, nil
}
func nullNativeFromTextual(buf []byte) (interface{}, []byte, error) {
if len(buf) < 4 {
return nil, nil, fmt.Errorf("cannot decode textual null: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:4], nullBytes) {
return nil, buf[4:], nil
}
return nil, nil, errors.New("cannot decode textual null: expected: null")
}
func nullTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
if datum != nil {
return nil, fmt.Errorf("cannot encode textual null: expected: Go nil; received: %T", datum)
}
return append(buf, nullBytes...), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io"
)
const (
// CompressionNullLabel is used when OCF blocks are not compressed.
CompressionNullLabel = "null"
// CompressionDeflateLabel is used when OCF blocks are compressed using the
// deflate algorithm.
CompressionDeflateLabel = "deflate"
// CompressionSnappyLabel is used when OCF blocks are compressed using the
// snappy algorithm.
CompressionSnappyLabel = "snappy"
)
// compressionID are values used to specify compression algorithm used to compress
// and decompress Avro Object Container File (OCF) streams.
type compressionID uint8
const (
compressionNull compressionID = iota
compressionDeflate
compressionSnappy
)
const (
ocfBlockConst = 24 // Each OCF block has two longs prefix, and sync marker suffix
ocfHeaderSizeConst = 48 // OCF header is usually about 48 bytes longer than its compressed schema
ocfMagicString = "Obj\x01"
ocfMetadataSchema = `{"type":"map","values":"bytes"}`
ocfSyncLength = 16
)
var (
ocfMagicBytes = []byte(ocfMagicString)
ocfMetadataCodec *Codec
)
func init() {
ocfMetadataCodec, _ = NewCodec(ocfMetadataSchema)
}
type ocfHeader struct {
codec *Codec
compressionID compressionID
syncMarker [ocfSyncLength]byte
metadata map[string][]byte
}
func newOCFHeader(config OCFConfig) (*ocfHeader, error) {
var err error
header := new(ocfHeader)
//
// avro.codec
//
switch config.CompressionName {
case "":
header.compressionID = compressionNull
case CompressionNullLabel:
header.compressionID = compressionNull
case CompressionDeflateLabel:
header.compressionID = compressionDeflate
case CompressionSnappyLabel:
header.compressionID = compressionSnappy
default:
return nil, fmt.Errorf("cannot create OCF header using unrecognized compression algorithm: %q", config.CompressionName)
}
//
// avro.schema
//
if config.Codec != nil {
header.codec = config.Codec
} else if config.Schema == "" {
return nil, fmt.Errorf("cannot create OCF header without either Codec or Schema specified")
} else {
if header.codec, err = NewCodec(config.Schema); err != nil {
return nil, fmt.Errorf("cannot create OCF header: %s", err)
}
}
header.metadata = config.MetaData
//
// The 16-byte, randomly-generated sync marker for this file.
//
_, err = rand.Read(header.syncMarker[:])
if err != nil {
return nil, err
}
return header, nil
}
func readOCFHeader(ior io.Reader) (*ocfHeader, error) {
//
// magic bytes
//
magic := make([]byte, 4)
_, err := io.ReadFull(ior, magic)
if err != nil {
return nil, fmt.Errorf("cannot read OCF header magic bytes: %s", err)
}
if !bytes.Equal(magic, ocfMagicBytes) {
return nil, fmt.Errorf("cannot read OCF header with invalid magic bytes: %#q", magic)
}
//
// metadata
//
metadata, err := metadataBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read OCF header metadata: %s", err)
}
//
// avro.codec
//
// NOTE: Avro specification states that `null` cID is used by
// default when "avro.codec" was not included in the metadata header. The
// specification does not talk about the case when "avro.codec" was included
// with the empty string as its value. I believe it is an error for an OCF
// file to provide the empty string as the cID algorithm. While it
// is trivially easy to gracefully handle here, I'm not sure whether this
// happens a lot, and don't want to accept bad input unless we have
// significant reason to do so.
var cID compressionID
value, ok := metadata["avro.codec"]
if ok {
switch avroCodec := string(value); avroCodec {
case CompressionNullLabel:
cID = compressionNull
case CompressionDeflateLabel:
cID = compressionDeflate
case CompressionSnappyLabel:
cID = compressionSnappy
default:
return nil, fmt.Errorf("cannot read OCF header using unrecognized compression algorithm from avro.codec: %q", avroCodec)
}
}
//
// create goavro.Codec from specified avro.schema
//
value, ok = metadata["avro.schema"]
if !ok {
return nil, errors.New("cannot read OCF header without avro.schema")
}
codec, err := NewCodec(string(value))
if err != nil {
return nil, fmt.Errorf("cannot read OCF header with invalid avro.schema: %s", err)
}
header := &ocfHeader{codec: codec, compressionID: cID, metadata: metadata}
//
// read and store sync marker
//
if n, err := io.ReadFull(ior, header.syncMarker[:]); err != nil {
return nil, fmt.Errorf("cannot read OCF header without sync marker: only read %d of %d bytes: %s", n, ocfSyncLength, err)
}
//
// header is valid
//
return header, nil
}
func writeOCFHeader(header *ocfHeader, iow io.Writer) (err error) {
//
// avro.codec
//
var avroCodec string
switch header.compressionID {
case compressionNull:
avroCodec = CompressionNullLabel
case compressionDeflate:
avroCodec = CompressionDeflateLabel
case compressionSnappy:
avroCodec = CompressionSnappyLabel
default:
return fmt.Errorf("should not get here: cannot write OCF header using unrecognized compression algorithm: %d", header.compressionID)
}
//
// avro.schema
//
// Create buffer for OCF header. The first four bytes are magic, and we'll
// use copy to fill them in, so initialize buffer's length with 4, and its
// capacity equal to length of avro schema plus a constant.
schema := header.codec.Schema()
buf := make([]byte, 4, len(schema)+ocfHeaderSizeConst)
_ = copy(buf, ocfMagicBytes)
//
// file metadata, including the schema
//
meta := make(map[string]interface{})
for k, v := range header.metadata {
meta[k] = v
}
meta["avro.schema"] = []byte(schema)
meta["avro.codec"] = []byte(avroCodec)
buf, err = ocfMetadataCodec.BinaryFromNative(buf, meta)
if err != nil {
return fmt.Errorf("should not get here: cannot write OCF header: %s", err)
}
//
// 16-byte sync marker
//
buf = append(buf, header.syncMarker[:]...)
// emit OCF header
_, err = iow.Write(buf)
if err != nil {
return fmt.Errorf("cannot write OCF header: %s", err)
}
return nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"compress/flate"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"github.com/golang/snappy"
)
// OCFReader structure is used to read Object Container Files (OCF).
type OCFReader struct {
header *ocfHeader
block []byte // buffer from which decoding takes place
rerr error // most recent error that took place while reading bytes (unrecoverable)
ior io.Reader
readReady bool // true after Scan and before Read
remainingBlockItems int64 // count of encoded data items remaining in block buffer to be decoded
}
// NewOCFReader initializes and returns a new structure used to read an Avro
// Object Container File (OCF).
//
// func example(ior io.Reader) error {
// // NOTE: Wrap provided io.Reader in a buffered reader, which improves the
// // performance of streaming file data.
// br := bufio.NewReader(ior)
// ocfr, err := goavro.NewOCFReader(br)
// if err != nil {
// return err
// }
// for ocfr.Scan() {
// datum, err := ocfr.Read()
// if err != nil {
// return err
// }
// fmt.Println(datum)
// }
// return ocfr.Err()
// }
func NewOCFReader(ior io.Reader) (*OCFReader, error) {
header, err := readOCFHeader(ior)
if err != nil {
return nil, fmt.Errorf("cannot create OCFReader: %s", err)
}
return &OCFReader{header: header, ior: ior}, nil
}
//MetaData returns the file metadata map found within the OCF file
func (ocfr *OCFReader) MetaData() map[string][]byte {
return ocfr.header.metadata
}
// Codec returns the codec found within the OCF file.
func (ocfr *OCFReader) Codec() *Codec {
return ocfr.header.codec
}
// CompressionName returns the name of the compression algorithm found within
// the OCF file.
func (ocfr *OCFReader) CompressionName() string {
switch ocfr.header.compressionID {
case compressionNull:
return CompressionNullLabel
case compressionDeflate:
return CompressionDeflateLabel
case compressionSnappy:
return CompressionSnappyLabel
default:
return "should not get here: unrecognized compression algorithm"
}
}
// Err returns the last error encountered while reading the OCF file. See
// `NewOCFReader` documentation for an example.
func (ocfr *OCFReader) Err() error {
return ocfr.rerr
}
// Read consumes one datum value from the Avro OCF stream and returns it. Read
// is designed to be called only once after each invocation of the Scan method.
// See `NewOCFReader` documentation for an example.
func (ocfr *OCFReader) Read() (interface{}, error) {
// NOTE: Test previous error before testing readReady to prevent overwriting
// previous error.
if ocfr.rerr != nil {
return nil, ocfr.rerr
}
if !ocfr.readReady {
ocfr.rerr = errors.New("Read called without successful Scan")
return nil, ocfr.rerr
}
ocfr.readReady = false
// decode one datum value from block
var datum interface{}
datum, ocfr.block, ocfr.rerr = ocfr.header.codec.NativeFromBinary(ocfr.block)
if ocfr.rerr != nil {
return false, ocfr.rerr
}
ocfr.remainingBlockItems--
return datum, nil
}
// RemainingBlockItems returns the number of items remaining in the block being
// processed.
func (ocfr *OCFReader) RemainingBlockItems() int64 {
return ocfr.remainingBlockItems
}
// Scan returns true when there is at least one more data item to be read from
// the Avro OCF. Scan ought to be called prior to calling the Read method each
// time the Read method is invoked. See `NewOCFReader` documentation for an
// example.
func (ocfr *OCFReader) Scan() bool {
ocfr.readReady = false
if ocfr.rerr != nil {
return false
}
// NOTE: If there are no more remaining data items from the existing block,
// then attempt to slurp in the next block.
if ocfr.remainingBlockItems <= 0 {
if count := len(ocfr.block); count != 0 {
ocfr.rerr = fmt.Errorf("extra bytes between final datum in previous block and block sync marker: %d", count)
return false
}
// Read the block count and update the number of remaining items for
// this block
ocfr.remainingBlockItems, ocfr.rerr = longBinaryReader(ocfr.ior)
if ocfr.rerr != nil {
if ocfr.rerr == io.EOF {
ocfr.rerr = nil // merely end of file, rather than error
} else {
ocfr.rerr = fmt.Errorf("cannot read block count: %s", ocfr.rerr)
}
return false
}
if ocfr.remainingBlockItems <= 0 {
ocfr.rerr = fmt.Errorf("cannot decode when block count is not greater than 0: %d", ocfr.remainingBlockItems)
return false
}
if ocfr.remainingBlockItems > MaxBlockCount {
ocfr.rerr = fmt.Errorf("cannot decode when block count exceeds MaxBlockCount: %d > %d", ocfr.remainingBlockItems, MaxBlockCount)
}
var blockSize int64
blockSize, ocfr.rerr = longBinaryReader(ocfr.ior)
if ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read block size: %s", ocfr.rerr)
return false
}
if blockSize <= 0 {
ocfr.rerr = fmt.Errorf("cannot decode when block size is not greater than 0: %d", blockSize)
return false
}
if blockSize > MaxBlockSize {
ocfr.rerr = fmt.Errorf("cannot decode when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize)
return false
}
// read entire block into buffer
ocfr.block = make([]byte, blockSize)
_, ocfr.rerr = io.ReadFull(ocfr.ior, ocfr.block)
if ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read block: %s", ocfr.rerr)
return false
}
switch ocfr.header.compressionID {
case compressionNull:
// no-op
case compressionDeflate:
// NOTE: flate.NewReader wraps with io.ByteReader if argument does
// not implement that interface.
rc := flate.NewReader(bytes.NewBuffer(ocfr.block))
ocfr.block, ocfr.rerr = ioutil.ReadAll(rc)
if ocfr.rerr != nil {
_ = rc.Close()
return false
}
if ocfr.rerr = rc.Close(); ocfr.rerr != nil {
return false
}
case compressionSnappy:
index := len(ocfr.block) - 4 // last 4 bytes is crc32 of decoded block
if index <= 0 {
ocfr.rerr = fmt.Errorf("cannot decompress snappy without CRC32 checksum: %d", len(ocfr.block))
return false
}
decoded, err := snappy.Decode(nil, ocfr.block[:index])
if err != nil {
ocfr.rerr = fmt.Errorf("cannot decompress: %s", err)
return false
}
actualCRC := crc32.ChecksumIEEE(decoded)
expectedCRC := binary.BigEndian.Uint32(ocfr.block[index : index+4])
if actualCRC != expectedCRC {
ocfr.rerr = fmt.Errorf("snappy CRC32 checksum mismatch: %x != %x", actualCRC, expectedCRC)
return false
}
ocfr.block = decoded
default:
ocfr.rerr = fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfr.header.compressionID)
return false
}
// read and ensure sync marker matches
sync := make([]byte, ocfSyncLength)
var n int
if n, ocfr.rerr = io.ReadFull(ocfr.ior, sync); ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, ocfr.rerr)
return false
}
if !bytes.Equal(sync, ocfr.header.syncMarker[:]) {
ocfr.rerr = fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfr.header.syncMarker)
return false
}
}
ocfr.readReady = true
return true
}
// SkipThisBlockAndReset can be called after an error occurs while reading or
// decoding datum values from an OCF stream. OCF specifies each OCF stream
// contain one or more blocks of data. Each block consists of a block count, the
// number of bytes for the block, followed be the possibly compressed
// block. Inside each decompressed block is all of the binary encoded datum
// values concatenated together. In other words, OCF framing is at a block level
// rather than a datum level. If there is an error while reading or decoding a
// datum, the reader is not able to skip to the next datum value, because OCF
// does not have any markers for where each datum ends and the next one
// begins. Therefore, the reader is only able to skip this datum value and all
// subsequent datum values in the current block, move to the next block and
// start decoding datum values there.
func (ocfr *OCFReader) SkipThisBlockAndReset() {
// ??? is it an error to call method unless the reader has had an error
ocfr.remainingBlockItems = 0
ocfr.block = ocfr.block[:0]
ocfr.rerr = nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"compress/flate"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"os"
"github.com/golang/snappy"
)
// OCFConfig is used to specify creation parameters for OCFWriter.
type OCFConfig struct {
// W specifies the `io.Writer` to which to send the encoded data,
// (required). If W is `*os.File`, then creating an OCF for writing will
// attempt to read any existing OCF header and use the schema and
// compression codec specified by the existing header, then advance the file
// position to the tail end of the file for appending.
W io.Writer
// Codec specifies the Codec to use for the new OCFWriter, (optional). If
// the W parameter above is an `*os.File` which contains a Codec, the Codec
// in the existing file will be used instead. Otherwise if this Codec
// parameter is specified, it will be used. If neither the W parameter above
// is an `*os.File` with an existing Codec, nor this Codec parameter is
// specified, the OCFWriter will create a new Codec from the schema string
// specified by the Schema parameter below.
Codec *Codec
// Schema specifies the Avro schema for the data to be encoded, (optional).
// If neither the W parameter above is an `*os.File` with an existing Codec,
// nor the Codec parameter above is specified, the OCFWriter will create a
// new Codec from the schema string specified by this Schema parameter.
Schema string
// CompressionName specifies the compression codec used, (optional). If
// omitted, defaults to "null" codec. When appending to an existing OCF,
// this field is ignored.
CompressionName string
//MetaData specifies application specific meta data to be added to
//the OCF file. When appending to an existing OCF, this field
//is ignored
MetaData map[string][]byte
}
// OCFWriter is used to create a new or append to an existing Avro Object
// Container File (OCF).
type OCFWriter struct {
header *ocfHeader
iow io.Writer
}
// NewOCFWriter returns a new OCFWriter instance that may be used for appending
// binary Avro data, either by appending to an existing OCF file or creating a
// new OCF file.
func NewOCFWriter(config OCFConfig) (*OCFWriter, error) {
var err error
ocf := &OCFWriter{iow: config.W}
switch config.W.(type) {
case nil:
return nil, errors.New("cannot create OCFWriter when W is nil")
case *os.File:
file := config.W.(*os.File)
stat, err := file.Stat()
if err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
// NOTE: When upstream provides a new file, it will already exist but
// have a size of 0 bytes.
if stat.Size() > 0 {
// attempt to read existing OCF header
if ocf.header, err = readOCFHeader(file); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
// prepare for appending data to existing OCF
if err = ocf.quickScanToTail(file); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
return ocf, nil // happy case for appending to existing OCF
}
}
// create new OCF header based on configuration parameters
if ocf.header, err = newOCFHeader(config); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
if err = writeOCFHeader(ocf.header, config.W); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
return ocf, nil // another happy case for creation of new OCF
}
// quickScanToTail advances the stream reader to the tail end of the
// file. Rather than reading each encoded block, optionally decompressing it,
// and then decoding it, this method reads the block count, ignoring it, then
// reads the block size, then skips ahead to the followig block. It does this
// repeatedly until attempts to read the file return io.EOF.
func (ocfw *OCFWriter) quickScanToTail(ior io.Reader) error {
sync := make([]byte, ocfSyncLength)
for {
// Read and validate block count
blockCount, err := longBinaryReader(ior)
if err != nil {
if err == io.EOF {
return nil // merely end of file, rather than error
}
return fmt.Errorf("cannot read block count: %s", err)
}
if blockCount <= 0 {
return fmt.Errorf("cannot read when block count is not greater than 0: %d", blockCount)
}
if blockCount > MaxBlockCount {
return fmt.Errorf("cannot read when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// Read block size
blockSize, err := longBinaryReader(ior)
if err != nil {
return fmt.Errorf("cannot read block size: %s", err)
}
if blockSize <= 0 {
return fmt.Errorf("cannot read when block size is not greater than 0: %d", blockSize)
}
if blockSize > MaxBlockSize {
return fmt.Errorf("cannot read when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize)
}
// Advance reader to end of block
if _, err = io.CopyN(ioutil.Discard, ior, blockSize); err != nil {
return fmt.Errorf("cannot seek to next block: %s", err)
}
// Read and validate sync marker
var n int
if n, err = io.ReadFull(ior, sync); err != nil {
return fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, err)
}
if !bytes.Equal(sync, ocfw.header.syncMarker[:]) {
return fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfw.header.syncMarker)
}
}
}
// Append appends one or more data items to an OCF file in a block. If there are
// more data items in the slice than MaxBlockCount allows, the data slice will
// be chunked into multiple blocks, each not having more than MaxBlockCount
// items.
func (ocfw *OCFWriter) Append(data interface{}) error {
arrayValues, err := convertArray(data)
if err != nil {
return err
}
// Chunk data so no block has more than MaxBlockCount items.
for int64(len(arrayValues)) > MaxBlockCount {
if err := ocfw.appendDataIntoBlock(arrayValues[:MaxBlockCount]); err != nil {
return err
}
arrayValues = arrayValues[MaxBlockCount:]
}
return ocfw.appendDataIntoBlock(arrayValues)
}
func (ocfw *OCFWriter) appendDataIntoBlock(data []interface{}) error {
var block []byte // working buffer for encoding data values
var err error
// Encode and concatenate each data item into the block
for _, datum := range data {
if block, err = ocfw.header.codec.BinaryFromNative(block, datum); err != nil {
return fmt.Errorf("cannot translate datum to binary: %v; %s", datum, err)
}
}
switch ocfw.header.compressionID {
case compressionNull:
// no-op
case compressionDeflate:
// compress into new bytes buffer.
bb := bytes.NewBuffer(make([]byte, 0, len(block)))
cw, _ := flate.NewWriter(bb, flate.DefaultCompression)
// writing bytes to cw will compress bytes and send to bb.
if _, err := cw.Write(block); err != nil {
return err
}
if err := cw.Close(); err != nil {
return err
}
block = bb.Bytes()
case compressionSnappy:
compressed := snappy.Encode(nil, block)
// OCF requires snappy to have CRC32 checksum after each snappy block
compressed = append(compressed, 0, 0, 0, 0) // expand slice by 4 bytes so checksum will fit
binary.BigEndian.PutUint32(compressed[len(compressed)-4:], crc32.ChecksumIEEE(block)) // checksum of decompressed block
block = compressed
default:
return fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfw.header.compressionID)
}
// create file data block
buf := make([]byte, 0, len(block)+ocfBlockConst) // pre-allocate block bytes
buf, _ = longBinaryFromNative(buf, len(data)) // block count (number of data items)
buf, _ = longBinaryFromNative(buf, len(block)) // block size (number of bytes in block)
buf = append(buf, block...) // serialized objects
buf = append(buf, ocfw.header.syncMarker[:]...) // sync marker
_, err = ocfw.iow.Write(buf)
return err
}
// Codec returns the codec used by OCFWriter. This function provided because
// upstream may be appending to existing OCF which uses a different schema than
// requested during instantiation.
func (ocfw *OCFWriter) Codec() *Codec {
return ocfw.header.codec
}
// CompressionName returns the name of the compression algorithm used by
// OCFWriter. This function provided because upstream may be appending to
// existing OCF which uses a different compression algorithm than requested
// during instantiation. the OCF file.
func (ocfw *OCFWriter) CompressionName() string {
switch ocfw.header.compressionID {
case compressionNull:
return CompressionNullLabel
case compressionDeflate:
return CompressionDeflateLabel
case compressionSnappy:
return CompressionSnappyLabel
default:
return "should not get here: unrecognized compression algorithm"
}
}
package goavro
import (
"encoding/binary"
"fmt"
"io"
)
// rabinEmpty is a constant used to initialize the crc64Table, and to compute
// the CRC-64-AVRO fingerprint of every object schema.
const rabinEmpty = uint64(0xc15d213aa4d7a795)
// rabinTable is never modified after initialization but its values are read to
// compute the CRC-64-AVRO fingerprint of every schema its given.
var rabinTable = [256]uint64{
0,
3238593523956797946,
6477187047913595892,
8435907220062204430,
12954374095827191784,
11472609148414072338,
16871814440124408860,
14327483619285186022,
16515860097293205755,
14539261057490653441,
13607494391182877455,
10387063993012335349,
6265406319754774291,
8791864835633305321,
1085550678754862311,
2585467722461443357,
5247393906202824413,
7215812591205457703,
1239030555549527337,
4449591751341063379,
18092457712352332085,
15556728100436498639,
11742789833002527425,
10234164645493242683,
12530812639509548582,
9302088354573213660,
17583729671266610642,
15633189885995973672,
2171101357509724622,
3661574416647526452,
5170935444922886714,
7724537325157989312,
10494787812405648826,
13642865964979244096,
14431625182410915406,
16480541316673728436,
2478061111099054674,
1049933365183482792,
8899183502682126758,
6300970840149272668,
8399466921467862337,
6368420890995002555,
3275086581351513781,
108854135608684367,
14364169659802000041,
16980263386864569171,
11435870349096892765,
12845837170396948647,
15669858317114364775,
17692196227407282845,
9265331945857609875,
12422293323479818601,
7688114635962061967,
5062151678603773301,
3698085083440658299,
2279937883717887617,
4342202715019449244,
1203395666939462246,
7323148833295052904,
5282940851558637970,
10341870889845773428,
11778178981837571470,
15449074650315978624,
18057156506771531386,
11669866394404287583,
10160817855121008037,
17874829710049597355,
15339802717267265105,
1311848476550706103,
4523114428088083021,
5464845951130112067,
7432843562972398009,
4956122222198109348,
7509300761534850398,
2099866730366965584,
3591042414950500010,
17798367005364253516,
15848531969535615670,
12601941680298545336,
9372796311334617410,
16798933842935724674,
14253900473960229752,
12736841781990005110,
11255500115345754252,
6550173162703027562,
8509314479008689296,
217708271217368734,
3455596968422674276,
870833084869474937,
2370047569572014979,
6194214610827729293,
8721096401170761847,
13822387873690697105,
10602378625989962859,
16587157392570359397,
14609853536892473247,
3483332339477899749,
2064482512161650719,
7616958077116566033,
4991418462803860459,
9480190278288059917,
12637572737790640119,
15741190762473065977,
17762823925471730691,
15376229271924123934,
17983608511393921252,
10124303357207546602,
11561034798826117904,
7396170166881316598,
5356383260452470540,
4559875767435775234,
1420363961462201592,
8684405430038898488,
6085769495188764354,
2406791333878924492,
979366144819647798,
14646297666590105808,
16695918618875998506,
10565881703117275940,
13713538703073841886,
11362911691697612739,
12772455230081578553,
14146576876296094775,
16763373153642681805,
3347869283551649835,
182341662412566993,
8616954185191982047,
6585487012709290533,
13933329357911598997,
17126321439046432367,
11006435164953838689,
12992741788688209307,
8257930048646602877,
6803747195591438727,
3132703159877387145,
542775339377431155,
2623696953101412206,
619515277774763668,
9046228856176166042,
5871394916501263712,
10929691902260224134,
13501751302614184316,
14865687125944796018,
16338017159720129160,
9912244444396218696,
11925134239902742706,
15018601523069700796,
18202706530865158982,
4199733460733931168,
1637543290675756890,
7182084829901000020,
5717935174548446382,
7834929158557182387,
4632665972928804937,
3844057317981030983,
1849042541720329149,
16103865201353027163,
17549867708331900833,
9700748483321744815,
12280807109898935381,
5834933197202143791,
8937414855024798677,
655924238275353051,
2732422975565056033,
16374796089197559239,
14974255385173568573,
13465025131935292979,
10821211621719183305,
13100346325406055124,
11041713811386575662,
17018628958017378592,
13897997918303815898,
435416542434737468,
3097107305413864646,
6911193936845348552,
8293578696285179698,
1741666169738949874,
3808479038558283016,
4740095139144029958,
7870595381236532988,
12388429221655458586,
9736009554713699040,
17442192802341523694,
16068516186704462100,
18239503069743100937,
15127152172900050419,
11888425678624364541,
9803746554456753671,
5681455845848806369,
7073288438148047387,
1673934641775824917,
4308477092595991023,
6966664678955799498,
5503217582476919344,
4128965024323301438,
1566351579938693572,
15233916154233132066,
18417600011429070296,
9982836925607720918,
11996431537128302124,
9627165335515697969,
12207926510359495371,
15886756170769674437,
17332335396841578815,
3917464579278591193,
1922028658990515491,
8051932600676513581,
4850374241660872407,
2917466598601071895,
327962119137676525,
8187398044598779619,
6732512565967646489,
11221777246008269567,
13207379120439233285,
14004037317153847563,
17197450482186430705,
14792340333762633196,
16265093719173729302,
10712766520904941080,
13284123302255603682,
9119751534871550468,
5944212839312182270,
2840727922924403184,
836967320887912458,
17368810860077796976,
15995557527495450506,
12171538990377528708,
9518416773021940862,
4813582667757848984,
7943378085384837218,
1958732289639295596,
4025966300338256790,
1458733299300535947,
4093699022299389809,
5610888623004134783,
7002018658576923781,
12103802978479819107,
10018419036150929561,
18310175810188503703,
15198246066092718957,
13391477134206599341,
10748366240846565719,
16157651908532642649,
14756687855020634787,
729366649650267973,
2805444311502067391,
6051901489239909553,
9155087905094251851,
6695738567103299670,
8078825954266321324,
364683324825133986,
3025950744619954776,
17233908370383964094,
14112856248920397380,
13170974025418581066,
11113046258555286960,
}
// rabin returns an unsigned 64-bit integer Rabin fingerprint for buf. NOTE:
// This is only used during Codec instantiation to calculate the Rabin
// fingerprint of the canonical schema.
func rabin(buf []byte) uint64 {
fp := rabinEmpty
for i := 0; i < len(buf); i++ {
fp = (fp >> 8) ^ rabinTable[(byte(fp)^buf[i])&0xff] // unsigned right shift >>>
}
return fp
}
const soeMagicPrefix = 2 // 2-byte prefix for SOE encoded data
const soeHeaderLen = soeMagicPrefix + 8 // 2-byte prefix plus 8-byte fingerprint
// FingerprintFromSOE returns the unsigned 64-bit Rabin fingerprint from the
// header of a buffer that encodes a Single-Object Encoded datum. This function
// is designed to be used to lookup a Codec that can decode the contents of the
// buffer. Once a Codec is found that has the matching Rabin fingerprint, its
// NativeFromBinary method may be used to decode the remaining bytes returned as
// the second return value. On failure this function returns an
// ErrNotSingleObjectEncoded error.
//
// func decode(codex map[uint64]*goavro.Codec, buf []byte) error {
// // Perform a sanity check on the buffer, then return the Rabin fingerprint
// // of the schema used to encode the data.
// fingerprint, newBuf, err := goavro.FingerprintFromSOE(buf)
// if err != nil {
// return err
// }
//
// // Get a previously stored Codec from the codex map.
// codec, ok := codex[fingerprint]
// if !ok {
// return fmt.Errorf("unknown codec: %#x", fingerprint)
// }
//
// // Use the fetched Codec to decode the buffer as a SOE.
// //
// // Faster because SOE magic prefix and schema fingerprint already
// // checked and used to fetch the Codec. Just need to decode the binary
// // bytes remaining after the prefix were removed.
// datum, _, err := codec.NativeFromBinary(newBuf)
// if err != nil {
// return err
// }
//
// _, err = fmt.Println(datum)
// return err
// }
func FingerprintFromSOE(buf []byte) (uint64, []byte, error) {
if len(buf) < soeHeaderLen {
// Not enough bytes to encode schema fingerprint.
return 0, nil, ErrNotSingleObjectEncoded(io.ErrShortBuffer.Error())
}
if buf[0] != 0xC3 || buf[1] != 0x01 {
// Currently only one SOE prefix is recognized.
return 0, nil, ErrNotSingleObjectEncoded(fmt.Sprintf("unknown SOE prefix: %#x", buf[:soeMagicPrefix]))
}
// Only recognizes single-object encodings format version 1.
return binary.LittleEndian.Uint64(buf[soeMagicPrefix:]), buf[soeHeaderLen:], nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
)
func makeRecordCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
// NOTE: To support recursive data types, create the codec and register it
// using the specified name, and fill in the codec functions later.
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Record ought to have valid name: %s", err)
}
fields, ok := schemaMap["fields"]
if !ok {
return nil, fmt.Errorf("Record %q ought to have fields key", c.typeName)
}
fieldSchemas, ok := fields.([]interface{})
if !ok || fieldSchemas == nil {
return nil, fmt.Errorf("Record %q fields ought to be non-nil array: %v", c.typeName, fields)
}
codecFromFieldName := make(map[string]*Codec)
codecFromIndex := make([]*Codec, len(fieldSchemas))
nameFromIndex := make([]string, len(fieldSchemas))
defaultValueFromName := make(map[string]interface{}, len(fieldSchemas))
for i, fieldSchema := range fieldSchemas {
fieldSchemaMap, ok := fieldSchema.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type; received: %v", c.typeName, i+1, fieldSchema)
}
// NOTE: field names are not registered in the symbol table, because
// field names are not individually addressable codecs.
fieldCodec, err := buildCodecForTypeDescribedByMap(st, c.typeName.namespace, fieldSchemaMap)
if err != nil {
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type: %s", c.typeName, i+1, err)
}
// However, when creating a full name for the field name, be sure to use
// record's namespace
n, err := newNameFromSchemaMap(c.typeName.namespace, fieldSchemaMap)
if err != nil {
return nil, fmt.Errorf("Record %q field %d ought to have valid name: %v", c.typeName, i+1, fieldSchemaMap)
}
fieldName := n.short()
if _, ok := codecFromFieldName[fieldName]; ok {
return nil, fmt.Errorf("Record %q field %d ought to have unique name: %q", c.typeName, i+1, fieldName)
}
if defaultValue, ok := fieldSchemaMap["default"]; ok {
typeNameShort := fieldCodec.typeName.short()
switch typeNameShort {
case "boolean":
v, ok := defaultValue.(bool)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = bool(v)
case "bytes":
v, ok := defaultValue.(string)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = []byte(v)
case "double":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = float64(v)
case "float":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = float32(v)
case "int":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = int32(v)
case "long":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = int64(v)
case "string":
v, ok := defaultValue.(string)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = string(v)
case "union":
// When codec is union, then default value ought to encode using
// first schema in union. NOTE: To support a null default
// value, the string literal "null" must be coerced to a `nil`
if defaultValue == "null" {
defaultValue = nil
}
// NOTE: To support record field default values, union schema
// set to the type name of first member
// TODO: change to schemaCanonical below
defaultValue = Union(fieldCodec.schemaOriginal, defaultValue)
default:
debug("fieldName: %q; type: %q; defaultValue: %T(%#v)\n", fieldName, c.typeName, defaultValue, defaultValue)
}
// attempt to encode default value using codec
_, err = fieldCodec.binaryFromNative(nil, defaultValue)
if err != nil {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValueFromName[fieldName] = defaultValue
}
nameFromIndex[i] = fieldName
codecFromIndex[i] = fieldCodec
codecFromFieldName[fieldName] = fieldCodec
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
valueMap, ok := datum.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("cannot encode binary record %q: expected map[string]interface{}; received: %T", c.typeName, datum)
}
// records encoded in order fields were defined in schema
for i, fieldCodec := range codecFromIndex {
fieldName := nameFromIndex[i]
// NOTE: If field value was not specified in map, then set
// fieldValue to its default value (which may or may not have been
// specified).
fieldValue, ok := valueMap[fieldName]
if !ok {
if fieldValue, ok = defaultValueFromName[fieldName]; !ok {
return nil, fmt.Errorf("cannot encode binary record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName)
}
}
var err error
buf, err = fieldCodec.binaryFromNative(buf, fieldValue)
if err != nil {
return nil, fmt.Errorf("cannot encode binary record %q field %q: value does not match its schema: %s", c.typeName, fieldName, err)
}
}
return buf, nil
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
recordMap := make(map[string]interface{}, len(codecFromIndex))
for i, fieldCodec := range codecFromIndex {
name := nameFromIndex[i]
var value interface{}
var err error
value, buf, err = fieldCodec.nativeFromBinary(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode binary record %q field %q: %s", c.typeName, name, err)
}
recordMap[name] = value
}
return recordMap, buf, nil
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
var mapValues map[string]interface{}
var err error
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextDecoder
// to return an error when a field name is not found in the
// codecFromFieldName map.
mapValues, buf, err = genericMapTextDecoder(buf, nil, codecFromFieldName)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual record %q: %s", c.typeName, err)
}
if actual, expected := len(mapValues), len(codecFromFieldName); actual != expected {
// set missing field keys to their respective default values, then
// re-check number of keys
for fieldName, defaultValue := range defaultValueFromName {
if _, ok := mapValues[fieldName]; !ok {
mapValues[fieldName] = defaultValue
}
}
if actual, expected = len(mapValues), len(codecFromFieldName); actual != expected {
return nil, nil, fmt.Errorf("cannot decode textual record %q: only found %d of %d fields", c.typeName, actual, expected)
}
}
return mapValues, buf, nil
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
// NOTE: Ensure only schema defined field names are encoded; and if
// missing in datum, either use the provided field default value or
// return an error.
sourceMap, ok := datum.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("cannot encode textual record %q: expected map[string]interface{}; received: %T", c.typeName, datum)
}
destMap := make(map[string]interface{}, len(codecFromIndex))
for fieldName := range codecFromFieldName {
fieldValue, ok := sourceMap[fieldName]
if !ok {
defaultValue, ok := defaultValueFromName[fieldName]
if !ok {
return nil, fmt.Errorf("cannot encode textual record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName)
}
fieldValue = defaultValue
}
destMap[fieldName] = fieldValue
}
datum = destMap
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextEncoder
// to return an error when a field name is not found in the
// codecFromFieldName map.
return genericMapTextEncoder(buf, datum, nil, codecFromFieldName)
}
return c, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"unicode"
)
// advanceAndConsume advances to non whitespace and returns an error if the next
// non whitespace byte is not what is expected.
func advanceAndConsume(buf []byte, expected byte) ([]byte, error) {
var err error
if buf, err = advanceToNonWhitespace(buf); err != nil {
return nil, err
}
if actual := buf[0]; actual != expected {
return nil, fmt.Errorf("expected: %q; actual: %q", expected, actual)
}
return buf[1:], nil
}
// advanceToNonWhitespace consumes bytes from buf until non-whitespace character
// is found. It returns error when no more bytes remain, because its purpose is
// to scan ahead to the next non-whitespace character.
func advanceToNonWhitespace(buf []byte) ([]byte, error) {
for i, b := range buf {
if !unicode.IsSpace(rune(b)) {
return buf[i:], nil
}
}
return nil, io.ErrShortBuffer
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
)
// Union wraps a datum value in a map for encoding as a Union, as required by
// Union encoder.
//
// When providing a value for an Avro union, the encoder will accept `nil` for a
// `null` value. If the value is non-`nil`, it must be a
// `map[string]interface{}` with a single key-value pair, where the key is the
// Avro type name and the value is the datum's value. As a convenience, the
// `Union` function wraps any datum value in a map as specified above.
//
// func ExampleUnion() {
// codec, err := goavro.NewCodec(`["null","string","int"]`)
// if err != nil {
// fmt.Println(err)
// }
// buf, err := codec.TextualFromNative(nil, goavro.Union("string", "some string"))
// if err != nil {
// fmt.Println(err)
// }
// fmt.Println(string(buf))
// // Output: {"string":"some string"}
// }
func Union(name string, datum interface{}) interface{} {
if datum == nil && name == "null" {
return nil
}
return map[string]interface{}{name: datum}
}
func buildCodecForTypeDescribedBySlice(st map[string]*Codec, enclosingNamespace string, schemaArray []interface{}) (*Codec, error) {
if len(schemaArray) == 0 {
return nil, errors.New("Union ought to have one or more members")
}
allowedTypes := make([]string, len(schemaArray)) // used for error reporting when encoder receives invalid datum type
codecFromIndex := make([]*Codec, len(schemaArray))
codecFromName := make(map[string]*Codec, len(schemaArray))
indexFromName := make(map[string]int, len(schemaArray))
for i, unionMemberSchema := range schemaArray {
unionMemberCodec, err := buildCodec(st, enclosingNamespace, unionMemberSchema)
if err != nil {
return nil, fmt.Errorf("Union item %d ought to be valid Avro type: %s", i+1, err)
}
fullName := unionMemberCodec.typeName.fullName
if _, ok := indexFromName[fullName]; ok {
return nil, fmt.Errorf("Union item %d ought to be unique type: %s", i+1, unionMemberCodec.typeName)
}
allowedTypes[i] = fullName
codecFromIndex[i] = unionMemberCodec
codecFromName[fullName] = unionMemberCodec
indexFromName[fullName] = i
}
return &Codec{
// NOTE: To support record field default values, union schema set to the
// type name of first member
// TODO: add/change to schemaCanonical below
schemaOriginal: codecFromIndex[0].typeName.fullName,
typeName: &name{"union", nullNamespace},
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) {
var decoded interface{}
var err error
decoded, buf, err = longNativeFromBinary(buf)
if err != nil {
return nil, nil, err
}
index := decoded.(int64) // longDecoder always returns int64, so elide error checking
if index < 0 || index >= int64(len(codecFromIndex)) {
return nil, nil, fmt.Errorf("cannot decode binary union: index ought to be between 0 and %d; read index: %d", len(codecFromIndex)-1, index)
}
c := codecFromIndex[index]
decoded, buf, err = c.nativeFromBinary(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode binary union item %d: %s", index+1, err)
}
if decoded == nil {
// do not wrap a nil value in a map
return nil, buf, nil
}
// Non-nil values are wrapped in a map with single key set to type name of value
return Union(allowedTypes[index], decoded), buf, nil
},
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
switch v := datum.(type) {
case nil:
index, ok := indexFromName["null"]
if !ok {
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
return longBinaryFromNative(buf, index)
case map[string]interface{}:
if len(v) != 1 {
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
}
// will execute exactly once
for key, value := range v {
index, ok := indexFromName[key]
if !ok {
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
c := codecFromIndex[index]
buf, _ = longBinaryFromNative(buf, index)
return c.binaryFromNative(buf, value)
}
}
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
},
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) {
if len(buf) >= 4 && bytes.Equal(buf[:4], []byte("null")) {
if _, ok := indexFromName["null"]; ok {
return nil, buf[4:], nil
}
}
var datum interface{}
var err error
datum, buf, err = genericMapTextDecoder(buf, nil, codecFromName)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual union: %s", err)
}
return datum, buf, nil
},
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
switch v := datum.(type) {
case nil:
_, ok := indexFromName["null"]
if !ok {
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
return append(buf, "null"...), nil
case map[string]interface{}:
if len(v) != 1 {
return nil, fmt.Errorf("cannot encode textual union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
}
// will execute exactly once
for key, value := range v {
index, ok := indexFromName[key]
if !ok {
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
buf = append(buf, '{')
var err error
buf, err = stringTextualFromNative(buf, key)
if err != nil {
return nil, fmt.Errorf("cannot encode textual union: %s", err)
}
buf = append(buf, ':')
c := codecFromIndex[index]
buf, err = c.textualFromNative(buf, value)
if err != nil {
return nil, fmt.Errorf("cannot encode textual union: %s", err)
}
return append(buf, '}'), nil
}
}
return nil, fmt.Errorf("cannot encode textual union: non-nil values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
},
}, nil
}
......@@ -122,6 +122,8 @@ github.com/golang/protobuf/ptypes
github.com/golang/protobuf/ptypes/any
github.com/golang/protobuf/ptypes/duration
github.com/golang/protobuf/ptypes/timestamp
# github.com/golang/snappy v0.0.1
github.com/golang/snappy
# github.com/google/flatbuffers v1.11.0
github.com/google/flatbuffers/go
# github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c
......@@ -167,6 +169,8 @@ github.com/klauspost/cpuid
github.com/lib/pq
github.com/lib/pq/oid
github.com/lib/pq/scram
# github.com/linkedin/goavro/v2 v2.9.7
github.com/linkedin/goavro/v2
# github.com/mattetti/filebuffer v1.0.0
github.com/mattetti/filebuffer
# github.com/mattn/go-colorable v0.1.4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment