Commit fe8b5334 by Emil Tullstedt Committed by GitHub

Modules: Add patched goavro dependency for extensions (#21027)

parent 32c9d558
......@@ -4,7 +4,9 @@ go 1.13
require (
github.com/BurntSushi/toml v0.3.1
github.com/DataDog/zstd v1.4.4 // indirect
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f
github.com/apache/thrift v0.13.0 // indirect
github.com/aws/aws-sdk-go v1.25.48
github.com/beevik/etree v1.1.0 // indirect
github.com/benbjohnson/clock v0.0.0-20161215174838-7dc76406b6d3
......@@ -42,6 +44,7 @@ require (
github.com/klauspost/compress v1.4.1 // indirect
github.com/klauspost/cpuid v1.2.0 // indirect
github.com/lib/pq v1.2.0
github.com/linkedin/goavro/v2 v2.9.7
github.com/mattn/go-isatty v0.0.10
github.com/mattn/go-sqlite3 v1.11.0
github.com/opentracing/opentracing-go v1.1.0
......@@ -62,6 +65,8 @@ require (
github.com/uber/jaeger-client-go v2.16.0+incompatible
github.com/uber/jaeger-lib v2.0.0+incompatible // indirect
github.com/unknwon/com v1.0.1
github.com/xitongsys/parquet-go v1.4.0 // indirect
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6 // indirect
github.com/yudai/gojsondiff v1.0.0
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 // indirect
github.com/yudai/pp v2.0.1+incompatible // indirect
......
......@@ -3,12 +3,16 @@ cloud.google.com/go v0.34.0 h1:eOI3/cP2VTU6uZLDYAoic+eyzzB9YyGmJ7eIjl8rOPg=
cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/DataDog/zstd v1.4.4 h1:+IawcoXhCBylN7ccwdwf8LOH2jKq7NavGpEPanrlTzE=
github.com/DataDog/zstd v1.4.4/go.mod h1:1jcaCB/ufaK+sKp1NBhlGmpz41jOoPQ35bpF36t7BBo=
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f h1:HR5nRmUQgXrwqZOwZ2DAc/aCi3Bu3xENpspW935vxu0=
github.com/VividCortex/mysqlerr v0.0.0-20170204212430-6c6b55f8796f/go.mod h1:f3HiCrHjHBdcm6E83vGaXh1KomZMA2P6aeo3hKx/wg0=
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
github.com/apache/arrow/go/arrow v0.0.0-20190716210558-5f564424c71c h1:iHUHzx3S1TU5xt+D7vLb0PAk3e+RfayF9IhR6+hyO/k=
github.com/apache/arrow/go/arrow v0.0.0-20190716210558-5f564424c71c/go.mod h1:VTxUBvSJ3s3eHAg65PNgrsn5BtqCRPdmyXh6rAfdxN0=
github.com/apache/thrift v0.13.0 h1:5hryIiq9gtn+MiLVn0wP37kb/uTeRZgN08WoCsAhIhI=
github.com/apache/thrift v0.13.0/go.mod h1:cp2SuWMxlEZw2r+iP2GNCdIi4C1qmUzdZFSVb+bacwQ=
github.com/aws/aws-sdk-go v1.25.48 h1:J82DYDGZHOKHdhx6hD24Tm30c2C3GchYGfN0mf9iKUk=
github.com/aws/aws-sdk-go v1.25.48/go.mod h1:KmX6BPdI08NWTb3/sm4ZGu5ShLoqVDhKgpiN924inxo=
github.com/beevik/etree v1.0.1/go.mod h1:r8Aw8JqVegEf0w2fDnATrX9VpkMcyFeM0FhwO62wh+A=
......@@ -97,6 +101,7 @@ github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5y
github.com/golang/protobuf v1.3.2 h1:6nsPYzhq5kReh6QImI3k5qWzO4PEbvbIW2cwSfR/6xs=
github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
github.com/golang/snappy v0.0.0-20180518054509-2e65f85255db/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
github.com/google/flatbuffers v1.11.0 h1:O7CEyB8Cb3/DmtxODGtLHcEvpr81Jm5qLg/hsHnxA2A=
github.com/google/flatbuffers v1.11.0/go.mod h1:1AeVuKshWv4vARoZatz6mlQ0JxURH0Kv5+zNeJKJCa8=
......@@ -163,6 +168,8 @@ github.com/lib/pq v1.0.0 h1:X5PMW56eZitiTeO7tKzZxFCSpbFZJtkMMooicw2us9A=
github.com/lib/pq v1.0.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/lib/pq v1.2.0 h1:LXpIM/LZ5xGFhOpXAQUIMM1HdyqzVYM13zNdjCEEcA0=
github.com/lib/pq v1.2.0/go.mod h1:5WUZQaWbwv1U+lTReE5YruASi9Al49XbQIvNi/34Woo=
github.com/linkedin/goavro/v2 v2.9.7 h1:Vd++Rb/RKcmNJjM0HP/JJFMEWa21eUBVKPYlKehOGrM=
github.com/linkedin/goavro/v2 v2.9.7/go.mod h1:UgQUb2N/pmueQYH9bfqFioWxzYCZXSfF8Jw03O5sjqA=
github.com/lunny/log v0.0.0-20160921050905-7887c61bf0de/go.mod h1:3q8WtuPQsoRbatJuy3nvq/hRSvuBJrHHr+ybPPiNvHQ=
github.com/lunny/nodb v0.0.0-20160621015157-fc1ef06ad4af/go.mod h1:Cqz6pqow14VObJ7peltM+2n3PWOz7yTrfUuGbVFkzN0=
github.com/mattetti/filebuffer v1.0.0 h1:ixTvQ0JjBTwWbdpDZ98lLrydo7KRi8xNRIi5RFszsbY=
......@@ -283,6 +290,10 @@ github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e h1:GSGeB9EAKY2spCABz6x
github.com/unknwon/com v0.0.0-20190804042917-757f69c95f3e/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
github.com/unknwon/com v1.0.1 h1:3d1LTxD+Lnf3soQiD4Cp/0BRB+Rsa/+RTvz8GMMzIXs=
github.com/unknwon/com v1.0.1/go.mod h1:tOOxU81rwgoCLoOVVPHb6T/wt8HZygqH5id+GNnlCXM=
github.com/xitongsys/parquet-go v1.4.0 h1:+3+QFRRwAilhTdNcJU2hPxslLCAKJ+Tn8C2OhnCVWDo=
github.com/xitongsys/parquet-go v1.4.0/go.mod h1:on8bl2K/PEouGNEJqxht0t3K4IyN/ABeFu84Hh3lzrE=
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6 h1:KPDKkdchSII+K5KS7iMpE062MVh2OucaM31599ER4U0=
github.com/xitongsys/parquet-go-source v0.0.0-20191104003508-ecfa341356a6/go.mod h1:xxCx7Wpym/3QCo6JhujJX51dzSXrwmb0oH6FQb39SEA=
github.com/yudai/gojsondiff v1.0.0 h1:27cbfqXLVEJ1o8I6v3y9lg8Ydm53EKqHXAOMxEGlCOA=
github.com/yudai/gojsondiff v1.0.0/go.mod h1:AY32+k2cwILAkW1fbgxQ5mUmMiZFgLIV+FBNExI05xg=
github.com/yudai/golcs v0.0.0-20170316035057-ecda9a501e82 h1:BHyfKlQyqbsFN5p3IfnEUduWvb9is428/nNb5L3U01M=
......
......@@ -9,6 +9,7 @@ import (
"github.com/grafana/grafana/pkg/models"
"github.com/grafana/grafana/pkg/registry"
_ "github.com/jung-kurt/gofpdf"
_ "github.com/linkedin/goavro/v2"
_ "github.com/pkg/errors"
_ "github.com/robfig/cron"
_ "github.com/robfig/cron/v3"
......
cmd/snappytool/snappytool
testdata/bench
# These explicitly listed benchmark data files are for an obsolete version of
# snappy_test.go.
testdata/alice29.txt
testdata/asyoulik.txt
testdata/fireworks.jpeg
testdata/geo.protodata
testdata/html
testdata/html_x_4
testdata/kppkn.gtb
testdata/lcet10.txt
testdata/paper-100k.pdf
testdata/plrabn12.txt
testdata/urls.10K
# This is the official list of Snappy-Go authors for copyright purposes.
# This file is distinct from the CONTRIBUTORS files.
# See the latter for an explanation.
# Names should be added to this file as
# Name or Organization <email address>
# The email address is not required for organizations.
# Please keep the list sorted.
Damian Gryski <dgryski@gmail.com>
Google Inc.
Jan Mercl <0xjnml@gmail.com>
Rodolfo Carvalho <rhcarvalho@gmail.com>
Sebastien Binet <seb.binet@gmail.com>
# This is the official list of people who can contribute
# (and typically have contributed) code to the Snappy-Go repository.
# The AUTHORS file lists the copyright holders; this file
# lists people. For example, Google employees are listed here
# but not in AUTHORS, because Google holds the copyright.
#
# The submission process automatically checks to make sure
# that people submitting code are listed in this file (by email address).
#
# Names should be added to this file only after verifying that
# the individual or the individual's organization has agreed to
# the appropriate Contributor License Agreement, found here:
#
# http://code.google.com/legal/individual-cla-v1.0.html
# http://code.google.com/legal/corporate-cla-v1.0.html
#
# The agreement for individuals can be filled out on the web.
#
# When adding J Random Contributor's name to this file,
# either J's name or J's organization's name should be
# added to the AUTHORS file, depending on whether the
# individual or corporate CLA was used.
# Names should be added to this file like so:
# Name <email address>
# Please keep the list sorted.
Damian Gryski <dgryski@gmail.com>
Jan Mercl <0xjnml@gmail.com>
Kai Backman <kaib@golang.org>
Marc-Antoine Ruel <maruel@chromium.org>
Nigel Tao <nigeltao@golang.org>
Rob Pike <r@golang.org>
Rodolfo Carvalho <rhcarvalho@gmail.com>
Russ Cox <rsc@golang.org>
Sebastien Binet <seb.binet@gmail.com>
Copyright (c) 2011 The Snappy-Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
The Snappy compression format in the Go programming language.
To download and install from source:
$ go get github.com/golang/snappy
Unless otherwise noted, the Snappy-Go source files are distributed
under the BSD-style license found in the LICENSE file.
Benchmarks.
The golang/snappy benchmarks include compressing (Z) and decompressing (U) ten
or so files, the same set used by the C++ Snappy code (github.com/google/snappy
and note the "google", not "golang"). On an "Intel(R) Core(TM) i7-3770 CPU @
3.40GHz", Go's GOARCH=amd64 numbers as of 2016-05-29:
"go test -test.bench=."
_UFlat0-8 2.19GB/s ± 0% html
_UFlat1-8 1.41GB/s ± 0% urls
_UFlat2-8 23.5GB/s ± 2% jpg
_UFlat3-8 1.91GB/s ± 0% jpg_200
_UFlat4-8 14.0GB/s ± 1% pdf
_UFlat5-8 1.97GB/s ± 0% html4
_UFlat6-8 814MB/s ± 0% txt1
_UFlat7-8 785MB/s ± 0% txt2
_UFlat8-8 857MB/s ± 0% txt3
_UFlat9-8 719MB/s ± 1% txt4
_UFlat10-8 2.84GB/s ± 0% pb
_UFlat11-8 1.05GB/s ± 0% gaviota
_ZFlat0-8 1.04GB/s ± 0% html
_ZFlat1-8 534MB/s ± 0% urls
_ZFlat2-8 15.7GB/s ± 1% jpg
_ZFlat3-8 740MB/s ± 3% jpg_200
_ZFlat4-8 9.20GB/s ± 1% pdf
_ZFlat5-8 991MB/s ± 0% html4
_ZFlat6-8 379MB/s ± 0% txt1
_ZFlat7-8 352MB/s ± 0% txt2
_ZFlat8-8 396MB/s ± 1% txt3
_ZFlat9-8 327MB/s ± 1% txt4
_ZFlat10-8 1.33GB/s ± 1% pb
_ZFlat11-8 605MB/s ± 1% gaviota
"go test -test.bench=. -tags=noasm"
_UFlat0-8 621MB/s ± 2% html
_UFlat1-8 494MB/s ± 1% urls
_UFlat2-8 23.2GB/s ± 1% jpg
_UFlat3-8 1.12GB/s ± 1% jpg_200
_UFlat4-8 4.35GB/s ± 1% pdf
_UFlat5-8 609MB/s ± 0% html4
_UFlat6-8 296MB/s ± 0% txt1
_UFlat7-8 288MB/s ± 0% txt2
_UFlat8-8 309MB/s ± 1% txt3
_UFlat9-8 280MB/s ± 1% txt4
_UFlat10-8 753MB/s ± 0% pb
_UFlat11-8 400MB/s ± 0% gaviota
_ZFlat0-8 409MB/s ± 1% html
_ZFlat1-8 250MB/s ± 1% urls
_ZFlat2-8 12.3GB/s ± 1% jpg
_ZFlat3-8 132MB/s ± 0% jpg_200
_ZFlat4-8 2.92GB/s ± 0% pdf
_ZFlat5-8 405MB/s ± 1% html4
_ZFlat6-8 179MB/s ± 1% txt1
_ZFlat7-8 170MB/s ± 1% txt2
_ZFlat8-8 189MB/s ± 1% txt3
_ZFlat9-8 164MB/s ± 1% txt4
_ZFlat10-8 479MB/s ± 1% pb
_ZFlat11-8 270MB/s ± 1% gaviota
For comparison (Go's encoded output is byte-for-byte identical to C++'s), here
are the numbers from C++ Snappy's
make CXXFLAGS="-O2 -DNDEBUG -g" clean snappy_unittest.log && cat snappy_unittest.log
BM_UFlat/0 2.4GB/s html
BM_UFlat/1 1.4GB/s urls
BM_UFlat/2 21.8GB/s jpg
BM_UFlat/3 1.5GB/s jpg_200
BM_UFlat/4 13.3GB/s pdf
BM_UFlat/5 2.1GB/s html4
BM_UFlat/6 1.0GB/s txt1
BM_UFlat/7 959.4MB/s txt2
BM_UFlat/8 1.0GB/s txt3
BM_UFlat/9 864.5MB/s txt4
BM_UFlat/10 2.9GB/s pb
BM_UFlat/11 1.2GB/s gaviota
BM_ZFlat/0 944.3MB/s html (22.31 %)
BM_ZFlat/1 501.6MB/s urls (47.78 %)
BM_ZFlat/2 14.3GB/s jpg (99.95 %)
BM_ZFlat/3 538.3MB/s jpg_200 (73.00 %)
BM_ZFlat/4 8.3GB/s pdf (83.30 %)
BM_ZFlat/5 903.5MB/s html4 (22.52 %)
BM_ZFlat/6 336.0MB/s txt1 (57.88 %)
BM_ZFlat/7 312.3MB/s txt2 (61.91 %)
BM_ZFlat/8 353.1MB/s txt3 (54.99 %)
BM_ZFlat/9 289.9MB/s txt4 (66.26 %)
BM_ZFlat/10 1.2GB/s pb (19.68 %)
BM_ZFlat/11 527.4MB/s gaviota (37.72 %)
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package snappy
import (
"encoding/binary"
"errors"
"io"
)
var (
// ErrCorrupt reports that the input is invalid.
ErrCorrupt = errors.New("snappy: corrupt input")
// ErrTooLarge reports that the uncompressed length is too large.
ErrTooLarge = errors.New("snappy: decoded block is too large")
// ErrUnsupported reports that the input isn't supported.
ErrUnsupported = errors.New("snappy: unsupported input")
errUnsupportedLiteralLength = errors.New("snappy: unsupported literal length")
)
// DecodedLen returns the length of the decoded block.
func DecodedLen(src []byte) (int, error) {
v, _, err := decodedLen(src)
return v, err
}
// decodedLen returns the length of the decoded block and the number of bytes
// that the length header occupied.
func decodedLen(src []byte) (blockLen, headerLen int, err error) {
v, n := binary.Uvarint(src)
if n <= 0 || v > 0xffffffff {
return 0, 0, ErrCorrupt
}
const wordSize = 32 << (^uint(0) >> 32 & 1)
if wordSize == 32 && v > 0x7fffffff {
return 0, 0, ErrTooLarge
}
return int(v), n, nil
}
const (
decodeErrCodeCorrupt = 1
decodeErrCodeUnsupportedLiteralLength = 2
)
// Decode returns the decoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire decoded block.
// Otherwise, a newly allocated slice will be returned.
//
// The dst and src must not overlap. It is valid to pass a nil dst.
func Decode(dst, src []byte) ([]byte, error) {
dLen, s, err := decodedLen(src)
if err != nil {
return nil, err
}
if dLen <= len(dst) {
dst = dst[:dLen]
} else {
dst = make([]byte, dLen)
}
switch decode(dst, src[s:]) {
case 0:
return dst, nil
case decodeErrCodeUnsupportedLiteralLength:
return nil, errUnsupportedLiteralLength
}
return nil, ErrCorrupt
}
// NewReader returns a new Reader that decompresses from r, using the framing
// format described at
// https://github.com/google/snappy/blob/master/framing_format.txt
func NewReader(r io.Reader) *Reader {
return &Reader{
r: r,
decoded: make([]byte, maxBlockSize),
buf: make([]byte, maxEncodedLenOfMaxBlockSize+checksumSize),
}
}
// Reader is an io.Reader that can read Snappy-compressed bytes.
type Reader struct {
r io.Reader
err error
decoded []byte
buf []byte
// decoded[i:j] contains decoded bytes that have not yet been passed on.
i, j int
readHeader bool
}
// Reset discards any buffered data, resets all state, and switches the Snappy
// reader to read from r. This permits reusing a Reader rather than allocating
// a new one.
func (r *Reader) Reset(reader io.Reader) {
r.r = reader
r.err = nil
r.i = 0
r.j = 0
r.readHeader = false
}
func (r *Reader) readFull(p []byte, allowEOF bool) (ok bool) {
if _, r.err = io.ReadFull(r.r, p); r.err != nil {
if r.err == io.ErrUnexpectedEOF || (r.err == io.EOF && !allowEOF) {
r.err = ErrCorrupt
}
return false
}
return true
}
// Read satisfies the io.Reader interface.
func (r *Reader) Read(p []byte) (int, error) {
if r.err != nil {
return 0, r.err
}
for {
if r.i < r.j {
n := copy(p, r.decoded[r.i:r.j])
r.i += n
return n, nil
}
if !r.readFull(r.buf[:4], true) {
return 0, r.err
}
chunkType := r.buf[0]
if !r.readHeader {
if chunkType != chunkTypeStreamIdentifier {
r.err = ErrCorrupt
return 0, r.err
}
r.readHeader = true
}
chunkLen := int(r.buf[1]) | int(r.buf[2])<<8 | int(r.buf[3])<<16
if chunkLen > len(r.buf) {
r.err = ErrUnsupported
return 0, r.err
}
// The chunk types are specified at
// https://github.com/google/snappy/blob/master/framing_format.txt
switch chunkType {
case chunkTypeCompressedData:
// Section 4.2. Compressed data (chunk type 0x00).
if chunkLen < checksumSize {
r.err = ErrCorrupt
return 0, r.err
}
buf := r.buf[:chunkLen]
if !r.readFull(buf, false) {
return 0, r.err
}
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
buf = buf[checksumSize:]
n, err := DecodedLen(buf)
if err != nil {
r.err = err
return 0, r.err
}
if n > len(r.decoded) {
r.err = ErrCorrupt
return 0, r.err
}
if _, err := Decode(r.decoded, buf); err != nil {
r.err = err
return 0, r.err
}
if crc(r.decoded[:n]) != checksum {
r.err = ErrCorrupt
return 0, r.err
}
r.i, r.j = 0, n
continue
case chunkTypeUncompressedData:
// Section 4.3. Uncompressed data (chunk type 0x01).
if chunkLen < checksumSize {
r.err = ErrCorrupt
return 0, r.err
}
buf := r.buf[:checksumSize]
if !r.readFull(buf, false) {
return 0, r.err
}
checksum := uint32(buf[0]) | uint32(buf[1])<<8 | uint32(buf[2])<<16 | uint32(buf[3])<<24
// Read directly into r.decoded instead of via r.buf.
n := chunkLen - checksumSize
if n > len(r.decoded) {
r.err = ErrCorrupt
return 0, r.err
}
if !r.readFull(r.decoded[:n], false) {
return 0, r.err
}
if crc(r.decoded[:n]) != checksum {
r.err = ErrCorrupt
return 0, r.err
}
r.i, r.j = 0, n
continue
case chunkTypeStreamIdentifier:
// Section 4.1. Stream identifier (chunk type 0xff).
if chunkLen != len(magicBody) {
r.err = ErrCorrupt
return 0, r.err
}
if !r.readFull(r.buf[:len(magicBody)], false) {
return 0, r.err
}
for i := 0; i < len(magicBody); i++ {
if r.buf[i] != magicBody[i] {
r.err = ErrCorrupt
return 0, r.err
}
}
continue
}
if chunkType <= 0x7f {
// Section 4.5. Reserved unskippable chunks (chunk types 0x02-0x7f).
r.err = ErrUnsupported
return 0, r.err
}
// Section 4.4 Padding (chunk type 0xfe).
// Section 4.6. Reserved skippable chunks (chunk types 0x80-0xfd).
if !r.readFull(r.buf[:chunkLen], false) {
return 0, r.err
}
}
}
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
package snappy
// decode has the same semantics as in decode_other.go.
//
//go:noescape
func decode(dst, src []byte) int
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 appengine !gc noasm
package snappy
// decode writes the decoding of src to dst. It assumes that the varint-encoded
// length of the decompressed bytes has already been read, and that len(dst)
// equals that length.
//
// It returns 0 on success or a decodeErrCodeXxx error code on failure.
func decode(dst, src []byte) int {
var d, s, offset, length int
for s < len(src) {
switch src[s] & 0x03 {
case tagLiteral:
x := uint32(src[s] >> 2)
switch {
case x < 60:
s++
case x == 60:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-1])
case x == 61:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-2]) | uint32(src[s-1])<<8
case x == 62:
s += 4
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-3]) | uint32(src[s-2])<<8 | uint32(src[s-1])<<16
case x == 63:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
x = uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24
}
length = int(x) + 1
if length <= 0 {
return decodeErrCodeUnsupportedLiteralLength
}
if length > len(dst)-d || length > len(src)-s {
return decodeErrCodeCorrupt
}
copy(dst[d:], src[s:s+length])
d += length
s += length
continue
case tagCopy1:
s += 2
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 4 + int(src[s-2])>>2&0x7
offset = int(uint32(src[s-2])&0xe0<<3 | uint32(src[s-1]))
case tagCopy2:
s += 3
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-3])>>2
offset = int(uint32(src[s-2]) | uint32(src[s-1])<<8)
case tagCopy4:
s += 5
if uint(s) > uint(len(src)) { // The uint conversions catch overflow from the previous line.
return decodeErrCodeCorrupt
}
length = 1 + int(src[s-5])>>2
offset = int(uint32(src[s-4]) | uint32(src[s-3])<<8 | uint32(src[s-2])<<16 | uint32(src[s-1])<<24)
}
if offset <= 0 || d < offset || length > len(dst)-d {
return decodeErrCodeCorrupt
}
// Copy from an earlier sub-slice of dst to a later sub-slice. Unlike
// the built-in copy function, this byte-by-byte copy always runs
// forwards, even if the slices overlap. Conceptually, this is:
//
// d += forwardCopy(dst[d:d+length], dst[d-offset:])
for end := d + length; d != end; d++ {
dst[d] = dst[d-offset]
}
}
if d != len(dst) {
return decodeErrCodeCorrupt
}
return 0
}
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package snappy
import (
"encoding/binary"
"errors"
"io"
)
// Encode returns the encoded form of src. The returned slice may be a sub-
// slice of dst if dst was large enough to hold the entire encoded block.
// Otherwise, a newly allocated slice will be returned.
//
// The dst and src must not overlap. It is valid to pass a nil dst.
func Encode(dst, src []byte) []byte {
if n := MaxEncodedLen(len(src)); n < 0 {
panic(ErrTooLarge)
} else if len(dst) < n {
dst = make([]byte, n)
}
// The block starts with the varint-encoded length of the decompressed bytes.
d := binary.PutUvarint(dst, uint64(len(src)))
for len(src) > 0 {
p := src
src = nil
if len(p) > maxBlockSize {
p, src = p[:maxBlockSize], p[maxBlockSize:]
}
if len(p) < minNonLiteralBlockSize {
d += emitLiteral(dst[d:], p)
} else {
d += encodeBlock(dst[d:], p)
}
}
return dst[:d]
}
// inputMargin is the minimum number of extra input bytes to keep, inside
// encodeBlock's inner loop. On some architectures, this margin lets us
// implement a fast path for emitLiteral, where the copy of short (<= 16 byte)
// literals can be implemented as a single load to and store from a 16-byte
// register. That literal's actual length can be as short as 1 byte, so this
// can copy up to 15 bytes too much, but that's OK as subsequent iterations of
// the encoding loop will fix up the copy overrun, and this inputMargin ensures
// that we don't overrun the dst and src buffers.
const inputMargin = 16 - 1
// minNonLiteralBlockSize is the minimum size of the input to encodeBlock that
// could be encoded with a copy tag. This is the minimum with respect to the
// algorithm used by encodeBlock, not a minimum enforced by the file format.
//
// The encoded output must start with at least a 1 byte literal, as there are
// no previous bytes to copy. A minimal (1 byte) copy after that, generated
// from an emitCopy call in encodeBlock's main loop, would require at least
// another inputMargin bytes, for the reason above: we want any emitLiteral
// calls inside encodeBlock's main loop to use the fast path if possible, which
// requires being able to overrun by inputMargin bytes. Thus,
// minNonLiteralBlockSize equals 1 + 1 + inputMargin.
//
// The C++ code doesn't use this exact threshold, but it could, as discussed at
// https://groups.google.com/d/topic/snappy-compression/oGbhsdIJSJ8/discussion
// The difference between Go (2+inputMargin) and C++ (inputMargin) is purely an
// optimization. It should not affect the encoded form. This is tested by
// TestSameEncodingAsCppShortCopies.
const minNonLiteralBlockSize = 1 + 1 + inputMargin
// MaxEncodedLen returns the maximum length of a snappy block, given its
// uncompressed length.
//
// It will return a negative value if srcLen is too large to encode.
func MaxEncodedLen(srcLen int) int {
n := uint64(srcLen)
if n > 0xffffffff {
return -1
}
// Compressed data can be defined as:
// compressed := item* literal*
// item := literal* copy
//
// The trailing literal sequence has a space blowup of at most 62/60
// since a literal of length 60 needs one tag byte + one extra byte
// for length information.
//
// Item blowup is trickier to measure. Suppose the "copy" op copies
// 4 bytes of data. Because of a special check in the encoding code,
// we produce a 4-byte copy only if the offset is < 65536. Therefore
// the copy op takes 3 bytes to encode, and this type of item leads
// to at most the 62/60 blowup for representing literals.
//
// Suppose the "copy" op copies 5 bytes of data. If the offset is big
// enough, it will take 5 bytes to encode the copy op. Therefore the
// worst case here is a one-byte literal followed by a five-byte copy.
// That is, 6 bytes of input turn into 7 bytes of "compressed" data.
//
// This last factor dominates the blowup, so the final estimate is:
n = 32 + n + n/6
if n > 0xffffffff {
return -1
}
return int(n)
}
var errClosed = errors.New("snappy: Writer is closed")
// NewWriter returns a new Writer that compresses to w.
//
// The Writer returned does not buffer writes. There is no need to Flush or
// Close such a Writer.
//
// Deprecated: the Writer returned is not suitable for many small writes, only
// for few large writes. Use NewBufferedWriter instead, which is efficient
// regardless of the frequency and shape of the writes, and remember to Close
// that Writer when done.
func NewWriter(w io.Writer) *Writer {
return &Writer{
w: w,
obuf: make([]byte, obufLen),
}
}
// NewBufferedWriter returns a new Writer that compresses to w, using the
// framing format described at
// https://github.com/google/snappy/blob/master/framing_format.txt
//
// The Writer returned buffers writes. Users must call Close to guarantee all
// data has been forwarded to the underlying io.Writer. They may also call
// Flush zero or more times before calling Close.
func NewBufferedWriter(w io.Writer) *Writer {
return &Writer{
w: w,
ibuf: make([]byte, 0, maxBlockSize),
obuf: make([]byte, obufLen),
}
}
// Writer is an io.Writer that can write Snappy-compressed bytes.
type Writer struct {
w io.Writer
err error
// ibuf is a buffer for the incoming (uncompressed) bytes.
//
// Its use is optional. For backwards compatibility, Writers created by the
// NewWriter function have ibuf == nil, do not buffer incoming bytes, and
// therefore do not need to be Flush'ed or Close'd.
ibuf []byte
// obuf is a buffer for the outgoing (compressed) bytes.
obuf []byte
// wroteStreamHeader is whether we have written the stream header.
wroteStreamHeader bool
}
// Reset discards the writer's state and switches the Snappy writer to write to
// w. This permits reusing a Writer rather than allocating a new one.
func (w *Writer) Reset(writer io.Writer) {
w.w = writer
w.err = nil
if w.ibuf != nil {
w.ibuf = w.ibuf[:0]
}
w.wroteStreamHeader = false
}
// Write satisfies the io.Writer interface.
func (w *Writer) Write(p []byte) (nRet int, errRet error) {
if w.ibuf == nil {
// Do not buffer incoming bytes. This does not perform or compress well
// if the caller of Writer.Write writes many small slices. This
// behavior is therefore deprecated, but still supported for backwards
// compatibility with code that doesn't explicitly Flush or Close.
return w.write(p)
}
// The remainder of this method is based on bufio.Writer.Write from the
// standard library.
for len(p) > (cap(w.ibuf)-len(w.ibuf)) && w.err == nil {
var n int
if len(w.ibuf) == 0 {
// Large write, empty buffer.
// Write directly from p to avoid copy.
n, _ = w.write(p)
} else {
n = copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
w.ibuf = w.ibuf[:len(w.ibuf)+n]
w.Flush()
}
nRet += n
p = p[n:]
}
if w.err != nil {
return nRet, w.err
}
n := copy(w.ibuf[len(w.ibuf):cap(w.ibuf)], p)
w.ibuf = w.ibuf[:len(w.ibuf)+n]
nRet += n
return nRet, nil
}
func (w *Writer) write(p []byte) (nRet int, errRet error) {
if w.err != nil {
return 0, w.err
}
for len(p) > 0 {
obufStart := len(magicChunk)
if !w.wroteStreamHeader {
w.wroteStreamHeader = true
copy(w.obuf, magicChunk)
obufStart = 0
}
var uncompressed []byte
if len(p) > maxBlockSize {
uncompressed, p = p[:maxBlockSize], p[maxBlockSize:]
} else {
uncompressed, p = p, nil
}
checksum := crc(uncompressed)
// Compress the buffer, discarding the result if the improvement
// isn't at least 12.5%.
compressed := Encode(w.obuf[obufHeaderLen:], uncompressed)
chunkType := uint8(chunkTypeCompressedData)
chunkLen := 4 + len(compressed)
obufEnd := obufHeaderLen + len(compressed)
if len(compressed) >= len(uncompressed)-len(uncompressed)/8 {
chunkType = chunkTypeUncompressedData
chunkLen = 4 + len(uncompressed)
obufEnd = obufHeaderLen
}
// Fill in the per-chunk header that comes before the body.
w.obuf[len(magicChunk)+0] = chunkType
w.obuf[len(magicChunk)+1] = uint8(chunkLen >> 0)
w.obuf[len(magicChunk)+2] = uint8(chunkLen >> 8)
w.obuf[len(magicChunk)+3] = uint8(chunkLen >> 16)
w.obuf[len(magicChunk)+4] = uint8(checksum >> 0)
w.obuf[len(magicChunk)+5] = uint8(checksum >> 8)
w.obuf[len(magicChunk)+6] = uint8(checksum >> 16)
w.obuf[len(magicChunk)+7] = uint8(checksum >> 24)
if _, err := w.w.Write(w.obuf[obufStart:obufEnd]); err != nil {
w.err = err
return nRet, err
}
if chunkType == chunkTypeUncompressedData {
if _, err := w.w.Write(uncompressed); err != nil {
w.err = err
return nRet, err
}
}
nRet += len(uncompressed)
}
return nRet, nil
}
// Flush flushes the Writer to its underlying io.Writer.
func (w *Writer) Flush() error {
if w.err != nil {
return w.err
}
if len(w.ibuf) == 0 {
return nil
}
w.write(w.ibuf)
w.ibuf = w.ibuf[:0]
return w.err
}
// Close calls Flush and then closes the Writer.
func (w *Writer) Close() error {
w.Flush()
ret := w.err
if w.err == nil {
w.err = errClosed
}
return ret
}
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !appengine
// +build gc
// +build !noasm
package snappy
// emitLiteral has the same semantics as in encode_other.go.
//
//go:noescape
func emitLiteral(dst, lit []byte) int
// emitCopy has the same semantics as in encode_other.go.
//
//go:noescape
func emitCopy(dst []byte, offset, length int) int
// extendMatch has the same semantics as in encode_other.go.
//
//go:noescape
func extendMatch(src []byte, i, j int) int
// encodeBlock has the same semantics as in encode_other.go.
//
//go:noescape
func encodeBlock(dst, src []byte) (d int)
// Copyright 2016 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64 appengine !gc noasm
package snappy
func load32(b []byte, i int) uint32 {
b = b[i : i+4 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
}
func load64(b []byte, i int) uint64 {
b = b[i : i+8 : len(b)] // Help the compiler eliminate bounds checks on the next line.
return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 |
uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
}
// emitLiteral writes a literal chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= len(lit) && len(lit) <= 65536
func emitLiteral(dst, lit []byte) int {
i, n := 0, uint(len(lit)-1)
switch {
case n < 60:
dst[0] = uint8(n)<<2 | tagLiteral
i = 1
case n < 1<<8:
dst[0] = 60<<2 | tagLiteral
dst[1] = uint8(n)
i = 2
default:
dst[0] = 61<<2 | tagLiteral
dst[1] = uint8(n)
dst[2] = uint8(n >> 8)
i = 3
}
return i + copy(dst[i:], lit)
}
// emitCopy writes a copy chunk and returns the number of bytes written.
//
// It assumes that:
// dst is long enough to hold the encoded bytes
// 1 <= offset && offset <= 65535
// 4 <= length && length <= 65535
func emitCopy(dst []byte, offset, length int) int {
i := 0
// The maximum length for a single tagCopy1 or tagCopy2 op is 64 bytes. The
// threshold for this loop is a little higher (at 68 = 64 + 4), and the
// length emitted down below is is a little lower (at 60 = 64 - 4), because
// it's shorter to encode a length 67 copy as a length 60 tagCopy2 followed
// by a length 7 tagCopy1 (which encodes as 3+2 bytes) than to encode it as
// a length 64 tagCopy2 followed by a length 3 tagCopy2 (which encodes as
// 3+3 bytes). The magic 4 in the 64±4 is because the minimum length for a
// tagCopy1 op is 4 bytes, which is why a length 3 copy has to be an
// encodes-as-3-bytes tagCopy2 instead of an encodes-as-2-bytes tagCopy1.
for length >= 68 {
// Emit a length 64 copy, encoded as 3 bytes.
dst[i+0] = 63<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
i += 3
length -= 64
}
if length > 64 {
// Emit a length 60 copy, encoded as 3 bytes.
dst[i+0] = 59<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
i += 3
length -= 60
}
if length >= 12 || offset >= 2048 {
// Emit the remaining copy, encoded as 3 bytes.
dst[i+0] = uint8(length-1)<<2 | tagCopy2
dst[i+1] = uint8(offset)
dst[i+2] = uint8(offset >> 8)
return i + 3
}
// Emit the remaining copy, encoded as 2 bytes.
dst[i+0] = uint8(offset>>8)<<5 | uint8(length-4)<<2 | tagCopy1
dst[i+1] = uint8(offset)
return i + 2
}
// extendMatch returns the largest k such that k <= len(src) and that
// src[i:i+k-j] and src[j:k] have the same contents.
//
// It assumes that:
// 0 <= i && i < j && j <= len(src)
func extendMatch(src []byte, i, j int) int {
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
}
return j
}
func hash(u, shift uint32) uint32 {
return (u * 0x1e35a7bd) >> shift
}
// encodeBlock encodes a non-empty src to a guaranteed-large-enough dst. It
// assumes that the varint-encoded length of the decompressed bytes has already
// been written.
//
// It also assumes that:
// len(dst) >= MaxEncodedLen(len(src)) &&
// minNonLiteralBlockSize <= len(src) && len(src) <= maxBlockSize
func encodeBlock(dst, src []byte) (d int) {
// Initialize the hash table. Its size ranges from 1<<8 to 1<<14 inclusive.
// The table element type is uint16, as s < sLimit and sLimit < len(src)
// and len(src) <= maxBlockSize and maxBlockSize == 65536.
const (
maxTableSize = 1 << 14
// tableMask is redundant, but helps the compiler eliminate bounds
// checks.
tableMask = maxTableSize - 1
)
shift := uint32(32 - 8)
for tableSize := 1 << 8; tableSize < maxTableSize && tableSize < len(src); tableSize *= 2 {
shift--
}
// In Go, all array elements are zero-initialized, so there is no advantage
// to a smaller tableSize per se. However, it matches the C++ algorithm,
// and in the asm versions of this code, we can get away with zeroing only
// the first tableSize elements.
var table [maxTableSize]uint16
// sLimit is when to stop looking for offset/length copies. The inputMargin
// lets us use a fast path for emitLiteral in the main loop, while we are
// looking for copies.
sLimit := len(src) - inputMargin
// nextEmit is where in src the next emitLiteral should start from.
nextEmit := 0
// The encoded form must start with a literal, as there are no previous
// bytes to copy, so we start looking for hash matches at s == 1.
s := 1
nextHash := hash(load32(src, s), shift)
for {
// Copied from the C++ snappy implementation:
//
// Heuristic match skipping: If 32 bytes are scanned with no matches
// found, start looking only at every other byte. If 32 more bytes are
// scanned (or skipped), look at every third byte, etc.. When a match
// is found, immediately go back to looking at every byte. This is a
// small loss (~5% performance, ~0.1% density) for compressible data
// due to more bookkeeping, but for non-compressible data (such as
// JPEG) it's a huge win since the compressor quickly "realizes" the
// data is incompressible and doesn't bother looking for matches
// everywhere.
//
// The "skip" variable keeps track of how many bytes there are since
// the last match; dividing it by 32 (ie. right-shifting by five) gives
// the number of bytes to move ahead for each iteration.
skip := 32
nextS := s
candidate := 0
for {
s = nextS
bytesBetweenHashLookups := skip >> 5
nextS = s + bytesBetweenHashLookups
skip += bytesBetweenHashLookups
if nextS > sLimit {
goto emitRemainder
}
candidate = int(table[nextHash&tableMask])
table[nextHash&tableMask] = uint16(s)
nextHash = hash(load32(src, nextS), shift)
if load32(src, s) == load32(src, candidate) {
break
}
}
// A 4-byte match has been found. We'll later see if more than 4 bytes
// match. But, prior to the match, src[nextEmit:s] are unmatched. Emit
// them as literal bytes.
d += emitLiteral(dst[d:], src[nextEmit:s])
// Call emitCopy, and then see if another emitCopy could be our next
// move. Repeat until we find no match for the input immediately after
// what was consumed by the last emitCopy call.
//
// If we exit this loop normally then we need to call emitLiteral next,
// though we don't yet know how big the literal will be. We handle that
// by proceeding to the next iteration of the main loop. We also can
// exit this loop via goto if we get close to exhausting the input.
for {
// Invariant: we have a 4-byte match at s, and no need to emit any
// literal bytes prior to s.
base := s
// Extend the 4-byte match as long as possible.
//
// This is an inlined version of:
// s = extendMatch(src, candidate+4, s+4)
s += 4
for i := candidate + 4; s < len(src) && src[i] == src[s]; i, s = i+1, s+1 {
}
d += emitCopy(dst[d:], base-candidate, s-base)
nextEmit = s
if s >= sLimit {
goto emitRemainder
}
// We could immediately start working at s now, but to improve
// compression we first update the hash table at s-1 and at s. If
// another emitCopy is not our next move, also calculate nextHash
// at s+1. At least on GOARCH=amd64, these three hash calculations
// are faster as one load64 call (with some shifts) instead of
// three load32 calls.
x := load64(src, s-1)
prevHash := hash(uint32(x>>0), shift)
table[prevHash&tableMask] = uint16(s - 1)
currHash := hash(uint32(x>>8), shift)
candidate = int(table[currHash&tableMask])
table[currHash&tableMask] = uint16(s)
if uint32(x>>8) != load32(src, candidate) {
nextHash = hash(uint32(x>>16), shift)
s++
break
}
}
}
emitRemainder:
if nextEmit < len(src) {
d += emitLiteral(dst[d:], src[nextEmit:])
}
return d
}
module github.com/golang/snappy
// Copyright 2011 The Snappy-Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package snappy implements the Snappy compression format. It aims for very
// high speeds and reasonable compression.
//
// There are actually two Snappy formats: block and stream. They are related,
// but different: trying to decompress block-compressed data as a Snappy stream
// will fail, and vice versa. The block format is the Decode and Encode
// functions and the stream format is the Reader and Writer types.
//
// The block format, the more common case, is used when the complete size (the
// number of bytes) of the original data is known upfront, at the time
// compression starts. The stream format, also known as the framing format, is
// for when that isn't always true.
//
// The canonical, C++ implementation is at https://github.com/google/snappy and
// it only implements the block format.
package snappy // import "github.com/golang/snappy"
import (
"hash/crc32"
)
/*
Each encoded block begins with the varint-encoded length of the decoded data,
followed by a sequence of chunks. Chunks begin and end on byte boundaries. The
first byte of each chunk is broken into its 2 least and 6 most significant bits
called l and m: l ranges in [0, 4) and m ranges in [0, 64). l is the chunk tag.
Zero means a literal tag. All other values mean a copy tag.
For literal tags:
- If m < 60, the next 1 + m bytes are literal bytes.
- Otherwise, let n be the little-endian unsigned integer denoted by the next
m - 59 bytes. The next 1 + n bytes after that are literal bytes.
For copy tags, length bytes are copied from offset bytes ago, in the style of
Lempel-Ziv compression algorithms. In particular:
- For l == 1, the offset ranges in [0, 1<<11) and the length in [4, 12).
The length is 4 + the low 3 bits of m. The high 3 bits of m form bits 8-10
of the offset. The next byte is bits 0-7 of the offset.
- For l == 2, the offset ranges in [0, 1<<16) and the length in [1, 65).
The length is 1 + m. The offset is the little-endian unsigned integer
denoted by the next 2 bytes.
- For l == 3, this tag is a legacy format that is no longer issued by most
encoders. Nonetheless, the offset ranges in [0, 1<<32) and the length in
[1, 65). The length is 1 + m. The offset is the little-endian unsigned
integer denoted by the next 4 bytes.
*/
const (
tagLiteral = 0x00
tagCopy1 = 0x01
tagCopy2 = 0x02
tagCopy4 = 0x03
)
const (
checksumSize = 4
chunkHeaderSize = 4
magicChunk = "\xff\x06\x00\x00" + magicBody
magicBody = "sNaPpY"
// maxBlockSize is the maximum size of the input to encodeBlock. It is not
// part of the wire format per se, but some parts of the encoder assume
// that an offset fits into a uint16.
//
// Also, for the framing format (Writer type instead of Encode function),
// https://github.com/google/snappy/blob/master/framing_format.txt says
// that "the uncompressed data in a chunk must be no longer than 65536
// bytes".
maxBlockSize = 65536
// maxEncodedLenOfMaxBlockSize equals MaxEncodedLen(maxBlockSize), but is
// hard coded to be a const instead of a variable, so that obufLen can also
// be a const. Their equivalence is confirmed by
// TestMaxEncodedLenOfMaxBlockSize.
maxEncodedLenOfMaxBlockSize = 76490
obufHeaderLen = len(magicChunk) + checksumSize + chunkHeaderSize
obufLen = obufHeaderLen + maxEncodedLenOfMaxBlockSize
)
const (
chunkTypeCompressedData = 0x00
chunkTypeUncompressedData = 0x01
chunkTypePadding = 0xfe
chunkTypeStreamIdentifier = 0xff
)
var crcTable = crc32.MakeTable(crc32.Castagnoli)
// crc implements the checksum specified in section 3 of
// https://github.com/google/snappy/blob/master/framing_format.txt
func crc(b []byte) uint32 {
c := crc32.Update(0, crcTable, b)
return uint32(c>>15|c<<17) + 0xa282ead8
}
Goavro was originally created during the Fall of 2014 at LinkedIn,
Corp., in New York City, New York, USA.
The following persons, listed in alphabetical order, have participated
with goavro development by contributing code and test cases.
Alan Gardner <alanctgardner@gmail.com>
Billy Hand <bhand@mediamath.com>
Christian Blades <christian.blades@careerbuilder.com>
Corey Scott <corey.scott@gmail.com>
Darshan Shaligram <scintilla@gmail.com>
Dylan Wen <hhkbp2@gmail.com>
Enrico Candino <enrico.candino@gmail.com>
Fellyn Silliman <fsilliman@linkedin.com>
James Crasta <jcrasta@underarmour.com>
Jeff Haynie <jhaynie@gmail.com>
Joe Roth <joseph_roth@cable.comcast.com>
Karrick S. McDermott <kmcdermott@linkedin.com>
Kasey Klipsch <kklipsch@mediamath.com>
Michael Johnson <mijohnson@linkedin.com>
Murray Resinski <murray.resinski@octanner.com>
Nicolas Kaiser <nikai@nikai.net>
Sebastien Launay <sebastien@opendns.com>
Thomas Desrosiers <thomasdesr@gmail.com>
kklipsch <junk@klipsch.net>
seborama <sebastien.chatal@sainsburys.co.uk>
A big thank you to these persons who provided testing and amazing
feedback to goavro during its initial implementation:
Dennis Ordanov <dordanov@linkedin.com>
Thomas Desrosiers <thomasdesr@gmail.com>
Also a big thank you is extended to our supervisors who supported our
efforts to bring goavro to the open source community:
Greg Leffler <gleffler@linkedin.com>
Nick Berry <niberry@linkedin.com>
From the Avro specification:
default: A default value for this field, used when reading instances
that lack this field (optional). Permitted values depend on the
field's schema type, according to the table below. Default values for
union fields correspond to the first schema in the union. Default
values for bytes and fixed fields are JSON strings, where Unicode code
points 0-255 are mapped to unsigned 8-bit byte values 0-255. I read
the above to mean that the purpose of default values are to allow
reading Avro data that was written without the fields, and not
necessarily augmentation of data being serialized. So in general I
agree with you in terms of purpose.
One very important aspect of Avro is that the schema used to serialize
the data should always remain with the data, so that a reader would
always be able to read the schema and then be able to consume the
data. I think most people still agree so far.
However, this is where things get messy. Schema evolution is
frequently cited when folks want to use a new version of the schema to
read data that was once written using an older version of that schema.
I do not believe the Avro specification properly handles schema
evolution. Here's a simple example:
```
Record v0:
name: string
nickname: string, default: ""
```
```
Record v1:
name: string
nickname: string, default: ""
title: string, default: ""
```
Okay, now a binary stream of records is just a bunch of strings. Let's
do that now.
```
0x0A, A, l, i, c, e, 0x06, B, o, b, 0x0A, B, r, u, c, e, 0x0A, S, a, l, l, y, 0x06, A, n, n
```
How many records is that? It could be as many as 5 records, each of a
single name and no nicknames. It could be as few as 2 records, one of
them with a nickname and a title, and one with only a nickname, or a
title.
Now to drive home the nail that Avro schema evolution is broken, even
if each record had a header that indicated how many bytes it would
consume, we could know where one record began and ended, and how many
records there are. But if we were to read a record with two strings
in it, is the second string the nickname or the title?
The Avro specification has no answer to that question, so neither do I.
Effectively, Avro could be a great tool for serializing complex data,
but it's broken in its current form, and to fix it would require it to
break compatibility with itself, effectively rendering any binary data
serialized in a previous version of Avro unreadable by new versions,
unless it had some sort of version marker on the data so a library
could branch.
One great solution would be augmenting the binary encoding with a
simple field number identifier. Let's imagine an Avro 2.x that had
this feature, and would support schema evolution. Here's an example
stream of bytes that could be unambiguously decoded using the new
schema:
```
0x02, 0x0A, A, l, i, c, e, 0x02, 0x06, B, o, B, 0x04, 0x0A, B, r, u, c, e, 0x02, 0x0C, C, h, a, r, l, i, e, 0x06, 0x04, M, r
```
In the above example of my fake Avro 2.0, this can be
deterministically decoded because 0x02 indicates the following is
field number 1 (name), followed by string length 5, followed by
Alice.
Then the decoder would see 0x02, marking field number 1 again,
which means, "next record", followed by string length 3, followed by
Bob, followed by 0x04, which means field number 2 (nickname), followed
by string length 5, followed by Bruce.
Followed by field number 1 (next record), followed by string length 6,
followed by Charlie, followed by field number 3 (title), followed by
string length 2, followed by Mr.
In my hypothetical version of Avro 2, Avro can cope with schema
evolution using record defaults and such. Sadly, Avro 1.x cannot and
thus we should avoid using it if your use-case requires schema
evolution.
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"math"
"reflect"
)
func makeArrayCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
// array type must have items
itemSchema, ok := schemaMap["items"]
if !ok {
return nil, fmt.Errorf("Array ought to have items key")
}
itemCodec, err := buildCodec(st, enclosingNamespace, itemSchema)
if err != nil {
return nil, fmt.Errorf("Array items ought to be valid Avro type: %s", err)
}
return &Codec{
typeName: &name{"array", nullNamespace},
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) {
var value interface{}
var err error
// block count and block size
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err)
}
blockCount := value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long encoded
// block size following the negative block count. We have no use
// for the block size in this decoder, so we read and discard
// the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be made positive
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// NOTE: While the attempt of a RAM optimization shown below is not
// necessary, many encoders will encode all items in a single block.
// We can optimize amount of RAM allocated by runtime for the array
// by initializing the array for that number of items.
arrayValues := make([]interface{}, 0, blockCount)
for blockCount != 0 {
// Decode `blockCount` datum values from buffer
for i := int64(0); i < blockCount; i++ {
if value, buf, err = itemCodec.nativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array item %d: %s", i+1, err)
}
arrayValues = append(arrayValues, value)
}
// Decode next blockCount from buffer, because there may be more blocks
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block count: %s", err)
}
blockCount = value.(int64)
if blockCount < 0 {
// NOTE: A negative block count implies there is a long
// encoded block size following the negative block count. We
// have no use for the block size in this decoder, so we
// read and discard the value.
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can
// never be made positive
return nil, nil, fmt.Errorf("cannot decode binary array with block count: %d", blockCount)
}
blockCount = -blockCount // convert to its positive equivalent
if _, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary array block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, nil, fmt.Errorf("cannot decode binary array when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
}
return arrayValues, buf, nil
},
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
arrayValues, err := convertArray(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode binary array: %s", err)
}
arrayLength := int64(len(arrayValues))
var alreadyEncoded, remainingInBlock int64
for i, item := range arrayValues {
if remainingInBlock == 0 { // start a new block
remainingInBlock = arrayLength - alreadyEncoded
if remainingInBlock > MaxBlockCount {
// limit block count to MacBlockCount
remainingInBlock = MaxBlockCount
}
buf, _ = longBinaryFromNative(buf, remainingInBlock)
}
if buf, err = itemCodec.binaryFromNative(buf, item); err != nil {
return nil, fmt.Errorf("cannot encode binary array item %d: %v: %s", i+1, item, err)
}
remainingInBlock--
alreadyEncoded++
}
return longBinaryFromNative(buf, 0) // append trailing 0 block count to signal end of Array
},
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) {
var arrayValues []interface{}
var value interface{}
var err error
var b byte
if buf, err = advanceAndConsume(buf, '['); err != nil {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err)
}
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
// NOTE: Special case for empty array
if buf[0] == ']' {
return arrayValues, buf[1:], nil
}
// NOTE: Also terminates when read ']' byte.
for len(buf) > 0 {
// decode value
value, buf, err = itemCodec.nativeFromTextual(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", err)
}
arrayValues = append(arrayValues, value)
// either comma or closing curly brace
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
switch b = buf[0]; b {
case ']':
return arrayValues, buf[1:], nil
case ',':
// no-op
default:
return nil, nil, fmt.Errorf("cannot decode textual array: expected ',' or ']'; received: %q", b)
}
// NOTE: consume comma from above
if buf, _ = advanceToNonWhitespace(buf[1:]); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual array: %s", io.ErrShortBuffer)
}
}
return nil, buf, io.ErrShortBuffer
},
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
arrayValues, err := convertArray(datum)
if err != nil {
return nil, fmt.Errorf("cannot encode textual array: %s", err)
}
var atLeastOne bool
buf = append(buf, '[')
for i, item := range arrayValues {
atLeastOne = true
// Encode value
buf, err = itemCodec.textualFromNative(buf, item)
if err != nil {
// field was specified in datum; therefore its value was invalid
return nil, fmt.Errorf("cannot encode textual array item %d; %v: %s", i+1, item, err)
}
buf = append(buf, ',')
}
if atLeastOne {
return append(buf[:len(buf)-1], ']'), nil
}
return append(buf, ']'), nil
},
}, nil
}
// convertArray converts interface{} to []interface{} if possible.
func convertArray(datum interface{}) ([]interface{}, error) {
arrayValues, ok := datum.([]interface{})
if ok {
return arrayValues, nil
}
// NOTE: When given a slice of any other type, zip values to
// items as a convenience to client.
v := reflect.ValueOf(datum)
if v.Kind() != reflect.Slice {
return nil, fmt.Errorf("cannot create []interface{}: expected slice; received: %T", datum)
}
// NOTE: Two better alternatives to the current algorithm are:
// (1) mutate the reflection tuple underneath to convert the
// []int, for example, to []interface{}, with O(1) complexity
// (2) use copy builtin to zip the data items over with O(n) complexity,
// but more efficient than what's below.
// Suggestions?
arrayValues = make([]interface{}, v.Len())
for idx := 0; idx < v.Len(); idx++ {
arrayValues[idx] = v.Index(idx).Interface()
}
return arrayValues, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"math"
)
// bytesBinaryReader reads bytes from io.Reader and returns byte slice of
// specified size or the error encountered while trying to read those bytes.
func bytesBinaryReader(ior io.Reader) ([]byte, error) {
size, err := longBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read bytes: cannot read size: %s", err)
}
if size < 0 {
return nil, fmt.Errorf("cannot read bytes: size is negative: %d", size)
}
if size > MaxBlockSize {
return nil, fmt.Errorf("cannot read bytes: size exceeds MaxBlockSize: %d > %d", size, MaxBlockSize)
}
buf := make([]byte, size)
_, err = io.ReadAtLeast(ior, buf, int(size))
if err != nil {
return nil, fmt.Errorf("cannot read bytes: %s", err)
}
return buf, nil
}
// longBinaryReader reads bytes from io.Reader until has complete long value, or
// read error.
func longBinaryReader(ior io.Reader) (int64, error) {
var value uint64
var shift uint
var err error
var b byte
// NOTE: While benchmarks show it's more performant to invoke ReadByte when
// available, testing whether a variable's data type implements a particular
// method is quite slow too. So perform the test once, and branch to the
// appropriate loop based on the results.
if byteReader, ok := ior.(io.ByteReader); ok {
for {
if b, err = byteReader.ReadByte(); err != nil {
return 0, err // NOTE: must send back unaltered error to detect io.EOF
}
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), nil
}
shift += 7
}
}
// NOTE: ior does not also implement io.ByteReader, so we must allocate a
// byte slice with a single byte, and read each byte into the slice.
buf := make([]byte, 1)
for {
if _, err = ior.Read(buf); err != nil {
return 0, err // NOTE: must send back unaltered error to detect io.EOF
}
b = buf[0]
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), nil
}
shift += 7
}
}
// metadataBinaryReader reads bytes from io.Reader until has entire map value,
// or read error.
func metadataBinaryReader(ior io.Reader) (map[string][]byte, error) {
var err error
var value interface{}
// block count and block size
if value, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block count: %s", err)
}
blockCount := value.(int64)
if blockCount < 0 {
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be
// made positive
return nil, fmt.Errorf("cannot read map with block count: %d", blockCount)
}
// NOTE: A negative block count implies there is a long encoded block
// size following the negative block count. We have no use for the block
// size in this decoder, so we read and discard the value.
blockCount = -blockCount // convert to its positive equivalent
if _, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// NOTE: While the attempt of a RAM optimization shown below is not
// necessary, many encoders will encode all items in a single block. We can
// optimize amount of RAM allocated by runtime for the array by initializing
// the array for that number of items.
mapValues := make(map[string][]byte, blockCount)
for blockCount != 0 {
// Decode `blockCount` datum values from buffer
for i := int64(0); i < blockCount; i++ {
// first decode the key string
keyBytes, err := bytesBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read map key: %s", err)
}
key := string(keyBytes)
if _, ok := mapValues[key]; ok {
return nil, fmt.Errorf("cannot read map: duplicate key: %q", key)
}
// metadata values are always bytes
buf, err := bytesBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read map value for key %q: %s", key, err)
}
mapValues[key] = buf
}
// Decode next blockCount from buffer, because there may be more blocks
if value, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block count: %s", err)
}
blockCount = value.(int64)
if blockCount < 0 {
if blockCount == math.MinInt64 {
// The minimum number for any signed numerical type can never be
// made positive
return nil, fmt.Errorf("cannot read map with block count: %d", blockCount)
}
// NOTE: A negative block count implies there is a long encoded
// block size following the negative block count. We have no use for
// the block size in this decoder, so we read and discard the value.
blockCount = -blockCount // convert to its positive equivalent
if _, err = longBinaryReader(ior); err != nil {
return nil, fmt.Errorf("cannot read map block size: %s", err)
}
}
// Ensure block count does not exceed some sane value.
if blockCount > MaxBlockCount {
return nil, fmt.Errorf("cannot read map when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
}
return mapValues, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
"io"
)
func booleanNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < 1 {
return nil, nil, io.ErrShortBuffer
}
var b byte
b, buf = buf[0], buf[1:]
switch b {
case byte(0):
return false, buf, nil
case byte(1):
return true, buf, nil
default:
return nil, nil, fmt.Errorf("cannot decode binary boolean: expected: Go byte(0) or byte(1); received: byte(%d)", b)
}
}
func booleanBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
value, ok := datum.(bool)
if !ok {
return nil, fmt.Errorf("cannot encode binary boolean: expected: Go bool; received: %T", datum)
}
var b byte
if value {
b = 1
}
return append(buf, b), nil
}
func booleanNativeFromTextual(buf []byte) (interface{}, []byte, error) {
if len(buf) < 4 {
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:4], []byte("true")) {
return true, buf[4:], nil
}
if len(buf) < 5 {
return nil, nil, fmt.Errorf("cannot decode textual boolean: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:5], []byte("false")) {
return false, buf[5:], nil
}
return nil, nil, errors.New("expected false or true")
}
func booleanTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
value, ok := datum.(bool)
if !ok {
return nil, fmt.Errorf("boolean: expected: Go bool; received: %T", datum)
}
if value {
return append(buf, "true"...), nil
}
return append(buf, "false"...), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"sort"
"strconv"
"strings"
)
// pcfProcessor is a function type that given a parsed JSON object, returns its
// Parsing Canonical Form according to the Avro specification.
type pcfProcessor func(s interface{}) (string, error)
// parsingCanonialForm returns the "Parsing Canonical Form" (pcf) for a parsed
// JSON structure of a valid Avro schema, or an error describing the schema
// error.
func parsingCanonicalForm(schema interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
switch val := schema.(type) {
case map[string]interface{}:
// JSON objects are decoded as a map of strings to empty interfaces
return pcfObject(val, parentNamespace, typeLookup)
case []interface{}:
// JSON arrays are decoded as a slice of empty interfaces
return pcfArray(val, parentNamespace, typeLookup)
case string:
// JSON string values are decoded as a Go string
return pcfString(val, typeLookup)
case float64:
// JSON numerical values are decoded as Go float64
return pcfNumber(val)
default:
return "", fmt.Errorf("cannot parse schema with invalid schema type; ought to be map[string]interface{}, []interface{}, string, or float64; received: %T: %v", schema, schema)
}
}
// pcfNumber returns the parsing canonical form for a numerical value.
func pcfNumber(val float64) (string, error) {
return strconv.FormatFloat(val, 'g', -1, 64), nil
}
// pcfString returns the parsing canonical form for a string value.
func pcfString(val string, typeLookup map[string]string) (string, error) {
if canonicalName, ok := typeLookup[val]; ok {
return `"` + canonicalName + `"`, nil
}
return `"` + val + `"`, nil
}
// pcfArray returns the parsing canonical form for a JSON array.
func pcfArray(val []interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
items := make([]string, len(val))
for i, el := range val {
p, err := parsingCanonicalForm(el, parentNamespace, typeLookup)
if err != nil {
return "", err
}
items[i] = p
}
return "[" + strings.Join(items, ",") + "]", nil
}
// pcfObject returns the parsing canonical form for a JSON object.
func pcfObject(jsonMap map[string]interface{}, parentNamespace string, typeLookup map[string]string) (string, error) {
pairs := make(stringPairs, 0, len(jsonMap))
// Remember the namespace to fully qualify names later
var namespace string
if namespaceJSON, ok := jsonMap["namespace"]; ok {
if namespaceStr, ok := namespaceJSON.(string); ok {
// and it's value is string (otherwise invalid schema)
if parentNamespace == "" {
namespace = namespaceStr
} else {
namespace = parentNamespace + "." + namespaceStr
}
parentNamespace = namespace
}
} else if objectType, ok := jsonMap["type"]; ok && objectType == "record" {
namespace = parentNamespace
}
for k, v := range jsonMap {
// Reduce primitive schemas to their simple form.
if len(jsonMap) == 1 && k == "type" {
if t, ok := v.(string); ok {
return "\"" + t + "\"", nil
}
}
// Only keep relevant attributes (strip 'doc', 'alias', 'namespace')
if _, ok := fieldOrder[k]; !ok {
continue
}
// Add namespace to a non-qualified name.
if k == "name" && namespace != "" {
// Check if the name isn't already qualified.
if t, ok := v.(string); ok && !strings.ContainsRune(t, '.') {
v = namespace + "." + t
typeLookup[t] = v.(string)
}
}
// Only fixed type allows size, and we must convert a string size to a
// float.
if k == "size" {
if s, ok := v.(string); ok {
s, err := strconv.ParseUint(s, 10, 0)
if err != nil {
// should never get here because already validated schema
return "", fmt.Errorf("Fixed size ought to be number greater than zero: %v", s)
}
v = float64(s)
}
}
pk, err := parsingCanonicalForm(k, parentNamespace, typeLookup)
if err != nil {
return "", err
}
pv, err := parsingCanonicalForm(v, parentNamespace, typeLookup)
if err != nil {
return "", err
}
pairs = append(pairs, stringPair{k, pk + ":" + pv})
}
// Sort keys by their order in specification.
sort.Sort(byAvroFieldOrder(pairs))
return "{" + strings.Join(pairs.Bs(), ",") + "}", nil
}
// stringPair represents a pair of string values.
type stringPair struct {
A string
B string
}
// stringPairs is a sortable slice of pairs of strings.
type stringPairs []stringPair
// Bs returns an array of second values of an array of pairs.
func (sp *stringPairs) Bs() []string {
items := make([]string, len(*sp))
for i, el := range *sp {
items[i] = el.B
}
return items
}
// fieldOrder defines fields that show up in canonical schema and specifies
// their precedence.
var fieldOrder = map[string]int{
"name": 1,
"type": 2,
"fields": 3,
"symbols": 4,
"items": 5,
"values": 6,
"size": 7,
}
// byAvroFieldOrder is equipped with a sort order of fields according to the
// specification.
type byAvroFieldOrder []stringPair
func (s byAvroFieldOrder) Len() int {
return len(s)
}
func (s byAvroFieldOrder) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s byAvroFieldOrder) Less(i, j int) bool {
return fieldOrder[s[i].A] < fieldOrder[s[j].A]
}
// +build goavro_debug
package goavro
import (
"fmt"
"os"
)
// debug formats and prints arguments to stderr for development builds
func debug(f string, a ...interface{}) {
os.Stderr.Write([]byte("goavro: " + fmt.Sprintf(f, a...)))
}
// +build !goavro_debug
package goavro
// debug is a no-op for release builds, and the function call is optimized out
// by the compiler.
func debug(_ string, _ ...interface{}) {}
/*
Package goavro is a library that encodes and decodes Avro data.
Goavro provides methods to encode native Go data into both binary and textual
JSON Avro data, and methods to decode both binary and textual JSON Avro data to
native Go data.
Goavro also provides methods to read and write Object Container File (OCF)
formatted files, and the library contains example programs to read and write OCF
files.
Usage Example:
package main
import (
"fmt"
"github.com/linkedin/goavro"
)
func main() {
codec, err := goavro.NewCodec(`
{
"type": "record",
"name": "LongList",
"fields" : [
{"name": "next", "type": ["null", "LongList", {"type": "long", "logicalType": "timestamp-millis"}], "default": null}
]
}`)
if err != nil {
fmt.Println(err)
}
// NOTE: May omit fields when using default value
textual := []byte(`{"next":{"LongList":{}}}`)
// Convert textual Avro data (in Avro JSON format) to native Go form
native, _, err := codec.NativeFromTextual(textual)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to binary Avro data
binary, err := codec.BinaryFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// Convert binary Avro data back to native Go form
native, _, err = codec.NativeFromBinary(binary)
if err != nil {
fmt.Println(err)
}
// Convert native Go form to textual Avro data
textual, err = codec.TextualFromNative(nil, native)
if err != nil {
fmt.Println(err)
}
// NOTE: Textual encoding will show all fields, even those with values that
// match their default values
fmt.Println(string(textual))
// Output: {"next":{"LongList":{"next":null}}}
}
*/
package goavro
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
)
// enum does not have child objects, therefore whatever namespace it defines is
// just to store its name in the symbol table.
func makeEnumCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Enum ought to have valid name: %s", err)
}
// enum type must have symbols
s1, ok := schemaMap["symbols"]
if !ok {
return nil, fmt.Errorf("Enum %q ought to have symbols key", c.typeName)
}
s2, ok := s1.([]interface{})
if !ok || len(s2) == 0 {
return nil, fmt.Errorf("Enum %q symbols ought to be non-empty array of strings: %v", c.typeName, s1)
}
symbols := make([]string, len(s2))
for i, s := range s2 {
symbol, ok := s.(string)
if !ok {
return nil, fmt.Errorf("Enum %q symbol %d ought to be non-empty string; received: %T", c.typeName, i+1, symbol)
}
if err := checkString(symbol); err != nil {
return nil, fmt.Errorf("Enum %q symbol %d ought to %s", c.typeName, i+1, err)
}
symbols[i] = symbol
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
var value interface{}
var err error
var index int64
if value, buf, err = longNativeFromBinary(buf); err != nil {
return nil, nil, fmt.Errorf("cannot decode binary enum %q index: %s", c.typeName, err)
}
index = value.(int64)
if index < 0 || index >= int64(len(symbols)) {
return nil, nil, fmt.Errorf("cannot decode binary enum %q: index ought to be between 0 and %d; read index: %d", c.typeName, len(symbols)-1, index)
}
return symbols[index], buf, nil
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
someString, ok := datum.(string)
if !ok {
return nil, fmt.Errorf("cannot encode binary enum %q: expected string; received: %T", c.typeName, datum)
}
for i, symbol := range symbols {
if symbol == someString {
return longBinaryFromNative(buf, i)
}
}
return nil, fmt.Errorf("cannot encode binary enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
if buf, _ = advanceToNonWhitespace(buf); len(buf) == 0 {
return nil, nil, fmt.Errorf("cannot decode textual enum: %s", io.ErrShortBuffer)
}
// decode enum string
var value interface{}
var err error
value, buf, err = stringNativeFromTextual(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual enum: expected key: %s", err)
}
someString := value.(string)
for _, symbol := range symbols {
if symbol == someString {
return someString, buf, nil
}
}
return nil, nil, fmt.Errorf("cannot decode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
someString, ok := datum.(string)
if !ok {
return nil, fmt.Errorf("cannot encode textual enum %q: expected string; received: %T", c.typeName, datum)
}
for _, symbol := range symbols {
if symbol == someString {
return stringTextualFromNative(buf, someString)
}
}
return nil, fmt.Errorf("cannot encode textual enum %q: value ought to be member of symbols: %v; %q", c.typeName, symbols, someString)
}
return c, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"strconv"
)
// Fixed does not have child objects, therefore whatever namespace it defines is
// just to store its name in the symbol table.
func makeFixedCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Fixed ought to have valid name: %s", err)
}
size, err := sizeFromSchemaMap(c.typeName, schemaMap)
if err != nil {
return nil, err
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
if buflen := uint(len(buf)); size > buflen {
return nil, nil, fmt.Errorf("cannot decode binary fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen)
}
return buf[:size], buf[size:], nil
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode binary fixed %q: expected []byte or string; received: %T", c.typeName, datum)
}
if count := uint(len(someBytes)); count != size {
return nil, fmt.Errorf("cannot encode binary fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return append(buf, someBytes...), nil
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
if buflen := uint(len(buf)); size > buflen {
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: schema size exceeds remaining buffer size: %d > %d (short buffer)", c.typeName, size, buflen)
}
var datum interface{}
var err error
datum, buf, err = bytesNativeFromTextual(buf)
if err != nil {
return nil, buf, err
}
datumBytes := datum.([]byte)
if count := uint(len(datumBytes)); count != size {
return nil, nil, fmt.Errorf("cannot decode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return datum, buf, err
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
var someBytes []byte
switch d := datum.(type) {
case []byte:
someBytes = d
case string:
someBytes = []byte(d)
default:
return nil, fmt.Errorf("cannot encode textual fixed %q: expected []byte or string; received: %T", c.typeName, datum)
}
if count := uint(len(someBytes)); count != size {
return nil, fmt.Errorf("cannot encode textual fixed %q: datum size ought to equal schema size: %d != %d", c.typeName, count, size)
}
return bytesTextualFromNative(buf, someBytes)
}
return c, nil
}
func sizeFromSchemaMap(typeName *name, schemaMap map[string]interface{}) (uint, error) {
// Fixed type must have size
sizeRaw, ok := schemaMap["size"]
if !ok {
return 0, fmt.Errorf("Fixed %q ought to have size key", typeName)
}
var size uint
switch val := sizeRaw.(type) {
case string:
s, err := strconv.ParseUint(val, 10, 0)
if err != nil {
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
size = uint(s)
case float64:
if val <= 0 {
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
size = uint(val)
default:
return 0, fmt.Errorf("Fixed %q size ought to be number greater than zero: %v", typeName, sizeRaw)
}
return size, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
"strconv"
)
const (
doubleEncodedLength = 8 // double requires 8 bytes
floatEncodedLength = 4 // float requires 4 bytes
)
////////////////////////////////////////
// Binary Decode
////////////////////////////////////////
func doubleNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < doubleEncodedLength {
return nil, nil, fmt.Errorf("cannot decode binary double: %s", io.ErrShortBuffer)
}
return math.Float64frombits(binary.LittleEndian.Uint64(buf[:doubleEncodedLength])), buf[doubleEncodedLength:], nil
}
func floatNativeFromBinary(buf []byte) (interface{}, []byte, error) {
if len(buf) < floatEncodedLength {
return nil, nil, fmt.Errorf("cannot decode binary float: %s", io.ErrShortBuffer)
}
return math.Float32frombits(binary.LittleEndian.Uint32(buf[:floatEncodedLength])), buf[floatEncodedLength:], nil
}
////////////////////////////////////////
// Binary Encode
////////////////////////////////////////
func doubleBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value float64
switch v := datum.(type) {
case float64:
value = v
case float32:
value = float64(v)
case int:
if value = float64(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int would lose precision: %d", v)
}
case int64:
if value = float64(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int64 would lose precision: %d", v)
}
case int32:
if value = float64(v); int32(value) != v {
return nil, fmt.Errorf("cannot encode binary double: provided Go int32 would lose precision: %d", v)
}
default:
return nil, fmt.Errorf("cannot encode binary double: expected: Go numeric; received: %T", datum)
}
buf = append(buf, 0, 0, 0, 0, 0, 0, 0, 0)
binary.LittleEndian.PutUint64(buf[len(buf)-doubleEncodedLength:], math.Float64bits(value))
return buf, nil
}
func floatBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value float32
switch v := datum.(type) {
case float32:
value = v
case float64:
// Assume runtime can cast special floats correctly, and if there is a
// loss of precision from float64 and float32, that should be expected
// or at least understood by the client.
value = float32(v)
case int:
if value = float32(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int would lose precision: %d", v)
}
case int64:
if value = float32(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int64 would lose precision: %d", v)
}
case int32:
if value = float32(v); int32(value) != v {
return nil, fmt.Errorf("cannot encode binary float: provided Go int32 would lose precision: %d", v)
}
default:
return nil, fmt.Errorf("cannot encode binary float: expected: Go numeric; received: %T", datum)
}
// return floatingBinaryEncoder(buf, uint64(math.Float32bits(value)), floatEncodedLength)
buf = append(buf, 0, 0, 0, 0)
binary.LittleEndian.PutUint32(buf[len(buf)-floatEncodedLength:], uint32(math.Float32bits(value)))
return buf, nil
}
////////////////////////////////////////
// Text Decode
////////////////////////////////////////
func doubleNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return floatingTextDecoder(buf, 64)
}
func floatNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return floatingTextDecoder(buf, 32)
}
func floatingTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) {
buflen := len(buf)
if buflen >= 4 {
if bytes.Equal(buf[:4], []byte("null")) {
return math.NaN(), buf[4:], nil
}
if buflen >= 5 {
if bytes.Equal(buf[:5], []byte("1e999")) {
return math.Inf(1), buf[5:], nil
}
if buflen >= 6 {
if bytes.Equal(buf[:6], []byte("-1e999")) {
return math.Inf(-1), buf[6:], nil
}
}
}
}
index, err := numberLength(buf, true) // NOTE: floatAllowed = true
if err != nil {
return nil, nil, err
}
datum, err := strconv.ParseFloat(string(buf[:index]), bitSize)
if err != nil {
return nil, nil, err
}
if bitSize == 32 {
return float32(datum), buf[index:], nil
}
return datum, buf[index:], nil
}
func numberLength(buf []byte, floatAllowed bool) (int, error) {
// ALGORITHM: increment index as long as bytes are valid for number state engine.
var index, buflen, count int
var b byte
// STATE 0: begin, optional: -
if buflen = len(buf); index == buflen {
return 0, io.ErrShortBuffer
}
if buf[index] == '-' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
}
// STATE 1: if 0, goto 2; otherwise if 1-9, goto 3; otherwise bail
if b = buf[index]; b == '0' {
if index++; index == buflen {
return index, nil // valid number
}
} else if b >= '1' && b <= '9' {
if index++; index == buflen {
return index, nil // valid number
}
// STATE 3: absorb zero or more digits
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
if index++; index == buflen {
return index, nil // valid number
}
}
} else {
return 0, fmt.Errorf("unexpected byte: %q", b)
}
if floatAllowed {
// STATE 2: if ., goto 4; otherwise goto 5
if buf[index] == '.' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
// STATE 4: absorb one or more digits
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
count++
if index++; index == buflen {
return index, nil // valid number
}
}
if count == 0 {
// did not get at least one digit
return 0, fmt.Errorf("unexpected byte: %q", b)
}
}
// STATE 5: if e|e, goto 6; otherwise goto 7
if b = buf[index]; b == 'e' || b == 'E' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
// STATE 6: if -|+, goto 8; otherwise goto 8
if b = buf[index]; b == '+' || b == '-' {
if index++; index == buflen {
return 0, io.ErrShortBuffer
}
}
// STATE 8: absorb one or more digits
count = 0
for {
if b = buf[index]; b < '0' || b > '9' {
break
}
count++
if index++; index == buflen {
return index, nil // valid number
}
}
if count == 0 {
// did not get at least one digit
return 0, fmt.Errorf("unexpected byte: %q", b)
}
}
}
// STATE 7: end
return index, nil
}
////////////////////////////////////////
// Text Encode
////////////////////////////////////////
func floatTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return floatingTextEncoder(buf, datum, 32)
}
func doubleTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return floatingTextEncoder(buf, datum, 64)
}
func floatingTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) {
var isFloat bool
var someFloat64 float64
var someInt64 int64
switch v := datum.(type) {
case float32:
isFloat = true
someFloat64 = float64(v)
case float64:
isFloat = true
someFloat64 = v
case int:
if someInt64 = int64(v); int(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: provided Go int would lose precision: %d", v)
}
return nil, fmt.Errorf("cannot encode textual float: provided Go int would lose precision: %d", v)
}
case int64:
someInt64 = v
case int32:
if someInt64 = int64(v); int32(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: provided Go int32 would lose precision: %d", v)
}
return nil, fmt.Errorf("cannot encode textual float: provided Go int32 would lose precision: %d", v)
}
default:
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual double: expected: Go numeric; received: %T", datum)
}
return nil, fmt.Errorf("cannot encode textual float: expected: Go numeric; received: %T", datum)
}
if isFloat {
if math.IsNaN(someFloat64) {
return append(buf, "null"...), nil
}
if math.IsInf(someFloat64, 1) {
return append(buf, "1e999"...), nil
}
if math.IsInf(someFloat64, -1) {
return append(buf, "-1e999"...), nil
}
return strconv.AppendFloat(buf, someFloat64, 'g', -1, bitSize), nil
}
return strconv.AppendInt(buf, someInt64, 10), nil
}
module github.com/linkedin/goavro/v2
go 1.12
require github.com/golang/snappy v0.0.1
github.com/golang/snappy v0.0.1 h1:Qgr9rKW7uDUkrbSmQeiDsGa8SjGyCOGtuasMWwvp2P4=
github.com/golang/snappy v0.0.1/go.mod h1:/XxbfmMg8lxefKM7IXC3fBNl/7bRcc72aCRzEWrmP2Q=
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"strconv"
)
const (
intDownShift = uint32(31)
intFlag = byte(128)
intMask = byte(127)
longDownShift = uint32(63)
)
////////////////////////////////////////
// Binary Decode
////////////////////////////////////////
func intNativeFromBinary(buf []byte) (interface{}, []byte, error) {
var offset, value int
var shift uint
for offset = 0; offset < len(buf); offset++ {
b := buf[offset]
value |= int(b&intMask) << shift
if b&intFlag == 0 {
return (int32(value>>1) ^ -int32(value&1)), buf[offset+1:], nil
}
shift += 7
}
return nil, nil, io.ErrShortBuffer
}
func longNativeFromBinary(buf []byte) (interface{}, []byte, error) {
var offset int
var value uint64
var shift uint
for offset = 0; offset < len(buf); offset++ {
b := buf[offset]
value |= uint64(b&intMask) << shift
if b&intFlag == 0 {
return (int64(value>>1) ^ -int64(value&1)), buf[offset+1:], nil
}
shift += 7
}
return nil, nil, io.ErrShortBuffer
}
////////////////////////////////////////
// Binary Encode
////////////////////////////////////////
func intBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value int32
switch v := datum.(type) {
case int32:
value = v
case int:
if value = int32(v); int(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go int would lose precision: %d", v)
}
case int64:
if value = int32(v); int64(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go int64 would lose precision: %d", v)
}
case float64:
if value = int32(v); float64(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go float64 would lose precision: %f", v)
}
case float32:
if value = int32(v); float32(value) != v {
return nil, fmt.Errorf("cannot encode binary int: provided Go float32 would lose precision: %f", v)
}
default:
return nil, fmt.Errorf("cannot encode binary int: expected: Go numeric; received: %T", datum)
}
encoded := uint64((uint32(value) << 1) ^ uint32(value>>intDownShift))
return integerBinaryEncoder(buf, encoded)
}
func longBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
var value int64
switch v := datum.(type) {
case int64:
value = v
case int:
value = int64(v)
case int32:
value = int64(v)
case float64:
if value = int64(v); float64(value) != v {
return nil, fmt.Errorf("cannot encode binary long: provided Go float64 would lose precision: %f", v)
}
case float32:
if value = int64(v); float32(value) != v {
return nil, fmt.Errorf("cannot encode binary long: provided Go float32 would lose precision: %f", v)
}
default:
return nil, fmt.Errorf("long: expected: Go numeric; received: %T", datum)
}
encoded := (uint64(value) << 1) ^ uint64(value>>longDownShift)
return integerBinaryEncoder(buf, encoded)
}
func integerBinaryEncoder(buf []byte, encoded uint64) ([]byte, error) {
// used by both intBinaryEncoder and longBinaryEncoder
if encoded == 0 {
return append(buf, 0), nil
}
for encoded > 0 {
b := byte(encoded) & intMask
encoded = encoded >> 7
if encoded != 0 {
b |= intFlag // set high bit; we have more bytes
}
buf = append(buf, b)
}
return buf, nil
}
////////////////////////////////////////
// Text Decode
////////////////////////////////////////
func longNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return integerTextDecoder(buf, 64)
}
func intNativeFromTextual(buf []byte) (interface{}, []byte, error) {
return integerTextDecoder(buf, 32)
}
func integerTextDecoder(buf []byte, bitSize int) (interface{}, []byte, error) {
index, err := numberLength(buf, false) // NOTE: floatAllowed = false
if err != nil {
return nil, nil, err
}
datum, err := strconv.ParseInt(string(buf[:index]), 10, bitSize)
if err != nil {
return nil, nil, err
}
if bitSize == 32 {
return int32(datum), buf[index:], nil
}
return datum, buf[index:], nil
}
////////////////////////////////////////
// Text Encode
////////////////////////////////////////
func longTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return integerTextEncoder(buf, datum, 64)
}
func intTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
return integerTextEncoder(buf, datum, 32)
}
func integerTextEncoder(buf []byte, datum interface{}, bitSize int) ([]byte, error) {
var someInt64 int64
switch v := datum.(type) {
case int:
someInt64 = int64(v)
case int32:
someInt64 = int64(v)
case int64:
someInt64 = v
case float32:
if someInt64 = int64(v); float32(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: provided Go float32 would lose precision: %f", v)
}
return nil, fmt.Errorf("cannot encode textual int: provided Go float32 would lose precision: %f", v)
}
case float64:
if someInt64 = int64(v); float64(someInt64) != v {
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: provided Go float64 would lose precision: %f", v)
}
return nil, fmt.Errorf("cannot encode textual int: provided Go float64 would lose precision: %f", v)
}
default:
if bitSize == 64 {
return nil, fmt.Errorf("cannot encode textual long: expected: Go numeric; received: %T", datum)
}
return nil, fmt.Errorf("cannot encode textual int: expected: Go numeric; received: %T", datum)
}
return strconv.AppendInt(buf, someInt64, 10), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"errors"
"fmt"
"strings"
)
const nullNamespace = ""
// ErrInvalidName is the error returned when one or more parts of an Avro name
// is invalid.
type ErrInvalidName struct {
Message string
}
func (e ErrInvalidName) Error() string {
return "schema name ought to " + e.Message
}
// NOTE: This function designed to work with name components, after they have
// been split on the period rune.
func isRuneInvalidForFirstCharacter(r rune) bool {
return (r < 'A' || r > 'Z') && (r < 'a' || r > 'z') && r != '_'
}
func isRuneInvalidForOtherCharacters(r rune) bool {
return isRuneInvalidForFirstCharacter(r) && (r < '0' || r > '9')
}
func checkNameComponent(s string) error {
err := checkString(s)
if err != nil {
return &ErrInvalidName{err.Error()}
}
return err
}
func checkString(s string) error {
if len(s) == 0 {
return errors.New("be non-empty string")
}
if strings.IndexFunc(s[:1], isRuneInvalidForFirstCharacter) != -1 {
return errors.New("start with [A-Za-z_]: " + s)
}
if strings.IndexFunc(s[1:], isRuneInvalidForOtherCharacters) != -1 {
return errors.New("have second and remaining characters contain only [A-Za-z0-9_]: " + s)
}
return nil
}
// name describes an Avro name in terms of its full name and namespace.
type name struct {
fullName string // the instance's Avro name
namespace string // for use when building new name from existing one
}
// newName returns a new Name instance after first ensuring the arguments do not
// violate any of the Avro naming rules.
func newName(n, ns, ens string) (*name, error) {
var nn name
if index := strings.LastIndexByte(n, '.'); index > -1 {
// inputName does contain a dot, so ignore everything else and use it as the full name
nn.fullName = n
nn.namespace = n[:index]
} else {
// inputName does not contain a dot, therefore is not the full name
if ns != nullNamespace {
// if namespace provided in the schema in the same schema level, use it
nn.fullName = ns + "." + n
nn.namespace = ns
} else if ens != nullNamespace {
// otherwise if enclosing namespace provided, use it
nn.fullName = ens + "." + n
nn.namespace = ens
} else {
// otherwise no namespace, so use null namespace, the empty string
nn.fullName = n
}
}
// verify all components of the full name for adherence to Avro naming rules
for i, component := range strings.Split(nn.fullName, ".") {
if i == 0 && RelaxedNameValidation && component == "" {
continue
}
if err := checkNameComponent(component); err != nil {
return nil, err
}
}
return &nn, nil
}
var (
// RelaxedNameValidation causes name validation to allow the first component
// of an Avro namespace to be the empty string.
RelaxedNameValidation bool
)
func newNameFromSchemaMap(enclosingNamespace string, schemaMap map[string]interface{}) (*name, error) {
var nameString, namespaceString string
name, ok := schemaMap["name"]
if !ok {
return nil, errors.New("schema ought to have name key")
}
nameString, ok = name.(string)
if !ok || nameString == nullNamespace {
return nil, fmt.Errorf("schema name ought to be non-empty string; received: %T: %v", name, name)
}
if namespace, ok := schemaMap["namespace"]; ok {
namespaceString, ok = namespace.(string)
if !ok {
return nil, fmt.Errorf("schema namespace, if provided, ought to be a string; received: %T: %v", namespace, namespace)
}
}
return newName(nameString, namespaceString, enclosingNamespace)
}
func (n *name) String() string {
return n.fullName
}
// short returns the name without the prefixed namespace.
func (n *name) short() string {
if index := strings.LastIndexByte(n.fullName, '.'); index > -1 {
return n.fullName[index+1:]
}
return n.fullName
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
"io"
)
var nullBytes = []byte("null")
func nullNativeFromBinary(buf []byte) (interface{}, []byte, error) { return nil, buf, nil }
func nullBinaryFromNative(buf []byte, datum interface{}) ([]byte, error) {
if datum != nil {
return nil, fmt.Errorf("cannot encode binary null: expected: Go nil; received: %T", datum)
}
return buf, nil
}
func nullNativeFromTextual(buf []byte) (interface{}, []byte, error) {
if len(buf) < 4 {
return nil, nil, fmt.Errorf("cannot decode textual null: %s", io.ErrShortBuffer)
}
if bytes.Equal(buf[:4], nullBytes) {
return nil, buf[4:], nil
}
return nil, nil, errors.New("cannot decode textual null: expected: null")
}
func nullTextualFromNative(buf []byte, datum interface{}) ([]byte, error) {
if datum != nil {
return nil, fmt.Errorf("cannot encode textual null: expected: Go nil; received: %T", datum)
}
return append(buf, nullBytes...), nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"crypto/rand"
"errors"
"fmt"
"io"
)
const (
// CompressionNullLabel is used when OCF blocks are not compressed.
CompressionNullLabel = "null"
// CompressionDeflateLabel is used when OCF blocks are compressed using the
// deflate algorithm.
CompressionDeflateLabel = "deflate"
// CompressionSnappyLabel is used when OCF blocks are compressed using the
// snappy algorithm.
CompressionSnappyLabel = "snappy"
)
// compressionID are values used to specify compression algorithm used to compress
// and decompress Avro Object Container File (OCF) streams.
type compressionID uint8
const (
compressionNull compressionID = iota
compressionDeflate
compressionSnappy
)
const (
ocfBlockConst = 24 // Each OCF block has two longs prefix, and sync marker suffix
ocfHeaderSizeConst = 48 // OCF header is usually about 48 bytes longer than its compressed schema
ocfMagicString = "Obj\x01"
ocfMetadataSchema = `{"type":"map","values":"bytes"}`
ocfSyncLength = 16
)
var (
ocfMagicBytes = []byte(ocfMagicString)
ocfMetadataCodec *Codec
)
func init() {
ocfMetadataCodec, _ = NewCodec(ocfMetadataSchema)
}
type ocfHeader struct {
codec *Codec
compressionID compressionID
syncMarker [ocfSyncLength]byte
metadata map[string][]byte
}
func newOCFHeader(config OCFConfig) (*ocfHeader, error) {
var err error
header := new(ocfHeader)
//
// avro.codec
//
switch config.CompressionName {
case "":
header.compressionID = compressionNull
case CompressionNullLabel:
header.compressionID = compressionNull
case CompressionDeflateLabel:
header.compressionID = compressionDeflate
case CompressionSnappyLabel:
header.compressionID = compressionSnappy
default:
return nil, fmt.Errorf("cannot create OCF header using unrecognized compression algorithm: %q", config.CompressionName)
}
//
// avro.schema
//
if config.Codec != nil {
header.codec = config.Codec
} else if config.Schema == "" {
return nil, fmt.Errorf("cannot create OCF header without either Codec or Schema specified")
} else {
if header.codec, err = NewCodec(config.Schema); err != nil {
return nil, fmt.Errorf("cannot create OCF header: %s", err)
}
}
header.metadata = config.MetaData
//
// The 16-byte, randomly-generated sync marker for this file.
//
_, err = rand.Read(header.syncMarker[:])
if err != nil {
return nil, err
}
return header, nil
}
func readOCFHeader(ior io.Reader) (*ocfHeader, error) {
//
// magic bytes
//
magic := make([]byte, 4)
_, err := io.ReadFull(ior, magic)
if err != nil {
return nil, fmt.Errorf("cannot read OCF header magic bytes: %s", err)
}
if !bytes.Equal(magic, ocfMagicBytes) {
return nil, fmt.Errorf("cannot read OCF header with invalid magic bytes: %#q", magic)
}
//
// metadata
//
metadata, err := metadataBinaryReader(ior)
if err != nil {
return nil, fmt.Errorf("cannot read OCF header metadata: %s", err)
}
//
// avro.codec
//
// NOTE: Avro specification states that `null` cID is used by
// default when "avro.codec" was not included in the metadata header. The
// specification does not talk about the case when "avro.codec" was included
// with the empty string as its value. I believe it is an error for an OCF
// file to provide the empty string as the cID algorithm. While it
// is trivially easy to gracefully handle here, I'm not sure whether this
// happens a lot, and don't want to accept bad input unless we have
// significant reason to do so.
var cID compressionID
value, ok := metadata["avro.codec"]
if ok {
switch avroCodec := string(value); avroCodec {
case CompressionNullLabel:
cID = compressionNull
case CompressionDeflateLabel:
cID = compressionDeflate
case CompressionSnappyLabel:
cID = compressionSnappy
default:
return nil, fmt.Errorf("cannot read OCF header using unrecognized compression algorithm from avro.codec: %q", avroCodec)
}
}
//
// create goavro.Codec from specified avro.schema
//
value, ok = metadata["avro.schema"]
if !ok {
return nil, errors.New("cannot read OCF header without avro.schema")
}
codec, err := NewCodec(string(value))
if err != nil {
return nil, fmt.Errorf("cannot read OCF header with invalid avro.schema: %s", err)
}
header := &ocfHeader{codec: codec, compressionID: cID, metadata: metadata}
//
// read and store sync marker
//
if n, err := io.ReadFull(ior, header.syncMarker[:]); err != nil {
return nil, fmt.Errorf("cannot read OCF header without sync marker: only read %d of %d bytes: %s", n, ocfSyncLength, err)
}
//
// header is valid
//
return header, nil
}
func writeOCFHeader(header *ocfHeader, iow io.Writer) (err error) {
//
// avro.codec
//
var avroCodec string
switch header.compressionID {
case compressionNull:
avroCodec = CompressionNullLabel
case compressionDeflate:
avroCodec = CompressionDeflateLabel
case compressionSnappy:
avroCodec = CompressionSnappyLabel
default:
return fmt.Errorf("should not get here: cannot write OCF header using unrecognized compression algorithm: %d", header.compressionID)
}
//
// avro.schema
//
// Create buffer for OCF header. The first four bytes are magic, and we'll
// use copy to fill them in, so initialize buffer's length with 4, and its
// capacity equal to length of avro schema plus a constant.
schema := header.codec.Schema()
buf := make([]byte, 4, len(schema)+ocfHeaderSizeConst)
_ = copy(buf, ocfMagicBytes)
//
// file metadata, including the schema
//
meta := make(map[string]interface{})
for k, v := range header.metadata {
meta[k] = v
}
meta["avro.schema"] = []byte(schema)
meta["avro.codec"] = []byte(avroCodec)
buf, err = ocfMetadataCodec.BinaryFromNative(buf, meta)
if err != nil {
return fmt.Errorf("should not get here: cannot write OCF header: %s", err)
}
//
// 16-byte sync marker
//
buf = append(buf, header.syncMarker[:]...)
// emit OCF header
_, err = iow.Write(buf)
if err != nil {
return fmt.Errorf("cannot write OCF header: %s", err)
}
return nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"compress/flate"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"github.com/golang/snappy"
)
// OCFReader structure is used to read Object Container Files (OCF).
type OCFReader struct {
header *ocfHeader
block []byte // buffer from which decoding takes place
rerr error // most recent error that took place while reading bytes (unrecoverable)
ior io.Reader
readReady bool // true after Scan and before Read
remainingBlockItems int64 // count of encoded data items remaining in block buffer to be decoded
}
// NewOCFReader initializes and returns a new structure used to read an Avro
// Object Container File (OCF).
//
// func example(ior io.Reader) error {
// // NOTE: Wrap provided io.Reader in a buffered reader, which improves the
// // performance of streaming file data.
// br := bufio.NewReader(ior)
// ocfr, err := goavro.NewOCFReader(br)
// if err != nil {
// return err
// }
// for ocfr.Scan() {
// datum, err := ocfr.Read()
// if err != nil {
// return err
// }
// fmt.Println(datum)
// }
// return ocfr.Err()
// }
func NewOCFReader(ior io.Reader) (*OCFReader, error) {
header, err := readOCFHeader(ior)
if err != nil {
return nil, fmt.Errorf("cannot create OCFReader: %s", err)
}
return &OCFReader{header: header, ior: ior}, nil
}
//MetaData returns the file metadata map found within the OCF file
func (ocfr *OCFReader) MetaData() map[string][]byte {
return ocfr.header.metadata
}
// Codec returns the codec found within the OCF file.
func (ocfr *OCFReader) Codec() *Codec {
return ocfr.header.codec
}
// CompressionName returns the name of the compression algorithm found within
// the OCF file.
func (ocfr *OCFReader) CompressionName() string {
switch ocfr.header.compressionID {
case compressionNull:
return CompressionNullLabel
case compressionDeflate:
return CompressionDeflateLabel
case compressionSnappy:
return CompressionSnappyLabel
default:
return "should not get here: unrecognized compression algorithm"
}
}
// Err returns the last error encountered while reading the OCF file. See
// `NewOCFReader` documentation for an example.
func (ocfr *OCFReader) Err() error {
return ocfr.rerr
}
// Read consumes one datum value from the Avro OCF stream and returns it. Read
// is designed to be called only once after each invocation of the Scan method.
// See `NewOCFReader` documentation for an example.
func (ocfr *OCFReader) Read() (interface{}, error) {
// NOTE: Test previous error before testing readReady to prevent overwriting
// previous error.
if ocfr.rerr != nil {
return nil, ocfr.rerr
}
if !ocfr.readReady {
ocfr.rerr = errors.New("Read called without successful Scan")
return nil, ocfr.rerr
}
ocfr.readReady = false
// decode one datum value from block
var datum interface{}
datum, ocfr.block, ocfr.rerr = ocfr.header.codec.NativeFromBinary(ocfr.block)
if ocfr.rerr != nil {
return false, ocfr.rerr
}
ocfr.remainingBlockItems--
return datum, nil
}
// RemainingBlockItems returns the number of items remaining in the block being
// processed.
func (ocfr *OCFReader) RemainingBlockItems() int64 {
return ocfr.remainingBlockItems
}
// Scan returns true when there is at least one more data item to be read from
// the Avro OCF. Scan ought to be called prior to calling the Read method each
// time the Read method is invoked. See `NewOCFReader` documentation for an
// example.
func (ocfr *OCFReader) Scan() bool {
ocfr.readReady = false
if ocfr.rerr != nil {
return false
}
// NOTE: If there are no more remaining data items from the existing block,
// then attempt to slurp in the next block.
if ocfr.remainingBlockItems <= 0 {
if count := len(ocfr.block); count != 0 {
ocfr.rerr = fmt.Errorf("extra bytes between final datum in previous block and block sync marker: %d", count)
return false
}
// Read the block count and update the number of remaining items for
// this block
ocfr.remainingBlockItems, ocfr.rerr = longBinaryReader(ocfr.ior)
if ocfr.rerr != nil {
if ocfr.rerr == io.EOF {
ocfr.rerr = nil // merely end of file, rather than error
} else {
ocfr.rerr = fmt.Errorf("cannot read block count: %s", ocfr.rerr)
}
return false
}
if ocfr.remainingBlockItems <= 0 {
ocfr.rerr = fmt.Errorf("cannot decode when block count is not greater than 0: %d", ocfr.remainingBlockItems)
return false
}
if ocfr.remainingBlockItems > MaxBlockCount {
ocfr.rerr = fmt.Errorf("cannot decode when block count exceeds MaxBlockCount: %d > %d", ocfr.remainingBlockItems, MaxBlockCount)
}
var blockSize int64
blockSize, ocfr.rerr = longBinaryReader(ocfr.ior)
if ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read block size: %s", ocfr.rerr)
return false
}
if blockSize <= 0 {
ocfr.rerr = fmt.Errorf("cannot decode when block size is not greater than 0: %d", blockSize)
return false
}
if blockSize > MaxBlockSize {
ocfr.rerr = fmt.Errorf("cannot decode when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize)
return false
}
// read entire block into buffer
ocfr.block = make([]byte, blockSize)
_, ocfr.rerr = io.ReadFull(ocfr.ior, ocfr.block)
if ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read block: %s", ocfr.rerr)
return false
}
switch ocfr.header.compressionID {
case compressionNull:
// no-op
case compressionDeflate:
// NOTE: flate.NewReader wraps with io.ByteReader if argument does
// not implement that interface.
rc := flate.NewReader(bytes.NewBuffer(ocfr.block))
ocfr.block, ocfr.rerr = ioutil.ReadAll(rc)
if ocfr.rerr != nil {
_ = rc.Close()
return false
}
if ocfr.rerr = rc.Close(); ocfr.rerr != nil {
return false
}
case compressionSnappy:
index := len(ocfr.block) - 4 // last 4 bytes is crc32 of decoded block
if index <= 0 {
ocfr.rerr = fmt.Errorf("cannot decompress snappy without CRC32 checksum: %d", len(ocfr.block))
return false
}
decoded, err := snappy.Decode(nil, ocfr.block[:index])
if err != nil {
ocfr.rerr = fmt.Errorf("cannot decompress: %s", err)
return false
}
actualCRC := crc32.ChecksumIEEE(decoded)
expectedCRC := binary.BigEndian.Uint32(ocfr.block[index : index+4])
if actualCRC != expectedCRC {
ocfr.rerr = fmt.Errorf("snappy CRC32 checksum mismatch: %x != %x", actualCRC, expectedCRC)
return false
}
ocfr.block = decoded
default:
ocfr.rerr = fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfr.header.compressionID)
return false
}
// read and ensure sync marker matches
sync := make([]byte, ocfSyncLength)
var n int
if n, ocfr.rerr = io.ReadFull(ocfr.ior, sync); ocfr.rerr != nil {
ocfr.rerr = fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, ocfr.rerr)
return false
}
if !bytes.Equal(sync, ocfr.header.syncMarker[:]) {
ocfr.rerr = fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfr.header.syncMarker)
return false
}
}
ocfr.readReady = true
return true
}
// SkipThisBlockAndReset can be called after an error occurs while reading or
// decoding datum values from an OCF stream. OCF specifies each OCF stream
// contain one or more blocks of data. Each block consists of a block count, the
// number of bytes for the block, followed be the possibly compressed
// block. Inside each decompressed block is all of the binary encoded datum
// values concatenated together. In other words, OCF framing is at a block level
// rather than a datum level. If there is an error while reading or decoding a
// datum, the reader is not able to skip to the next datum value, because OCF
// does not have any markers for where each datum ends and the next one
// begins. Therefore, the reader is only able to skip this datum value and all
// subsequent datum values in the current block, move to the next block and
// start decoding datum values there.
func (ocfr *OCFReader) SkipThisBlockAndReset() {
// ??? is it an error to call method unless the reader has had an error
ocfr.remainingBlockItems = 0
ocfr.block = ocfr.block[:0]
ocfr.rerr = nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"compress/flate"
"encoding/binary"
"errors"
"fmt"
"hash/crc32"
"io"
"io/ioutil"
"os"
"github.com/golang/snappy"
)
// OCFConfig is used to specify creation parameters for OCFWriter.
type OCFConfig struct {
// W specifies the `io.Writer` to which to send the encoded data,
// (required). If W is `*os.File`, then creating an OCF for writing will
// attempt to read any existing OCF header and use the schema and
// compression codec specified by the existing header, then advance the file
// position to the tail end of the file for appending.
W io.Writer
// Codec specifies the Codec to use for the new OCFWriter, (optional). If
// the W parameter above is an `*os.File` which contains a Codec, the Codec
// in the existing file will be used instead. Otherwise if this Codec
// parameter is specified, it will be used. If neither the W parameter above
// is an `*os.File` with an existing Codec, nor this Codec parameter is
// specified, the OCFWriter will create a new Codec from the schema string
// specified by the Schema parameter below.
Codec *Codec
// Schema specifies the Avro schema for the data to be encoded, (optional).
// If neither the W parameter above is an `*os.File` with an existing Codec,
// nor the Codec parameter above is specified, the OCFWriter will create a
// new Codec from the schema string specified by this Schema parameter.
Schema string
// CompressionName specifies the compression codec used, (optional). If
// omitted, defaults to "null" codec. When appending to an existing OCF,
// this field is ignored.
CompressionName string
//MetaData specifies application specific meta data to be added to
//the OCF file. When appending to an existing OCF, this field
//is ignored
MetaData map[string][]byte
}
// OCFWriter is used to create a new or append to an existing Avro Object
// Container File (OCF).
type OCFWriter struct {
header *ocfHeader
iow io.Writer
}
// NewOCFWriter returns a new OCFWriter instance that may be used for appending
// binary Avro data, either by appending to an existing OCF file or creating a
// new OCF file.
func NewOCFWriter(config OCFConfig) (*OCFWriter, error) {
var err error
ocf := &OCFWriter{iow: config.W}
switch config.W.(type) {
case nil:
return nil, errors.New("cannot create OCFWriter when W is nil")
case *os.File:
file := config.W.(*os.File)
stat, err := file.Stat()
if err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
// NOTE: When upstream provides a new file, it will already exist but
// have a size of 0 bytes.
if stat.Size() > 0 {
// attempt to read existing OCF header
if ocf.header, err = readOCFHeader(file); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
// prepare for appending data to existing OCF
if err = ocf.quickScanToTail(file); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
return ocf, nil // happy case for appending to existing OCF
}
}
// create new OCF header based on configuration parameters
if ocf.header, err = newOCFHeader(config); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
if err = writeOCFHeader(ocf.header, config.W); err != nil {
return nil, fmt.Errorf("cannot create OCFWriter: %s", err)
}
return ocf, nil // another happy case for creation of new OCF
}
// quickScanToTail advances the stream reader to the tail end of the
// file. Rather than reading each encoded block, optionally decompressing it,
// and then decoding it, this method reads the block count, ignoring it, then
// reads the block size, then skips ahead to the followig block. It does this
// repeatedly until attempts to read the file return io.EOF.
func (ocfw *OCFWriter) quickScanToTail(ior io.Reader) error {
sync := make([]byte, ocfSyncLength)
for {
// Read and validate block count
blockCount, err := longBinaryReader(ior)
if err != nil {
if err == io.EOF {
return nil // merely end of file, rather than error
}
return fmt.Errorf("cannot read block count: %s", err)
}
if blockCount <= 0 {
return fmt.Errorf("cannot read when block count is not greater than 0: %d", blockCount)
}
if blockCount > MaxBlockCount {
return fmt.Errorf("cannot read when block count exceeds MaxBlockCount: %d > %d", blockCount, MaxBlockCount)
}
// Read block size
blockSize, err := longBinaryReader(ior)
if err != nil {
return fmt.Errorf("cannot read block size: %s", err)
}
if blockSize <= 0 {
return fmt.Errorf("cannot read when block size is not greater than 0: %d", blockSize)
}
if blockSize > MaxBlockSize {
return fmt.Errorf("cannot read when block size exceeds MaxBlockSize: %d > %d", blockSize, MaxBlockSize)
}
// Advance reader to end of block
if _, err = io.CopyN(ioutil.Discard, ior, blockSize); err != nil {
return fmt.Errorf("cannot seek to next block: %s", err)
}
// Read and validate sync marker
var n int
if n, err = io.ReadFull(ior, sync); err != nil {
return fmt.Errorf("cannot read sync marker: read %d out of %d bytes: %s", n, ocfSyncLength, err)
}
if !bytes.Equal(sync, ocfw.header.syncMarker[:]) {
return fmt.Errorf("sync marker mismatch: %v != %v", sync, ocfw.header.syncMarker)
}
}
}
// Append appends one or more data items to an OCF file in a block. If there are
// more data items in the slice than MaxBlockCount allows, the data slice will
// be chunked into multiple blocks, each not having more than MaxBlockCount
// items.
func (ocfw *OCFWriter) Append(data interface{}) error {
arrayValues, err := convertArray(data)
if err != nil {
return err
}
// Chunk data so no block has more than MaxBlockCount items.
for int64(len(arrayValues)) > MaxBlockCount {
if err := ocfw.appendDataIntoBlock(arrayValues[:MaxBlockCount]); err != nil {
return err
}
arrayValues = arrayValues[MaxBlockCount:]
}
return ocfw.appendDataIntoBlock(arrayValues)
}
func (ocfw *OCFWriter) appendDataIntoBlock(data []interface{}) error {
var block []byte // working buffer for encoding data values
var err error
// Encode and concatenate each data item into the block
for _, datum := range data {
if block, err = ocfw.header.codec.BinaryFromNative(block, datum); err != nil {
return fmt.Errorf("cannot translate datum to binary: %v; %s", datum, err)
}
}
switch ocfw.header.compressionID {
case compressionNull:
// no-op
case compressionDeflate:
// compress into new bytes buffer.
bb := bytes.NewBuffer(make([]byte, 0, len(block)))
cw, _ := flate.NewWriter(bb, flate.DefaultCompression)
// writing bytes to cw will compress bytes and send to bb.
if _, err := cw.Write(block); err != nil {
return err
}
if err := cw.Close(); err != nil {
return err
}
block = bb.Bytes()
case compressionSnappy:
compressed := snappy.Encode(nil, block)
// OCF requires snappy to have CRC32 checksum after each snappy block
compressed = append(compressed, 0, 0, 0, 0) // expand slice by 4 bytes so checksum will fit
binary.BigEndian.PutUint32(compressed[len(compressed)-4:], crc32.ChecksumIEEE(block)) // checksum of decompressed block
block = compressed
default:
return fmt.Errorf("should not get here: cannot compress block using unrecognized compression: %d", ocfw.header.compressionID)
}
// create file data block
buf := make([]byte, 0, len(block)+ocfBlockConst) // pre-allocate block bytes
buf, _ = longBinaryFromNative(buf, len(data)) // block count (number of data items)
buf, _ = longBinaryFromNative(buf, len(block)) // block size (number of bytes in block)
buf = append(buf, block...) // serialized objects
buf = append(buf, ocfw.header.syncMarker[:]...) // sync marker
_, err = ocfw.iow.Write(buf)
return err
}
// Codec returns the codec used by OCFWriter. This function provided because
// upstream may be appending to existing OCF which uses a different schema than
// requested during instantiation.
func (ocfw *OCFWriter) Codec() *Codec {
return ocfw.header.codec
}
// CompressionName returns the name of the compression algorithm used by
// OCFWriter. This function provided because upstream may be appending to
// existing OCF which uses a different compression algorithm than requested
// during instantiation. the OCF file.
func (ocfw *OCFWriter) CompressionName() string {
switch ocfw.header.compressionID {
case compressionNull:
return CompressionNullLabel
case compressionDeflate:
return CompressionDeflateLabel
case compressionSnappy:
return CompressionSnappyLabel
default:
return "should not get here: unrecognized compression algorithm"
}
}
package goavro
import (
"encoding/binary"
"fmt"
"io"
)
// rabinEmpty is a constant used to initialize the crc64Table, and to compute
// the CRC-64-AVRO fingerprint of every object schema.
const rabinEmpty = uint64(0xc15d213aa4d7a795)
// rabinTable is never modified after initialization but its values are read to
// compute the CRC-64-AVRO fingerprint of every schema its given.
var rabinTable = [256]uint64{
0,
3238593523956797946,
6477187047913595892,
8435907220062204430,
12954374095827191784,
11472609148414072338,
16871814440124408860,
14327483619285186022,
16515860097293205755,
14539261057490653441,
13607494391182877455,
10387063993012335349,
6265406319754774291,
8791864835633305321,
1085550678754862311,
2585467722461443357,
5247393906202824413,
7215812591205457703,
1239030555549527337,
4449591751341063379,
18092457712352332085,
15556728100436498639,
11742789833002527425,
10234164645493242683,
12530812639509548582,
9302088354573213660,
17583729671266610642,
15633189885995973672,
2171101357509724622,
3661574416647526452,
5170935444922886714,
7724537325157989312,
10494787812405648826,
13642865964979244096,
14431625182410915406,
16480541316673728436,
2478061111099054674,
1049933365183482792,
8899183502682126758,
6300970840149272668,
8399466921467862337,
6368420890995002555,
3275086581351513781,
108854135608684367,
14364169659802000041,
16980263386864569171,
11435870349096892765,
12845837170396948647,
15669858317114364775,
17692196227407282845,
9265331945857609875,
12422293323479818601,
7688114635962061967,
5062151678603773301,
3698085083440658299,
2279937883717887617,
4342202715019449244,
1203395666939462246,
7323148833295052904,
5282940851558637970,
10341870889845773428,
11778178981837571470,
15449074650315978624,
18057156506771531386,
11669866394404287583,
10160817855121008037,
17874829710049597355,
15339802717267265105,
1311848476550706103,
4523114428088083021,
5464845951130112067,
7432843562972398009,
4956122222198109348,
7509300761534850398,
2099866730366965584,
3591042414950500010,
17798367005364253516,
15848531969535615670,
12601941680298545336,
9372796311334617410,
16798933842935724674,
14253900473960229752,
12736841781990005110,
11255500115345754252,
6550173162703027562,
8509314479008689296,
217708271217368734,
3455596968422674276,
870833084869474937,
2370047569572014979,
6194214610827729293,
8721096401170761847,
13822387873690697105,
10602378625989962859,
16587157392570359397,
14609853536892473247,
3483332339477899749,
2064482512161650719,
7616958077116566033,
4991418462803860459,
9480190278288059917,
12637572737790640119,
15741190762473065977,
17762823925471730691,
15376229271924123934,
17983608511393921252,
10124303357207546602,
11561034798826117904,
7396170166881316598,
5356383260452470540,
4559875767435775234,
1420363961462201592,
8684405430038898488,
6085769495188764354,
2406791333878924492,
979366144819647798,
14646297666590105808,
16695918618875998506,
10565881703117275940,
13713538703073841886,
11362911691697612739,
12772455230081578553,
14146576876296094775,
16763373153642681805,
3347869283551649835,
182341662412566993,
8616954185191982047,
6585487012709290533,
13933329357911598997,
17126321439046432367,
11006435164953838689,
12992741788688209307,
8257930048646602877,
6803747195591438727,
3132703159877387145,
542775339377431155,
2623696953101412206,
619515277774763668,
9046228856176166042,
5871394916501263712,
10929691902260224134,
13501751302614184316,
14865687125944796018,
16338017159720129160,
9912244444396218696,
11925134239902742706,
15018601523069700796,
18202706530865158982,
4199733460733931168,
1637543290675756890,
7182084829901000020,
5717935174548446382,
7834929158557182387,
4632665972928804937,
3844057317981030983,
1849042541720329149,
16103865201353027163,
17549867708331900833,
9700748483321744815,
12280807109898935381,
5834933197202143791,
8937414855024798677,
655924238275353051,
2732422975565056033,
16374796089197559239,
14974255385173568573,
13465025131935292979,
10821211621719183305,
13100346325406055124,
11041713811386575662,
17018628958017378592,
13897997918303815898,
435416542434737468,
3097107305413864646,
6911193936845348552,
8293578696285179698,
1741666169738949874,
3808479038558283016,
4740095139144029958,
7870595381236532988,
12388429221655458586,
9736009554713699040,
17442192802341523694,
16068516186704462100,
18239503069743100937,
15127152172900050419,
11888425678624364541,
9803746554456753671,
5681455845848806369,
7073288438148047387,
1673934641775824917,
4308477092595991023,
6966664678955799498,
5503217582476919344,
4128965024323301438,
1566351579938693572,
15233916154233132066,
18417600011429070296,
9982836925607720918,
11996431537128302124,
9627165335515697969,
12207926510359495371,
15886756170769674437,
17332335396841578815,
3917464579278591193,
1922028658990515491,
8051932600676513581,
4850374241660872407,
2917466598601071895,
327962119137676525,
8187398044598779619,
6732512565967646489,
11221777246008269567,
13207379120439233285,
14004037317153847563,
17197450482186430705,
14792340333762633196,
16265093719173729302,
10712766520904941080,
13284123302255603682,
9119751534871550468,
5944212839312182270,
2840727922924403184,
836967320887912458,
17368810860077796976,
15995557527495450506,
12171538990377528708,
9518416773021940862,
4813582667757848984,
7943378085384837218,
1958732289639295596,
4025966300338256790,
1458733299300535947,
4093699022299389809,
5610888623004134783,
7002018658576923781,
12103802978479819107,
10018419036150929561,
18310175810188503703,
15198246066092718957,
13391477134206599341,
10748366240846565719,
16157651908532642649,
14756687855020634787,
729366649650267973,
2805444311502067391,
6051901489239909553,
9155087905094251851,
6695738567103299670,
8078825954266321324,
364683324825133986,
3025950744619954776,
17233908370383964094,
14112856248920397380,
13170974025418581066,
11113046258555286960,
}
// rabin returns an unsigned 64-bit integer Rabin fingerprint for buf. NOTE:
// This is only used during Codec instantiation to calculate the Rabin
// fingerprint of the canonical schema.
func rabin(buf []byte) uint64 {
fp := rabinEmpty
for i := 0; i < len(buf); i++ {
fp = (fp >> 8) ^ rabinTable[(byte(fp)^buf[i])&0xff] // unsigned right shift >>>
}
return fp
}
const soeMagicPrefix = 2 // 2-byte prefix for SOE encoded data
const soeHeaderLen = soeMagicPrefix + 8 // 2-byte prefix plus 8-byte fingerprint
// FingerprintFromSOE returns the unsigned 64-bit Rabin fingerprint from the
// header of a buffer that encodes a Single-Object Encoded datum. This function
// is designed to be used to lookup a Codec that can decode the contents of the
// buffer. Once a Codec is found that has the matching Rabin fingerprint, its
// NativeFromBinary method may be used to decode the remaining bytes returned as
// the second return value. On failure this function returns an
// ErrNotSingleObjectEncoded error.
//
// func decode(codex map[uint64]*goavro.Codec, buf []byte) error {
// // Perform a sanity check on the buffer, then return the Rabin fingerprint
// // of the schema used to encode the data.
// fingerprint, newBuf, err := goavro.FingerprintFromSOE(buf)
// if err != nil {
// return err
// }
//
// // Get a previously stored Codec from the codex map.
// codec, ok := codex[fingerprint]
// if !ok {
// return fmt.Errorf("unknown codec: %#x", fingerprint)
// }
//
// // Use the fetched Codec to decode the buffer as a SOE.
// //
// // Faster because SOE magic prefix and schema fingerprint already
// // checked and used to fetch the Codec. Just need to decode the binary
// // bytes remaining after the prefix were removed.
// datum, _, err := codec.NativeFromBinary(newBuf)
// if err != nil {
// return err
// }
//
// _, err = fmt.Println(datum)
// return err
// }
func FingerprintFromSOE(buf []byte) (uint64, []byte, error) {
if len(buf) < soeHeaderLen {
// Not enough bytes to encode schema fingerprint.
return 0, nil, ErrNotSingleObjectEncoded(io.ErrShortBuffer.Error())
}
if buf[0] != 0xC3 || buf[1] != 0x01 {
// Currently only one SOE prefix is recognized.
return 0, nil, ErrNotSingleObjectEncoded(fmt.Sprintf("unknown SOE prefix: %#x", buf[:soeMagicPrefix]))
}
// Only recognizes single-object encodings format version 1.
return binary.LittleEndian.Uint64(buf[soeMagicPrefix:]), buf[soeHeaderLen:], nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
)
func makeRecordCodec(st map[string]*Codec, enclosingNamespace string, schemaMap map[string]interface{}) (*Codec, error) {
// NOTE: To support recursive data types, create the codec and register it
// using the specified name, and fill in the codec functions later.
c, err := registerNewCodec(st, schemaMap, enclosingNamespace)
if err != nil {
return nil, fmt.Errorf("Record ought to have valid name: %s", err)
}
fields, ok := schemaMap["fields"]
if !ok {
return nil, fmt.Errorf("Record %q ought to have fields key", c.typeName)
}
fieldSchemas, ok := fields.([]interface{})
if !ok || fieldSchemas == nil {
return nil, fmt.Errorf("Record %q fields ought to be non-nil array: %v", c.typeName, fields)
}
codecFromFieldName := make(map[string]*Codec)
codecFromIndex := make([]*Codec, len(fieldSchemas))
nameFromIndex := make([]string, len(fieldSchemas))
defaultValueFromName := make(map[string]interface{}, len(fieldSchemas))
for i, fieldSchema := range fieldSchemas {
fieldSchemaMap, ok := fieldSchema.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type; received: %v", c.typeName, i+1, fieldSchema)
}
// NOTE: field names are not registered in the symbol table, because
// field names are not individually addressable codecs.
fieldCodec, err := buildCodecForTypeDescribedByMap(st, c.typeName.namespace, fieldSchemaMap)
if err != nil {
return nil, fmt.Errorf("Record %q field %d ought to be valid Avro named type: %s", c.typeName, i+1, err)
}
// However, when creating a full name for the field name, be sure to use
// record's namespace
n, err := newNameFromSchemaMap(c.typeName.namespace, fieldSchemaMap)
if err != nil {
return nil, fmt.Errorf("Record %q field %d ought to have valid name: %v", c.typeName, i+1, fieldSchemaMap)
}
fieldName := n.short()
if _, ok := codecFromFieldName[fieldName]; ok {
return nil, fmt.Errorf("Record %q field %d ought to have unique name: %q", c.typeName, i+1, fieldName)
}
if defaultValue, ok := fieldSchemaMap["default"]; ok {
typeNameShort := fieldCodec.typeName.short()
switch typeNameShort {
case "boolean":
v, ok := defaultValue.(bool)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = bool(v)
case "bytes":
v, ok := defaultValue.(string)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = []byte(v)
case "double":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = float64(v)
case "float":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = float32(v)
case "int":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = int32(v)
case "long":
v, ok := defaultValue.(float64)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = int64(v)
case "string":
v, ok := defaultValue.(string)
if !ok {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValue = string(v)
case "union":
// When codec is union, then default value ought to encode using
// first schema in union. NOTE: To support a null default
// value, the string literal "null" must be coerced to a `nil`
if defaultValue == "null" {
defaultValue = nil
}
// NOTE: To support record field default values, union schema
// set to the type name of first member
// TODO: change to schemaCanonical below
defaultValue = Union(fieldCodec.schemaOriginal, defaultValue)
default:
debug("fieldName: %q; type: %q; defaultValue: %T(%#v)\n", fieldName, c.typeName, defaultValue, defaultValue)
}
// attempt to encode default value using codec
_, err = fieldCodec.binaryFromNative(nil, defaultValue)
if err != nil {
return nil, fmt.Errorf("Record %q field %q: default value ought to encode using field schema: %s", c.typeName, fieldName, err)
}
defaultValueFromName[fieldName] = defaultValue
}
nameFromIndex[i] = fieldName
codecFromIndex[i] = fieldCodec
codecFromFieldName[fieldName] = fieldCodec
}
c.binaryFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
valueMap, ok := datum.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("cannot encode binary record %q: expected map[string]interface{}; received: %T", c.typeName, datum)
}
// records encoded in order fields were defined in schema
for i, fieldCodec := range codecFromIndex {
fieldName := nameFromIndex[i]
// NOTE: If field value was not specified in map, then set
// fieldValue to its default value (which may or may not have been
// specified).
fieldValue, ok := valueMap[fieldName]
if !ok {
if fieldValue, ok = defaultValueFromName[fieldName]; !ok {
return nil, fmt.Errorf("cannot encode binary record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName)
}
}
var err error
buf, err = fieldCodec.binaryFromNative(buf, fieldValue)
if err != nil {
return nil, fmt.Errorf("cannot encode binary record %q field %q: value does not match its schema: %s", c.typeName, fieldName, err)
}
}
return buf, nil
}
c.nativeFromBinary = func(buf []byte) (interface{}, []byte, error) {
recordMap := make(map[string]interface{}, len(codecFromIndex))
for i, fieldCodec := range codecFromIndex {
name := nameFromIndex[i]
var value interface{}
var err error
value, buf, err = fieldCodec.nativeFromBinary(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode binary record %q field %q: %s", c.typeName, name, err)
}
recordMap[name] = value
}
return recordMap, buf, nil
}
c.nativeFromTextual = func(buf []byte) (interface{}, []byte, error) {
var mapValues map[string]interface{}
var err error
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextDecoder
// to return an error when a field name is not found in the
// codecFromFieldName map.
mapValues, buf, err = genericMapTextDecoder(buf, nil, codecFromFieldName)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual record %q: %s", c.typeName, err)
}
if actual, expected := len(mapValues), len(codecFromFieldName); actual != expected {
// set missing field keys to their respective default values, then
// re-check number of keys
for fieldName, defaultValue := range defaultValueFromName {
if _, ok := mapValues[fieldName]; !ok {
mapValues[fieldName] = defaultValue
}
}
if actual, expected = len(mapValues), len(codecFromFieldName); actual != expected {
return nil, nil, fmt.Errorf("cannot decode textual record %q: only found %d of %d fields", c.typeName, actual, expected)
}
}
return mapValues, buf, nil
}
c.textualFromNative = func(buf []byte, datum interface{}) ([]byte, error) {
// NOTE: Ensure only schema defined field names are encoded; and if
// missing in datum, either use the provided field default value or
// return an error.
sourceMap, ok := datum.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("cannot encode textual record %q: expected map[string]interface{}; received: %T", c.typeName, datum)
}
destMap := make(map[string]interface{}, len(codecFromIndex))
for fieldName := range codecFromFieldName {
fieldValue, ok := sourceMap[fieldName]
if !ok {
defaultValue, ok := defaultValueFromName[fieldName]
if !ok {
return nil, fmt.Errorf("cannot encode textual record %q field %q: schema does not specify default value and no value provided", c.typeName, fieldName)
}
fieldValue = defaultValue
}
destMap[fieldName] = fieldValue
}
datum = destMap
// NOTE: Setting `defaultCodec == nil` instructs genericMapTextEncoder
// to return an error when a field name is not found in the
// codecFromFieldName map.
return genericMapTextEncoder(buf, datum, nil, codecFromFieldName)
}
return c, nil
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"fmt"
"io"
"unicode"
)
// advanceAndConsume advances to non whitespace and returns an error if the next
// non whitespace byte is not what is expected.
func advanceAndConsume(buf []byte, expected byte) ([]byte, error) {
var err error
if buf, err = advanceToNonWhitespace(buf); err != nil {
return nil, err
}
if actual := buf[0]; actual != expected {
return nil, fmt.Errorf("expected: %q; actual: %q", expected, actual)
}
return buf[1:], nil
}
// advanceToNonWhitespace consumes bytes from buf until non-whitespace character
// is found. It returns error when no more bytes remain, because its purpose is
// to scan ahead to the next non-whitespace character.
func advanceToNonWhitespace(buf []byte) ([]byte, error) {
for i, b := range buf {
if !unicode.IsSpace(rune(b)) {
return buf[i:], nil
}
}
return nil, io.ErrShortBuffer
}
// Copyright [2019] LinkedIn Corp. Licensed under the Apache License, Version
// 2.0 (the "License"); you may not use this file except in compliance with the
// License. You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
package goavro
import (
"bytes"
"errors"
"fmt"
)
// Union wraps a datum value in a map for encoding as a Union, as required by
// Union encoder.
//
// When providing a value for an Avro union, the encoder will accept `nil` for a
// `null` value. If the value is non-`nil`, it must be a
// `map[string]interface{}` with a single key-value pair, where the key is the
// Avro type name and the value is the datum's value. As a convenience, the
// `Union` function wraps any datum value in a map as specified above.
//
// func ExampleUnion() {
// codec, err := goavro.NewCodec(`["null","string","int"]`)
// if err != nil {
// fmt.Println(err)
// }
// buf, err := codec.TextualFromNative(nil, goavro.Union("string", "some string"))
// if err != nil {
// fmt.Println(err)
// }
// fmt.Println(string(buf))
// // Output: {"string":"some string"}
// }
func Union(name string, datum interface{}) interface{} {
if datum == nil && name == "null" {
return nil
}
return map[string]interface{}{name: datum}
}
func buildCodecForTypeDescribedBySlice(st map[string]*Codec, enclosingNamespace string, schemaArray []interface{}) (*Codec, error) {
if len(schemaArray) == 0 {
return nil, errors.New("Union ought to have one or more members")
}
allowedTypes := make([]string, len(schemaArray)) // used for error reporting when encoder receives invalid datum type
codecFromIndex := make([]*Codec, len(schemaArray))
codecFromName := make(map[string]*Codec, len(schemaArray))
indexFromName := make(map[string]int, len(schemaArray))
for i, unionMemberSchema := range schemaArray {
unionMemberCodec, err := buildCodec(st, enclosingNamespace, unionMemberSchema)
if err != nil {
return nil, fmt.Errorf("Union item %d ought to be valid Avro type: %s", i+1, err)
}
fullName := unionMemberCodec.typeName.fullName
if _, ok := indexFromName[fullName]; ok {
return nil, fmt.Errorf("Union item %d ought to be unique type: %s", i+1, unionMemberCodec.typeName)
}
allowedTypes[i] = fullName
codecFromIndex[i] = unionMemberCodec
codecFromName[fullName] = unionMemberCodec
indexFromName[fullName] = i
}
return &Codec{
// NOTE: To support record field default values, union schema set to the
// type name of first member
// TODO: add/change to schemaCanonical below
schemaOriginal: codecFromIndex[0].typeName.fullName,
typeName: &name{"union", nullNamespace},
nativeFromBinary: func(buf []byte) (interface{}, []byte, error) {
var decoded interface{}
var err error
decoded, buf, err = longNativeFromBinary(buf)
if err != nil {
return nil, nil, err
}
index := decoded.(int64) // longDecoder always returns int64, so elide error checking
if index < 0 || index >= int64(len(codecFromIndex)) {
return nil, nil, fmt.Errorf("cannot decode binary union: index ought to be between 0 and %d; read index: %d", len(codecFromIndex)-1, index)
}
c := codecFromIndex[index]
decoded, buf, err = c.nativeFromBinary(buf)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode binary union item %d: %s", index+1, err)
}
if decoded == nil {
// do not wrap a nil value in a map
return nil, buf, nil
}
// Non-nil values are wrapped in a map with single key set to type name of value
return Union(allowedTypes[index], decoded), buf, nil
},
binaryFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
switch v := datum.(type) {
case nil:
index, ok := indexFromName["null"]
if !ok {
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
return longBinaryFromNative(buf, index)
case map[string]interface{}:
if len(v) != 1 {
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
}
// will execute exactly once
for key, value := range v {
index, ok := indexFromName[key]
if !ok {
return nil, fmt.Errorf("cannot encode binary union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
c := codecFromIndex[index]
buf, _ = longBinaryFromNative(buf, index)
return c.binaryFromNative(buf, value)
}
}
return nil, fmt.Errorf("cannot encode binary union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
},
nativeFromTextual: func(buf []byte) (interface{}, []byte, error) {
if len(buf) >= 4 && bytes.Equal(buf[:4], []byte("null")) {
if _, ok := indexFromName["null"]; ok {
return nil, buf[4:], nil
}
}
var datum interface{}
var err error
datum, buf, err = genericMapTextDecoder(buf, nil, codecFromName)
if err != nil {
return nil, nil, fmt.Errorf("cannot decode textual union: %s", err)
}
return datum, buf, nil
},
textualFromNative: func(buf []byte, datum interface{}) ([]byte, error) {
switch v := datum.(type) {
case nil:
_, ok := indexFromName["null"]
if !ok {
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
return append(buf, "null"...), nil
case map[string]interface{}:
if len(v) != 1 {
return nil, fmt.Errorf("cannot encode textual union: non-nil Union values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
}
// will execute exactly once
for key, value := range v {
index, ok := indexFromName[key]
if !ok {
return nil, fmt.Errorf("cannot encode textual union: no member schema types support datum: allowed types: %v; received: %T", allowedTypes, datum)
}
buf = append(buf, '{')
var err error
buf, err = stringTextualFromNative(buf, key)
if err != nil {
return nil, fmt.Errorf("cannot encode textual union: %s", err)
}
buf = append(buf, ':')
c := codecFromIndex[index]
buf, err = c.textualFromNative(buf, value)
if err != nil {
return nil, fmt.Errorf("cannot encode textual union: %s", err)
}
return append(buf, '}'), nil
}
}
return nil, fmt.Errorf("cannot encode textual union: non-nil values ought to be specified with Go map[string]interface{}, with single key equal to type name, and value equal to datum value: %v; received: %T", allowedTypes, datum)
},
}, nil
}
......@@ -122,6 +122,8 @@ github.com/golang/protobuf/ptypes
github.com/golang/protobuf/ptypes/any
github.com/golang/protobuf/ptypes/duration
github.com/golang/protobuf/ptypes/timestamp
# github.com/golang/snappy v0.0.1
github.com/golang/snappy
# github.com/google/flatbuffers v1.11.0
github.com/google/flatbuffers/go
# github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c
......@@ -167,6 +169,8 @@ github.com/klauspost/cpuid
github.com/lib/pq
github.com/lib/pq/oid
github.com/lib/pq/scram
# github.com/linkedin/goavro/v2 v2.9.7
github.com/linkedin/goavro/v2
# github.com/mattetti/filebuffer v1.0.0
github.com/mattetti/filebuffer
# github.com/mattn/go-colorable v0.1.4
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment