-
Notifications
You must be signed in to change notification settings - Fork 112
Description
Apache Iceberg version
iceberg-go version: v0.3.0
Please describe the bug 🐞
I don't understand the bug, so I'll describe what I was doing to run into it, and I'll also show my attempt at a minimal repro.
I had enabled "S3 Table Metadata", which stores meta-data about an S3 bucket in an iceberg table. whenever I tried to read from the tables, when performing a scan of the table using iceberg-go, I would get "error encountered during schema visitor: arrow/array: index out of range" errors.
These errors go away if I apply the following patch to arrow-go
:
diff --git a/arrow/array/struct.go b/arrow/array/struct.go
index 6883712c..4f943ac6 100644
--- a/arrow/array/struct.go
+++ b/arrow/array/struct.go
@@ -192,7 +192,7 @@ func (a *Struct) setData(data *Data) {
a.fields = make([]arrow.Array, len(data.childData))
for i, child := range data.childData {
if data.offset != 0 || child.Len() != data.length {
- sub := NewSliceData(child, int64(data.offset), int64(data.offset+data.length))
+ sub := NewSliceData(child, int64(data.offset), int64(data.offset+child.Len()))
a.fields[i] = MakeFromData(sub)
sub.Release()
} else {
I'm just banging rocks together - I have no idea if this patch is a good idea or not.
In an attempt to produce a minimal repro, I grabbed the iceberg schema and raw parquet data from the table, and loaded them with the following go program:
Click to expand
package main
import (
"context"
"encoding/json"
"flag"
"fmt"
"os"
"github.com/apache/iceberg-go"
"github.com/apache/iceberg-go/table"
"github.com/apache/arrow-go/v18/arrow/memory"
"github.com/apache/arrow-go/v18/parquet"
pfile "github.com/apache/arrow-go/v18/parquet/file"
"github.com/apache/arrow-go/v18/parquet/pqarrow"
)
func main() {
if err := mainErr(); err != nil {
fmt.Fprintln(os.Stdout, "error:", err)
os.Exit(1)
}
}
func mainErr() error {
var parquetfile string
var icebergSchemaFile string
flag.StringVar(&parquetfile, "parquet-file", "", "parquet file to load")
flag.StringVar(&icebergSchemaFile, "iceberg-schema-file", "", "iceberg schema to use to interpret parquet file")
flag.Parse()
var schema iceberg.Schema
schemaBytes, err := os.ReadFile(icebergSchemaFile)
if err != nil {
return fmt.Errorf("reading iceberg schema file: %s", err)
}
err = json.Unmarshal(schemaBytes, &schema)
if err != nil {
return fmt.Errorf("unmarhcalling iceberg schema: %s", err)
}
parquetFile, err := os.Open(parquetfile)
if err != nil {
return fmt.Errorf("opening parquet file: %s")
}
ctx := context.Background()
return dumpParquetWithSchema(ctx, parquetFile, &schema)
}
func dumpParquetWithSchema(ctx context.Context, file parquet.ReaderAtSeeker, schema *iceberg.Schema) error {
pr, err := pfile.NewParquetReader(file)
if err != nil {
return fmt.Errorf("creating parquet reader: %s", err)
}
defer pr.Close()
arrowReader, err := pqarrow.NewFileReader(pr, pqarrow.ArrowReadProperties{
BatchSize: 1024,
}, memory.DefaultAllocator)
if err != nil {
return fmt.Errorf("creating pqarrow file reader: %s", err)
}
ar, err := arrowReader.GetRecordReader(ctx, nil, nil)
if err != nil {
return fmt.Errorf("creating arrow reader: %s", err)
}
defer ar.Release()
arrowFileSchema := ar.Schema()
fileSchema, err := table.ArrowSchemaToIceberg(arrowFileSchema, false, nil)
if err != nil {
return fmt.Errorf("convert parquet file schema to iceberg schema: %s", err)
}
for ar.Next() {
rec := ar.Record()
newRec, err := table.ToRequestedSchema(ctx, schema, fileSchema, rec, false, false, false)
if err != nil {
return fmt.Errorf("table to requested schema: %s", err)
}
err = json.NewEncoder(os.Stdout).Encode(newRec)
if err != nil {
return fmt.Errorf("encoding arrow rec as json: %s", err)
}
rec.Release()
}
return nil
}
It fails with the same error as the table-scan described above, and the same patch to arrow-go makes the error go away.
However if I do a raw dump of the parquet file with arrow-go without trying to apply the iceberg-schema to it, I don't get any errors.