Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions java/src/main/java/org/lance/util/JsonFields.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lance.util;

import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.FieldType;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
* Utility helpers for constructing JSON fields using Arrow extension metadata.
*
* <p>This class aligns with the Arrow JSON extension type (extension name {@code "arrow.json"}) for
* Utf8 and LargeUtf8 fields that logically carry JSON text.
*
* <p>When writing data, fields annotated with {@code arrow.json} are converted by Lance into its
* internal JSONB representation (physically stored as {@code LargeBinary} with extension name
* {@code "lance.json"}). When reading, Lance converts {@code lance.json} back into {@code
* arrow.json} (Utf8), so callers always work with JSON text rather than binary JSON.
*
* <p>The {@code lance.json} storage type is intentionally not exposed via helpers in this class to
* keep the internal JSONB format an implementation detail.
*
* <p>See also the Arrow extension type documentation:
* https://arrow.apache.org/docs/format/Extensions.html
*/
public final class JsonFields {

/**
* Field metadata key used by Arrow to store the extension type name ({@code
* ARROW:extension:name}).
*/
private static final String EXTENSION_NAME_KEY = "ARROW:extension:name";

/**
* Arrow JSON extension type name ({@code arrow.json}) used to mark Utf8/LargeUtf8 fields as
* carrying JSON text, whose semantics are interpreted and converted by Lance.
*/
private static final String ARROW_JSON_EXTENSION_NAME = "arrow.json";

private JsonFields() {}

/**
* Create a Utf8 field annotated as an Arrow JSON extension field.
*
* <p>The resulting field uses the {@code arrow.json} extension and relies on Lance to convert
* between JSON text and its internal JSONB representation on write and read.
*
* @param name the field name
* @param nullable whether the field is nullable
* @return a Field with Utf8 storage type and arrow.json extension metadata
*/
public static Field jsonUtf8(String name, boolean nullable) {
return new Field(name, jsonFieldType(new ArrowType.Utf8(), nullable), Collections.emptyList());
}

/**
* Create a LargeUtf8 field annotated as an Arrow JSON extension field.
*
* <p>The resulting field uses the {@code arrow.json} extension and relies on Lance to convert
* between JSON text and its internal JSONB representation on write and read.
*
* @param name the field name
* @param nullable whether the field is nullable
* @return a Field with LargeUtf8 storage type and arrow.json extension metadata
*/
public static Field jsonLargeUtf8(String name, boolean nullable) {
return new Field(
name, jsonFieldType(new ArrowType.LargeUtf8(), nullable), Collections.emptyList());
}

private static FieldType jsonFieldType(ArrowType storageType, boolean nullable) {
return new FieldType(nullable, storageType, null, jsonExtensionMetadata());
}

private static Map<String, String> jsonExtensionMetadata() {
Map<String, String> metadata = new HashMap<>();
metadata.put(EXTENSION_NAME_KEY, ARROW_JSON_EXTENSION_NAME);
return Collections.unmodifiableMap(metadata);
}
}
111 changes: 111 additions & 0 deletions java/src/test/java/org/lance/JsonExtractionTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lance;

import org.lance.ipc.LanceScanner;
import org.lance.ipc.ScanOptions;
import org.lance.util.JsonFields;

import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.memory.RootAllocator;
import org.apache.arrow.vector.IntVector;
import org.apache.arrow.vector.VarCharVector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.ipc.ArrowReader;
import org.apache.arrow.vector.ipc.ArrowStreamReader;
import org.apache.arrow.vector.ipc.ArrowStreamWriter;
import org.apache.arrow.vector.types.pojo.ArrowType;
import org.apache.arrow.vector.types.pojo.Field;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;

import java.io.ByteArrayOutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.util.Arrays;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class JsonExtractionTest {

@Test
void testJsonExtraction(@TempDir Path tempDir) throws Exception {
String datasetPath = tempDir.resolve("json_extraction_test").toString();
try (BufferAllocator allocator = new RootAllocator()) {
Schema schema =
new Schema(
Arrays.asList(
Field.nullable("id", new ArrowType.Int(32, true)),
JsonFields.jsonUtf8("data", true)));

try (VectorSchemaRoot root = VectorSchemaRoot.create(schema, allocator)) {
root.allocateNew();

IntVector idVector = (IntVector) root.getVector("id");
VarCharVector dataVector = (VarCharVector) root.getVector("data");

idVector.setSafe(0, 1);
idVector.setSafe(1, 2);
idVector.setSafe(2, 3);

dataVector.setSafe(0, "{\"user\":{\"theme\":\"dark\"}}".getBytes(StandardCharsets.UTF_8));
dataVector.setSafe(1, "{\"user\":{\"theme\":\"light\"}}".getBytes(StandardCharsets.UTF_8));
dataVector.setSafe(2, "{\"user\":{\"theme\":\"dark\"}}".getBytes(StandardCharsets.UTF_8));

root.setRowCount(3);

ByteArrayOutputStream out = new ByteArrayOutputStream();
try (ArrowStreamWriter writer = new ArrowStreamWriter(root, null, out)) {
writer.start();
writer.writeBatch();
writer.end();
}

byte[] bytes = out.toByteArray();
try (ArrowStreamReader reader =
new ArrowStreamReader(new ByteArrayReadableSeekableByteChannel(bytes), allocator)) {
try (Dataset ds =
Dataset.write()
.allocator(allocator)
.reader(reader)
.uri(datasetPath)
.mode(WriteParams.WriteMode.OVERWRITE)
.execute()) {
assertEquals(datasetPath, ds.uri());
}
}
}

try (Dataset dataset = Dataset.open().allocator(allocator).uri(datasetPath).build()) {
String filter = "json_extract(data, '$.user.theme') = '\"dark\"'";
try (LanceScanner scanner =
dataset.newScan(new ScanOptions.Builder().filter(filter).build())) {
try (ArrowReader resultReader = scanner.scanBatches()) {
int totalRows = 0;
boolean hadBatch = false;
while (resultReader.loadNextBatch()) {
hadBatch = true;
totalRows += resultReader.getVectorSchemaRoot().getRowCount();
}
assertTrue(hadBatch, "Expected at least one batch to be loaded");
assertEquals(2, totalRows, "Expected exactly two rows matching the filter");
}
}
}
}
}
}