Public
Edited
Apr 17, 2024
1 fork
40 stars
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
arrow = import("https://cdn.skypack.dev/apache-arrow@9")
Insert cell
Insert cell
// Load the parquet-wasm library
parquetModule = {
const parquetModule = await import(
"https://unpkg.com/parquet-wasm@0.4.0-beta.5/esm/arrow2.js"
);
// Need to await the default export first to initialize the WebAssembly code
const {memory} = await parquetModule.default();
return [parquetModule, memory];
}
Insert cell
parquet = parquetModule[0]
Insert cell
wasmMemory = parquetModule[1]
Insert cell
Insert cell
arrowTable = {
// An index column with ordered integers from 0 up to 10,000 t
const indexArray = new Uint16Array(10000);
for (let i = 0; i < indexArray.length; ++i) {
indexArray[i] = i;
}

// Random float values between 0 and 100
const valuesArray = Float32Array.from({ length: 10000 }, () =>
Number((Math.random() * 100).toFixed(1))
);

// Create the Arrow Table
return arrow.tableFromArrays({
index: indexArray,
values: valuesArray
});
}
Insert cell
Insert cell
parquetBuffer = parquet.writeParquet(arrow.tableToIPC(arrowTable, "file"));
Insert cell
arrowTableFFI = parquet._readParquetFFI(parquetBuffer);
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
arrowTableFFI.schemaLength()
Insert cell
Insert cell
schemaPointers = [arrowTableFFI.schemaAddr(0), arrowTableFFI.schemaAddr(1)]
Insert cell
Insert cell
parseField = {
/** Minimal parsing of an arrow Field from an ArrowSchema FFI struct */
function parseField(buffer, ptr) {
const dataView = new DataView(buffer);

// Parse format to an Arrow JS DataType
const format = parseFormat(dataView, ptr);

// Parse name into a JS string
const namePtr = dataView.getUint32(ptr + 4, true);
const name = parseNullTerminatedString(dataView, namePtr);

// Extra 4 to be 8-byte aligned
const flags = dataView.getBigInt64(ptr + 16, true);
const nullable = Boolean(flags & 0x00000001n);

return arrow.Field.new(name, format, nullable);
}

/** Parse a pointer to the format struct attribute into an Arrow JS DataType */
function parseFormat(dataView, ptr) {
const formatPtr = dataView.getUint32(ptr, true);
const format = parseNullTerminatedString(dataView, formatPtr);

// Strings from https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
// In this minimal implementation, only include these two types
const formatMapping = {
S: new arrow.Uint16(),
f: new arrow.Float32()
};
return formatMapping[format];
}

/** Parse a C-style null-terminated string to a JS string */
function parseNullTerminatedString(dataView, ptr) {
// First find the end of the null-terminated string
let end = ptr;
while (end < dataView.byteLength && dataView.getUint8(end) !== 0) {
end += 1;
}

// Take a Uint8Array view of that buffer range and decode to a UTF8 string
return new TextDecoder("utf-8").decode(
new Uint8Array(dataView.buffer, ptr, end - ptr)
);
}
return parseField;
}
Insert cell
Insert cell
arrowFields = [
parseField(wasmMemory.buffer, schemaPointers[0]),
parseField(wasmMemory.buffer, schemaPointers[1])
]
Insert cell
Insert cell
arrowTableFFI.chunksLength()
Insert cell
Insert cell
arrowTableFFI.chunkLength(0)
Insert cell
Insert cell
arrayPointers = [arrowTableFFI.arrayAddr(0, 0), arrowTableFFI.arrayAddr(0, 1)]
Insert cell
Insert cell
parseArray = {
/** Parse Array from ArrowArray FFI */
return function parseArray(buffer, ptr, dataType) {
const dataView = new DataView(buffer);

// These functions assume bigint support
// Older browsers could work with each half and cast to a number
const length = dataView.getBigInt64(ptr, true);
const nullCount = dataView.getBigInt64(ptr + 8, true);
const offset = dataView.getBigInt64(ptr + 16, true);
const nBuffers = dataView.getBigInt64(ptr + 24, true);
const nChildren = dataView.getBigInt64(ptr + 32, true);

// We have a pointer to an array of pointers, where each of those points to an actual array of data
const ptrToBuffers = dataView.getUint32(ptr + 40, true);
const bufferPtrs = [];
for (let i = 0; i < nBuffers; i++) {
bufferPtrs.push(dataView.getUint32(ptrToBuffers + i * 4, true));
}

// For simplicity in this example, we only support primitive types
// For primitive types, the first pointer is the validity and the second is the data:
// https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
const validityPtr = bufferPtrs[0];
// In an actual implementation you'd want to parse this validity bitmap 😄
// https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
const nullBitmap = validityPtr === 0 ? null : null;

// Then take a view of the data
// Here ArrayType is Uint16Array and Float32Array, respectively
const dataPtr = bufferPtrs[1];
const data = new dataType.ArrayType(
dataView.buffer,
dataPtr,
Number(length)
);

// Then create Arrow JS Data objects from these low-level arrays
return arrow.makeData({
type: dataType,
offset: Number(offset),
length: Number(length),
nullCount: Number(nullCount),
nullBitmap,
data
});
};
}
Insert cell
Insert cell
arrowArrays = [
parseArray(wasmMemory.buffer, arrayPointers[0], arrowFields[0].type),
parseArray(wasmMemory.buffer, arrayPointers[1], arrowFields[1].type)
]
Insert cell
Insert cell
arrowTableFromFFI = {
const schema = new arrow.Schema(arrowFields);
const recordBatch = new arrow.RecordBatch(
schema,
arrow.makeData({ type: new arrow.Struct(), children: arrowArrays })
);
return new arrow.Table(schema, recordBatch);
}
Insert cell
Insert cell
Insert cell
Insert cell
arraysEqual = {
// https://stackoverflow.com/a/16436975
return function arraysEqual(a, b) {
if (a === b) return true;
if (a == null || b == null) return false;
if (a.length !== b.length) return false;

// If you don't care about the order of the elements inside
// the array, you should sort both arrays here.
// Please note that calling sort on an array will modify that array.
// you might want to clone your array first.

for (var i = 0; i < a.length; ++i) {
if (a[i] !== b[i]) return false;
}
return true;
}
}
Insert cell
arraysEqual(
arrowTable.getChild("index").toArray(),
arrowTableFromFFI.getChild("index").toArray()
)
Insert cell
arraysEqual(
arrowTable.getChild("values").toArray(),
arrowTableFromFFI.getChild("values").toArray()
)
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell
Insert cell

One platform to build and deploy the best data apps

Experiment and prototype by building visualizations in live JavaScript notebooks. Collaborate with your team and decide which concepts to build out.
Use Observable Framework to build data apps locally. Use data loaders to build in any language or library, including Python, SQL, and R.
Seamlessly deploy to Observable. Test before you ship, use automatic deploy-on-commit, and ensure your projects are always up-to-date.
Learn more