Hugging Face support

This Jupyter Notebook demonstrates various operations involving the Hugging Face bridge:

  1. Converting a plaid dataset to Hugging Face

  2. Generating a Hugging Face dataset with a generator

  3. Converting a Hugging Face dataset to plaid

  4. Saving and Loading Hugging Face datasets

  5. Handling plaid samples from Hugging Face datasets without converting the complete dataset to plaid

Each section is documented and explained.

# Import necessary libraries and functions
import pickle

import numpy as np
from Muscat.Bridges.CGNSBridge import MeshToCGNS
from Muscat.MeshTools import MeshCreationTools as MCT

from plaid.bridges import huggingface_bridge
from plaid import Dataset
from plaid import Sample
from plaid import ProblemDefinition
Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
  In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
  For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
  For unit testing set OMP_PROC_BIND=false
/home/docs/checkouts/readthedocs.org/user_builds/plaid-lib/conda/0.1.8/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
# Print Sample util
def show_sample(sample: Sample):
    print(f"sample = {sample}")
    sample.show_tree()
    print(f"{sample.get_scalar_names() = }")
    print(f"{sample.get_field_names() = }")

Initialize plaid dataset and problem_definition

# Input data
points = np.array(
    [
        [0.0, 0.0],
        [1.0, 0.0],
        [1.0, 1.0],
        [0.0, 1.0],
        [0.5, 1.5],
    ]
)

triangles = np.array(
    [
        [0, 1, 2],
        [0, 2, 3],
        [2, 4, 3],
    ]
)


dataset = Dataset()

print("Creating meshes dataset...")
for _ in range(3):
    mesh = MCT.CreateMeshOfTriangles(points, triangles)

    sample = Sample()

    sample.meshes.add_tree(MeshToCGNS(mesh))
    sample.add_scalar("scalar", np.random.randn())
    sample.add_field("node_field", np.random.rand(len(points)), location="Vertex")
    sample.add_field(
        "cell_field", np.random.rand(len(triangles)), location="CellCenter"
    )

    dataset.add_sample(sample)

infos = {
    "legal": {"owner": "Bob", "license": "my_license"},
    "data_production": {"type": "simulation", "physics": "3D example"},
}

dataset.set_infos(infos)

print(f" {dataset = }")

problem = ProblemDefinition()
problem.add_output_scalars_names(["scalar"])
problem.add_output_fields_names(["node_field", "cell_field"])
problem.add_input_meshes_names(["/Base/Zone"])

problem.set_task("regression")
problem.set_split({"train": [0, 1], "test": [2]})

print(f" {problem = }")
Creating meshes dataset...
 dataset = Dataset(3 samples, 1 scalar, 0 time_series, 3 fields)
 problem = ProblemDefinition(output_scalars_names=['scalar'], output_fields_names=['cell_field', 'node_field'], input_meshes_names=['/Base/Zone'], task='regression', split_names=['train', 'test'])
/tmp/ipykernel_3577/2944975079.py:48: DeprecationWarning: use `add_out_features_identifiers` instead [since v0.1.8] (will be removed in v0.2.0)
  problem.add_output_scalars_names(["scalar"])
/tmp/ipykernel_3577/2944975079.py:49: DeprecationWarning: use `add_out_features_identifiers` instead [since v0.1.8] (will be removed in v0.2.0)
  problem.add_output_fields_names(["node_field", "cell_field"])
/tmp/ipykernel_3577/2944975079.py:50: DeprecationWarning: use `add_in_features_identifiers` instead [since v0.1.8] (will be removed in v0.2.0)
  problem.add_input_meshes_names(["/Base/Zone"])

Section 1: Convert plaid dataset to Hugging Face

The description field of Hugging Face dataset is automatically configured to include data from the plaid dataset info and problem_definition to prevent loss of information and equivalence of format.

hf_dataset = huggingface_bridge.plaid_dataset_to_huggingface(dataset, problem)
print()
print(f"{hf_dataset = }")
print(f"{hf_dataset.description = }")
Generating all_samples split: 0 examples [00:00, ? examples/s]
Generating all_samples split: 3 examples [00:00, 283.62 examples/s]
hf_dataset = Dataset({
    features: ['sample'],
    num_rows: 3
})
hf_dataset.description = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}, 'split': {'train': [0, 1], 'test': [2]}, 'task': 'regression', 'in_scalars_names': [], 'out_scalars_names': ['scalar'], 'in_timeseries_names': [], 'out_timeseries_names': [], 'in_fields_names': [], 'out_fields_names': ['cell_field', 'node_field'], 'in_meshes_names': ['/Base/Zone'], 'out_meshes_names': []}

The previous code generates a Hugging Face dataset containing all the samples from the plaid dataset, the splits being defined in the hf_dataset descriptions. For splits, Hugging Face proposes DatasetDict, which are dictionaries of hf datasets, with keys being the name of the corresponding splits. It is possible de generate a hf datasetdict directly from plaid:

hf_datasetdict = huggingface_bridge.plaid_dataset_to_huggingface_datasetdict(dataset, problem, main_splits = ['train', 'test'])
print()
print(f"{hf_datasetdict['train'] = }")
print(f"{hf_datasetdict['test'] = }")
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 2 examples [00:00, 1631.71 examples/s]

Generating test split: 0 examples [00:00, ? examples/s]
Generating test split: 1 examples [00:00, 1301.77 examples/s]
hf_datasetdict['train'] = Dataset({
    features: ['sample'],
    num_rows: 2
})
hf_datasetdict['test'] = Dataset({
    features: ['sample'],
    num_rows: 1
})

Section 2: Generate a Hugging Face dataset with a generator

def generator():
    for id in range(len(dataset)):
        yield {
            "sample": pickle.dumps(dataset[id]),
        }


hf_dataset_gen = huggingface_bridge.plaid_generator_to_huggingface(
    generator, infos, problem
)
print()
print(f"{hf_dataset_gen = }")
print(f"{hf_dataset_gen.description = }")
Generating all_samples split: 0 examples [00:00, ? examples/s]
Generating all_samples split: 3 examples [00:00, 1983.12 examples/s]
hf_dataset_gen = Dataset({
    features: ['sample'],
    num_rows: 3
})
hf_dataset_gen.description = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}, 'split': {'train': [0, 1], 'test': [2]}, 'task': 'regression', 'in_scalars_names': [], 'out_scalars_names': ['scalar'], 'in_timeseries_names': [], 'out_timeseries_names': [], 'in_fields_names': [], 'out_fields_names': ['cell_field', 'node_field'], 'in_meshes_names': ['/Base/Zone'], 'out_meshes_names': []}

The same is available with datasetdict:

hf_datasetdict_gen = huggingface_bridge.plaid_generator_to_huggingface_datasetdict(
    generator, infos, problem, main_splits = ['train', 'test']
)
print()
print(f"{hf_datasetdict['train'] = }")
print(f"{hf_datasetdict['test'] = }")
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 3 examples [00:00, 2395.38 examples/s]

Generating test split: 0 examples [00:00, ? examples/s]
Generating test split: 3 examples [00:00, 1176.19 examples/s]
hf_datasetdict['train'] = Dataset({
    features: ['sample'],
    num_rows: 2
})
hf_datasetdict['test'] = Dataset({
    features: ['sample'],
    num_rows: 1
})

Section 3: Convert a Hugging Face dataset to plaid

Plaid dataset infos and problem_defitinion are recovered from the huggingface dataset

dataset_2, problem_2 = huggingface_bridge.huggingface_dataset_to_plaid(hf_dataset)
print()
print(f"{dataset_2 = }")
print(f"{dataset_2.get_infos() = }")
print(f"{problem_2 = }")
Converting Hugging Face dataset to plaid dataset...
  0%|          | 0/3 [00:00<?, ?it/s]
100%|██████████| 3/3 [00:00<00:00, 914.85it/s]
dataset_2 = Dataset(3 samples, 1 scalar, 0 time_series, 3 fields)
dataset_2.get_infos() = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}}
problem_2 = ProblemDefinition(output_scalars_names=['scalar'], output_fields_names=['cell_field', 'node_field'], input_meshes_names=['/Base/Zone'], task='regression', split_names=['train', 'test'])

Section 4: Save and Load Hugging Face datasets

From and to disk

# Save to disk
hf_dataset.save_to_disk("/tmp/path/to/dir")
Saving the dataset (0/1 shards):   0%|          | 0/3 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3/3 [00:00<00:00, 1120.77 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3/3 [00:00<00:00, 993.28 examples/s] 

# Load from disk
from datasets import load_from_disk, load_dataset

loaded_hf_dataset = load_from_disk("/tmp/path/to/dir")

print()
print(f"{loaded_hf_dataset = }")
print(f"{loaded_hf_dataset.description = }")
loaded_hf_dataset = Dataset({
    features: ['sample'],
    num_rows: 3
})
loaded_hf_dataset.description = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}, 'split': {'train': [0, 1], 'test': [2]}, 'task': 'regression', 'in_scalars_names': [], 'out_scalars_names': ['scalar'], 'in_timeseries_names': [], 'out_timeseries_names': [], 'in_fields_names': [], 'out_fields_names': ['cell_field', 'node_field'], 'in_meshes_names': ['/Base/Zone'], 'out_meshes_names': []}

From and to the Hugging Face hub

You need an huggingface account, with a configured access token, and to install huggingface_hub[cli]. Pushing and loading a huggingface dataset without loss of information requires the configuration of a DatasetCard.

Find below example of instruction (not executed by this notebook).

Push to the hub

First login the huggingface cli:

huggingface-cli login

and enter you access token.

Then, the following python instruction enable pushing a dataset to the hub:

hf_dataset.push_to_hub("chanel/dataset")

from datasets import load_dataset_builder

datasetInfo = load_dataset_builder("chanel/dataset").__getstate__()['info']

from huggingface_hub import DatasetCard

card_text = create_string_for_huggingface_dataset_card(
    description = description,
    download_size_bytes = datasetInfo.download_size,
    dataset_size_bytes = datasetInfo.dataset_size,
    ...)
dataset_card = DatasetCard(card_text)
dataset_card.push_to_hub("chanel/dataset")

The second upload of the dataset_card is required to ensure that load_dataset from the hub will populate the hf-dataset.description field, and be compatible for conversion to plaid. Wihtout a dataset_card, the description field is lost.

Load from hub

dataset = load_dataset("chanel/dataset", split="all_samples")

More efficient retrieval are made possible by partial loads and split laods (in the case of a datasetdict):

dataset_train = load_dataset("chanel/dataset", split="train")
dataset_train_extract = load_dataset("chanel/dataset", split="train[:10]")

Section 5: Handle plaid samples from Hugging Face datasets without converting the complete dataset to plaid

To fully exploit optimzed data handling of the Hugging Face datasets library, it is possible to extract information from the huggingface dataset without converting to plaid. The description atttribute includes the plaid dataset _infos attribute and plaid problem_definition attributes.

print(f"{loaded_hf_dataset.description = }")
loaded_hf_dataset.description = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}, 'split': {'train': [0, 1], 'test': [2]}, 'task': 'regression', 'in_scalars_names': [], 'out_scalars_names': ['scalar'], 'in_timeseries_names': [], 'out_timeseries_names': [], 'in_fields_names': [], 'out_fields_names': ['cell_field', 'node_field'], 'in_meshes_names': ['/Base/Zone'], 'out_meshes_names': []}

Get the first sample of the first split

split_names = list(loaded_hf_dataset.description["split"].keys())
id = loaded_hf_dataset.description["split"][split_names[0]]
hf_sample = loaded_hf_dataset[id[0]]

print(f"{hf_sample = }")
hf_sample = {'sample': b'\x80\x04\x95\xbc\t\x00\x00\x00\x00\x00\x00}\x94(\x8c\x04path\x94N\x8c\x06meshes\x94\x8c plaid.containers.features.meshes\x94\x8c\x0cSampleMeshes\x94\x93\x94)\x81\x94}\x94(\x8c\x04data\x94}\x94G\x00\x00\x00\x00\x00\x00\x00\x00]\x94(\x8c\x08CGNSTree\x94N]\x94(]\x94(\x8c\x12CGNSLibraryVersion\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x01\x85\x94h\x12\x8c\x05dtype\x94\x93\x94\x8c\x02f4\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x04\x00\x00\x80@\x94t\x94b]\x94\x8c\x14CGNSLibraryVersion_t\x94e]\x94(\x8c\x08Base_2_2\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x02\x85\x94h\x1b\x8c\x02i4\x94\x89\x88\x87\x94R\x94(K\x03h\x1fNNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x08\x02\x00\x00\x00\x02\x00\x00\x00\x94t\x94b]\x94(]\x94(\x8c\x022D\x94N]\x94\x8c\x08Family_t\x94e]\x94(\x8c\x04Zone\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x01K\x03\x86\x94h\x1b\x8c\x02i8\x94\x89\x88\x87\x94R\x94(K\x03h\x1fNNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\x18\x05\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94(]\x94(\x8c\x08ZoneType\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x0c\x85\x94h\x1b\x8c\x02S1\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01|\x94NNNK\x01K\x01K\x00t\x94b\x89C\x0cUnstructured\x94t\x94b]\x94\x8c\nZoneType_t\x94e]\x94(\x8c\x0fGridCoordinates\x94N]\x94(]\x94(\x8c\x0bCoordinateX\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x05\x85\x94h\x1b\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03h\x1fNNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xe0?\x94t\x94b]\x94\x8c\x0bDataArray_t\x94e]\x94(\x8c\x0bCoordinateY\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x05\x85\x94h]\x89C(\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf0?\x00\x00\x00\x00\x00\x00\xf8?\x94t\x94b]\x94hbee\x8c\x11GridCoordinates_t\x94e]\x94(\x8c\x0eElements_TRI_3\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x02\x85\x94h-\x89C\x08\x05\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94(]\x94(\x8c\x0cElementRange\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x02\x85\x94h>\x89C\x10\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94\x8c\x0cIndexRange_t\x94e]\x94(\x8c\x13ElementConnectivity\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\t\x85\x94h>\x89CH\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x01\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94hbee\x8c\nElements_t\x94e]\x94(\x8c\tPointData\x94N]\x94(]\x94(\x8c\x0cGridLocation\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x06\x85\x94h\x1b\x8c\x02S1\x94\x89\x88\x87\x94R\x94(K\x03hLNNNK\x01K\x01K\x00t\x94b\x89C\x06Vertex\x94t\x94b]\x94\x8c\x0eGridLocation_t\x94e]\x94(\x8c\x0bOriginalIds\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x05\x85\x94h>\x89C(\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94hbe]\x94(\x8c\nnode_field\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x05\x85\x94h]\x89C(\x88k\xd8\\\x98Z\xb4?\x8ep\x15\x10\x12\xe8\xe4?H\xc6\x87Z\xcbJ\xda?\xa4 \xc2\x8a\xb1\x0e\xdb?\x10\x9f\xa8 \xaf%\xde?\x94t\x94b]\x94hbee\x8c\x0eFlowSolution_t\x94e]\x94(\x8c\x08CellData\x94N]\x94(]\x94(h\x8eh\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\n\x85\x94h\x1b\x8c\x02S1\x94\x89\x88\x87\x94R\x94(K\x03hLNNNK\x01K\x01K\x00t\x94b\x89C\nCellCenter\x94t\x94b]\x94h\x9ae]\x94(h\x9ch\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x03\x85\x94h>\x89C\x18\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94hbe]\x94(\x8c\ncell_field\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x03\x85\x94h]\x89C\x18\xc0\xd4\x1cg\xfe\xce\xcb?H\x87\x91\xe6\xfc.\xb2?\xe4\xef\xb8o\x1f\xd0\xe6?\x94t\x94b]\x94hbeeh\xade]\x94(\x8c\x0bSurfaceData\x94N]\x94(]\x94(h\x8eh\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\n\x85\x94h\x1b\x8c\x02S1\x94\x89\x88\x87\x94R\x94(K\x03hLNNNK\x01K\x01K\x00t\x94b\x89C\nFaceCenter\x94t\x94b]\x94h\x9ae]\x94(h\x9ch\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x00\x85\x94h>\x89C\x00\x94t\x94b]\x94hbeeh\xade]\x94(\x8c\nFamilyName\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x02\x85\x94h\x1b\x8c\x02S1\x94\x89\x88\x87\x94R\x94(K\x03hLNNNK\x01K\x01K\x00t\x94b\x89C\x022D\x94t\x94b]\x94\x8c\x0cFamilyName_t\x94ee\x8c\x06Zone_t\x94e]\x94(\x8c\x04Time\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x01\x85\x94h-\x89C\x04\x01\x00\x00\x00\x94t\x94b]\x94(]\x94(\x8c\x0fIterationValues\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x01\x85\x94h-\x89C\x04\x01\x00\x00\x00\x94t\x94b]\x94hbe]\x94(\x8c\nTimeValues\x94h\x11h\x14K\x00\x85\x94h\x16\x87\x94R\x94(K\x01K\x01\x85\x94h]\x89C\x08\x00\x00\x00\x00\x00\x00\x00\x00\x94t\x94b]\x94hbee\x8c\x13BaseIterativeData_t\x94ee\x8c\nCGNSBase_t\x94ee\x8c\nCGNSTree_t\x94es\x8c\x06_links\x94}\x94G\x00\x00\x00\x00\x00\x00\x00\x00Ns\x8c\x06_paths\x94}\x94G\x00\x00\x00\x00\x00\x00\x00\x00Ns\x8c\x14_default_active_base\x94N\x8c\x14_default_active_zone\x94N\x8c\x14_default_active_time\x94N\x8c\x0f_mesh_base_name\x94\x8c\x04Base\x94\x8c\x0f_mesh_zone_name\x94h7ub\x8c\x07scalars\x94\x8c!plaid.containers.features.scalars\x94\x8c\rSampleScalars\x94\x93\x94)\x81\x94}\x94h\x08}\x94\x8c\x06scalar\x94G\xbf\xd1\xc6;\xda\xaf\x04Ussb\x8c\x0btime_series\x94Nu.'}

We notice that hf_sample is a binary object efficiently handled by huggingface datasets. It can be converted into a plaid sample using a specific constructor relying on a pydantic validator.

plaid_sample = huggingface_bridge.to_plaid_sample(hf_sample)

show_sample(plaid_sample)
sample = Sample(1 scalar, 0 time series, 1 timestamp, 3 fields)
 CGNSLibraryVersion : (1,) [4.] float32 CGNSLibraryVersion_t
 Base_2_2 : (2,) [2 2] int32 CGNSBase_t
|_  2D : None Family_t
|_  Zone : (1, 3) [[5 3 0]] int64 Zone_t
   |_  ZoneType : (12,) Unstructured |S1 ZoneType_t
   |_  GridCoordinates : None GridCoordinates_t
      |_  CoordinateX : (5,) [0.  1.  1.  0.  0.5] float64 DataArray_t
      |_  CoordinateY : (5,) [0.  0.  1.  1.  1.5] float64 DataArray_t
   |_  Elements_TRI_3 : (2,) [5 0] int32 Elements_t
      |_  ElementRange : (2,) [1 3] int64 IndexRange_t
      |_  ElementConnectivity : (9,) [1 ... 4] int64 DataArray_t
   |_  PointData : None FlowSolution_t
      |_  GridLocation : (6,) Vertex |S1 GridLocation_t
      |_  OriginalIds : (5,) [1 2 3 4 5] int64 DataArray_t
      |_  node_field : (5,) [0.07950737 0.65332893 0.41081508 0.42277182 0.47105005] float64 DataArray_t
   |_  CellData : None FlowSolution_t
      |_  GridLocation : (10,) CellCenter |S1 GridLocation_t
      |_  OriginalIds : (3,) [1 2 3] int64 DataArray_t
      |_  cell_field : (3,) [0.21725445 0.07102948 0.71290562] float64 DataArray_t
   |_  SurfaceData : None FlowSolution_t
      |_  GridLocation : (10,) FaceCenter |S1 GridLocation_t
      |_  OriginalIds : (0,) [] int64 DataArray_t
   |_  FamilyName : (2,) 2D |S1 FamilyName_t
|_  Time : (1,) [1] int32 BaseIterativeData_t
   |_  IterationValues : (1,) [1] int32 DataArray_t
   |_  TimeValues : (1,) [0.] float64 DataArray_t
sample.get_scalar_names() = ['scalar']
sample.get_field_names() = ['OriginalIds', 'cell_field', 'node_field']

Very large datasets can be streamed directly from the Hugging Face hub:

hf_dataset_stream = load_dataset("chanel/dataset", split="all_samples", streaming=True)

plaid_sample = huggingface_bridge.to_plaid_sample(next(iter(hf_dataset_stream)))

show_sample(plaid_sample)