Hugging Face support¶
IMPORTANT NOTICE: THIS CODE IS STILL FUNCTIONAL, BUT IS DEPRECATED. NEW DATA HANDLING DETAILED IN STORAGE DESCRIPTIONS.
This Jupyter Notebook demonstrates various operations involving the Hugging Face bridge:
Converting a plaid dataset to Hugging Face
Generating a Hugging Face dataset with a generator
Converting a Hugging Face dataset to plaid
Saving and Loading Hugging Face datasets
Handling plaid samples from Hugging Face datasets without converting the complete dataset to plaid
Advanced concepts (read speed, memory usage, streaming)
Each section is documented and explained.
# Import necessary libraries and functions
import os, psutil
import tempfile
import shutil
from time import time
from functools import partial
import numpy as np
from Muscat.Bridges.CGNSBridge import MeshToCGNS
from Muscat.MeshTools import MeshCreationTools as MCT
from plaid.bridges import huggingface_bridge
from plaid import Dataset, Sample, ProblemDefinition
from plaid.types import FeatureIdentifier
Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set
In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads
For best performance with OpenMP 3.1 set OMP_PROC_BIND=true
For unit testing set OMP_PROC_BIND=false
/home/docs/checkouts/readthedocs.org/user_builds/plaid-lib/conda/stable/lib/python3.13/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
from .autonotebook import tqdm as notebook_tqdm
# Print Sample util
def show_sample(sample: Sample):
print(f"sample = {sample}")
sample.show_tree()
print(f"{sample.get_scalar_names() = }")
print(f"{sample.get_field_names() = }")
# Get_mem util
def get_mem():
"""Get the current memory usage of the process in MB."""
process = psutil.Process(os.getpid())
return process.memory_info().rss / (1024**2) # in MB
Initialize plaid dataset, infos and problem_definition¶
# Input data
points = np.array(
[
[0.0, 0.0],
[1.0, 0.0],
[1.0, 1.0],
[0.0, 1.0],
[0.5, 1.5],
]
)
triangles = np.array(
[
[0, 1, 2],
[0, 2, 3],
[2, 4, 3],
]
)
dataset = Dataset()
scalar_feat_id = FeatureIdentifier({"type": "scalar", "name": "scalar"})
node_field_feat_id = FeatureIdentifier(
{"type": "field", "name": "node_field", "location": "Vertex"}
)
cell_field_feat_id = FeatureIdentifier(
{"type": "field", "name": "cell_field", "location": "CellCenter"}
)
print("Creating meshes dataset...")
for _ in range(3):
mesh = MCT.CreateMeshOfTriangles(points, triangles)
sample = Sample()
sample.add_tree(MeshToCGNS(mesh, exportOriginalIDs=False))
sample.update_features_from_identifier(
scalar_feat_id, np.random.randn(), in_place=True
)
sample.update_features_from_identifier(
node_field_feat_id, np.random.rand(len(points)), in_place=True
)
sample.update_features_from_identifier(
cell_field_feat_id, np.random.rand(len(triangles)), in_place=True
)
dataset.add_sample(sample)
infos = {
"legal": {"owner": "Bob", "license": "my_license"},
"data_production": {"type": "simulation", "physics": "3D example"},
}
dataset.set_infos(infos)
print(f" {dataset = }")
print(f" {infos = }")
pb_def = ProblemDefinition()
pb_def.add_in_features_identifiers([scalar_feat_id, node_field_feat_id])
pb_def.add_out_features_identifiers([cell_field_feat_id])
pb_def.set_task("regression")
pb_def.set_split({"train": [0, 1], "test": [2]})
print(f" {pb_def = }")
Creating meshes dataset...
dataset = Dataset(3 samples, 1 scalar, 2 fields)
infos = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}}
pb_def = ProblemDefinition(in_features_identifiers=[{'type': 'field', 'name': 'node_field', 'location': 'Vertex'}, {'type': 'scalar', 'name': 'scalar'}], out_features_identifiers=[{'type': 'field', 'name': 'cell_field', 'location': 'CellCenter'}], task='regression', split_names=['train', 'test'])
Section 1: Convert plaid datasets to Hugging Face DatasetDict¶
main_splits = {
split_name: pb_def.get_split(split_name) for split_name in ["train", "test"]
}
hf_datasetdict, flat_cst, key_mappings = (
huggingface_bridge.plaid_dataset_to_huggingface_datasetdict(dataset, main_splits)
)
print(f"{hf_datasetdict = }")
print(f"{flat_cst = }")
print(f"{key_mappings = }")
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 2 examples [00:00, 53.59 examples/s]
Generating test split: 0 examples [00:00, ? examples/s]
Generating test split: 1 examples [00:00, 602.02 examples/s]
hf_datasetdict = DatasetDict({
train: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 2
})
test: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 1
})
})
flat_cst = {'train': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}, 'test': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}}
key_mappings = {'variable_features': ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'], 'constant_features': ['Base_2_2', 'Base_2_2/2D', 'Base_2_2/Zone', 'Base_2_2/Zone/CellCenterFields', 'Base_2_2/Zone/CellCenterFields/GridLocation', 'Base_2_2/Zone/Elements_TRI_3', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity', 'Base_2_2/Zone/Elements_TRI_3/ElementRange', 'Base_2_2/Zone/FamilyName', 'Base_2_2/Zone/GridCoordinates', 'Base_2_2/Zone/GridCoordinates/CoordinateX', 'Base_2_2/Zone/GridCoordinates/CoordinateY', 'Base_2_2/Zone/VertexFields', 'Base_2_2/Zone/VertexFields/GridLocation', 'Base_2_2/Zone/ZoneType', 'Global'], 'cgns_types': {'CGNSLibraryVersion': 'CGNSLibraryVersion_t', 'Base_2_2': 'CGNSBase_t', 'Base_2_2/2D': 'Family_t', 'Base_2_2/Zone': 'Zone_t', 'Base_2_2/Zone/ZoneType': 'ZoneType_t', 'Base_2_2/Zone/GridCoordinates': 'GridCoordinates_t', 'Base_2_2/Zone/GridCoordinates/CoordinateX': 'DataArray_t', 'Base_2_2/Zone/GridCoordinates/CoordinateY': 'DataArray_t', 'Base_2_2/Zone/Elements_TRI_3': 'Elements_t', 'Base_2_2/Zone/Elements_TRI_3/ElementRange': 'IndexRange_t', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': 'DataArray_t', 'Base_2_2/Zone/FamilyName': 'FamilyName_t', 'Base_2_2/Zone/VertexFields': 'FlowSolution_t', 'Base_2_2/Zone/VertexFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/VertexFields/node_field': 'DataArray_t', 'Base_2_2/Zone/CellCenterFields': 'FlowSolution_t', 'Base_2_2/Zone/CellCenterFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/CellCenterFields/cell_field': 'DataArray_t', 'Base_2_2/Time': 'BaseIterativeData_t', 'Base_2_2/Time/IterationValues': 'DataArray_t', 'Base_2_2/Time/TimeValues': 'DataArray_t', 'Global': 'CGNSBase_t', 'Global/Time': 'BaseIterativeData_t', 'Global/Time/IterationValues': 'DataArray_t', 'Global/Time/TimeValues': 'DataArray_t', 'Global/scalar': 'DataArray_t'}}
A partitioning of all the indices is provided in main_splits. The conversion outputs flat_cst and key_mappings, which are central to the Hugging Face support:
flat_cst: constant features dictionary (path → value): a flatten tree containing the CGNS trees leaves that a reconstant throughout the plaid dataset.key_mappings: metadata dictionary containing keys such as:variable_features: list of paths for non-constant features.constant_features: list of paths for constant features.cgns_types: mapping from paths to CGNS types.
flat_cst and cgns_types are required for reconstructing plaid datasets and samples from the hugginface datasets.
Section 2: Generate a Hugging Face dataset with a generator¶
Ganarators are used to handle large datasets that do not fit in memory:
split_ids = {}
split_ids["train"] = [0, 1]
split_ids["test"] = [2]
generators = {}
for split_name in split_ids.keys():
def generator_(ids):
for id in ids:
yield dataset[id]
generators[split_name] = partial(generator_, ids = split_ids[split_name])
hf_datasetdict, flat_cst, key_mappings = (
huggingface_bridge.plaid_generator_to_huggingface_datasetdict(
generators
)
)
print(f"{hf_datasetdict = }")
print(f"{flat_cst = }")
print(f"{key_mappings = }")
Generating train split: 0 examples [00:00, ? examples/s]
Generating train split: 2 examples [00:00, 808.62 examples/s]
Generating test split: 0 examples [00:00, ? examples/s]
Generating test split: 1 examples [00:00, 637.72 examples/s]
hf_datasetdict = DatasetDict({
train: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 2
})
test: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 1
})
})
flat_cst = {'train': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}, 'test': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}}
key_mappings = {'variable_features': ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'], 'constant_features': ['Base_2_2', 'Base_2_2/2D', 'Base_2_2/Zone', 'Base_2_2/Zone/CellCenterFields', 'Base_2_2/Zone/CellCenterFields/GridLocation', 'Base_2_2/Zone/Elements_TRI_3', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity', 'Base_2_2/Zone/Elements_TRI_3/ElementRange', 'Base_2_2/Zone/FamilyName', 'Base_2_2/Zone/GridCoordinates', 'Base_2_2/Zone/GridCoordinates/CoordinateX', 'Base_2_2/Zone/GridCoordinates/CoordinateY', 'Base_2_2/Zone/VertexFields', 'Base_2_2/Zone/VertexFields/GridLocation', 'Base_2_2/Zone/ZoneType', 'Global'], 'cgns_types': {'CGNSLibraryVersion': 'CGNSLibraryVersion_t', 'Base_2_2': 'CGNSBase_t', 'Base_2_2/2D': 'Family_t', 'Base_2_2/Zone': 'Zone_t', 'Base_2_2/Zone/ZoneType': 'ZoneType_t', 'Base_2_2/Zone/GridCoordinates': 'GridCoordinates_t', 'Base_2_2/Zone/GridCoordinates/CoordinateX': 'DataArray_t', 'Base_2_2/Zone/GridCoordinates/CoordinateY': 'DataArray_t', 'Base_2_2/Zone/Elements_TRI_3': 'Elements_t', 'Base_2_2/Zone/Elements_TRI_3/ElementRange': 'IndexRange_t', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': 'DataArray_t', 'Base_2_2/Zone/FamilyName': 'FamilyName_t', 'Base_2_2/Zone/VertexFields': 'FlowSolution_t', 'Base_2_2/Zone/VertexFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/VertexFields/node_field': 'DataArray_t', 'Base_2_2/Zone/CellCenterFields': 'FlowSolution_t', 'Base_2_2/Zone/CellCenterFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/CellCenterFields/cell_field': 'DataArray_t', 'Base_2_2/Time': 'BaseIterativeData_t', 'Base_2_2/Time/IterationValues': 'DataArray_t', 'Base_2_2/Time/TimeValues': 'DataArray_t', 'Global': 'CGNSBase_t', 'Global/Time': 'BaseIterativeData_t', 'Global/Time/IterationValues': 'DataArray_t', 'Global/Time/TimeValues': 'DataArray_t', 'Global/scalar': 'DataArray_t'}}
In this example, the generators are not very usefull since the plaid dataset is already loaded in memory. In real settings, one can create generators in the following way to prevent loading all the data beforehand:
generators = {}
for split_name, ids in main_splits.items():
def generator_(ids=ids):
for id in ids:
loaded_simulation_data = load('path/to/split_name/simulation_id')
sample = convert_to_sample(loaded_simulation_data)
yield sample
generators[split_name] = generator_
Section 3: Convert a Hugging Face dataset to plaid¶
cgns_types = key_mappings["cgns_types"]
dataset_2 = huggingface_bridge.to_plaid_dataset(
hf_datasetdict["train"], flat_cst["train"], cgns_types
)
print()
print(f"{dataset_2 = }")
dataset_2 = Dataset(2 samples, 1 scalar, 2 fields)
Section 4: Save and Load Hugging Face datasets¶
From and to disk¶
Saving and loading datasetdict, infos, tree_struct and problem definition to disk:
with tempfile.TemporaryDirectory() as out_dir:
huggingface_bridge.save_dataset_dict_to_disk(out_dir, hf_datasetdict)
huggingface_bridge.save_infos_to_disk(out_dir, infos)
huggingface_bridge.save_tree_struct_to_disk(out_dir, flat_cst, key_mappings)
huggingface_bridge.save_problem_definition_to_disk(out_dir, "task_1", pb_def)
loaded_hf_datasetdict = huggingface_bridge.load_dataset_from_disk(out_dir)
loaded_infos = huggingface_bridge.load_infos_from_disk(out_dir)
flat_cst, key_mappings = huggingface_bridge.load_tree_struct_from_disk(out_dir)
loaded_pb_def = huggingface_bridge.load_problem_definition_from_disk(
out_dir, "task_1"
)
shutil.rmtree(out_dir)
print(f"{loaded_hf_datasetdict = }")
print(f"{loaded_infos = }")
print(f"{flat_cst = }")
print(f"{key_mappings = }")
print(f"{loaded_pb_def = }")
Saving the dataset (0/1 shards): 0%| | 0/2 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 461.93 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2/2 [00:00<00:00, 417.41 examples/s]
Saving the dataset (0/1 shards): 0%| | 0/1 [00:00<?, ? examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 399.99 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 333.17 examples/s]
loaded_hf_datasetdict = DatasetDict({
train: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 2
})
test: Dataset({
features: ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'],
num_rows: 1
})
})
loaded_infos = {'legal': {'owner': 'Bob', 'license': 'my_license'}, 'data_production': {'type': 'simulation', 'physics': '3D example'}}
flat_cst = {'train': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}, 'test': {'Base_2_2': array([2, 2], dtype=int32), 'Base_2_2/2D': None, 'Base_2_2/2D_times': None, 'Base_2_2/Zone': array([[5, 3, 0]]), 'Base_2_2/Zone/CellCenterFields': None, 'Base_2_2/Zone/CellCenterFields/GridLocation': array(['CellCenter'], dtype='<U10'), 'Base_2_2/Zone/CellCenterFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields/cell_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/CellCenterFields_times': None, 'Base_2_2/Zone/Elements_TRI_3': array([5, 0], dtype=int32), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': array([1, 2, 3, 1, 3, 4, 3, 5, 4]), 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange': array([1, 3]), 'Base_2_2/Zone/Elements_TRI_3/ElementRange_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/Elements_TRI_3_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/FamilyName': array(['2D'], dtype='<U2'), 'Base_2_2/Zone/FamilyName_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates': None, 'Base_2_2/Zone/GridCoordinates/CoordinateX': array([0. , 1. , 1. , 0. , 0.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateX_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates/CoordinateY': array([0. , 0. , 1. , 1. , 1.5]), 'Base_2_2/Zone/GridCoordinates/CoordinateY_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/GridCoordinates_times': None, 'Base_2_2/Zone/VertexFields': None, 'Base_2_2/Zone/VertexFields/GridLocation': array(['Vertex'], dtype='<U6'), 'Base_2_2/Zone/VertexFields/GridLocation_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields/node_field_times': array([ 0., 0., -1.]), 'Base_2_2/Zone/VertexFields_times': None, 'Base_2_2/Zone/ZoneType': array(['Unstructured'], dtype='<U12'), 'Base_2_2/Zone/ZoneType_times': array([ 0., 0., -1.]), 'Base_2_2/Zone_times': array([ 0., 0., -1.]), 'Base_2_2_times': array([ 0., 0., -1.]), 'Global': array([1, 1], dtype=int32), 'Global/scalar_times': array([ 0., 0., -1.]), 'Global_times': array([ 0., 0., -1.])}}
key_mappings = {'variable_features': ['Base_2_2/Zone/CellCenterFields/cell_field', 'Base_2_2/Zone/VertexFields/node_field', 'Global/scalar'], 'constant_features': ['Base_2_2', 'Base_2_2/2D', 'Base_2_2/Zone', 'Base_2_2/Zone/CellCenterFields', 'Base_2_2/Zone/CellCenterFields/GridLocation', 'Base_2_2/Zone/Elements_TRI_3', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity', 'Base_2_2/Zone/Elements_TRI_3/ElementRange', 'Base_2_2/Zone/FamilyName', 'Base_2_2/Zone/GridCoordinates', 'Base_2_2/Zone/GridCoordinates/CoordinateX', 'Base_2_2/Zone/GridCoordinates/CoordinateY', 'Base_2_2/Zone/VertexFields', 'Base_2_2/Zone/VertexFields/GridLocation', 'Base_2_2/Zone/ZoneType', 'Global'], 'cgns_types': {'CGNSLibraryVersion': 'CGNSLibraryVersion_t', 'Base_2_2': 'CGNSBase_t', 'Base_2_2/2D': 'Family_t', 'Base_2_2/Zone': 'Zone_t', 'Base_2_2/Zone/ZoneType': 'ZoneType_t', 'Base_2_2/Zone/GridCoordinates': 'GridCoordinates_t', 'Base_2_2/Zone/GridCoordinates/CoordinateX': 'DataArray_t', 'Base_2_2/Zone/GridCoordinates/CoordinateY': 'DataArray_t', 'Base_2_2/Zone/Elements_TRI_3': 'Elements_t', 'Base_2_2/Zone/Elements_TRI_3/ElementRange': 'IndexRange_t', 'Base_2_2/Zone/Elements_TRI_3/ElementConnectivity': 'DataArray_t', 'Base_2_2/Zone/FamilyName': 'FamilyName_t', 'Base_2_2/Zone/VertexFields': 'FlowSolution_t', 'Base_2_2/Zone/VertexFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/VertexFields/node_field': 'DataArray_t', 'Base_2_2/Zone/CellCenterFields': 'FlowSolution_t', 'Base_2_2/Zone/CellCenterFields/GridLocation': 'GridLocation_t', 'Base_2_2/Zone/CellCenterFields/cell_field': 'DataArray_t', 'Base_2_2/Time': 'BaseIterativeData_t', 'Base_2_2/Time/IterationValues': 'DataArray_t', 'Base_2_2/Time/TimeValues': 'DataArray_t', 'Global': 'CGNSBase_t', 'Global/Time': 'BaseIterativeData_t', 'Global/Time/IterationValues': 'DataArray_t', 'Global/Time/TimeValues': 'DataArray_t', 'Global/scalar': 'DataArray_t'}}
loaded_pb_def = ProblemDefinition(in_features_identifiers=[{'location': 'Vertex', 'name': 'node_field', 'type': 'field'}, {'name': 'scalar', 'type': 'scalar'}], out_features_identifiers=[{'location': 'CellCenter', 'name': 'cell_field', 'type': 'field'}], task='regression')
From and to the Hugging Face hub¶
Find below examples of instructions (not executed by this notebook).
Load from hub¶
To load datasetdict, infos and problem_definitions from the hub:
huggingface_bridge.load_dataset_from_hub("chanel/dataset", *args, **kwargs)
huggingface_bridge.load_hf_infos_from_hub("chanel/dataset")
huggingface_bridge.load_hf_problem_definition_from_hub("chanel/dataset", "name")
Partial retrieval are possible along samples
huggingface_bridge.load_dataset_from_hub("chanel/dataset", split="train[:10], *args, **kwargs)
Streaming allows handling very large datasets
hf_dataset_streamed = huggingface_bridge.load_dataset_from_hub("chanel/dataset", split="split", streaming=True, *args, **kwargs)
for hf_sample in hf_dataset_streamed:
sample = huggingface_bridge.to_plaid_sample(hf_sample, flat_cst, cgns_types)
Native HF datasets commands are also possible:
dataset_train = load_dataset("chanel/dataset", split="train")
dataset_train = load_dataset("chanel/dataset", split="train", streaming=True)
dataset_train_extract = load_dataset("chanel/dataset", split="train[:10]")
If you are behind a proxy and relying on a private mirror the function load_dataset_from_hub is working provided the following is set:
HF_ENDPOINTto your private mirror addressCURL_CA_BUNDLEto your trusted CA certificatesHF_HOMEto a shared cache directory if needed
Push to the hub¶
To push a dataset on the Hub, you need an huggingface account, with a configured access token.
First login the huggingface cli:
huggingface-cli login
and enter you access token.
Then, the following python instruction enable pushing datasetdict, infos and problem_definitions to the hub:
huggingface_bridge.push_dataset_dict_to_hub("chanel/dataset", hf_dataset_dict)
huggingface_bridge.push_infos_to_hub("chanel/dataset", infos)
huggingface_bridge.push_tree_struct_to_hub("chanel/dataset", flat_cst, key_mappings)
huggingface_bridge.push_problem_definition_to_hub("chanel/dataset", "location", pb_def)
The dataset card can then be customized online, on the dataset repo page directly.
Section 5: Handle plaid samples from Hugging Face datasets without converting the complete dataset to plaid¶
To fully exploit optimzed data handling of the Hugging Face datasets library, it is possible to extract information from the huggingface dataset without converting to plaid.
Get the first sample of the first split
hf_sample = hf_datasetdict["train"][0]
print(f"{hf_sample = }")
hf_sample = {'Base_2_2/Zone/CellCenterFields/cell_field': [0.1876419633626938, 0.454351007938385, 0.8176311254501343], 'Base_2_2/Zone/VertexFields/node_field': [0.11778436601161957, 0.18116936087608337, 0.443500280380249, 0.18745622038841248, 0.9817090630531311], 'Global/scalar': [1.9853718280792236]}
We notice that hf_sample is not a plaid sample, but a dict containing the variable features of the datasets, with keys being the flattened path of the CGNS tree. contains a binary object efficiently handled by huggingface datasets. It can be converted into a plaid sample using a specific constructor relying on a pydantic validator, and the required flat_cst and cgns_types.
plaid_sample = huggingface_bridge.to_plaid_sample(
hf_datasetdict["train"], 0, flat_cst["train"], cgns_types
)
print("Variable features:")
for t in plaid_sample.get_all_time_values():
for path in key_mappings["variable_features"]:
print(path, plaid_sample.get_feature_by_path(path=path, time=t))
print("-------")
print("Sample and CGNS tree:")
show_sample(plaid_sample)
Variable features:
Base_2_2/Zone/CellCenterFields/cell_field [0.18764196 0.454351 0.8176311 ]
Base_2_2/Zone/VertexFields/node_field [0.11778437 0.18116936 0.44350028 0.18745622 0.98170906]
Global/scalar [1.9853718]
-------
Sample and CGNS tree:
sample = Sample(1 global, 1 timestamp, 2 fields)
Base_2_2 : (2,) [2 2] int32 CGNSBase_t
|_ Zone : (1, 3) [[5 3 0]] int64 Zone_t
|_ Elements_TRI_3 : (2,) [5 0] int32 Elements_t
|_ ElementConnectivity : (9,) [1 ... 4] int64 DataArray_t
|_ ElementRange : (2,) [1 3] int64 IndexRange_t
|_ FamilyName : (2,) 2D |S1 FamilyName_t
|_ ZoneType : (12,) Unstructured |S1 ZoneType_t
|_ CellCenterFields : None FlowSolution_t
|_ GridLocation : (10,) CellCenter |S1 GridLocation_t
|_ cell_field : (3,) [0.18764196 0.454351 0.8176311 ] float32 DataArray_t
|_ GridCoordinates : None GridCoordinates_t
|_ CoordinateX : (5,) [0. 1. 1. 0. 0.5] float64 DataArray_t
|_ CoordinateY : (5,) [0. 0. 1. 1. 1.5] float64 DataArray_t
|_ VertexFields : None FlowSolution_t
|_ GridLocation : (6,) Vertex |S1 GridLocation_t
|_ node_field : (5,) [0.11778437 0.18116936 0.44350028 0.18745622 0.98170906] float32 DataArray_t
|_ Time : (1,) [1] int32 BaseIterativeData_t
|_ IterationValues : (1,) [1] int32 DataArray_t
|_ TimeValues : (1,) [0.] float64 DataArray_t
|_ 2D : None Family_t
Global : (2,) [1 1] int32 CGNSBase_t
|_ scalar : (1,) [1.9853718] float32 DataArray_t
|_ Time : (1,) [1] int32 BaseIterativeData_t
|_ IterationValues : (1,) [1] int32 DataArray_t
|_ TimeValues : (1,) [0.] float64 DataArray_t
CGNSLibraryVersion : (1,) [4.] float32 CGNSLibraryVersion_t
sample.get_scalar_names() = ['scalar']
sample.get_field_names() = ['cell_field', 'node_field']
Very large datasets that do not fit on disk can be streamed directly from the Hugging Face hub:
hf_dataset_stream = load_dataset("chanel/dataset", split="train", streaming=True)
plaid_sample = huggingface_bridge.to_plaid_sample(next(iter(hf_dataset_stream)), flat_cst, cgns_types)
If you are behing a proxy:
hf_dataset_stream = huggingface_bridge.load_dataset_from_hub("chanel/dataset", split="train", streaming=True)
plaid_sample = huggingface_bridge.to_plaid_sample(next(iter(hf_dataset_stream)), flat_cst, cgns_types)
Section 6: Advanced concepts¶
In this section, we investigate concepts to better exploit the datasets made available on Hugging Face, by looking into read speed and memory usage. The commands are not executed by this notebook. You can copy/paste the following code to execute it, but be mindfull that it will download a 235MB dataset.
repo_id = "fabiencasenave/Tensile2d_DO_NOT_DELETE"
split_names = ["train_500", "test", "OOD"]
hf_dataset_dict = huggingface_bridge.load_dataset_from_hub(repo_id)
We investigate the time and memory needed to instantiate the plaid dataset dict from the repo_id, now that the hf datasets have been loaded in cache:
init_ram = get_mem()
start = time()
dataset_dict = huggingface_bridge.instantiate_plaid_datasetdict_from_hub(repo_id)
elapsed = time() - start
print(f"Time to instantiate plaid dataset dict from cache: {elapsed:.6g} s, RAM usage increase: {get_mem()-init_ram} MB")
>> Time to instantiate plaid dataset dict from cache: 1.37948 s, RAM usage increase: 22.5 MB
We notice the RAM usage is lower than the size of the dataset: all the variable shape 1DArrays and constant shape 2DArrays in the samples are initiated in no-copy mode.
We now investigate the possible gains when handling the datasets directly. First, bypassing cache checks and constructing plaid dataset from an instantiated HF dataset is much faster:
flat_cst, key_mappings = huggingface_bridge.load_tree_struct_from_hub(repo_id)
pb_def = huggingface_bridge.load_problem_definition_from_hub(repo_id, "task_1")
infos = huggingface_bridge.load_infos_from_hub(repo_id)
cgns_types = key_mappings["cgns_types"]
hf_dataset = hf_dataset_dict[split_names[0]]
init_ram = get_mem()
start = time()
dataset = huggingface_bridge.to_plaid_dataset(hf_dataset, flat_cst, cgns_types)
elapsed = time() - start
print(f"Time to build dataset on split {split_names[0]}: {elapsed:.6g} s, RAM usage increase: {get_mem()-init_ram} MB")
>> Time to build dataset on split train_500: 0.173115 s, RAM usage increase: 16.3125 MB
It is possible to further remove overheads by accessing directly 1DArrays in the arrow table of the HF datasets in no-copy mode:
init_ram = get_mem()
start = time()
data = {}
for i in range(len(hf_dataset)):
data[i] = hf_dataset.data["Base_2_2/Zone/PointData/sig12"][i].values.to_numpy(zero_copy_only=True)
elapsed = time() - start
print(f"Time to read 1D fields of variable size on the complete split {split_names[0]}: {elapsed:.6g} s, RAM usage increase: {get_mem()-init_ram} MB")
>> Time to read 1D fields of variable size on the complete split train_500: 0.0021801 s, RAM usage increase: 0.375 MB
An efficient way to retrieve the output feature directly from the pyarrow table is:
init_ram = get_mem()
start = time()
for i in tqdm(
range(len(hf_dataset_new["train"])), desc="Retrieving features"
):
for path in pb_def.get_out_features_identifiers():
hf_dataset_new["train"].data[path][i].values.to_numpy(
zero_copy_only=False
)
elapsed = time() - start
print(
f"Time to retrieve out features on train: {elapsed:.6g} s, RAM usage increase: {get_mem() - init_ram} MB"
)
>> Time to retrieve out features on train: 0.0400107 s, RAM usage increase: 0.27734375 MB
Notice that doing this for time-dependent datasets would require manual handling of the time dimension.
A robust way to retrieve input and output features from a HF dataset relying on the to_plaid_sample constructor is:
init_ram = get_mem()
start = time()
for i in tqdm(
range(len(hf_dataset_new[split_names[0]])), desc="Retrieving all variable features"
):
sample = huggingface_bridge.to_plaid_sample(
hf_dataset_new[split_names[0]],
i,
flat_cst[split_names[0]],
cgns_types,
enforce_shapes=False,
)
for t in sample.get_all_mesh_times():
for path in pb_def.get_in_features_identifiers():
sample.get_feature_by_path(path=path, time=t)
for path in pb_def.get_out_features_identifiers():
sample.get_feature_by_path(path=path, time=t)
elapsed = time() - start
print(
f"Time to retrieve in and out features on train: {elapsed:.6g} s, RAM usage increase: {get_mem() - init_ram} MB"
)
>> Time to retrieve in and out features on train: 0.401273 s, RAM usage increase: 17.72265625 MB
Notice that converting first to plaid samples incurs some overhead, but this method is robust and works for time-dependent datasets as well.