{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Pipeline Examples\n", "\n", "This notebook demonstrates the end-to-end process of building a machine learning pipeline using PLAID datasets and PLAID’s scikit-learn-compatible blocks." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 📦 Imports" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings('ignore', module='sklearn')\n", "warnings.filterwarnings(\"ignore\", message=\".*IProgress not found.*\")\n", "\n", "import os\n", "os.environ[\"OMP_PROC_BIND\"] = \"spread\"\n", "os.environ[\"OMP_PLACES\"] = \"threads\"\n", "\n", "from pathlib import Path\n", "\n", "import yaml\n", "import numpy as np\n", "import optuna\n", "\n", "from datasets.utils.logging import disable_progress_bar\n", "from datasets import load_dataset\n", "\n", "from sklearn.base import clone\n", "from sklearn.pipeline import Pipeline\n", "\n", "from sklearn.decomposition import PCA\n", "from sklearn.preprocessing import MinMaxScaler\n", "from sklearn.gaussian_process import GaussianProcessRegressor\n", "from sklearn.gaussian_process.kernels import Matern\n", "from sklearn.multioutput import MultiOutputRegressor\n", "\n", "from sklearn.model_selection import KFold, GridSearchCV\n", "\n", "from plaid.bridges.huggingface_bridge import huggingface_dataset_to_plaid, huggingface_description_to_problem_definition\n", "from plaid.pipelines.sklearn_block_wrappers import WrappedPlaidSklearnTransformer, WrappedPlaidSklearnRegressor\n", "from plaid.pipelines.plaid_blocks import PlaidTransformedTargetRegressor, PlaidColumnTransformer\n", "from mmgp.pipelines.mmgp_blocks import MMGPPreparer, MMGPTransformer, renumber_mesh_for_parametrization, floater_mesh_parametrization\n", "from Muscat.Containers.MeshGraphTools import FloaterMeshParametrization, RenumberMeshForParametrization\n", "\n", "\n", "disable_progress_bar()\n", "n_processes = min(max(1, os.cpu_count()), 4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 🚀 MMGP for `U1` field prediction of `Tensile2d` dataset\n", "\n", "Key steps covered:\n", "\n", "- **Loading and preparing the PLAID dataset** using Hugging Face integration and PLAID’s dataset classes \n", "- **Standardizing features** with PLAID-wrapped scikit-learn transformers for scalars and fields \n", "- **Dimensionality reduction** of flow fields via Principal Component Analysis (PCA) to reduce output complexity \n", "- **Regression modeling** of PCA coefficients from scalar inputs using Gaussian Process regression \n", "- **Pipeline assembly** combining transformations and regressors into a single scikit-learn-compatible workflow \n", "- **Hyperparameter tuning** using Optuna and scikit-learn’s `GridSearchCV`\n", "- **Model evaluation** using cross-validation and appropriate metrics \n", "- **Best practices** for working with PLAID datasets and pipelines in a reproducible and modular manner" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 📥 Load Dataset\n", "\n", "We load the `Tensile2d` dataset from Hugging Face and restrict ourselves to the first 24 samples of the training set." ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "hf_dataset = load_dataset(\"PLAID-datasets/Tensile2d\", split=\"all_samples[:6]\")\n", "dataset_train, _ = huggingface_dataset_to_plaid(hf_dataset, processes_number = 6, verbose = False)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "try:\n", " filename = Path(__file__).parent.parent.parent / \"docs\" / \"source\" / \"notebooks\" / \"config_pipeline.yml\"\n", "except NameError:\n", " filename = \"config_pipeline.yml\"\n", "\n", "with open(filename, 'r') as f:\n", " config = yaml.safe_load(f)\n", "\n", "all_feature_id = config['input_scalar_scaler']['in_features_identifiers'] +\\\n", " config['pca_nodes']['in_features_identifiers'] + config['pca_u1']['in_features_identifiers']" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "dataset_train = dataset_train.from_features_identifier(all_feature_id)\n", "print(\"dataset_train:\", dataset_train)\n", "print(\"scalar names =\", dataset_train.get_scalar_names())\n", "print(\"field names =\", dataset_train.get_field_names())" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def morphing(mesh):\n", " mesh_renumb, _, n_boundary = RenumberMeshForParametrization(\n", " mesh, inPlace=False)\n", " mesh_renumb.elemFields = mesh_renumb.nodeFields = {}\n", " morphed_mesh, _ = FloaterMeshParametrization(\n", " mesh_renumb, n_boundary)\n", " return morphed_mesh" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "preparator = MMGPPreparer(common_mesh_id = 1, morphing = morphing)\n", "\n", "input_scalar_scaler = WrappedPlaidSklearnTransformer(MinMaxScaler(), **config['input_scalar_scaler'])\n", "\n", "nodes_preprocessor = Pipeline(\n", " steps=[\n", " (\"mmgp_nodes_transf\", MMGPTransformer(**config['mmgp_nodes_transf'])),\n", " ('pca_nodes', WrappedPlaidSklearnTransformer(PCA(n_components=4), **config['pca_nodes'])),\n", " ]\n", ")\n", "\n", "column_preprocessor = PlaidColumnTransformer(\n", " [\n", " ('input_scalar_scaler', input_scalar_scaler),\n", " ('nodes_preprocessor', nodes_preprocessor),\n", " ]\n", " )\n", "\n", "preprocessor = Pipeline(\n", " steps=[\n", " (\"preparator\", preparator),\n", " ('column_preprocessor', column_preprocessor),\n", " ]\n", ")\n", "\n", "preprocessor" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "\n", "kernel = Matern(length_scale_bounds=(1e-8, 1e8), nu = 2.5)\n", "\n", "gpr = GaussianProcessRegressor(\n", " kernel=kernel,\n", " optimizer='fmin_l_bfgs_b',\n", " n_restarts_optimizer=1,\n", " random_state=42)\n", "\n", "reg = MultiOutputRegressor(gpr)\n", "\n", "def length_scale_init(X):\n", " return np.ones(X.shape[1])\n", "\n", "dynamics_params_factory = {'estimator__kernel__length_scale':length_scale_init}\n", "\n", "regressor = WrappedPlaidSklearnRegressor(reg, **config['regressor_mach'], dynamics_params_factory = dynamics_params_factory)\n", "\n", "postprocessor = Pipeline(\n", " steps=[\n", " (\"mmgp_u1_transf\", MMGPTransformer(**config['mmgp_u1_transf'])),\n", " ('pca_u1', WrappedPlaidSklearnTransformer(PCA(n_components=4), **config['pca_u1'])),\n", " ]\n", ")\n", "\n", "\n", "target_regressor = PlaidTransformedTargetRegressor(\n", " regressor=regressor,\n", " transformer=postprocessor,\n", " # out_features_identifiers = config['pca_u1']['in_features_identifiers']\n", ")\n", "target_regressor" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "pipeline = Pipeline(\n", " steps=[\n", " (\"preprocessor\", preprocessor),\n", " (\"regressor\", target_regressor),\n", " ]\n", ")\n", "pipeline" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "pipeline.fit(dataset_train)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "dataset_pred = pipeline.predict(dataset_train)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "for index in range(4):\n", " print(\"rel_dif =\", np.linalg.norm(dataset_pred[index].get_field(\"U1\") - dataset_train[index].get_field(\"U1\")))" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "pipeline.score(dataset_train)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [], "source": [ "param_grid = {\n", " 'preprocessor__preparator__common_mesh_id': [0, 2],\n", " 'regressor__transformer__pca_u1__sklearn_block__n_components': [2],\n", " 'preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components': [2]\n", "}\n", "\n", "cv = KFold(n_splits=2, shuffle=True, random_state=42)\n", "search = GridSearchCV(pipeline, param_grid=param_grid, cv = cv, verbose=3, error_score='raise')\n", "search.fit(dataset_train)\n" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "print(\"best_params =\", search.best_params_)\n", "optimized_pipeline = clone(pipeline).set_params(**search.best_params_)\n", "optimized_pipeline.fit(dataset_train)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "score = optimized_pipeline.score(dataset_train)\n", "print(\"score =\", score, \", error =\", 1. - score)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "for index in range(len(dataset_pred)):\n", " print(\"rel_dif =\", np.linalg.norm(dataset_pred[index].get_field(\"U1\") - dataset_train[index].get_field(\"U1\")))" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "optimal_common_mesh_id = search.best_params_['preprocessor__preparator__common_mesh_id']\n", "print(\"optimal_common_mesh_id =\", optimal_common_mesh_id)\n", "optimized_pipeline = clone(pipeline).set_params(\n", " preprocessor__preparator__common_mesh_id = optimal_common_mesh_id,\n", " regressor__transformer__pca_u1__sklearn_block__n_components = len(dataset_train),\n", " preprocessor__column_preprocessor__nodes_preprocessor__pca_nodes__sklearn_block__n_components = len(dataset_train)\n", ")\n", "optimized_pipeline.fit(dataset_train)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "dataset_pred = optimized_pipeline.predict(dataset_train)\n", "for index in range(len(dataset_pred)):\n", " print(f\"rel_dif(id={index}) =\", np.linalg.norm(dataset_pred[index].get_field(\"U1\") - dataset_train[index].get_field(\"U1\")))\n", "print(f\"error at id {optimal_common_mesh_id } should be numerical zero\")" ] } ], "metadata": { "kernelspec": { "display_name": "plaid-dev", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.13" } }, "nbformat": 4, "nbformat_minor": 2 }