Initial Commit

This commit is contained in:
Cian Hughes
2023-08-03 21:33:03 +01:00
committed by GitHub
parent 379ecfd4f6
commit 9781c9f050
15 changed files with 15034 additions and 0 deletions

360
act_plot.ipynb Normal file

File diff suppressed because one or more lines are too long

274
custom_activations.py Normal file
View File

@@ -0,0 +1,274 @@
import torch
from torch import nn
class SoftExp(nn.Module):
"""
Implementation of soft exponential activation.
Shape:
- Input: (N, *) where * means, any number of additional
dimensions
- Output: (N, *), same shape as the input
Parameters:
- alpha - trainable parameter
References:
- See related paper:
https://arxiv.org/pdf/1602.01321.pdf
Examples:
>>> a1 = soft_exponential(256)
>>> x = torch.randn(256)
>>> x = a1(x)
"""
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
self.alpha = alpha.to(self.device)
self.alpha.requiresGrad = True # set requiresGrad to true
# self.__name__ == "SoftExp"
def forward(self, x):
"""
Forward pass of the function.
Applies the function to the input elementwise.
"""
if self.alpha == 0.0:
return x
if self.alpha < 0.0:
return -torch.log(1 - self.alpha * (x + self.alpha)) / self.alpha
if self.alpha > 0.0:
return (torch.exp(self.alpha * x) - 1) / self.alpha + self.alpha
@torch.jit.script
def sech(x):
return 1 / torch.cosh(x)
@torch.jit.script
def dip(x):
return (-2.0261193218831233 * sech(x)) + 0.31303528549933146
@torch.jit.script
def bmu(x):
return torch.where(
x <= -1,
-1 / torch.abs(x),
torch.where(x >= 1, x - 2, dip(x)),
)
class BMU(nn.Module):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def forward(self, input):
return bmu(input)
class TrainableHybrid(nn.Module):
def __init__(
self, functions, function_args=None, function_kwargs=None, *args, **kwargs
):
super().__init__(*args, **kwargs)
if function_args is None:
function_args = [tuple() for _ in functions]
if function_kwargs is None:
function_kwargs = [dict() for _ in functions]
if None in function_args:
function_args = [
tuple() if fa is None else fa for fa in function_args
]
if None in function_kwargs:
function_kwargs = [
dict() if fk is None else fk for fk in function_kwargs
]
self.functions = [
f(*fa, *fk) for f, fa, fk in zip(functions, function_args, function_kwargs)
]
self.alpha = nn.Parameter(torch.randn(len(functions)))
self.normalize_alpha()
self.__name__ = (
f"TrainableHybrid{str([f.__name__ for f in functions]).replace(' ', '')}"
)
def __repr__(self):
return self.__name__
def normalize_alpha(self) -> None:
self.alpha.data = self.alpha / torch.sum(self.alpha)
def apply_activations(self, input: torch.Tensor):
return torch.sum(
torch.stack(
[a * f(input) for f, a in zip(self.functions, self.alpha)]
),
dim=0,
)
def forward(self, input: torch.Tensor) -> torch.Tensor:
self.normalize_alpha()
return self.apply_activations(input)
def to(self, device):
super().to(device)
self.functions = [f.to(device) for f in self.functions]
return self
class ISRU(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
self.alpha = alpha.to(self.device)
self.alpha.requiresGrad = True
self.__name__ = "ISRU"
def forward(self, x):
return x / torch.sqrt(1 + self.alpha * x**2)
class ISRLU(nn.Module):
def __init__(self, alpha=1.0, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
self.alpha = alpha.to(self.device)
self.alpha.requiresGrad = True
self.isru = ISRU(alpha)
self.__name__ = "ISRLU"
def forward(self, x):
return torch.where(x >= 0, x, self.isru(x))
class PBessel(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
beta = torch.tensor(beta) if beta is not None else nn.Parameter(torch.randn(1))
self.alpha = alpha.to(self.device)
self.beta = beta.to(self.device)
self.alpha.requiresGrad = True
self.beta.requiresGrad = True
self.__name__ = "PBessel"
def forward(self, input):
gamma = 1 - self.alpha
return (self.alpha * torch.special.bessel_j0(self.beta * input)) + (
gamma * torch.special.bessel_j1(self.beta * input)
)
class LeakyPReQU(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
beta = torch.tensor(beta) if beta is not None else nn.Parameter(torch.randn(1))
self.alpha = alpha.to(self.device)
self.beta = beta.to(self.device)
self.alpha.requiresGrad = True
self.beta.requiresGrad = True
self.__name__ = "LeakyPReQU"
def forward(self, input):
return torch.where(
input > 0,
(self.alpha * input * input) + (self.beta * input),
self.beta * input,
)
class Sinusoid(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
beta = torch.tensor(beta) if beta is not None else nn.Parameter(torch.randn(1))
self.alpha = alpha.to(self.device)
self.beta = beta.to(self.device)
self.alpha.requiresGrad = True
self.beta.requiresGrad = True
self.__name__ = "Sinusoid"
def forward(self, input):
return torch.sin(self.alpha * (input + self.beta))
class Modulo(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
beta = torch.tensor(beta) if beta is not None else nn.Parameter(torch.randn(1))
self.alpha = alpha.to(self.device)
self.beta = beta.to(self.device)
self.alpha.requiresGrad = True
self.beta.requiresGrad = True
self.__name__ = "Modulo"
def forward(self, input):
return torch.fmod(self.alpha * input, self.beta)
class TriWave(nn.Module):
def __init__(self, alpha=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
self.alpha = alpha.to(self.device)
self.alpha.requiresGrad = True
self.__name__ = "TriWave"
def forward(self, input):
return torch.abs(2 * (input / self.alpha - torch.floor(input / self.alpha + 0.5))) # noqa
class Gaussian(nn.Module):
def __init__(self, alpha=None, beta=None, *args, **kwargs):
super().__init__(*args, **kwargs)
if "device" in kwargs:
self.device = kwargs["device"]
else:
self.device = torch.device("cpu")
alpha = torch.tensor(alpha) if alpha is not None else nn.Parameter(torch.randn(1)) # noqa
beta = torch.tensor(beta) if beta is not None else nn.Parameter(torch.randn(1))
self.alpha = alpha.to(self.device)
self.beta = beta.to(self.device)
self.alpha.requiresGrad = True
self.beta.requiresGrad = True
self.__name__ = "Gaussian"
def forward(self, x):
return torch.exp(-(((x-self.alpha)**2)/(2*self.beta**2)))

1012
data_postprocessing.ipynb Normal file

File diff suppressed because it is too large Load Diff

1421
expt1.ipynb Normal file

File diff suppressed because it is too large Load Diff

547
expt1.py Normal file
View File

@@ -0,0 +1,547 @@
# %% [markdown]
# <h1>Experiment 1</h1>
# <h3>Initial hyperparameter tuning</h3>
# <p>Summary</p>
# <ul>
# <li>A model was created with a dynamic constructor, allowing for a hyperparameter-driven model</li>
# <li>Hyperparameters were tuned using <code>`Optuna`</code></li>
# <li>Training loop was constructed using <code>`PyTorchLightning`</code></li>
# <li>Model was trained on a cluster of machines using a shared SQL trial database</li>
# <li>An extremely aggressive pruning algorithm was used to quickly narrow in on an optimal hyperparameter space</li>
# <li>Experiment 1 was left to train on the cluster for 2 days</li>
# </ul>
# %%
# Data handling imports
from dask.distributed import Client, LocalCluster
import dask
import dask.dataframe as dd
import dask.array as da
import numpy as np
import pickle
import random
from itertools import chain
from tqdm.auto import tqdm
# Deep learning imports
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim
import pytorch_lightning as pl
from torchmetrics import MeanSquaredError
from pytorch_lightning import Trainer
import optuna
from optuna.pruners import HyperbandPruner
from optuna.integration import PyTorchLightningPruningCallback
# Suppress some warning messages from pytorch_lightning,
# It really doesn't like that i've forced it to handle a dask array!
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module=pl.__name__)
# Also, set up a log to record debug messages for failed trials
import logging
logging.basicConfig(filename="debug.log", encoding="utf-8", level=logging.ERROR)
# %% [markdown]
# <h3>Patching PyTorchLightning</h3>
# <p>
# A key part of this project was to develop a patch for PyTorchLightning to allow for the use of <code>`dask`</code> arrays as inputs. It was important that PyTorchLightning can accept <code>`dask`</code> arrays and only load the data into memory when needed. Otherwise, our extremely large datasets would simply crash our system as they are significantly larger than the available RAM and VRAM.
# </p><p>
# After several versions of the patch, this final version was developed. It is a simple monkey patch that wraps the <code>pytorch_lightning.utlities.data._extract_batch_size</code> generator with a check that mimics the expected behaviour for torch tensors when given a dask array and extends its type signature to ensure static analysis is still possible.
# </p><p>
# With this patch applied, the forward method in our model can accept a dask array and only compute each chunk of the array when needed. This allows us to train our model on datasets that are significantly larger than the available memory.
# </p>
# %%
# Monkey patch to allow pytorch lightning to accept a dask array as a model input
from typing import Any, Generator, Iterable, Mapping, Optional, Union
BType = Union[da.Array, torch.Tensor, str, Mapping[Any, "BType"], Iterable["BType"]]
unpatched = pl.utilities.data._extract_batch_size
def patch(batch: BType) -> Generator[Optional[int], None, None]:
if isinstance(batch, da.core.Array):
if len(batch.shape) == 0:
yield 1
else:
yield batch.shape[0]
else:
yield from unpatched(batch)
pl.utilities.data._extract_batch_size = patch
# %%
# Set the device to use with torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Prepare a dask cluster and client
def create_client():
cluster = LocalCluster(n_workers=2, threads_per_worker=1)
client = Client(cluster)
return client
if __name__ == "__main__":
client = create_client()
# %%
# Load X and y for training
samples = list(range(1, 82))
with open("sample_X.pkl", "rb") as f:
X = pickle.load(f)
with open("sample_y.pkl", "rb") as f:
y = pickle.load(f)
# %% [markdown]
# <h3>Dataset Splitting</h3>
# <p>The dataset is split into a training and validation dataset (80:20 split). Because the number of available samples is extremely small, we haven't produced a test dataset. In the future, as more data is obtained, a test set should be included whenever possible.</p>
# %%
# Separate samples into training and validation sets
val_samples = random.sample(samples, k=len(samples) // 5)
train_samples = [s for s in samples if s not in val_samples]
X_train = {i: X[i] for i in train_samples}
X_val = {i: X[i] for i in val_samples}
y_train = {i: y[i] for i in train_samples}
y_val = {i: y[i] for i in val_samples}
# %% [markdown]
# <h3>Dataset Collation</h3>
# <p>This function returns a closure for collating our data in a torch DataLoader. The use of a DataLoader will allow us to shuffle and prefetch data, reducing overfitting and maximising performance as IO will be a bottleneck. The closure is dynamically constructed, allowing us to select the outputs we train against. However, for this experiment we will match against all outputs for simplicity.</p>
# %%
# Create a function to dynamically modify data collation
def collate_fn(batch):
X0 = batch[0][0][0].to_numpy(dtype=np.float32)[0]
X1 = batch[0][0][1].to_dask_array(lengths=True)
y = batch[0][1].to_numpy(dtype=np.float32)
return (
torch.from_numpy(X0).to(device),
X1,
torch.from_numpy(y).to(device),
)
# %% [markdown]
# <h3>Convolutional Data Compression</h3>
# <p>
# The <code>`DaskCompression`</code> module accepts a dask array, and applies a convolutional kernel to it to significantly compress the input data. This allows us to transform a larger than VRAM dataset into one that can fit on our GPU, and (hopefully) retain the relevant information to train the rest of our model on.
# </p><p>
# Note how the kernel is only computed in line 12 and is immediately compressed via convolution. This ensures that only one kernel needs to be stored in memory at a time, avoiding the need to hold the entire dataset in memory at once.
# </p>
# %%
class DaskCompression(nn.Module):
def __init__(
self, in_channels, out_channels, kernel_size, chunk_size=1, device=device
):
super(DaskCompression, self).__init__()
self.kernel_size = kernel_size
self.in_channels = in_channels
self.out_channels = out_channels
self.chunk_size = chunk_size
self.device = device
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size).to(device)
def compress_kernel(self, kernel):
return (
self.conv(torch.from_numpy(kernel.compute()).to(self.device))
.squeeze()
.to("cpu") # return to cpu to save VRAM
)
def forward(self, x):
# Precompute the dimensions of the output array
dim0, dim2 = x.shape
assert dim2 == self.in_channels
dim0 = (dim0 // self.kernel_size) // self.chunk_size
x = x.reshape(dim0, self.chunk_size, self.kernel_size, dim2)
x = da.transpose(x, axes=(0, 1, 3, 2))
x = [self.compress_kernel(kernel) for kernel in x]
return torch.stack(x).to(self.device)
# %% [markdown]
# <h3>Model Design</h3>
# <p>
# The model was designed to be a dynamically constructed, hyperparameter driven model for ease of hyperparameter optimisation. The contructed model will process data in the following way:
# </p>
# <ol>
# <li>The input is left/right padded to a multiple of the compressor kernel size</li>
# <li>The dask array is compressed by a <code>`DaskCompressor`</code> layer, treating each input as a channel</li>
# <li>The compressed array is then recursively convoluted down to a size less than or equal to the width of our feedforward network</li>
# <li>The channels of the now convolved data are combined</li>
# <li>The combined, flattened data is then left/right padded to the width of the feedforward network</li>
# <li>Finally, the data is fed into a feedforward network</li>
# </ol>
# <p>
# This relatively simple design allows the network to accept both larger-than-ram datasets as inputs, and datasets of variable sizes. This makes it suitable for training on whole Aconity datasets, without the need for culling or binning.
# </p>
# %%
class Model(pl.LightningModule):
def __init__(
self,
# pl attributes
optimizer=torch.optim.Adam,
optimizer_args=(),
optimizer_kwargs={},
scheduler=None,
scheduler_kwargs={},
loss=torch.nn.MSELoss(),
train_ds=None,
val_ds=None,
# model args & kwargs
compressor_kernel_size=128,
compressor_chunk_size=128,
compressor_act=(nn.ReLU, (), {}),
conv_kernel_size=128,
conv_norm=False,
conv_act=(nn.ReLU, (), {}),
channel_combine_act=(nn.ReLU, (), {}),
param_ff_depth=4,
param_ff_width=16,
param_ff_act=(nn.ReLU, (), {}),
ff_width=512,
ff_depth=4,
ff_act=(nn.ReLU, (), {}),
out_size=6,
out_act=(nn.ReLU, (), {}),
):
super().__init__()
# Assign necessary attributes for pl model
self.optimizer = optimizer
self.optimizer_args = optimizer_args
self.optimizer_kwargs = optimizer_kwargs
self.scheduler = scheduler
self.scheduler_kwargs = scheduler_kwargs
self.loss = loss
self.mse = MeanSquaredError()
self.train_ds = train_ds
self.val_ds = val_ds
# Attrs for dynamically created model to be tested
self.compressor_kernel_size = compressor_kernel_size
self.compressor_chunk_size = compressor_chunk_size
self.conv_kernel_size = conv_kernel_size
self.ff_width = ff_width
self.ff_depth = ff_depth
self.out_size = out_size
# layers
# compressor compresses and converts dask array to torch tensor
self.convolutional_compressor = DaskCompression(
5,
5,
kernel_size=compressor_kernel_size,
chunk_size=compressor_chunk_size,
)
self.compressor_act = compressor_act[0](*compressor_act[1], **compressor_act[2])
# convolutional layer recursively applies convolutions to the compressed input
self.conv = nn.Conv1d(5, 5, kernel_size=conv_kernel_size)
self.conv_norm = nn.LocalResponseNorm(5) if conv_norm else nn.Identity()
self.conv_act = conv_act[0](*conv_act[1], **conv_act[2])
self.combine_channels = nn.Conv1d(5, 1, kernel_size=1)
self.channel_combine_act = channel_combine_act[0](
*channel_combine_act[1], **channel_combine_act[2]
)
self.param_ff = nn.Sequential(
nn.Linear(4, param_ff_width),
param_ff_act[0](*param_ff_act[1], **param_ff_act[2]),
*chain(
*(
(
nn.Linear(param_ff_width, param_ff_width),
param_ff_act[0](*param_ff_act[1], **param_ff_act[2]),
)
for _ in range(param_ff_depth)
)
),
)
self.ff = nn.Sequential(
nn.Linear(ff_width + param_ff_width, ff_width),
ff_act[0](*ff_act[1], **ff_act[2]),
*chain(
*(
(
nn.Linear(ff_width, ff_width),
ff_act[0](*ff_act[1], **ff_act[2]),
)
for _ in range(ff_depth)
)
),
)
self.out_dense = nn.Linear(ff_width, out_size)
self.out_act = out_act[0](*out_act[1], **out_act[2])
@staticmethod
def pad_ax0_to_multiple_of(x, multiple_of):
padding = (((x.shape[0] // multiple_of) + 1) * multiple_of) - x.shape[0]
left_pad = padding // 2
right_pad = padding - left_pad
return da.pad(
x, ((left_pad, right_pad), (0, 0)), mode="constant", constant_values=0
)
def pad_to_ff_width(self, x):
padding = self.ff_width - x.shape[1]
left_pad = padding // 2
right_pad = padding - left_pad
return F.pad(
x,
(right_pad, left_pad, 0, 0),
mode="constant",
value=0.0,
)
def forward(self, x0, x1):
# pad to a multiple of kernel_size * chunk_size
x1 = self.pad_ax0_to_multiple_of(
x1, self.compressor_kernel_size * self.compressor_chunk_size
)
x1 = self.convolutional_compressor(x1)
x1 = x1.reshape(x1.shape[0] * x1.shape[1], x1.shape[2]).T.unsqueeze(0)
while x1.shape[2] > self.ff_width:
x1 = self.conv(x1)
x1 = self.conv_norm(x1)
x1 = self.conv_act(x1)
x1 = self.combine_channels(x1)
x1 = self.channel_combine_act(x1)
x1 = x1.squeeze(1)
x1 = self.pad_to_ff_width(x1)
x0 = x0.unsqueeze(0)
x0 = self.param_ff(x0)
x = torch.cat((x1, x0), dim=1)
x = self.ff(x)
x = self.out_dense(x)
x = self.out_act(x)
return x
def configure_optimizers(self):
optimizer = self.optimizer(
self.parameters(), *self.optimizer_args, **self.optimizer_kwargs
)
if self.scheduler is not None:
scheduler = self.scheduler(optimizer, **self.scheduler_kwargs)
return optimizer, scheduler
else:
return optimizer
def train_dataloader(self):
return self.train_ds
def val_dataloader(self):
return self.val_ds
def training_step(self, batch, batch_idx):
x0, x1, y = batch
y_hat = self(x0, x1)
loss = self.loss(y_hat, y)
self.log("train_loss", loss)
mse = self.mse(y_hat, y)
self.log('train_MSE', mse, on_step=True, on_epoch=True, prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x0, x1, y = batch
y_hat = self(x0, x1)
loss = self.loss(y_hat, y)
self.log("val_loss", loss)
mse = self.mse(y_hat, y)
self.log('train_MSE', mse, on_step=True, on_epoch=True, prog_bar=True)
# %% [markdown]
# <h3>Activation Functions</h3>
# <p>
# For our hyperparameter optimisation, we intend to test all the activation functions in PyTorch. In addition to the builtin activations, we will also train using the following custom implemented activation functions from literature or our own design:
# </p>
# <ol>
# <li><b><code>BMU</code>:</b> Bio-Mimicking Unit; an activation function designed to mimic the activation potential of a biological neuron.</li>
# <li><b><code>SoftExp</code>:</b> Soft Exponential function; a parametric activation function that fits to a wide variety of exponential curves (DOI: <a href=https://arxiv.org/abs/1602.01321v1>10.48550/arXiv.1602.01321</a>)</li>
# <li><b><code>LeakyPReQU</code>:</b> Leaky Parametric Rectified Quadratic Unit; A smoothly and continuously differentiable function that is a parametrically sloped line for <code>x&#8924;0</code> and a quadratic curve for <code>x&gt;0</code></li>
# <li><b><code>ISRU</code>:</b> Inverse Square Root Unit; a somewhat uncommon function that can be useful in models such as this as it yields a continuously differentiable curve while being extremely fast to compute using bit manipulation</li>
# <li><b><code>ISRLU</code>:</b> Inverse Square Root Linear Unit; a modified ISRU that is an ISRU for <code>x&lt;0</code> and <code>`f(x)=x`</code> for <code>x&#8925;0</code> (DOI: <a href=https://arxiv.org/abs/1710.09967>10.48550/arXiv.1710.09967</a>)</li>
# <li><b><code>PBessel</code>:</b> Parametric Besse; A parametric Bessel curve yielding various different wave formations depending on a trainable parameter</li>
# <li><b><code>Sinusoid</code>:</b> A parametric sine wave, with amplitude and wavelength as trainable parameters</li>
# <li><b><code>Modulo</code>:</b> A parametric sawtooth wave, <code>`f(x)=x%&#593;</code> where &#593; is a trainable parameter</li>
# <li><b><code>TriWave</code>:</b> A parametric triangle wave, with amplitude and wavelength as trainable parameters</li>
# <li><b><code>Gaussian</code>:</b> A parametric gaussian curve, with trainable amplitude</li>
# </ol>
# %%
# Create a dispatcher including all builtin activations and
# Several custom activations from experimentation or literature
from custom_activations import SoftExp, PBessel
activation_dispatcher = {
"Tanh": nn.Tanh,
"SiLU": nn.SiLU,
"Softplus": nn.Softplus,
"SoftExp": SoftExp,
"PBessel": PBessel,
}
# %% [markdown]
# <h3>Hyperparameter training</h3>
# <p>Here, we define an objective function, describing what we want Optuna to do during each trial and how to react to various errors and/or situations that may arise. To summarise the objective:</p>
# <ul>
# <li>Optuna selects hyperparameters for all input parameters within the given constraints</li>
# <li>A model is generated using the selected hyperparameters</li>
# <li>PyTorchLightning trains the model through 2 epochs</li>
# <li>The model is evaluated on the validation set</li>
# <li>The validation loss is returned to Optuna</li>
# </ul>
# <p>
# Optuna monitors the reported validation loss and attempts to minimise it. An extremely aggressive pruning strategy known as "hyperband pruning" is used to efficiently reduce down the parameter space to something more reasonable. Any parameter set which optuna deems suboptimal will be immediately pruned or even stopped early to save time.
# </p>
# %%
# Test parameters
n_epochs = 2
output_keys = list(next(iter(y_train.values())).keys())
activation_vals = list(activation_dispatcher.keys())
# Next we define the objective function for the hyperparameter optimization
def objective(trial):
torch.cuda.empty_cache()
objective_value = torch.inf
model = None
logger = None
try:
# Select hyperparameters for testing
compressor_kernel_size = 128
compressor_chunk_size = 128
compressor_act = (
activation_dispatcher[
trial.suggest_categorical("compressor_act", activation_vals)
],
(),
{},
)
conv_kernel_size = 128
conv_norm = trial.suggest_categorical("conv_norm", [True, False])
conv_act = (
activation_dispatcher[
trial.suggest_categorical("conv_act", activation_vals)
],
(),
{},
)
channel_combine_act = (
activation_dispatcher[
trial.suggest_categorical("channel_combine_act", activation_vals)
],
(),
{},
)
param_ff_depth = trial.suggest_int("param_ff_depth", 2, 8, 2)
param_ff_width = trial.suggest_int("param_ff_width", 16, 64, 16)
param_ff_act = (
activation_dispatcher[
trial.suggest_categorical("param_ff_act", activation_vals)
],
(),
{},
)
ff_width = trial.suggest_int("ff_width", 256, 1025, 256)
ff_depth = trial.suggest_int("ff_depth", 2, 8, 2)
ff_act = (
activation_dispatcher[trial.suggest_categorical("ff_act", activation_vals)],
(),
{},
)
out_size = 2
out_act = (nn.Sigmoid, tuple(), dict())
# Set up the model architecture and other necessary components
model = Model(
compressor_kernel_size=compressor_kernel_size,
compressor_chunk_size=compressor_chunk_size,
compressor_act=compressor_act,
conv_kernel_size=conv_kernel_size,
conv_act=conv_act,
conv_norm=conv_norm,
channel_combine_act=channel_combine_act,
param_ff_depth=param_ff_depth,
param_ff_width=param_ff_width,
param_ff_act=param_ff_act,
ff_width=ff_width,
ff_depth=ff_depth,
ff_act=ff_act,
out_size=out_size,
out_act=out_act,
).to(device)
trainer = Trainer(
accelerator="gpu",
max_epochs=n_epochs,
devices=1,
logger=logger,
num_sanity_val_steps=0, # Needs to be disabled or else we get an error because X is dask array
# precision="16-mixed",
callbacks=[
PyTorchLightningPruningCallback(trial, monitor="val_loss"),
],
)
# Prepare datasets
train = DataLoader(
list(zip(X_train.values(), y_train.values())),
collate_fn=collate_fn,
shuffle=True,
)
valid = DataLoader(
list(zip(X_val.values(), y_val.values())),
shuffle=True,
collate_fn=collate_fn,
)
# Finally, train the model
trainer.fit(model, train, valid)
except Exception as e:
logging.exception(f"An exception occurred in trial {trial.number}: {e}")
raise optuna.exceptions.TrialPruned()
finally:
if logger is not None:
logger.experiment.unwatch(model)
logger.experiment.finish()
del model
torch.cuda.empty_cache()
if objective_value == torch.inf:
raise optuna.exceptions.TrialPruned()
return objective_value
# %% [markdown]
# <h3>Hyperparameter Optimisation on a Computing Cluster</h3>
# <p>
# The final important step is to run the optimisation using a cluster of computers to maximise the number of trials that can be run in parallel. Although this could be achieved using a more complex, scheduler controlled system and dask, we will use the far simpler approach of using a shared SQL ledger to keep track of the trials and their results. This is a very simple approach, but it is sufficient for our purposes, and is easy to implement. Using this approach, the model was trained on a cluster of 5 computers at once.
# </p>
# %%
if __name__ == "__main__":
# storage_name = "sqlite:///optuna.sql"
storage_name = "mysql+pymysql://root:Ch31121992@192.168.1.10:3306/optuna_db"
study_name = "Composition Experiment 1"
study = optuna.create_study(
study_name=study_name,
storage=storage_name,
direction="minimize",
pruner=HyperbandPruner(),
load_if_exists=True,
)
study.optimize(
objective,
n_trials=None,
timeout=None,
)

133
expt1_analysis.ipynb Normal file
View File

@@ -0,0 +1,133 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data handling imports\n",
"import numpy as np\n",
"import pickle\n",
"import random\n",
"from tqdm.auto import tqdm\n",
"import optuna"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"storage_name = \"mysql+pymysql://root:Ch31121992@192.168.1.10:3306/optuna_db\"\n",
"study_name = \"Experiment 1\"\n",
"study = optuna.load_study(\n",
" study_name=study_name,\n",
" storage=storage_name,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = study.trials_dataframe()\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.dropna(inplace=True)\n",
"df.sort_values(by=\"value\", inplace=True)\n",
"df.drop(df[\"value\"].idxmax(), inplace=True)\n",
"df.drop(df[\"value\"].idxmax(), inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"pd.options.plotting.backend = \"plotly\"\n",
"params = list(df.keys()[5:-1])\n",
"for p in params:\n",
" df.plot(x=p, y=\"value\", kind=\"scatter\", title=p)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"params = list(df.keys()[5:-1])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!poetry add tabulate\n",
"from tabulate import tabulate\n",
"print(\n",
" tabulate(\n",
" (x[0] for x in sorted(list(df.groupby(params)), key=lambda x: x[1][\"value\"].mean())),\n",
" headers = params,\n",
" tablefmt = \"grid\",\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"for p in params:\n",
" df.plot(x=p, y=\"value\", kind=\"scatter\", title=p).show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

366
expt2.ipynb Normal file
View File

@@ -0,0 +1,366 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>Experiment 2</h1>\n",
"<h3>Targetted hyperparameter tuning</h3>\n",
"<p>\n",
"By examining the results of expt1, a smaller range of hyperparameters for expt2 was chosen. This allowed for a more targetted search of the hyperparameter space to find an optimal configuration. The selected parameters for expt2 were as follows:\n",
"</p>\n",
"<ul>\n",
"<li>in_act = Linear, Mish, PBessel, or Tanhshrink</li>\n",
"<li>compressor_kernel_size = 128</li>\n",
"<li>compressor_act = Softshrink, SoftExp, or PReLU</li>\n",
"<li>conv_kernel_size = 128</li>\n",
"<li>conv_act = Sigmoid or PBessel</li>\n",
"<li>channel_combine_act = HardSigmoid or GELU</li>\n",
"<li>ff_width = 512</li>\n",
"<li>ff_depth = 2, 4, or 6</li>\n",
"<li>ff_act = CELU</li>\n",
"<li>out_act = Tanhshrink or Mish</li>\n",
"</ul>\n",
"<p>\n",
"Several of the parameters were able to be fixed to a specific value, and the remaining parameters (with the exception of <code>`in_act`</code>) were reduced to only 2 or 3 possible values, dramatically shrinking the parameter space. For this reason, a significantly less aggressive pruning algorithm was used, allowing for a more thorough search of the parameter space.\n",
"</p>"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Data handling imports\n",
"from dask.distributed import Client, LocalCluster\n",
"import dask\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import numpy as np\n",
"import pickle\n",
"import random\n",
"from itertools import chain\n",
"from tqdm.auto import tqdm\n",
"\n",
"# Deep learning imports\n",
"import torch\n",
"from torch.utils.data import DataLoader\n",
"from torch import nn\n",
"from torch.nn import functional as F\n",
"from torch import optim\n",
"import pytorch_lightning as pl\n",
"from pytorch_lightning import Trainer\n",
"import optuna\n",
"from optuna.pruners import HyperbandPruner\n",
"from optuna.integration import PyTorchLightningPruningCallback\n",
"\n",
"# Suppress some warning messages from pytorch_lightning,\n",
"# It really doesn't like that i've forced it to handle a dask array!\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning, module=pl.__name__)\n",
"\n",
"# Also, set up a log to record debug messages for failed trials\n",
"import logging\n",
"\n",
"logging.basicConfig(filename=\"debug_test.log\", encoding=\"utf-8\", level=logging.DEBUG)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"from expt1 import (\n",
" Model,\n",
" Linear,\n",
" device,\n",
" activation_dispatcher,\n",
" X_train,\n",
" y_train,\n",
" X_val,\n",
" y_val,\n",
" create_collate_fn,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cluster = LocalCluster(n_workers=8, threads_per_worker=1)\n",
"client = Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# Monkey patch to allow pytorch lightning to accept a dask array as a model input\n",
"from typing import Any, Generator, Iterable, Mapping, Optional, Union\n",
"\n",
"BType = Union[da.Array, torch.Tensor, str, Mapping[Any, \"BType\"], Iterable[\"BType\"]]\n",
"\n",
"unpatched = pl.utilities.data._extract_batch_size\n",
"\n",
"\n",
"def patch(batch: BType) -> Generator[Optional[int], None, None]:\n",
" if isinstance(batch, da.core.Array):\n",
" if len(batch.shape) == 0:\n",
" yield 1\n",
" else:\n",
" yield batch.shape[0]\n",
" else:\n",
" yield from unpatched(batch)\n",
"\n",
"\n",
"pl.utilities.data._extract_batch_size = patch"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Test parameters\n",
"n_epochs = 10\n",
"output_keys = list(next(iter(y_train.values())).keys())\n",
"activation_vals = list(activation_dispatcher.keys())\n",
"\n",
"\n",
"# Next we define the objective function for the hyperparameter optimization\n",
"def objective(trial):\n",
" torch.cuda.empty_cache()\n",
" objective_value = torch.inf\n",
" model = None\n",
" logger = None\n",
" try:\n",
" # Select hyperparameters for testing\n",
" in_act = (\n",
" activation_dispatcher[trial.suggest_categorical(\"in_act\", activation_vals)],\n",
" (),\n",
" {},\n",
" )\n",
" compressor_kernel_size = trial.suggest_int(\n",
" \"compressor_kernel_size\", 64, 257, 64\n",
" )\n",
" compressor_chunk_size = 128\n",
" compressor_act = (\n",
" activation_dispatcher[\n",
" trial.suggest_categorical(\"compressor_act\", activation_vals)\n",
" ],\n",
" (),\n",
" {},\n",
" )\n",
" conv_kernel_size = trial.suggest_int(\"conv_kernel_size\", 64, 257, 64)\n",
" conv_act = (\n",
" activation_dispatcher[\n",
" trial.suggest_categorical(\"conv_act\", activation_vals)\n",
" ],\n",
" (),\n",
" {},\n",
" )\n",
" channel_combine_act = (\n",
" activation_dispatcher[\n",
" trial.suggest_categorical(\"channel_combine_act\", activation_vals)\n",
" ],\n",
" (),\n",
" {},\n",
" )\n",
" ff_width = trial.suggest_int(\"ff_width\", 256, 1025, 256)\n",
" ff_depth = trial.suggest_int(\"ff_depth\", 2, 8, 2)\n",
" ff_act = (\n",
" activation_dispatcher[trial.suggest_categorical(\"ff_act\", activation_vals)],\n",
" (),\n",
" {},\n",
" )\n",
" out_size = len(output_keys)\n",
" out_act = (\n",
" activation_dispatcher[\n",
" trial.suggest_categorical(\"out_act\", activation_vals)\n",
" ],\n",
" (),\n",
" {},\n",
" )\n",
"\n",
" # Set up the model architecture and other necessary components\n",
" model = Model(\n",
" in_act=in_act,\n",
" compressor_kernel_size=compressor_kernel_size,\n",
" compressor_chunk_size=compressor_chunk_size,\n",
" compressor_act=compressor_act,\n",
" conv_kernel_size=conv_kernel_size,\n",
" conv_act=conv_act,\n",
" channel_combine_act=channel_combine_act,\n",
" ff_width=ff_width,\n",
" ff_depth=ff_depth,\n",
" ff_act=ff_act,\n",
" out_size=out_size,\n",
" out_act=out_act,\n",
" ).to(device)\n",
"\n",
" trainer = Trainer(\n",
" accelerator=\"gpu\",\n",
" max_epochs=n_epochs,\n",
" devices=1,\n",
" logger=logger,\n",
" num_sanity_val_steps=0, # Needs to be disabled or else we get an error because X is dask array\n",
" # precision=\"16-mixed\",\n",
" callbacks=[\n",
" PyTorchLightningPruningCallback(trial, monitor=\"val_loss\"),\n",
" ],\n",
" )\n",
" # Prepare datasets\n",
" train = DataLoader(\n",
" list(zip(X_train.values(), y_train.values())),\n",
" collate_fn=create_collate_fn(),\n",
" shuffle=True,\n",
" )\n",
" valid = DataLoader(\n",
" list(zip(X_val.values(), y_val.values())),\n",
" shuffle=True,\n",
" collate_fn=create_collate_fn(),\n",
" )\n",
" # Finally, train the model\n",
" trainer.fit(model, train, valid)\n",
" except torch.cuda.OutOfMemoryError as e:\n",
" logging.warning(f\"Ran out of memory in trial {trial.number}!\")\n",
" raise optuna.exceptions.TrialPruned()\n",
" except Exception as e:\n",
" logging.exception(f\"An exception occurred in trial {trial.number}: {e}\")\n",
" raise optuna.exceptions.TrialPruned()\n",
" finally:\n",
" if logger is not None:\n",
" logger.experiment.unwatch(model)\n",
" logger.experiment.finish()\n",
" del model\n",
" torch.cuda.empty_cache()\n",
" if objective_value == torch.inf:\n",
" raise optuna.exceptions.TrialPruned()\n",
" return objective_value"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[I 2023-07-31 23:49:15,744] Using an existing study with name 'Experiment 2' instead of creating a new one.\n",
"[I 2023-07-31 23:49:16,553] Trial 221 pruned. \n",
"[I 2023-07-31 23:49:16,928] Trial 222 pruned. \n",
"[I 2023-07-31 23:49:17,318] Trial 223 pruned. \n",
"[I 2023-07-31 23:49:17,682] Trial 224 pruned. \n",
"[W 2023-07-31 23:49:18,028] Trial 225 failed with parameters: {} because of the following error: KeyboardInterrupt().\n",
"Traceback (most recent call last):\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py\", line 200, in _run_trial\n",
" value_or_values = func(trial)\n",
" ^^^^^^^^^^^\n",
" File \"/tmp/ipykernel_562333/3392796582.py\", line 16, in objective\n",
" activation_dispatcher[trial.suggest_categorical(\"in_act\", activation_vals)],\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/trial/_trial.py\", line 405, in suggest_categorical\n",
" return self._suggest(name, CategoricalDistribution(choices=choices))\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/trial/_trial.py\", line 630, in _suggest\n",
" param_value = self.study.sampler.sample_independent(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/sampler.py\", line 471, in sample_independent\n",
" mpe_above = _ParzenEstimator(\n",
" ^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py\", line 75, in __init__\n",
" distributions=[\n",
" ^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py\", line 76, in <listcomp>\n",
" self._calculate_distributions(\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py\", line 154, in _calculate_distributions\n",
" return self._calculate_categorical_distributions(\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py\", line 192, in _calculate_categorical_distributions\n",
" weights /= weights.sum(axis=1, keepdims=True)\n",
" ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
" File \"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/numpy/core/_methods.py\", line 47, in _sum\n",
" def _sum(a, axis=None, dtype=None, out=None, keepdims=False,\n",
" \n",
"KeyboardInterrupt\n",
"[W 2023-07-31 23:49:18,040] Trial 225 failed with value None.\n"
]
},
{
"ename": "KeyboardInterrupt",
"evalue": "",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 11\u001b[0m\n\u001b[1;32m 3\u001b[0m study_name \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mExperiment 2\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m study \u001b[39m=\u001b[39m optuna\u001b[39m.\u001b[39mcreate_study(\n\u001b[1;32m 5\u001b[0m study_name\u001b[39m=\u001b[39mstudy_name,\n\u001b[1;32m 6\u001b[0m storage\u001b[39m=\u001b[39mstorage_name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 9\u001b[0m load_if_exists\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 10\u001b[0m )\n\u001b[0;32m---> 11\u001b[0m study\u001b[39m.\u001b[39;49moptimize(\n\u001b[1;32m 12\u001b[0m objective,\n\u001b[1;32m 13\u001b[0m n_trials\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m,\n\u001b[1;32m 14\u001b[0m timeout\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m,\n\u001b[1;32m 15\u001b[0m )\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/study.py:443\u001b[0m, in \u001b[0;36mStudy.optimize\u001b[0;34m(self, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39moptimize\u001b[39m(\n\u001b[1;32m 340\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 341\u001b[0m func: ObjectiveFuncType,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 348\u001b[0m show_progress_bar: \u001b[39mbool\u001b[39m \u001b[39m=\u001b[39m \u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 349\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 350\u001b[0m \u001b[39m \u001b[39m\u001b[39m\"\"\"Optimize an objective function.\u001b[39;00m\n\u001b[1;32m 351\u001b[0m \n\u001b[1;32m 352\u001b[0m \u001b[39m Optimization is done by choosing a suitable set of hyperparameter values from a given\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 440\u001b[0m \u001b[39m If nested invocation of this method occurs.\u001b[39;00m\n\u001b[1;32m 441\u001b[0m \u001b[39m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 443\u001b[0m _optimize(\n\u001b[1;32m 444\u001b[0m study\u001b[39m=\u001b[39;49m\u001b[39mself\u001b[39;49m,\n\u001b[1;32m 445\u001b[0m func\u001b[39m=\u001b[39;49mfunc,\n\u001b[1;32m 446\u001b[0m n_trials\u001b[39m=\u001b[39;49mn_trials,\n\u001b[1;32m 447\u001b[0m timeout\u001b[39m=\u001b[39;49mtimeout,\n\u001b[1;32m 448\u001b[0m n_jobs\u001b[39m=\u001b[39;49mn_jobs,\n\u001b[1;32m 449\u001b[0m catch\u001b[39m=\u001b[39;49m\u001b[39mtuple\u001b[39;49m(catch) \u001b[39mif\u001b[39;49;00m \u001b[39misinstance\u001b[39;49m(catch, Iterable) \u001b[39melse\u001b[39;49;00m (catch,),\n\u001b[1;32m 450\u001b[0m callbacks\u001b[39m=\u001b[39;49mcallbacks,\n\u001b[1;32m 451\u001b[0m gc_after_trial\u001b[39m=\u001b[39;49mgc_after_trial,\n\u001b[1;32m 452\u001b[0m show_progress_bar\u001b[39m=\u001b[39;49mshow_progress_bar,\n\u001b[1;32m 453\u001b[0m )\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py:66\u001b[0m, in \u001b[0;36m_optimize\u001b[0;34m(study, func, n_trials, timeout, n_jobs, catch, callbacks, gc_after_trial, show_progress_bar)\u001b[0m\n\u001b[1;32m 64\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 65\u001b[0m \u001b[39mif\u001b[39;00m n_jobs \u001b[39m==\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[0;32m---> 66\u001b[0m _optimize_sequential(\n\u001b[1;32m 67\u001b[0m study,\n\u001b[1;32m 68\u001b[0m func,\n\u001b[1;32m 69\u001b[0m n_trials,\n\u001b[1;32m 70\u001b[0m timeout,\n\u001b[1;32m 71\u001b[0m catch,\n\u001b[1;32m 72\u001b[0m callbacks,\n\u001b[1;32m 73\u001b[0m gc_after_trial,\n\u001b[1;32m 74\u001b[0m reseed_sampler_rng\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m,\n\u001b[1;32m 75\u001b[0m time_start\u001b[39m=\u001b[39;49m\u001b[39mNone\u001b[39;49;00m,\n\u001b[1;32m 76\u001b[0m progress_bar\u001b[39m=\u001b[39;49mprogress_bar,\n\u001b[1;32m 77\u001b[0m )\n\u001b[1;32m 78\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 79\u001b[0m \u001b[39mif\u001b[39;00m n_jobs \u001b[39m==\u001b[39m \u001b[39m-\u001b[39m\u001b[39m1\u001b[39m:\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py:163\u001b[0m, in \u001b[0;36m_optimize_sequential\u001b[0;34m(study, func, n_trials, timeout, catch, callbacks, gc_after_trial, reseed_sampler_rng, time_start, progress_bar)\u001b[0m\n\u001b[1;32m 160\u001b[0m \u001b[39mbreak\u001b[39;00m\n\u001b[1;32m 162\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 163\u001b[0m frozen_trial \u001b[39m=\u001b[39m _run_trial(study, func, catch)\n\u001b[1;32m 164\u001b[0m \u001b[39mfinally\u001b[39;00m:\n\u001b[1;32m 165\u001b[0m \u001b[39m# The following line mitigates memory problems that can be occurred in some\u001b[39;00m\n\u001b[1;32m 166\u001b[0m \u001b[39m# environments (e.g., services that use computing containers such as GitHub Actions).\u001b[39;00m\n\u001b[1;32m 167\u001b[0m \u001b[39m# Please refer to the following PR for further details:\u001b[39;00m\n\u001b[1;32m 168\u001b[0m \u001b[39m# https://github.com/optuna/optuna/pull/325.\u001b[39;00m\n\u001b[1;32m 169\u001b[0m \u001b[39mif\u001b[39;00m gc_after_trial:\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py:251\u001b[0m, in \u001b[0;36m_run_trial\u001b[0;34m(study, func, catch)\u001b[0m\n\u001b[1;32m 244\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39mFalse\u001b[39;00m, \u001b[39m\"\u001b[39m\u001b[39mShould not reach.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 246\u001b[0m \u001b[39mif\u001b[39;00m (\n\u001b[1;32m 247\u001b[0m frozen_trial\u001b[39m.\u001b[39mstate \u001b[39m==\u001b[39m TrialState\u001b[39m.\u001b[39mFAIL\n\u001b[1;32m 248\u001b[0m \u001b[39mand\u001b[39;00m func_err \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 249\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39misinstance\u001b[39m(func_err, catch)\n\u001b[1;32m 250\u001b[0m ):\n\u001b[0;32m--> 251\u001b[0m \u001b[39mraise\u001b[39;00m func_err\n\u001b[1;32m 252\u001b[0m \u001b[39mreturn\u001b[39;00m frozen_trial\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/study/_optimize.py:200\u001b[0m, in \u001b[0;36m_run_trial\u001b[0;34m(study, func, catch)\u001b[0m\n\u001b[1;32m 198\u001b[0m \u001b[39mwith\u001b[39;00m get_heartbeat_thread(trial\u001b[39m.\u001b[39m_trial_id, study\u001b[39m.\u001b[39m_storage):\n\u001b[1;32m 199\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 200\u001b[0m value_or_values \u001b[39m=\u001b[39m func(trial)\n\u001b[1;32m 201\u001b[0m \u001b[39mexcept\u001b[39;00m exceptions\u001b[39m.\u001b[39mTrialPruned \u001b[39mas\u001b[39;00m e:\n\u001b[1;32m 202\u001b[0m \u001b[39m# TODO(mamu): Handle multi-objective cases.\u001b[39;00m\n\u001b[1;32m 203\u001b[0m state \u001b[39m=\u001b[39m TrialState\u001b[39m.\u001b[39mPRUNED\n",
"Cell \u001b[0;32mIn[4], line 16\u001b[0m, in \u001b[0;36mobjective\u001b[0;34m(trial)\u001b[0m\n\u001b[1;32m 12\u001b[0m logger \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n\u001b[1;32m 13\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[1;32m 14\u001b[0m \u001b[39m# Select hyperparameters for testing\u001b[39;00m\n\u001b[1;32m 15\u001b[0m in_act \u001b[39m=\u001b[39m (\n\u001b[0;32m---> 16\u001b[0m activation_dispatcher[trial\u001b[39m.\u001b[39;49msuggest_categorical(\u001b[39m\"\u001b[39;49m\u001b[39min_act\u001b[39;49m\u001b[39m\"\u001b[39;49m, activation_vals)],\n\u001b[1;32m 17\u001b[0m (),\n\u001b[1;32m 18\u001b[0m {},\n\u001b[1;32m 19\u001b[0m )\n\u001b[1;32m 20\u001b[0m compressor_kernel_size \u001b[39m=\u001b[39m trial\u001b[39m.\u001b[39msuggest_int(\n\u001b[1;32m 21\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mcompressor_kernel_size\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m64\u001b[39m, \u001b[39m257\u001b[39m, \u001b[39m64\u001b[39m\n\u001b[1;32m 22\u001b[0m )\n\u001b[1;32m 23\u001b[0m compressor_chunk_size \u001b[39m=\u001b[39m \u001b[39m128\u001b[39m\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/trial/_trial.py:405\u001b[0m, in \u001b[0;36mTrial.suggest_categorical\u001b[0;34m(self, name, choices)\u001b[0m\n\u001b[1;32m 354\u001b[0m \u001b[39m\u001b[39m\u001b[39m\"\"\"Suggest a value for the categorical parameter.\u001b[39;00m\n\u001b[1;32m 355\u001b[0m \n\u001b[1;32m 356\u001b[0m \u001b[39mThe value is sampled from ``choices``.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[39m :ref:`configurations` tutorial describes more details and flexible usages.\u001b[39;00m\n\u001b[1;32m 401\u001b[0m \u001b[39m\"\"\"\u001b[39;00m\n\u001b[1;32m 402\u001b[0m \u001b[39m# There is no need to call self._check_distribution because\u001b[39;00m\n\u001b[1;32m 403\u001b[0m \u001b[39m# CategoricalDistribution does not support dynamic value space.\u001b[39;00m\n\u001b[0;32m--> 405\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_suggest(name, CategoricalDistribution(choices\u001b[39m=\u001b[39;49mchoices))\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/trial/_trial.py:630\u001b[0m, in \u001b[0;36mTrial._suggest\u001b[0;34m(self, name, distribution)\u001b[0m\n\u001b[1;32m 628\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 629\u001b[0m study \u001b[39m=\u001b[39m pruners\u001b[39m.\u001b[39m_filter_study(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mstudy, trial)\n\u001b[0;32m--> 630\u001b[0m param_value \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mstudy\u001b[39m.\u001b[39;49msampler\u001b[39m.\u001b[39;49msample_independent(\n\u001b[1;32m 631\u001b[0m study, trial, name, distribution\n\u001b[1;32m 632\u001b[0m )\n\u001b[1;32m 634\u001b[0m \u001b[39m# `param_value` is validated here (invalid value like `np.nan` raises ValueError).\u001b[39;00m\n\u001b[1;32m 635\u001b[0m param_value_in_internal_repr \u001b[39m=\u001b[39m distribution\u001b[39m.\u001b[39mto_internal_repr(param_value)\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/sampler.py:471\u001b[0m, in \u001b[0;36mTPESampler.sample_independent\u001b[0;34m(self, study, trial, param_name, param_distribution)\u001b[0m\n\u001b[1;32m 467\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 468\u001b[0m mpe_below \u001b[39m=\u001b[39m _ParzenEstimator(\n\u001b[1;32m 469\u001b[0m below, {param_name: param_distribution}, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_parzen_estimator_parameters\n\u001b[1;32m 470\u001b[0m )\n\u001b[0;32m--> 471\u001b[0m mpe_above \u001b[39m=\u001b[39m _ParzenEstimator(\n\u001b[1;32m 472\u001b[0m above, {param_name: param_distribution}, \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_parzen_estimator_parameters\n\u001b[1;32m 473\u001b[0m )\n\u001b[1;32m 474\u001b[0m samples_below \u001b[39m=\u001b[39m mpe_below\u001b[39m.\u001b[39msample(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_rng, \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_n_ei_candidates)\n\u001b[1;32m 475\u001b[0m log_likelihoods_below \u001b[39m=\u001b[39m mpe_below\u001b[39m.\u001b[39mlog_pdf(samples_below)\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py:75\u001b[0m, in \u001b[0;36m_ParzenEstimator.__init__\u001b[0;34m(self, observations, search_space, parameters, predetermined_weights)\u001b[0m\n\u001b[1;32m 71\u001b[0m weights \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mappend(weights, [parameters\u001b[39m.\u001b[39mprior_weight])\n\u001b[1;32m 72\u001b[0m weights \u001b[39m/\u001b[39m\u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39msum()\n\u001b[1;32m 73\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mixture_distribution \u001b[39m=\u001b[39m _MixtureOfProductDistribution(\n\u001b[1;32m 74\u001b[0m weights\u001b[39m=\u001b[39mweights,\n\u001b[0;32m---> 75\u001b[0m distributions\u001b[39m=\u001b[39m[\n\u001b[1;32m 76\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_calculate_distributions(\n\u001b[1;32m 77\u001b[0m transformed_observations[:, i], search_space[param], parameters\n\u001b[1;32m 78\u001b[0m )\n\u001b[1;32m 79\u001b[0m \u001b[39mfor\u001b[39;49;00m i, param \u001b[39min\u001b[39;49;00m \u001b[39menumerate\u001b[39;49m(search_space)\n\u001b[1;32m 80\u001b[0m ],\n\u001b[1;32m 81\u001b[0m )\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py:76\u001b[0m, in \u001b[0;36m<listcomp>\u001b[0;34m(.0)\u001b[0m\n\u001b[1;32m 71\u001b[0m weights \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mappend(weights, [parameters\u001b[39m.\u001b[39mprior_weight])\n\u001b[1;32m 72\u001b[0m weights \u001b[39m/\u001b[39m\u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39msum()\n\u001b[1;32m 73\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_mixture_distribution \u001b[39m=\u001b[39m _MixtureOfProductDistribution(\n\u001b[1;32m 74\u001b[0m weights\u001b[39m=\u001b[39mweights,\n\u001b[1;32m 75\u001b[0m distributions\u001b[39m=\u001b[39m[\n\u001b[0;32m---> 76\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_calculate_distributions(\n\u001b[1;32m 77\u001b[0m transformed_observations[:, i], search_space[param], parameters\n\u001b[1;32m 78\u001b[0m )\n\u001b[1;32m 79\u001b[0m \u001b[39mfor\u001b[39;00m i, param \u001b[39min\u001b[39;00m \u001b[39menumerate\u001b[39m(search_space)\n\u001b[1;32m 80\u001b[0m ],\n\u001b[1;32m 81\u001b[0m )\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py:154\u001b[0m, in \u001b[0;36m_ParzenEstimator._calculate_distributions\u001b[0;34m(self, transformed_observations, search_space, parameters)\u001b[0m\n\u001b[1;32m 147\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_calculate_distributions\u001b[39m(\n\u001b[1;32m 148\u001b[0m \u001b[39mself\u001b[39m,\n\u001b[1;32m 149\u001b[0m transformed_observations: np\u001b[39m.\u001b[39mndarray,\n\u001b[1;32m 150\u001b[0m search_space: BaseDistribution,\n\u001b[1;32m 151\u001b[0m parameters: _ParzenEstimatorParameters,\n\u001b[1;32m 152\u001b[0m ) \u001b[39m-\u001b[39m\u001b[39m>\u001b[39m _BatchedDistributions:\n\u001b[1;32m 153\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39misinstance\u001b[39m(search_space, CategoricalDistribution):\n\u001b[0;32m--> 154\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_calculate_categorical_distributions(\n\u001b[1;32m 155\u001b[0m transformed_observations, search_space\u001b[39m.\u001b[39;49mchoices, parameters\n\u001b[1;32m 156\u001b[0m )\n\u001b[1;32m 157\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m 158\u001b[0m \u001b[39massert\u001b[39;00m \u001b[39misinstance\u001b[39m(search_space, (FloatDistribution, IntDistribution))\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/optuna/samplers/_tpe/parzen_estimator.py:192\u001b[0m, in \u001b[0;36m_ParzenEstimator._calculate_categorical_distributions\u001b[0;34m(self, observations, choices, parameters)\u001b[0m\n\u001b[1;32m 186\u001b[0m weights \u001b[39m=\u001b[39m np\u001b[39m.\u001b[39mfull(\n\u001b[1;32m 187\u001b[0m shape\u001b[39m=\u001b[39m(\u001b[39mlen\u001b[39m(observations) \u001b[39m+\u001b[39m consider_prior, \u001b[39mlen\u001b[39m(choices)),\n\u001b[1;32m 188\u001b[0m fill_value\u001b[39m=\u001b[39mparameters\u001b[39m.\u001b[39mprior_weight \u001b[39m/\u001b[39m (\u001b[39mlen\u001b[39m(observations) \u001b[39m+\u001b[39m consider_prior),\n\u001b[1;32m 189\u001b[0m )\n\u001b[1;32m 191\u001b[0m weights[np\u001b[39m.\u001b[39marange(\u001b[39mlen\u001b[39m(observations)), observations\u001b[39m.\u001b[39mastype(\u001b[39mint\u001b[39m)] \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39m1\u001b[39m\n\u001b[0;32m--> 192\u001b[0m weights \u001b[39m/\u001b[39m\u001b[39m=\u001b[39m weights\u001b[39m.\u001b[39;49msum(axis\u001b[39m=\u001b[39;49m\u001b[39m1\u001b[39;49m, keepdims\u001b[39m=\u001b[39;49m\u001b[39mTrue\u001b[39;49;00m)\n\u001b[1;32m 193\u001b[0m \u001b[39mreturn\u001b[39;00m _BatchedCategoricalDistributions(weights)\n",
"File \u001b[0;32m~/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/numpy/core/_methods.py:47\u001b[0m, in \u001b[0;36m_sum\u001b[0;34m(a, axis, dtype, out, keepdims, initial, where)\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_amin\u001b[39m(a, axis\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, out\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, keepdims\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 44\u001b[0m initial\u001b[39m=\u001b[39m_NoValue, where\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 45\u001b[0m \u001b[39mreturn\u001b[39;00m umr_minimum(a, axis, \u001b[39mNone\u001b[39;00m, out, keepdims, initial, where)\n\u001b[0;32m---> 47\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_sum\u001b[39m(a, axis\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, out\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, keepdims\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 48\u001b[0m initial\u001b[39m=\u001b[39m_NoValue, where\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n\u001b[1;32m 49\u001b[0m \u001b[39mreturn\u001b[39;00m umr_sum(a, axis, dtype, out, keepdims, initial, where)\n\u001b[1;32m 51\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39m_prod\u001b[39m(a, axis\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, dtype\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, out\u001b[39m=\u001b[39m\u001b[39mNone\u001b[39;00m, keepdims\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 52\u001b[0m initial\u001b[39m=\u001b[39m_NoValue, where\u001b[39m=\u001b[39m\u001b[39mTrue\u001b[39;00m):\n",
"\u001b[0;31mKeyboardInterrupt\u001b[0m: "
]
}
],
"source": [
"storage_name = \"sqlite:///optuna.sql\"\n",
"storage_name = \"mysql+pymysql://root:Ch31121992@192.168.1.10:3306/optuna_db\"\n",
"study_name = \"Experiment 2\"\n",
"study = optuna.create_study(\n",
" study_name=study_name,\n",
" storage=storage_name,\n",
" direction=\"minimize\",\n",
" pruner=HyperbandPruner(),\n",
" load_if_exists=True,\n",
")\n",
"study.optimize(\n",
" objective,\n",
" n_trials=None,\n",
" timeout=None,\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

521
expt2.py Normal file
View File

@@ -0,0 +1,521 @@
# %% [markdown]
# <h1>Experiment 1</h1>
# <h3>Initial hyperparameter tuning</h3>
# <p>Summary</p>
# <ul>
# <li>A model was created with a dynamic constructor, allowing for a hyperparameter-driven model</li>
# <li>Hyperparameters were tuned using <code>`Optuna`</code></li>
# <li>Training loop was constructed using <code>`PyTorchLightning`</code></li>
# <li>Model was trained on a cluster of machines using a shared SQL trial database</li>
# <li>An extremely aggressive pruning algorithm was used to quickly narrow in on an optimal hyperparameter space</li>
# <li>Experiment 1 was left to train on the cluster for 2 days</li>
# </ul>
# %%
# Data handling imports
from dask.distributed import Client, LocalCluster
import dask
import dask.dataframe as dd
import dask.array as da
import numpy as np
import pickle
import random
from itertools import chain
from tqdm.auto import tqdm
# Deep learning imports
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch.nn import functional as F
from torch import optim
import pytorch_lightning as pl
from pytorch_lightning import Trainer
import optuna
from optuna.pruners import HyperbandPruner
from optuna.integration import PyTorchLightningPruningCallback
# Suppress some warning messages from pytorch_lightning,
# It really doesn't like that i've forced it to handle a dask array!
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module=pl.__name__)
# Also, set up a log to record debug messages for failed trials
import logging
logging.basicConfig(filename="debug.log", encoding="utf-8", level=logging.ERROR)
# %% [markdown]
# <h3>Patching PyTorchLightning</h3>
# <p>
# A key part of this project was to develop a patch for PyTorchLightning to allow for the use of <code>`dask`</code> arrays as inputs. It was important that PyTorchLightning can accept <code>`dask`</code> arrays and only load the data into memory when needed. Otherwise, our extremely large datasets would simply crash our system as they are significantly larger than the available RAM and VRAM.
# </p><p>
# After several versions of the patch, this final version was developed. It is a simple monkey patch that wraps the <code>pytorch_lightning.utlities.data._extract_batch_size</code> generator with a check that mimics the expected behaviour for torch tensors when given a dask array and extends its type signature to ensure static analysis is still possible.
# </p><p>
# With this patch applied, the forward method in our model can accept a dask array and only compute each chunk of the array when needed. This allows us to train our model on datasets that are significantly larger than the available memory.
# </p>
# %%
# Monkey patch to allow pytorch lightning to accept a dask array as a model input
from typing import Any, Generator, Iterable, Mapping, Optional, Union
BType = Union[da.Array, torch.Tensor, str, Mapping[Any, "BType"], Iterable["BType"]]
unpatched = pl.utilities.data._extract_batch_size
def patch(batch: BType) -> Generator[Optional[int], None, None]:
if isinstance(batch, da.core.Array):
if len(batch.shape) == 0:
yield 1
else:
yield batch.shape[0]
else:
yield from unpatched(batch)
pl.utilities.data._extract_batch_size = patch
# %%
# Set the device to use with torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# Prepare a dask cluster and client
def create_client():
cluster = LocalCluster(n_workers=2, threads_per_worker=1)
client = Client(cluster)
return client
if __name__ == "__main__":
client = create_client()
# %%
# Load X and y for training
samples = list(range(1, 82))
with open("sample_X.pkl", "rb") as f:
X = pickle.load(f)
with open("sample_y.pkl", "rb") as f:
y = pickle.load(f)
# %% [markdown]
# <h3>Dataset Splitting</h3>
# <p>The dataset is split into a training and validation dataset (80:20 split). Because the number of available samples is extremely small, we haven't produced a test dataset. In the future, as more data is obtained, a test set should be included whenever possible.</p>
# %%
# Separate samples into training and validation sets
val_samples = random.sample(samples, k=len(samples) // 5)
train_samples = [s for s in samples if s not in val_samples]
X_train = {i: X[i] for i in train_samples}
X_val = {i: X[i] for i in val_samples}
y_train = {i: y[i] for i in train_samples}
y_val = {i: y[i] for i in val_samples}
# %% [markdown]
# <h3>Dataset Collation</h3>
# <p>This function returns a closure for collating our data in a torch DataLoader. The use of a DataLoader will allow us to shuffle and prefetch data, reducing overfitting and maximising performance as IO will be a bottleneck. The closure is dynamically constructed, allowing us to select the outputs we train against. However, for this experiment we will match against all outputs for simplicity.</p>
# %%
# Create a function to dynamically modify data collation
def collate_fn(batch):
X0 = batch[0][0][0].to_numpy(dtype=np.float32)[0]
X1 = batch[0][0][1].to_dask_array(lengths=True)
y = batch[0][1].to_numpy(dtype=np.float32)
return (
torch.from_numpy(X0).to(device),
X1,
torch.from_numpy(y).to(device),
)
# %% [markdown]
# <h3>Convolutional Data Compression</h3>
# <p>
# The <code>`DaskCompression`</code> module accepts a dask array, and applies a convolutional kernel to it to significantly compress the input data. This allows us to transform a larger than VRAM dataset into one that can fit on our GPU, and (hopefully) retain the relevant information to train the rest of our model on.
# </p><p>
# Note how the kernel is only computed in line 12 and is immediately compressed via convolution. This ensures that only one kernel needs to be stored in memory at a time, avoiding the need to hold the entire dataset in memory at once.
# </p>
# %%
class DaskCompression(nn.Module):
def __init__(
self, in_channels, out_channels, kernel_size, chunk_size=1, device=device
):
super(DaskCompression, self).__init__()
self.kernel_size = kernel_size
self.in_channels = in_channels
self.out_channels = out_channels
self.chunk_size = chunk_size
self.device = device
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size).to(device)
def compress_kernel(self, kernel):
return (
self.conv(torch.from_numpy(kernel.compute()).to(self.device))
.squeeze()
.to("cpu") # return to cpu to save VRAM
)
def forward(self, x):
# Precompute the dimensions of the output array
dim0, dim2 = x.shape
assert dim2 == self.in_channels
dim0 = (dim0 // self.kernel_size) // self.chunk_size
x = x.reshape(dim0, self.chunk_size, self.kernel_size, dim2)
x = da.transpose(x, axes=(0, 1, 3, 2))
x = [self.compress_kernel(kernel) for kernel in x]
return torch.stack(x).to(self.device)
# %% [markdown]
# <h3>Model Design</h3>
# <p>
# The model was designed to be a dynamically constructed, hyperparameter driven model for ease of hyperparameter optimisation. The contructed model will process data in the following way:
# </p>
# <ol>
# <li>The input is left/right padded to a multiple of the compressor kernel size</li>
# <li>The dask array is compressed by a <code>`DaskCompressor`</code> layer, treating each input as a channel</li>
# <li>The compressed array is then recursively convoluted down to a size less than or equal to the width of our feedforward network</li>
# <li>The channels of the now convolved data are combined</li>
# <li>The combined, flattened data is then left/right padded to the width of the feedforward network</li>
# <li>Finally, the data is fed into a feedforward network</li>
# </ol>
# <p>
# This relatively simple design allows the network to accept both larger-than-ram datasets as inputs, and datasets of variable sizes. This makes it suitable for training on whole Aconity datasets, without the need for culling or binning.
# </p>
# %%
class Model(pl.LightningModule):
def __init__(
self,
# pl attributes
optimizer=torch.optim.Adam,
optimizer_args=(),
optimizer_kwargs={},
scheduler=None,
scheduler_kwargs={},
loss=torch.nn.MSELoss(),
train_ds=None,
val_ds=None,
# model args & kwargs
compressor_kernel_size=128,
compressor_chunk_size=128,
compressor_act=(nn.ReLU, (), {}),
conv_kernel_size=128,
conv_norm=False,
conv_act=(nn.ReLU, (), {}),
channel_combine_act=(nn.ReLU, (), {}),
param_ff_depth=4,
param_ff_width=16,
param_ff_act=(nn.ReLU, (), {}),
ff_width=512,
ff_depth=4,
ff_act=(nn.ReLU, (), {}),
out_size=6,
out_act=(nn.ReLU, (), {}),
):
super().__init__()
# Assign necessary attributes for pl model
self.optimizer = optimizer
self.optimizer_args = optimizer_args
self.optimizer_kwargs = optimizer_kwargs
self.scheduler = scheduler
self.scheduler_kwargs = scheduler_kwargs
self.loss = loss
self.train_ds = train_ds
self.val_ds = val_ds
# Attrs for dynamically created model to be tested
self.compressor_kernel_size = compressor_kernel_size
self.compressor_chunk_size = compressor_chunk_size
self.conv_kernel_size = conv_kernel_size
self.ff_width = ff_width
self.ff_depth = ff_depth
self.out_size = out_size
# layers
# compressor compresses and converts dask array to torch tensor
self.convolutional_compressor = DaskCompression(
5,
5,
kernel_size=compressor_kernel_size,
chunk_size=compressor_chunk_size,
)
self.compressor_act = compressor_act[0](*compressor_act[1], **compressor_act[2])
# convolutional layer recursively applies convolutions to the compressed input
self.conv = nn.Conv1d(5, 5, kernel_size=conv_kernel_size)
self.conv_norm = nn.LocalResponseNorm(5) if conv_norm else nn.Identity()
self.conv_act = conv_act[0](*conv_act[1], **conv_act[2])
self.combine_channels = nn.Conv1d(5, 1, kernel_size=1)
self.channel_combine_act = channel_combine_act[0](
*channel_combine_act[1], **channel_combine_act[2]
)
self.param_ff = nn.Sequential(
nn.Linear(4, param_ff_width),
param_ff_act[0](*param_ff_act[1], **param_ff_act[2]),
*chain(
*(
(
nn.Linear(param_ff_width, param_ff_width),
param_ff_act[0](*param_ff_act[1], **param_ff_act[2]),
)
for _ in range(param_ff_depth)
)
),
)
self.ff = nn.Sequential(
nn.Linear(ff_width + param_ff_width, ff_width),
ff_act[0](*ff_act[1], **ff_act[2]),
*chain(
*(
(
nn.Linear(ff_width, ff_width),
ff_act[0](*ff_act[1], **ff_act[2]),
)
for _ in range(ff_depth)
)
),
)
self.out_dense = nn.Linear(ff_width, out_size)
self.out_act = out_act[0](*out_act[1], **out_act[2])
@staticmethod
def pad_ax0_to_multiple_of(x, multiple_of):
padding = (((x.shape[0] // multiple_of) + 1) * multiple_of) - x.shape[0]
left_pad = padding // 2
right_pad = padding - left_pad
return da.pad(
x, ((left_pad, right_pad), (0, 0)), mode="constant", constant_values=0
)
def pad_to_ff_width(self, x):
padding = self.ff_width - x.shape[1]
left_pad = padding // 2
right_pad = padding - left_pad
return F.pad(
x,
(right_pad, left_pad, 0, 0),
mode="constant",
value=0.0,
)
def forward(self, x0, x1):
# pad to a multiple of kernel_size * chunk_size
x1 = self.pad_ax0_to_multiple_of(
x1, self.compressor_kernel_size * self.compressor_chunk_size
)
x1 = self.convolutional_compressor(x1)
x1 = x1.reshape(x1.shape[0] * x1.shape[1], x1.shape[2]).T.unsqueeze(0)
while x1.shape[2] > self.ff_width:
x1 = self.conv(x1)
x1 = self.conv_norm(x1)
x1 = self.conv_act(x1)
x1 = self.combine_channels(x1)
x1 = self.channel_combine_act(x1)
x1 = x1.squeeze(1)
x1 = self.pad_to_ff_width(x1)
x0 = x0.unsqueeze(0)
x0 = self.param_ff(x0)
x = torch.cat((x1, x0), dim=1)
x = self.ff(x)
x = self.out_dense(x)
x = self.out_act(x)
return x
def configure_optimizers(self):
optimizer = self.optimizer(
self.parameters(), *self.optimizer_args, **self.optimizer_kwargs
)
if self.scheduler is not None:
scheduler = self.scheduler(optimizer, **self.scheduler_kwargs)
return optimizer, scheduler
else:
return optimizer
def train_dataloader(self):
return self.train_ds
def val_dataloader(self):
return self.val_ds
def training_step(self, batch, batch_idx):
x0, x1, y = batch
y_hat = self(x0, x1)
loss = self.loss(y_hat, y)
self.log("train_loss", loss)
return loss
def validation_step(self, batch, batch_idx):
x0, x1, y = batch
y_hat = self(x0, x1)
loss = self.loss(y_hat, y)
self.log("val_loss", loss)
return loss
# %% [markdown]
# <h3>Activation Functions</h3>
# <p>
# For our hyperparameter optimisation, we intend to test all the activation functions in PyTorch. In addition to the builtin activations, we will also train using the following custom implemented activation functions from literature or our own design:
# </p>
# <ol>
# <li><b><code>BMU</code>:</b> Bio-Mimicking Unit; an activation function designed to mimic the activation potential of a biological neuron.</li>
# <li><b><code>SoftExp</code>:</b> Soft Exponential function; a parametric activation function that fits to a wide variety of exponential curves (DOI: <a href=https://arxiv.org/abs/1602.01321v1>10.48550/arXiv.1602.01321</a>)</li>
# <li><b><code>LeakyPReQU</code>:</b> Leaky Parametric Rectified Quadratic Unit; A smoothly and continuously differentiable function that is a parametrically sloped line for <code>x&#8924;0</code> and a quadratic curve for <code>x&gt;0</code></li>
# <li><b><code>ISRU</code>:</b> Inverse Square Root Unit; a somewhat uncommon function that can be useful in models such as this as it yields a continuously differentiable curve while being extremely fast to compute using bit manipulation</li>
# <li><b><code>ISRLU</code>:</b> Inverse Square Root Linear Unit; a modified ISRU that is an ISRU for <code>x&lt;0</code> and <code>`f(x)=x`</code> for <code>x&#8925;0</code> (DOI: <a href=https://arxiv.org/abs/1710.09967>10.48550/arXiv.1710.09967</a>)</li>
# <li><b><code>PBessel</code>:</b> Parametric Besse; A parametric Bessel curve yielding various different wave formations depending on a trainable parameter</li>
# <li><b><code>Sinusoid</code>:</b> A parametric sine wave, with amplitude and wavelength as trainable parameters</li>
# <li><b><code>Modulo</code>:</b> A parametric sawtooth wave, <code>`f(x)=x%&#593;</code> where &#593; is a trainable parameter</li>
# <li><b><code>TriWave</code>:</b> A parametric triangle wave, with amplitude and wavelength as trainable parameters</li>
# <li><b><code>Gaussian</code>:</b> A parametric gaussian curve, with trainable amplitude</li>
# </ol>
# %%
# Create a dispatcher including all builtin activations and
# Several custom activations from experimentation or literature
from custom_activations import SoftExp, PBessel
activation_dispatcher = {
"Tanh": nn.Tanh,
"SiLU": nn.SiLU,
"Softplus": nn.Softplus,
"SoftExp": SoftExp,
"PBessel": PBessel,
}
from boltons.dictutils import FrozenDict
out_act_dispatcher = {
"Sigmoid": (nn.Sigmoid, (), {}),
"Softmax": (nn.Softmax, (), {"dim": 1}),
}
# %% [markdown]
# <h3>Hyperparameter training</h3>
# <p>Here, we define an objective function, describing what we want Optuna to do during each trial and how to react to various errors and/or situations that may arise. To summarise the objective:</p>
# <ul>
# <li>Optuna selects hyperparameters for all input parameters within the given constraints</li>
# <li>A model is generated using the selected hyperparameters</li>
# <li>PyTorchLightning trains the model through 2 epochs</li>
# <li>The model is evaluated on the validation set</li>
# <li>The validation loss is returned to Optuna</li>
# </ul>
# <p>
# Optuna monitors the reported validation loss and attempts to minimise it. An extremely aggressive pruning strategy known as "hyperband pruning" is used to efficiently reduce down the parameter space to something more reasonable. Any parameter set which optuna deems suboptimal will be immediately pruned or even stopped early to save time.
# </p>
# %%
# Test parameters
n_epochs = 2
output_keys = list(next(iter(y_train.values())).keys())
activation_vals = list(activation_dispatcher.keys())
out_act_vals = list(out_act_dispatcher.keys())
# Next we define the objective function for the hyperparameter optimization
def objective(trial):
torch.cuda.empty_cache()
objective_value = torch.inf
model = None
logger = None
try:
# Select hyperparameters for testing
compressor_kernel_size = 128
compressor_chunk_size = 128
compressor_act = (SoftExp, (), {})
conv_kernel_size = 128
conv_norm = False
conv_act = (nn.Tanh, (), {})
channel_combine_act = (nn.Softplus, (), {})
param_ff_depth = 2
param_ff_width = 16
param_ff_act = (PBessel, (), {})
ff_width = 1024
ff_depth = 4
ff_act = (nn.Softplus, (), {})
out_size = 2
out_act = out_act_dispatcher[trial.suggest_categorical("out_act", out_act_vals)]
# Set up the model architecture and other necessary components
model = Model(
compressor_kernel_size=compressor_kernel_size,
compressor_chunk_size=compressor_chunk_size,
compressor_act=compressor_act,
conv_kernel_size=conv_kernel_size,
conv_act=conv_act,
conv_norm=conv_norm,
channel_combine_act=channel_combine_act,
param_ff_depth=param_ff_depth,
param_ff_width=param_ff_width,
param_ff_act=param_ff_act,
ff_width=ff_width,
ff_depth=ff_depth,
ff_act=ff_act,
out_size=out_size,
out_act=out_act,
).to(device)
trainer = Trainer(
accelerator="gpu",
max_epochs=n_epochs,
devices=1,
logger=logger,
num_sanity_val_steps=0, # Needs to be disabled or else we get an error because X is dask array
# precision="16-mixed",
callbacks=[
PyTorchLightningPruningCallback(trial, monitor="val_loss"),
],
)
# Prepare datasets
train = DataLoader(
list(zip(X_train.values(), y_train.values())),
collate_fn=collate_fn,
shuffle=True,
)
valid = DataLoader(
list(zip(X_val.values(), y_val.values())),
shuffle=True,
collate_fn=collate_fn,
)
# Finally, train the model
trainer.fit(model, train, valid)
except Exception as e:
logging.exception(f"An exception occurred in trial {trial.number}: {e}")
raise optuna.exceptions.TrialPruned()
finally:
if logger is not None:
logger.experiment.unwatch(model)
logger.experiment.finish()
del model
torch.cuda.empty_cache()
if objective_value == torch.inf:
raise optuna.exceptions.TrialPruned()
return objective_value
# %% [markdown]
# <h3>Hyperparameter Optimisation on a Computing Cluster</h3>
# <p>
# The final important step is to run the optimisation using a cluster of computers to maximise the number of trials that can be run in parallel. Although this could be achieved using a more complex, scheduler controlled system and dask, we will use the far simpler approach of using a shared SQL ledger to keep track of the trials and their results. This is a very simple approach, but it is sufficient for our purposes, and is easy to implement. Using this approach, the model was trained on a cluster of 5 computers at once.
# </p>
# %%
if __name__ == "__main__":
# storage_name = "sqlite:///optuna.sql"
storage_name = "mysql+pymysql://root:Ch31121992@192.168.1.10:3306/optuna_db"
study_name = "Composition Experiment 2"
study = optuna.create_study(
study_name=study_name,
storage=storage_name,
direction="minimize",
load_if_exists=True,
)
study.optimize(
objective,
n_trials=10,
timeout=None,
)

7024
expt2_analysis.ipynb Normal file

File diff suppressed because it is too large Load Diff

398
expt3.ipynb Normal file
View File

@@ -0,0 +1,398 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1>Experiment 3</h1>\n",
"<h3>Optimised model training</h3>\n",
"<p>In experiment 3 the model was trained using the optimised hyperparameters. By examining the results of expt2, it was noticed that trials #1, #10, and #16 all resulted in quite low losses while also showing clear downward trends resembling a clearly discernible training curve. Of these, trial #16 was ultimately selected as the model to be tested, as the data suggests that <code>`in_act=Mish`</code> tends to give the lowest losses in most models tested. The parameters for trial #16 were as follows:</p>\n",
"<ul>\n",
"<li><b>in_act</b> = Mish</li>\n",
"<li><b>compressor_kernel_size</b> = 128</li>\n",
"<li><b>compressor_chunk_size</b> = 128</li>\n",
"<li><b>compressor_act</b> = SoftExp</li>\n",
"<li><b>conv_kernel_size</b> = 128</li>\n",
"<li><b>conv_act</b> = Sigmoid</li>\n",
"<li><b>channel_combine_act</b> = GELU</li>\n",
"<li><b>ff_width</b> = 512</li>\n",
"<li><b>ff_depth</b> = 2</li>\n",
"<li><b>ff_act</b> = CELU</li>\n",
"<li><b>out_act</b> = Tanhshrink</li>\n",
"</ul>\n",
"<p>\n",
"Because most of the training curves in expt2 appeared to be unstable, a learning rate scheduler was used to reduce the learning rate by 20% if the validation loss did not improve for 5 epochs. The model was checkpointed, with the best 10 iterations of the model being retained for testing after training.\n",
"</p>\n",
"<h3>Modified optimal model training</h3>\n",
"<p>\n",
"Following the first attempt at training the optimised model (Model 1, Test 1), it was noted that training curves were clearly discernible, but still quite unstable and noisy. To try and further improve the stability of the training, a modified version of the model was prepared and trained (Model 2, Test 2). The modified model was the same as Model 1, but with the addition of a LayerNormalization layer to the convolutional layer of the <code>`DaskCompressor`</code> submodule. This change was made because highly recurrent submodules such as the compressor are known to be especially prone to instability caused by vanishing or exploding gradients. It was reasoned that by normalizing at each iteration the gradients would be less likely to vanish or explode, making the training more stable.\n",
"</p>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Data handling imports\n",
"from dask.distributed import Client, LocalCluster\n",
"import dask\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import numpy as np\n",
"import pickle\n",
"import random\n",
"from itertools import chain\n",
"from tqdm.auto import tqdm\n",
"\n",
"# Deep learning imports\n",
"import torch\n",
"from torch.utils.data import DataLoader\n",
"from torch import nn\n",
"from torch.nn import functional as F\n",
"from torch import optim\n",
"import pytorch_lightning as pl\n",
"from pytorch_lightning import Trainer\n",
"from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint\n",
"from pytorch_lightning.loggers import WandbLogger\n",
"\n",
"# Suppress some warning messages from pytorch_lightning,\n",
"# It really doesn't like that i've forced it to handle a dask array!\n",
"import warnings\n",
"\n",
"warnings.filterwarnings(\"ignore\", category=UserWarning, module=pl.__name__)\n",
"\n",
"# Also, set up a log to record debug messages for failed trials\n",
"import logging\n",
"\n",
"logging.basicConfig(filename=\"debug.log\", encoding=\"utf-8\", level=logging.DEBUG)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from expt1 import (\n",
" Model,\n",
" device,\n",
" X_train,\n",
" y_train,\n",
" X_val,\n",
" y_val,\n",
" create_collate_fn,\n",
")\n",
"from custom_activations import SoftExp"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/distributed/node.py:182: UserWarning: Port 8787 is already in use.\n",
"Perhaps you already have a cluster running?\n",
"Hosting the HTTP server on port 34477 instead\n",
" warnings.warn(\n"
]
}
],
"source": [
"cluster = LocalCluster(n_workers=8, threads_per_worker=1)\n",
"client = Client(cluster)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Monkey patch to allow pytorch lightning to accept a dask array as a model input\n",
"from typing import Any, Generator, Iterable, Mapping, Optional, Union\n",
"\n",
"BType = Union[da.Array, torch.Tensor, str, Mapping[Any, \"BType\"], Iterable[\"BType\"]]\n",
"\n",
"unpatched = pl.utilities.data._extract_batch_size\n",
"\n",
"\n",
"def patch(batch: BType) -> Generator[Optional[int], None, None]:\n",
" if isinstance(batch, da.core.Array):\n",
" if len(batch.shape) == 0:\n",
" yield 1\n",
" else:\n",
" yield batch.shape[0]\n",
" else:\n",
" yield from unpatched(batch)\n",
"\n",
"\n",
"pl.utilities.data._extract_batch_size = patch"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"# Prepare datasets\n",
"train = DataLoader(\n",
" list(zip(X_train.values(), y_train.values())),\n",
" collate_fn=create_collate_fn(),\n",
" shuffle=True,\n",
")\n",
"valid = DataLoader(\n",
" list(zip(X_val.values(), y_val.values())),\n",
" shuffle=True,\n",
" collate_fn=create_collate_fn(),\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Set up the model architecture and other necessary components\n",
"model = Model(\n",
" # Training parameters\n",
" optimizer=optim.Adam,\n",
" scheduler=optim.lr_scheduler.ReduceLROnPlateau,\n",
" scheduler_kwargs={\"factor\": 0.8, \"patience\": 5},\n",
" # Model parameters\n",
" in_act=(nn.Mish, list(), dict()),\n",
" compressor_kernel_size=128,\n",
" compressor_chunk_size=128,\n",
" compressor_act=(SoftExp, list(), dict()),\n",
" conv_kernel_size=128,\n",
" conv_act=(nn.Sigmoid, list(), dict()),\n",
" channel_combine_act=(nn.GELU, list(), dict()),\n",
" ff_width=512,\n",
" ff_depth=2,\n",
" ff_act=(nn.CELU, list(), dict()),\n",
" out_size=len(list(next(iter(y_train.values())).keys())),\n",
" out_act=(nn.Tanhshrink, list(), dict()),\n",
").to(device)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[34m\u001b[1mwandb\u001b[0m: Currently logged in as: \u001b[33mchughes000\u001b[0m. Use \u001b[1m`wandb login --relogin`\u001b[0m to force relogin\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "6d1624339b4c4aaeb195b5ebc3b3e69e",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"VBox(children=(Label(value='Waiting for wandb.init()...\\r'), FloatProgress(value=0.016669258750092317, max=1.0…"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"wandb version 0.15.8 is available! To upgrade, please run:\n",
" $ pip install wandb --upgrade"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Tracking run with wandb version 0.15.7"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Run data is saved locally in <code>./wandb/run-20230801_233841-q70oibx2</code>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
"Syncing run <strong><a href='https://wandb.ai/chughes000/Aconity_ML_Test_DryRun/runs/q70oibx2' target=\"_blank\">Test 2</a></strong> to <a href='https://wandb.ai/chughes000/Aconity_ML_Test_DryRun' target=\"_blank\">Weights & Biases</a> (<a href='https://wandb.me/run' target=\"_blank\">docs</a>)<br/>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View project at <a href='https://wandb.ai/chughes000/Aconity_ML_Test_DryRun' target=\"_blank\">https://wandb.ai/chughes000/Aconity_ML_Test_DryRun</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"data": {
"text/html": [
" View run at <a href='https://wandb.ai/chughes000/Aconity_ML_Test_DryRun/runs/q70oibx2' target=\"_blank\">https://wandb.ai/chughes000/Aconity_ML_Test_DryRun/runs/q70oibx2</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"GPU available: True (cuda), used: True\n",
"TPU available: False, using: 0 TPU cores\n",
"IPU available: False, using: 0 IPUs\n",
"HPU available: False, using: 0 HPUs\n"
]
}
],
"source": [
"early_stop_callback = EarlyStopping(\n",
" monitor=\"val_loss\", patience=15, verbose=False, mode=\"min\"\n",
")\n",
"\n",
"checkpoint_callback = ModelCheckpoint(\n",
" monitor=\"val_loss\",\n",
" dirpath=\"./checkpoints\",\n",
" filename=\"checkpoint-{epoch:02d}-{val_loss:.2f}\",\n",
" save_top_k=10,\n",
" mode=\"min\",\n",
")\n",
"\n",
"logger = WandbLogger(project=\"Aconity_ML_Test_DryRun\", name=f\"Test 1\")\n",
"logger.experiment.watch(model, log=\"all\", log_freq=1)\n",
"\n",
"trainer = Trainer(\n",
" accelerator=\"gpu\",\n",
" max_epochs=-1,\n",
" devices=\"auto\",\n",
" strategy=\"auto\",\n",
" logger=logger,\n",
" callbacks=[checkpoint_callback, early_stop_callback],\n",
" num_sanity_val_steps=0, # Needs to be disabled or else we get an error because X is dask array\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]\n",
"/home/cianh/Programming/Git_Projects/Aconity_ML_Test/.venv/lib/python3.11/site-packages/pytorch_lightning/core/optimizer.py:361: RuntimeWarning: Found unsupported keys in the optimizer configuration: {'scheduler'}\n",
" rank_zero_warn(\n",
"\n",
" | Name | Type | Params\n",
"--------------------------------------------------------------\n",
"0 | loss | MSELoss | 0 \n",
"1 | in_act | Mish | 0 \n",
"2 | convolutional_compressor | DaskCompression | 3.2 K \n",
"3 | compressor_act | SoftExp | 1 \n",
"4 | conv | Conv1d | 3.2 K \n",
"5 | conv_act | Sigmoid | 0 \n",
"6 | combine_channels | Conv1d | 6 \n",
"7 | channel_combine_act | GELU | 0 \n",
"8 | ff | Sequential | 525 K \n",
"9 | out_dense | Linear | 11.8 K\n",
"10 | out_act | Tanhshrink | 0 \n",
"--------------------------------------------------------------\n",
"543 K Trainable params\n",
"0 Non-trainable params\n",
"543 K Total params\n",
"2.174 Total estimated model params size (MB)\n"
]
},
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "25d7ba2f5e3c4f68a55fdafed5a5b092",
"version_major": 2,
"version_minor": 0
},
"text/plain": [
"Training: 0it [00:00, ?it/s]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Finally, train the model\n",
"trainer.fit(model, train, valid)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

101
expt3.py Normal file
View File

@@ -0,0 +1,101 @@
# From expt2 selected trials ???
# Data handling imports
from dask.distributed import Client, LocalCluster
import dask.array as da
# Deep learning imports
import torch
from torch.utils.data import DataLoader
from torch import nn
from torch import optim
import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from expt1 import (
Model,
device,
X_train,
y_train,
X_val,
y_val,
collate_fn,
)
from custom_activations import SoftExp, PBessel
# Suppress some warning messages from pytorch_lightning,
# It really doesn't like that i've forced it to handle a dask array!
import warnings
import logging
warnings.filterwarnings("ignore", category=UserWarning, module=pl.__name__)
# Also, set up a log to record debug messages for failed trials
logging.basicConfig(filename="debug.log", encoding="utf-8", level=logging.ERROR)
if __name__ == "__main__":
cluster = LocalCluster(n_workers=8, threads_per_worker=1)
client = Client(cluster)
# Prepare datasets
train = DataLoader(
list(zip(X_train.values(), y_train.values())),
collate_fn=collate_fn,
shuffle=True,
)
valid = DataLoader(
list(zip(X_val.values(), y_val.values())),
shuffle=True,
collate_fn=collate_fn,
)
# Set up the model architecture and other necessary components
model = Model(
# Training parameters
optimizer=optim.Adam,
# Model parameters
compressor_kernel_size=128,
compressor_chunk_size=128,
compressor_act=(SoftExp, (), {}),
conv_kernel_size=128,
conv_act=(nn.Tanh, (), {}),
conv_norm=False,
channel_combine_act=(nn.Softplus, (), {}),
param_ff_depth=2,
param_ff_width=16,
param_ff_act=(PBessel, (), {}),
ff_width=1024,
ff_depth=6,
ff_act=(nn.Softplus, (), {}),
out_size=2,
out_act=(nn.Sigmoid, tuple(), dict()),
).to(device)
if __name__ == "__main__":
early_stop_callback = EarlyStopping(
monitor="val_loss", patience=15, verbose=False, mode="min"
)
checkpoint_callback = ModelCheckpoint(
monitor="val_loss",
dirpath="./checkpoints",
filename="checkpoint-{epoch:02d}-{val_loss:.2f}",
save_top_k=10,
mode="min",
)
logger = WandbLogger(project="Aconity_ML_Expt1", name="Test 3")
logger.experiment.watch(model, log="all", log_freq=1)
trainer = Trainer(
accelerator="gpu",
max_epochs=-1,
devices="auto",
strategy="auto",
logger=logger,
callbacks=[checkpoint_callback, early_stop_callback],
num_sanity_val_steps=0, # Disabled or we get error because X is dask array
)
# Finally, train the model
trainer.fit(model, train, valid)

1229
expt3_analysis.ipynb Normal file

File diff suppressed because it is too large Load Diff

1382
process_X_data.ipynb Normal file

File diff suppressed because one or more lines are too long

213
process_y_data.ipynb Normal file
View File

@@ -0,0 +1,213 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
"# Read the excel file\n",
"doe_df = pd.read_excel(\n",
" \"data/NiTi_Cubes_Analysis.xlsx\",\n",
" sheet_name=\"DOE & RSPNS\",\n",
" header=1,\n",
" usecols=\"A:M, T:AC\",\n",
" nrows=81,\n",
")\n",
"# Remove newlines from column names\n",
"doe_df.rename(\n",
" mapper=dict(zip(doe_df.keys(), (k.replace(\"\\n\", \" \") for k in doe_df.keys()))),\n",
" axis=1,\n",
" inplace=True,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"doe_df"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"# Split the dataframe into a dictionary of dataframes, one for each sample\n",
"sample_y = dict(iter(doe_df.groupby(\"Sample\")))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sample_y[1]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Finally, pickle this data for use in experiments\n",
"with open(\"sample_y.pkl\", \"wb\") as f:\n",
" pickle.dump(sample_y, f)"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Sample</th>\n",
" <th>Laser power, P\\n(W)</th>\n",
" <th>Scan speed, V\\n(mm/sec)</th>\n",
" <th>Spot size, F\\n(µm)</th>\n",
" <th>Hatch spacing, H\\n(µm)</th>\n",
" <th>Surface Energy Density @ 90µm Layer thickness, El (J/mm2)</th>\n",
" <th>Surface Energy Density @ Spot size, EF (J/mm2)</th>\n",
" <th>Vol. Energy Density @ Hatch Spacing, VEDH (J/mm3)</th>\n",
" <th>Vol. Energy Density @ Spot Size, VEDF (J/mm3)</th>\n",
" <th>Density\\n(Archimedes by Acetone)</th>\n",
" <th>...</th>\n",
" <th>Ni</th>\n",
" <th>Ti</th>\n",
" <th>Oxygen</th>\n",
" <th>Carbon</th>\n",
" <th>Ni (Norm)</th>\n",
" <th>Ti (Norm)</th>\n",
" <th>Sa (um)</th>\n",
" <th>Sku</th>\n",
" <th>Ssk</th>\n",
" <th>Sz (um)</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>180</td>\n",
" <td>1000</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" <td>2.0</td>\n",
" <td>4.5</td>\n",
" <td>50.0</td>\n",
" <td>50.0</td>\n",
" <td>6.343695</td>\n",
" <td>...</td>\n",
" <td>41.33</td>\n",
" <td>43.76</td>\n",
" <td>1.1</td>\n",
" <td>13.81</td>\n",
" <td>48.5721</td>\n",
" <td>51.4279</td>\n",
" <td>18.686</td>\n",
" <td>3.243</td>\n",
" <td>0.28</td>\n",
" <td>187.116</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" Sample Laser power, P\\n(W) Scan speed, V\\n(mm/sec) Spot size, F\\n(µm) \\\n",
"0 1 180 1000 40 \n",
"\n",
" Hatch spacing, H\\n(µm) \\\n",
"0 40 \n",
"\n",
" Surface Energy Density @ 90µm Layer thickness, El (J/mm2) \\\n",
"0 2.0 \n",
"\n",
" Surface Energy Density @ Spot size, EF (J/mm2) \\\n",
"0 4.5 \n",
"\n",
" Vol. Energy Density @ Hatch Spacing, VEDH (J/mm3) \\\n",
"0 50.0 \n",
"\n",
" Vol. Energy Density @ Spot Size, VEDF (J/mm3) \\\n",
"0 50.0 \n",
"\n",
" Density\\n(Archimedes by Acetone) ... Ni Ti Oxygen Carbon \\\n",
"0 6.343695 ... 41.33 43.76 1.1 13.81 \n",
"\n",
" Ni (Norm) Ti (Norm) Sa (um) Sku Ssk Sz (um) \n",
"0 48.5721 51.4279 18.686 3.243 0.28 187.116 \n",
"\n",
"[1 rows x 23 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sample_y[1]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

53
pyproject.toml Normal file
View File

@@ -0,0 +1,53 @@
[tool.poetry]
name = "aconity-ml-test"
version = "0.1.0"
description = ""
authors = ["Cian Hughes <cian.hughes@dcu.ie>"]
readme = "README.md"
# packages = [{include = "aconity_ml_test"}]
[tool.poetry.dependencies]
python = ">=3.9,<3.12"
read_layers = { file = "../MTPy/wheels/read_layers-0.1.0-cp311-cp311-manylinux_2_34_x86_64.whl" }
dask = { extras = ["distributed", "graphviz"], version = "*" }
pytorch-lightning = "^2.0.6"
xgboost = "^1.7.6"
optuna = "^3.2.0"
wandb = "^0.15.7"
numba = "^0.57.1"
tqdm = "^4.65.0"
matplotlib = "^3.7.2"
plotly = "^5.15.0"
bokeh = "^3.2.1"
holoviews = "^1.17.0"
datashader = "^0.15.1"
psutil = "^5.9.5"
pandas = "^2.0.3"
tables = "^3.8.0"
lz4 = "^4.3.2"
openpyxl = "^3.1.2"
odfpy = "^1.4.1"
fsspec = "^2023.6.0"
jupyterlab = "^4.0.3"
jupyter = "^1.0.0"
ipywidgets = "^8.0.7"
pyarrow = "^12.0.1"
jupyter-bokeh = "^3.0.7"
torch = { file = "./wheel/torch-2.0.1+cu118-cp311-cp311-linux_x86_64.whl" }
optuna-dashboard = "^0.10.3"
pymysql = "^1.1.0"
mysqlclient = "^2.2.0"
tabulate = "^0.9.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
[[tool.poetry.source]]
name = "PyPI"
priority = "primary"
# [[tool.poetry.source]]
# name = "nvidia"
# url = "https://pypi.ngc.nvidia.com"
# priority = "primary"