Refactor of experiment2 for convenience

2025-12-22 14:11:59 +00:00 · 2024-06-06 14:53:08 +01:00
parent 165ef045ea
commit f10aa2fd19
8 changed files with 144 additions and 142 deletions
--- a/poetry.lock
+++ b/poetry.lock
@@ -2321,6 +2321,24 @@ dev = ["black", "flake8", "isort", "pre-commit"]
 doc = ["myst-parser", "sphinx", "sphinx-book-theme"]
 test = ["coverage", "pytest", "pytest-cov"]

+[[package]]
+name = "loguru"
+version = "0.7.2"
+description = "Python logging made (stupidly) simple"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "loguru-0.7.2-py3-none-any.whl", hash = "sha256:003d71e3d3ed35f0f8984898359d65b79e5b21943f78af86aa5491210429b8eb"},
+    {file = "loguru-0.7.2.tar.gz", hash = "sha256:e671a53522515f34fd406340ee968cb9ecafbc4b36c679da03c18fd8d0bd51ac"},
+]
+
+[package.dependencies]
+colorama = {version = ">=0.3.4", markers = "sys_platform == \"win32\""}
+win32-setctime = {version = ">=1.0.0", markers = "sys_platform == \"win32\""}
+
+[package.extras]
+dev = ["Sphinx (==7.2.5)", "colorama (==0.4.5)", "colorama (==0.4.6)", "exceptiongroup (==1.1.3)", "freezegun (==1.1.0)", "freezegun (==1.2.2)", "mypy (==v0.910)", "mypy (==v0.971)", "mypy (==v1.4.1)", "mypy (==v1.5.1)", "pre-commit (==3.4.0)", "pytest (==6.1.2)", "pytest (==7.4.0)", "pytest-cov (==2.12.1)", "pytest-cov (==4.1.0)", "pytest-mypy-plugins (==1.9.3)", "pytest-mypy-plugins (==3.0.0)", "sphinx-autobuild (==2021.3.14)", "sphinx-rtd-theme (==1.3.0)", "tox (==3.27.1)", "tox (==4.11.0)"]
+
 [[package]]
 name = "mako"
 version = "1.3.5"
@@ -5341,6 +5359,20 @@ files = [
    {file = "widgetsnbextension-4.0.10.tar.gz", hash = "sha256:64196c5ff3b9a9183a8e699a4227fb0b7002f252c814098e66c4d1cd0644688f"},
 ]

+[[package]]
+name = "win32-setctime"
+version = "1.1.0"
+description = "A small Python utility to set file creation time on Windows"
+optional = false
+python-versions = ">=3.5"
+files = [
+    {file = "win32_setctime-1.1.0-py3-none-any.whl", hash = "sha256:231db239e959c2fe7eb1d7dc129f11172354f98361c4fa2d6d2d7e278baa8aad"},
+    {file = "win32_setctime-1.1.0.tar.gz", hash = "sha256:15cf5750465118d6929ae4de4eb46e8edae9a5634350c01ba582df868e932cb2"},
+]
+
+[package.extras]
+dev = ["black (>=19.3b0)", "pytest (>=4.6.2)"]
+
 [[package]]
 name = "xyzservices"
 version = "2024.4.0"
@@ -5458,4 +5490,4 @@ multidict = ">=4.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11"
-content-hash = "f607472660b04b7f6f5d49a4561730f788a46f0d1e0176322e872111b00481cd"
+content-hash = "d8cc3168211c9f7eaddf78e15b3077aadb6cda3358dfacfa83da732af83aa899"
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,7 @@ safetensors = "^0.4.3"
 alive-progress = "^3.1.5"
 hvplot = "^0.10.0"
 pyarrow = "^16.1.0"
+loguru = "^0.7.2"


 [build-system]
--- a/symbolic_nn_tests/main.py
+++ b/symbolic_nn_tests/main.py
@@ -1,14 +1,15 @@
 import typer
 from typing import Optional, Iterable
 from typing_extensions import Annotated
-from . import experiment1
+from loguru import logger
+from . import experiment1, experiment2


-EXPERIMENTS = (experiment1,)
+EXPERIMENTS = (experiment1, experiment2)


 def parse_int_or_intiterable(i: Optional[str]) -> Iterable[int]:
-    return range(1, len(EXPERIMENTS) + 1) if i is None else map(int, i.split(","))
+    return range(1, len(EXPERIMENTS) + 1) if i is None else list(map(int, i.split(",")))


 def main(
@@ -26,10 +27,10 @@ def main(
        bool, typer.Option(help="Whether or not to log via Weights & Biases")
    ] = True,
 ):
-    experiment_indeces = (i - 1 for i in experiments)
-    experiment_funcs = [EXPERIMENTS[i].run for i in experiment_indeces]
-
-    for experiment in experiment_funcs:
+    for i, n in enumerate(experiments, start=1):
+        j = n - 1
+        experiment = EXPERIMENTS[j].run
+        logger.info(f"Running Experiment {n} ({i}/{len(experiments)})...")
        experiment(tensorboard=tensorboard, wandb=wandb)


--- a/symbolic_nn_tests/experiment2/init.py
+++ b/symbolic_nn_tests/experiment2/init.py
@@ -27,49 +27,25 @@ def test(loss_func, version, tensorboard=True, wandb=True):
        )
        logger.append(wandb_logger)

-    test_model(logger=logger, loss_func=loss_func, lr=LEARNING_RATE)
+    test_model(logger=logger, loss_func=loss_func)

    if wandb:
        _wandb.finish()


 def run(tensorboard: bool = True, wandb: bool = True):
+    from .model import unpacking_mse_loss
    from . import semantic_loss
-    from torch import nn

    test(
-        nn.functional.cross_entropy,
-        "cross_entropy",
+        unpacking_mse_loss,
+        "mse_loss",
        tensorboard=tensorboard,
        wandb=wandb,
    )
    test(
-        semantic_loss.similarity_cross_entropy,
-        "similarity_cross_entropy",
-        tensorboard=tensorboard,
-        wandb=wandb,
-    )
-    test(
-        semantic_loss.hasline_cross_entropy,
-        "hasline_cross_entropy",
-        tensorboard=tensorboard,
-        wandb=wandb,
-    )
-    test(
-        semantic_loss.hasloop_cross_entropy,
-        "hasloop_cross_entropy",
-        tensorboard=tensorboard,
-        wandb=wandb,
-    )
-    test(
-        semantic_loss.multisemantic_cross_entropy,
-        "multisemantic_cross_entropy",
-        tensorboard=tensorboard,
-        wandb=wandb,
-    )
-    test(
-        semantic_loss.garbage_cross_entropy,
-        "garbage_cross_entropy",
+        semantic_loss.positive_slope_linear_loss,
+        "positive_slope_linear_loss",
        tensorboard=tensorboard,
        wandb=wandb,
    )
--- a/symbolic_nn_tests/experiment2/dataset.py
+++ b/symbolic_nn_tests/experiment2/dataset.py
@@ -11,6 +11,7 @@ from multiprocessing import Pool
 from symbolic_nn_tests.dataloader import DATASET_DIR
 import warnings
 from tqdm.auto import tqdm
+from loguru import logger


 warnings.filterwarnings(action="ignore", category=UserWarning)
@@ -47,58 +48,58 @@ def get_dataset():
    ):
        construct_dataset("pubchem")
    else:
-        print("Pre-existing dataset detected!")
-    print("Dataset loaded!")
+        logger.info("Pre-existing dataset detected!")
+    logger.info("Dataset loaded!")
    return TensorDataset(*load_dataset("pubchem"))


 def construct_dataset(filename):
-    print("Constructing dataset...")
+    logger.info("Constructing dataset...")
    df = construct_ds_dataframe(filename)
    save_dataframe_to_dataset(df, PUBCHEM_DIR / f"{filename}.pickle")
-    print("Dataset constructed!")
+    logger.info("Dataset constructed!")


 def construct_ds_dataframe(filename):
-    print("Constructing dataset dataframe...")
+    logger.info("Constructing dataset dataframe...")
    df = add_molecule_encodings(construct_raw_dataset(filename))
    # NOTE: This kind of checkpointing will be used throughout the construction process It doesn't
    # take much disk space, it lets the GC collect out-of-scope data from the construction process
    # and it makes it easier to debug if construction fails
    parquet_file = PUBCHEM_DIR / f"{filename}.parquet"
    df.write_parquet(parquet_file)
-    print("Dataset dataframe constructed!")
+    logger.info("Dataset dataframe constructed!")
    return pl.read_parquet(parquet_file)


 def construct_raw_dataset(filename):
-    print("Constructing raw dataset...")
+    logger.info("Constructing raw dataset...")
    df = collate_dataset()
    parquet_file = PUBCHEM_DIR / f"{filename}_raw.parquet"
    df.write_parquet(parquet_file)
-    print("Raw dataset constructed!")
+    logger.info("Raw dataset constructed!")
    return pl.read_parquet(parquet_file)


 def collate_dataset():
-    print("Collating dataset...")
+    logger.info("Collating dataset...")
    if not (PUBCHEM_DIR.exists() and len(tuple(PUBCHEM_DIR.glob("*.json")))):
        fetch_dataset()

    df = pl.concat(
        map(pl.read_json, PUBCHEM_DIR.glob("*.json")),
    ).drop("id")
-    print("dataset collated!")
+    logger.info("dataset collated!")
    return df


 def fetch_dataset():
-    print("Fetching dataset...")
+    logger.info("Fetching dataset...")
    kaggle.api.dataset_download_files(
        "burakhmmtgl/predict-molecular-properties", quiet=False, path=DATASET_DIR
    )
    shutil.unpack_archive(DATASET_DIR / "predict-molecular-properties.zip", PUBCHEM_DIR)
-    print("Dataset fetched!")
+    logger.info("Dataset fetched!")


@lru_cache(maxsize=1)
@@ -172,7 +173,7 @@ def encode_orbital(orbital):


 def save_dataframe_to_dataset(df, filename):
-    print("Saving dataset to tensors...")
+    logger.info("Saving dataset to tensors...")
    with (filename.parent / f"{filename.stem}_x0{filename.suffix}").open("wb") as f:
        pickle.dump(properties_to_tensor(df).float(), f)
    with (filename.parent / f"{filename.stem}_x1{filename.suffix}").open("wb") as f:
@@ -180,7 +181,7 @@ def save_dataframe_to_dataset(df, filename):
    with (filename.parent / f"{filename.stem}_y{filename.suffix}").open("wb") as f:
        pickle.dump(df["En"].to_torch().float(), f)
    del df
-    print("Tensors saved!")
+    logger.info("Tensors saved!")


 def chunked_df(df, n):
--- a/symbolic_nn_tests/experiment2/math.py
+++ b/symbolic_nn_tests/experiment2/math.py
@@ -0,0 +1,23 @@
+import torch
+
+
+def sech(x):
+    return torch.reciprocal(torch.cosh(x))
+
+
+def linear_fit(x, y):
+    mean_x = torch.mean(x)
+    mean_y = torch.mean(y)
+    cov_xy = torch.mean(x * y) - (mean_x * mean_y)
+    var_x = torch.mean(x * x) - (mean_x * mean_x)
+    m = cov_xy / var_x
+    c = mean_y - (m * mean_x)
+    return m, c
+
+
+def line(x, m, c):
+    return (m * x) + c
+
+
+def linear_residuals(x, y, m, c):
+    return y - line(x, m, c)
--- a/symbolic_nn_tests/experiment2/model.py
+++ b/symbolic_nn_tests/experiment2/model.py
@@ -14,9 +14,7 @@ class Model(nn.Module):
        self.encode_x0 = self.create_xval_encoding_fn(self.x0_encoder)
        self.encode_x1 = self.create_xval_encoding_fn(self.x1_encoder)
        self.ff = nn.Sequential(
-            nn.Linear(17, 256),
-            nn.ReLU(),
-            nn.Linear(256, 128),
+            nn.Linear(17, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
@@ -59,95 +57,12 @@ def get_singleton_dataset():
    )


-def smooth_l1_loss(out, y):
+def unpacking_mse_loss(out, y):
    _, y_pred = out
-    return nn.functional.smooth_l1_loss(y_pred, y)
+    return nn.functional.mse_loss(y_pred, y)


-def sech(x):
-    return torch.reciprocal(torch.cosh(x))
-
-
-def linear_fit(x, y):
-    mean_x = torch.mean(x)
-    mean_y = torch.mean(y)
-    cov_xy = torch.mean(x * y) - (mean_x * mean_y)
-    var_x = torch.mean(x * x) - (mean_x * mean_x)
-    m = cov_xy / var_x
-    c = mean_y - (m * mean_x)
-    return m, c
-
-
-def line(x, m, c):
-    return (m * x) + c
-
-
-def linear_residuals(x, y, m, c):
-    return y - line(x, m, c)
-
-
-def semantic_loss(x, y_pred, w, a):
-    m, c = linear_fit(x, y_pred)
-    residuals = linear_residuals(x, y_pred, m, c)
-    scaled_residuals = residuals * sech(w * x)
-    slope_penalty = torch.nn.functional.softmax(a * m, dim=0)
-    loss = torch.mean(scaled_residuals**2) + torch.mean(slope_penalty)
-    return loss
-
-
-def loss(out, y):
-    x, y_pred = out
-    x0, x1 = x
-
-    # Here, we want to make semantic use of the differential electronegativity of the molecule
-    # so start by calculating that
-    mean_electronegativities = torch.tensor(
-        [i[:, 3].mean() for i in x0], dtype=torch.float32
-    ).to(y_pred.device)
-    diff_electronegativity = (
-        torch.tensor(
-            [
-                (i[:, 3] - mean).abs().sum()
-                for i, mean in zip(x0, mean_electronegativities)
-            ],
-            dtype=torch.float32,
-        )
-        * 4.0
-    ).to(y_pred.device)
-
-    # Then, we need to get a linear best fit on that. Our semantic info is based on a graph of
-    # En (y) vs differential electronegativity on the x vs y axes, so y_pred is y here
-    m, c = linear_fit(diff_electronegativity, y_pred)
-
-    # To start with, we want to calculate a penalty based on deviation from a linear relationship
-    # Scaling is being based on 1/sech(w*r) as this increases multiplier as deviation grows.
-    # `w` was selected based on noting that the residual spread before eneg scaling was about 25;
-    # enegs were normalised as x/4, so we want to incentivize a spread of about 25/4~=6, and w=0.2
-    # causes the penalty function to cross 2 at just over 6. Yes, that's a bit arbitrary but we're
-    # just steering the model not applying hard constraints to it shold be fine.
-    residual_penalty = (
-        (
-            linear_residuals(diff_electronegativity, y_pred, m, c)
-            / sech(0.2 * diff_electronegativity)
-        )
-        .abs()
-        .float()
-        .mean()
-    )
-
-    # We also need to calculate a penalty that incentivizes a positive slope. For this, im using softmax
-    # to scale the slope as it will penalise negative slopes while not just creating a reward hack for
-    # maximizing slope. The softmax function approximates 1 from about 5 onwards, so if we multiply m by
-    # 500, then our penalty should be almost minimised for any slope above 0.01 and maximised below 0.01.
-    # This should suffice for incentivizing the model to favour positive slopes.
-    slope_penalty = (torch.nn.functional.softmax(-m * 500.0) + 1).mean()
-
-    # Finally, let's get a smooth L1 loss and scale it based on these penalty functions
-    return nn.functional.smooth_l1_loss(y_pred, y) * residual_penalty * slope_penalty
-
-
-# def main(loss_func=smooth_l1_loss, logger=None, **kwargs):
-def main(loss_func=loss, logger=None, **kwargs):
+def main(loss_func=unpacking_mse_loss, logger=None, **kwargs):
    import lightning as L

    from symbolic_nn_tests.train import TrainingWrapper
@@ -160,7 +75,7 @@ def main(loss_func=loss, logger=None, **kwargs):
    train, val, test = get_singleton_dataset()
    lmodel = TrainingWrapper(Model(), loss_func=loss_func)
    lmodel.configure_optimizers(optimizer=torch.optim.NAdam, **kwargs)
-    trainer = L.Trainer(max_epochs=10, logger=logger, num_sanity_val_steps=0)
+    trainer = L.Trainer(max_epochs=10, logger=logger)
    trainer.fit(model=lmodel, train_dataloaders=train, val_dataloaders=val)
    trainer.test(dataloaders=test)

--- a/symbolic_nn_tests/experiment2/semantic_loss.py
+++ b/symbolic_nn_tests/experiment2/semantic_loss.py
@@ -1,3 +1,5 @@
+from symbolic_nn_tests.experiment2.math import linear_fit, linear_residuals, sech
+from torch import nn
 import torch


@@ -14,3 +16,54 @@ import torch
 # without creating a reward hack for maximizing/minimizing m and preventing exploding gradients.
 # It also allows us to avoid the assumption of linearity: we only care about the direction of
 # proportionality.
+
+
+def positive_slope_linear_loss(out, y):
+    x, y_pred = out
+    x0, x1 = x
+
+    # Here, we want to make semantic use of the differential electronegativity of the molecule
+    # so start by calculating that
+    mean_electronegativities = torch.tensor(
+        [i[:, 3].mean() for i in x0], dtype=torch.float32
+    ).to(y_pred.device)
+    diff_electronegativity = (
+        torch.tensor(
+            [
+                (i[:, 3] - mean).abs().sum()
+                for i, mean in zip(x0, mean_electronegativities)
+            ],
+            dtype=torch.float32,
+        )
+        * 4.0
+    ).to(y_pred.device)
+
+    # Then, we need to get a linear best fit on that. Our semantic info is based on a graph of
+    # En (y) vs differential electronegativity on the x vs y axes, so y_pred is y here
+    m, c = linear_fit(diff_electronegativity, y_pred)
+
+    # To start with, we want to calculate a penalty based on deviation from a linear relationship
+    # Scaling is being based on 1/sech(w*r) as this increases multiplier as deviation grows.
+    # `w` was selected based on noting that the residual spread before eneg scaling was about 25;
+    # enegs were normalised as x/4, so we want to incentivize a spread of about 25/4~=6, and w=0.2
+    # causes the penalty function to cross 2 at just over 6. Yes, that's a bit arbitrary but we're
+    # just steering the model not applying hard constraints to it shold be fine.
+    residual_penalty = (
+        (
+            linear_residuals(diff_electronegativity, y_pred, m, c)
+            / sech(0.2 * diff_electronegativity)
+        )
+        .abs()
+        .float()
+        .mean()
+    )
+
+    # We also need to calculate a penalty that incentivizes a positive slope. For this, im using softmax
+    # to scale the slope as it will penalise negative slopes while not just creating a reward hack for
+    # maximizing slope. The softmax function approximates 1 from about 5 onwards, so if we multiply m by
+    # 500, then our penalty should be almost minimised for any slope above 0.01 and maximised below 0.01.
+    # This should suffice for incentivizing the model to favour positive slopes.
+    slope_penalty = (torch.nn.functional.softmax(-m * 500.0) + 1).mean()
+
+    # Finally, let's get a smooth L1 loss and scale it based on these penalty functions
+    return nn.functional.mse_loss(y_pred, y) * residual_penalty * slope_penalty