From b595fa0fb5a9ef399aeb0e62be3b905707589c56 Mon Sep 17 00:00:00 2001
From: Cian Hughes <chughes000@gmail.com>
Date: Fri, 23 Feb 2024 14:07:45 +0000
Subject: [PATCH] Final performance optimisations for bhmie

---
 Project.toml         |   9 ++-
 src/miemfp.jl        | 146 ++++++++++++++++++++++++++-----------------
 test/Project.toml    |   4 --
 test/benchmarks.jl   |  13 ++--
 test/miemfp_tests.jl |   3 +-
 test/miemfp_tests.py |   4 +-
 6 files changed, 105 insertions(+), 74 deletions(-)
 delete mode 100644 test/Project.toml

diff --git a/Project.toml b/Project.toml
index 3e33293..2577bd6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,17 +1,20 @@
 name = "nanoconc"
 uuid = "9a947172-b1ea-4b16-84a6-f3d50752424d"
 authors = ["Cian Hughes <chughes000@gmail.com>"]
-version = "0.1.1"
+version = "0.1.2"
 
 [deps]
+AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Debugger = "31a5f54b-26ea-5ae9-a837-f05ce5417438"
 Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
 HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
+InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
 Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
-Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
 QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
-XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0"
+StaticVectors = "20fadf95-9e3d-483c-97cd-cab2760e7998"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/src/miemfp.jl b/src/miemfp.jl
index 873d6a3..7fb2039 100755
--- a/src/miemfp.jl
+++ b/src/miemfp.jl
@@ -4,21 +4,47 @@ parameters for coated/uncoated particles of a given size/material
 """
 @fastmath module miemfp
 
-#############################################################################
-# STATUS NOTES:                                                             #
-# Synchronous parallelization implemented for bare calculations, notable    #
-# performance increase. Need to do same for coated                          #
-#############################################################################
-
 using Base.Threads
-using Distributed
 using Memoize
 using Interpolations
+using StaticArrays
 
 const PI_5_996E3::Float64 = pi * 5.996e3
-const TWO_PI::Float64 = 2 * pi
+const TWO_PI::Float64 = 2 * π
+const HALF_PI::Float64 = π / 2
 const BHCOAT_DEL::Float64 = 1e-8
 
+# Avoiding recomputing amu on every loop saves a lot of unnecessary allocations
+const PRECOMPUTED_AMU = @SVector [
+    1.0, 0.5403023058681398, -0.4161468365471424, -0.9899924966004454, -0.6536436208636119, 0.28366218546322625,
+    0.960170286650366, 0.7539022543433046, -0.14550003380861354, -0.9111302618846769, -0.8390715290764524,
+    0.004425697988050785, 0.8438539587324921, 0.9074467814501962, 0.1367372182078336, -0.7596879128588213,
+    -0.9576594803233847, -0.27516333805159693, 0.6603167082440802, 0.9887046181866692, 0.40808206181339196,
+    -0.5477292602242684, -0.9999608263946371, -0.5328330203333975, 0.424179007336997, 0.9912028118634736,
+    0.6469193223286404, -0.2921388087338362, -0.9626058663135666, -0.7480575296890004, 0.15425144988758405,
+    0.9147423578045313, 0.8342233605065102, -0.013276747223059479, -0.8485702747846052, -0.9036922050915067,
+    -0.12796368962740468, 0.7654140519453434, 0.9550736440472949, 0.26664293235993725, -0.6669380616522619,
+    -0.9873392775238264, -0.39998531498835127, 0.5551133015206257, 0.9998433086476912, 0.5253219888177297,
+    -0.4321779448847783, -0.9923354691509287, -0.6401443394691997, 0.3005925437436371, 0.9649660284921133,
+    0.7421541968137826, -0.16299078079570548, -0.9182827862121189, -0.8293098328631502, 0.022126756261955736,
+    0.853220107722584, 0.8998668269691938, 0.11918013544881928, -0.7710802229758452, -0.9524129804151563,
+    -0.25810163593826746, 0.6735071623235862, 0.9858965815825497
+]
+
+"Computes values of amu where nang > 64"
+@memoize function compute_amu(nang::Int64)::Vector{Float64}
+    cos.((HALF_PI / (nang - 1)) .* (63:nang-2))
+end
+
+"Returns amu values generated as efficiently as possible for a given nang value"
+function get_amu(nang::Int64)::Vector{Float64}
+    if nang <= 64
+        return @views PRECOMPUTED_AMU[1:nang]
+    else
+        return @views vcat(PRECOMPUTED_AMU, compute_amu(nang))
+    end
+end
+
 "Helper function for calculating om"
 @memoize function _om_calc(wavel::Float64)::Float64
     PI_5_996E3 / wavel
@@ -119,8 +145,8 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64
         d1y2::ComplexF64 = 1.0 / (n / y2 - d0y2) - n / y2
 
         if iflag == false
-            d1x1 = 1.0 / (n / x1 - d0x1) - n / x1
-            d1x2 = 1.0 / (n / x2 - d0x2) - n / x2
+            d1x1 = x1 / (n - (d0x1 * x1)) - n / x1
+            d1x2 = x1 / (n - (d0x2 * x2)) - n / x2
             chix2 = two_n_minus_1 * chi1x2 / x2 - chi0x2
             chiy2 = two_n_minus_1 * chi1y2 / y2 - chi0y2
             chipx2::ComplexF64 = chi1x2 - n * chix2 / x2
@@ -175,29 +201,30 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64
     return (qext, qsca, qback)
 end
 
+
 """
 Finds scattering parameters for uncoated particles. Heavily modified, but originally
 based on a combination of the original FORTRAN77 BHMIE algorithm and a Python
 implementation attributed to Herbert Kaiser hosted on ScatterLib (http://scatterlib.wikidot.com/mie)
 """
-function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32
+function bhmie(x::Float64, refrel::ComplexF64, nang::Int64
 )::Tuple{Float64,Float64,Float64,Array{ComplexF64,1},Array{ComplexF64,1}}
     y::ComplexF64 = x * refrel
-    nstop::UInt32 = UInt32(round(x + 4.0 * cbrt(x) + 2.0))
-    nn::UInt32 = UInt32(round(max(nstop, abs(y)) + 14))
-    amu::Vector{Float64} = cos.((1.570796327 / Float64(nang - 1)) .* Float64.(0:nang-1))
+    nstop::Int64 = round(x + 4.0 * cbrt(x) + 2.0)
+    nn::Int64 = round(max(nstop, abs(y)) + 14)
+    amu::Vector = get_amu(nang)
 
+    y_inv = inv(y)
     d = Vector{ComplexF64}(undef, nn)
     d[nn] = ComplexF64(0.0, 0.0)
-    @simd for n in nn:-1:2
-        let n_over_y = n / y
-            @views d[n-1] = n_over_y - (1.0 / (d[n] + n_over_y))
-        end
+    for n in nn:-1:2
+        n_over_y = y_inv * n
+        d[n-1] = n_over_y - inv(d[n] + n_over_y)
     end
 
     psi0 = cos(x)
     psi1 = sin(x)
-    qsca::Float64 = 0.0
+    qsca = 0.0
     # xi0 = ComplexF64(psi0, psi1)
     xi1 = ComplexF64(psi1, -psi0)
     chi1 = psi0
@@ -206,54 +233,59 @@ function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32
     s1_2 = zeros(ComplexF64, nang)
     s2_1 = zeros(ComplexF64, nang)
     s2_2 = zeros(ComplexF64, nang)
-    pi0::Vector{Float64} = zeros(nang)
-    pi1::Vector{Float64} = ones(nang)
-    tau::Vector{Float64} = similar(Vector{Float64}, nang)
-    @inbounds @simd for n in (1:nstop)::UnitRange{Int64}
-        psi, chi = let nm1x2 = 2 * n - 1
-            nm1x2 * psi1 / x - psi0,
-            nm1x2 * chi1 / x - chi0
-        end
+    pi0 = zeros(Float64, nang)
+    pi1 = ones(Float64, nang)
+    tau = Vector{Float64}(undef, nang)
+
+    @inbounds for n in 1:nstop
+        nm1x2 = Float64(2 * n - 1)
+        psi = nm1x2 * psi1 / x - psi0
+        chi = nm1x2 * chi1 / x - chi0
+
         xi = psi - (chi * im)
-        an::ComplexF64, bn::ComplexF64 = let @views dn = d[n]
-            let an_mult::ComplexF64 = dn / refrel + n / x
-                (an_mult * psi - psi1) / (an_mult * xi - xi1)
-            end,
-            let bn_mult::ComplexF64 = refrel * dn + n / x
-                (bn_mult * psi - psi1) / (bn_mult * xi - xi1)
-            end
-        end
+        dn = d[n]
+        an_mult::ComplexF64 = dn / refrel + n / x
+        an::ComplexF64 = (an_mult * psi - psi1) / (an_mult * xi - xi1)
+        bn_mult::ComplexF64 = refrel * dn + n / x
+        bn::ComplexF64 = (bn_mult * psi - psi1) / (bn_mult * xi - xi1)
 
-        qsca += (2 * n + 1) * ((abs(an)^2) + (abs(bn)^2))
         pi_ = pi1
+        np1 = n + 1
+        _2np1 = 2 * n + 1
+
+        qsca += _2np1 * ((abs(an)^2) + (abs(bn)^2))
+
+        fn = _2np1 / (n * (np1))
+        pi1 = (_2np1 * amu .* pi_ - (n + 1) .* pi0) / n
+        
+        tau = n .* amu .* pi_ .- np1 .* pi0
+        anpi = an * pi_
+        antau = an .* tau
+        bnpi = bn * pi_
+        bntau = bn .* tau
+        s1_1 .+= fn .* (anpi .+ bntau)
+        s2_1 .+= fn .* (bnpi .+ antau)
+        
+        #* Optimisation: branchless if statement toggling between .+= and .-= for s1_2 and s2_2
+        isodd_factor = Float64(isodd(n)) - Float64(!isodd(n))
+        s1_2 .+= isodd_factor .* fn .* (anpi .- bntau)
+        s2_2 .+= isodd_factor .* fn .* (bnpi .- antau)
 
-        let np1x2 = 2 * n + 1, tau = n * amu .* pi_ - (n + 1) .* pi0
-            let fn = np1x2 / (n * (n + 1)), anpi = an * pi_, antau = an * tau, bnpi = bn * pi_, bntau = bn * tau
-                s1_1::Vector{ComplexF64} .+= fn * (anpi + bntau)
-                s2_1::Vector{ComplexF64} .+= fn * (antau + bnpi)
-                if isodd(n)
-                    s1_2::Vector{ComplexF64} .+= fn * (anpi - bntau)
-                    s2_2::Vector{ComplexF64} .+= fn * (bnpi - antau)
-                else
-                    s1_2::Vector{ComplexF64} .-= fn * (anpi - bntau)
-                    s2_2::Vector{ComplexF64} .-= fn * (bnpi - antau)
-                end
-            end
-            pi1 = (np1x2 * amu .* pi_ - (n + 1) .* pi0) / n
-        end
         psi0, chi0 = psi1, chi1
         psi1, chi1 = psi, chi
         xi1 = ComplexF64(psi1, -chi1)
         pi0 = pi_
     end
-    @inbounds s1::Vector{ComplexF64} = vcat(s1_1, s1_2[1:-1:end-1])
-    @inbounds s2::Vector{ComplexF64} = vcat(s2_1, s2_2[1:-1:end-1])
+
+    @inbounds s1::Vector{ComplexF64} = @views vcat(s1_1, reverse(s1_2))
+    @inbounds s2::Vector{ComplexF64} = @views vcat(s2_1, reverse(s2_2))
+
     x_sq::Float64 = x^2
-    qsca *= (2.0 / (x_sq))
-    qext::Float64, qback::Float64 = let four_over_x_sq::Float64 = 4.0 / x_sq
-        (four_over_x_sq) * real(s1[1]),
-        (four_over_x_sq) * (abs(s1[2*nang-1])^2)
-    end
+    four_over_x_sq::Float64 = 4.0 / x_sq
+    qsca *= four_over_x_sq / 2
+    qext = four_over_x_sq * real(s1[1])
+    qback = four_over_x_sq * (abs(s1[2*nang-1])^2)
+
     return (qext, qsca, qback, s1, s2)
 end
 
diff --git a/test/Project.toml b/test/Project.toml
deleted file mode 100644
index 755c045..0000000
--- a/test/Project.toml
+++ /dev/null
@@ -1,4 +0,0 @@
-[deps]
-AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4"
-BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/test/benchmarks.jl b/test/benchmarks.jl
index c816ef7..dd0f3d6 100644
--- a/test/benchmarks.jl
+++ b/test/benchmarks.jl
@@ -3,15 +3,16 @@ module Benchmarks
 include("../anchors.jl")
 include("ffi_wraps.jl")
 
-import .Anchors.ROOT_DIR
+import .Anchors.SRC_DIR
 import .FFIWraps: bhmie_c, bhmie_fortran, bhmie_fortran77
 using BenchmarkTools
+using InteractiveUtils #! DEBUG
 
-include("$ROOT_DIR/src/miemfp.jl")
+include("$SRC_DIR/miemfp.jl")
 
 function bench_vs_ffi()
     # Fixed testing values
-    nang = UInt32(2)  # Example number of angles
+    nang = 2  # Example number of angles
 
     c_result = @benchmark bhmie_c(x, cxref, nang, cxs1, cxs2) setup=(
         x = rand(Float32);
@@ -36,11 +37,11 @@ function bench_vs_ffi()
         cxs1 = rand(ComplexF32, $nang);
         cxs2 = rand(ComplexF32, $nang);
     )
-
-    j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), nang) setup=(
+    
+    j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), Int64(nang)) setup=(
         x = rand(Float32);
         cxref = rand(ComplexF32);
-        nang = UInt32($nang);
+        nang = $nang;
     )
 
     println("\nC Implementation")
diff --git a/test/miemfp_tests.jl b/test/miemfp_tests.jl
index 8cab4f7..106973d 100644
--- a/test/miemfp_tests.jl
+++ b/test/miemfp_tests.jl
@@ -1,7 +1,6 @@
 using Test
 using Random
 using PropCheck
-using Debugger
 using PyCall
 
 if !@isdefined TestUtils
@@ -25,7 +24,7 @@ miemfp.bhmie(
     nang::Int64,
     s1::Vector{ComplexF64},
     s2::Vector{ComplexF64},
-) = miemfp.bhmie(x, cxref, UInt32(nang))
+) = miemfp.bhmie(x, cxref, nang)
 
 function miemfp.bhmie(
     x::Float64,
diff --git a/test/miemfp_tests.py b/test/miemfp_tests.py
index c401eb4..6e6b426 100644
--- a/test/miemfp_tests.py
+++ b/test/miemfp_tests.py
@@ -22,9 +22,9 @@ def compare_bhmie_functions(
         # This is to ensure that only one instance of each function is running at a time
         # to avoid memory issues in the FFI code
         await event1.wait()
-        f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2]
+        f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2]  # Only testing at nang = 2 to avoid memory issues
         await event2.wait()
-        f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2]
+        f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2]  # Only testing at nang = 2 to avoid memory issues
 
         return np.all(np.isclose(f1_result, f2_result))