From b595fa0fb5a9ef399aeb0e62be3b905707589c56 Mon Sep 17 00:00:00 2001 From: Cian Hughes Date: Fri, 23 Feb 2024 14:07:45 +0000 Subject: [PATCH] Final performance optimisations for bhmie --- Project.toml | 9 ++- src/miemfp.jl | 146 ++++++++++++++++++++++++++----------------- test/Project.toml | 4 -- test/benchmarks.jl | 13 ++-- test/miemfp_tests.jl | 3 +- test/miemfp_tests.py | 4 +- 6 files changed, 105 insertions(+), 74 deletions(-) delete mode 100644 test/Project.toml diff --git a/Project.toml b/Project.toml index 3e33293..2577bd6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,17 +1,20 @@ name = "nanoconc" uuid = "9a947172-b1ea-4b16-84a6-f3d50752424d" authors = ["Cian Hughes "] -version = "0.1.1" +version = "0.1.2" [deps] +AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" Debugger = "31a5f54b-26ea-5ae9-a837-f05ce5417438" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" +InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826" -Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" -XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" +StaticVectors = "20fadf95-9e3d-483c-97cd-cab2760e7998" +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/src/miemfp.jl b/src/miemfp.jl index 873d6a3..7fb2039 100755 --- a/src/miemfp.jl +++ b/src/miemfp.jl @@ -4,21 +4,47 @@ parameters for coated/uncoated particles of a given size/material """ @fastmath module miemfp -############################################################################# -# STATUS NOTES: # -# Synchronous parallelization implemented for bare calculations, notable # -# performance increase. Need to do same for coated # -############################################################################# - using Base.Threads -using Distributed using Memoize using Interpolations +using StaticArrays const PI_5_996E3::Float64 = pi * 5.996e3 -const TWO_PI::Float64 = 2 * pi +const TWO_PI::Float64 = 2 * π +const HALF_PI::Float64 = π / 2 const BHCOAT_DEL::Float64 = 1e-8 +# Avoiding recomputing amu on every loop saves a lot of unnecessary allocations +const PRECOMPUTED_AMU = @SVector [ + 1.0, 0.5403023058681398, -0.4161468365471424, -0.9899924966004454, -0.6536436208636119, 0.28366218546322625, + 0.960170286650366, 0.7539022543433046, -0.14550003380861354, -0.9111302618846769, -0.8390715290764524, + 0.004425697988050785, 0.8438539587324921, 0.9074467814501962, 0.1367372182078336, -0.7596879128588213, + -0.9576594803233847, -0.27516333805159693, 0.6603167082440802, 0.9887046181866692, 0.40808206181339196, + -0.5477292602242684, -0.9999608263946371, -0.5328330203333975, 0.424179007336997, 0.9912028118634736, + 0.6469193223286404, -0.2921388087338362, -0.9626058663135666, -0.7480575296890004, 0.15425144988758405, + 0.9147423578045313, 0.8342233605065102, -0.013276747223059479, -0.8485702747846052, -0.9036922050915067, + -0.12796368962740468, 0.7654140519453434, 0.9550736440472949, 0.26664293235993725, -0.6669380616522619, + -0.9873392775238264, -0.39998531498835127, 0.5551133015206257, 0.9998433086476912, 0.5253219888177297, + -0.4321779448847783, -0.9923354691509287, -0.6401443394691997, 0.3005925437436371, 0.9649660284921133, + 0.7421541968137826, -0.16299078079570548, -0.9182827862121189, -0.8293098328631502, 0.022126756261955736, + 0.853220107722584, 0.8998668269691938, 0.11918013544881928, -0.7710802229758452, -0.9524129804151563, + -0.25810163593826746, 0.6735071623235862, 0.9858965815825497 +] + +"Computes values of amu where nang > 64" +@memoize function compute_amu(nang::Int64)::Vector{Float64} + cos.((HALF_PI / (nang - 1)) .* (63:nang-2)) +end + +"Returns amu values generated as efficiently as possible for a given nang value" +function get_amu(nang::Int64)::Vector{Float64} + if nang <= 64 + return @views PRECOMPUTED_AMU[1:nang] + else + return @views vcat(PRECOMPUTED_AMU, compute_amu(nang)) + end +end + "Helper function for calculating om" @memoize function _om_calc(wavel::Float64)::Float64 PI_5_996E3 / wavel @@ -119,8 +145,8 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64 d1y2::ComplexF64 = 1.0 / (n / y2 - d0y2) - n / y2 if iflag == false - d1x1 = 1.0 / (n / x1 - d0x1) - n / x1 - d1x2 = 1.0 / (n / x2 - d0x2) - n / x2 + d1x1 = x1 / (n - (d0x1 * x1)) - n / x1 + d1x2 = x1 / (n - (d0x2 * x2)) - n / x2 chix2 = two_n_minus_1 * chi1x2 / x2 - chi0x2 chiy2 = two_n_minus_1 * chi1y2 / y2 - chi0y2 chipx2::ComplexF64 = chi1x2 - n * chix2 / x2 @@ -175,29 +201,30 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64 return (qext, qsca, qback) end + """ Finds scattering parameters for uncoated particles. Heavily modified, but originally based on a combination of the original FORTRAN77 BHMIE algorithm and a Python implementation attributed to Herbert Kaiser hosted on ScatterLib (http://scatterlib.wikidot.com/mie) """ -function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32 +function bhmie(x::Float64, refrel::ComplexF64, nang::Int64 )::Tuple{Float64,Float64,Float64,Array{ComplexF64,1},Array{ComplexF64,1}} y::ComplexF64 = x * refrel - nstop::UInt32 = UInt32(round(x + 4.0 * cbrt(x) + 2.0)) - nn::UInt32 = UInt32(round(max(nstop, abs(y)) + 14)) - amu::Vector{Float64} = cos.((1.570796327 / Float64(nang - 1)) .* Float64.(0:nang-1)) + nstop::Int64 = round(x + 4.0 * cbrt(x) + 2.0) + nn::Int64 = round(max(nstop, abs(y)) + 14) + amu::Vector = get_amu(nang) + y_inv = inv(y) d = Vector{ComplexF64}(undef, nn) d[nn] = ComplexF64(0.0, 0.0) - @simd for n in nn:-1:2 - let n_over_y = n / y - @views d[n-1] = n_over_y - (1.0 / (d[n] + n_over_y)) - end + for n in nn:-1:2 + n_over_y = y_inv * n + d[n-1] = n_over_y - inv(d[n] + n_over_y) end psi0 = cos(x) psi1 = sin(x) - qsca::Float64 = 0.0 + qsca = 0.0 # xi0 = ComplexF64(psi0, psi1) xi1 = ComplexF64(psi1, -psi0) chi1 = psi0 @@ -206,54 +233,59 @@ function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32 s1_2 = zeros(ComplexF64, nang) s2_1 = zeros(ComplexF64, nang) s2_2 = zeros(ComplexF64, nang) - pi0::Vector{Float64} = zeros(nang) - pi1::Vector{Float64} = ones(nang) - tau::Vector{Float64} = similar(Vector{Float64}, nang) - @inbounds @simd for n in (1:nstop)::UnitRange{Int64} - psi, chi = let nm1x2 = 2 * n - 1 - nm1x2 * psi1 / x - psi0, - nm1x2 * chi1 / x - chi0 - end + pi0 = zeros(Float64, nang) + pi1 = ones(Float64, nang) + tau = Vector{Float64}(undef, nang) + + @inbounds for n in 1:nstop + nm1x2 = Float64(2 * n - 1) + psi = nm1x2 * psi1 / x - psi0 + chi = nm1x2 * chi1 / x - chi0 + xi = psi - (chi * im) - an::ComplexF64, bn::ComplexF64 = let @views dn = d[n] - let an_mult::ComplexF64 = dn / refrel + n / x - (an_mult * psi - psi1) / (an_mult * xi - xi1) - end, - let bn_mult::ComplexF64 = refrel * dn + n / x - (bn_mult * psi - psi1) / (bn_mult * xi - xi1) - end - end + dn = d[n] + an_mult::ComplexF64 = dn / refrel + n / x + an::ComplexF64 = (an_mult * psi - psi1) / (an_mult * xi - xi1) + bn_mult::ComplexF64 = refrel * dn + n / x + bn::ComplexF64 = (bn_mult * psi - psi1) / (bn_mult * xi - xi1) - qsca += (2 * n + 1) * ((abs(an)^2) + (abs(bn)^2)) pi_ = pi1 + np1 = n + 1 + _2np1 = 2 * n + 1 + + qsca += _2np1 * ((abs(an)^2) + (abs(bn)^2)) + + fn = _2np1 / (n * (np1)) + pi1 = (_2np1 * amu .* pi_ - (n + 1) .* pi0) / n + + tau = n .* amu .* pi_ .- np1 .* pi0 + anpi = an * pi_ + antau = an .* tau + bnpi = bn * pi_ + bntau = bn .* tau + s1_1 .+= fn .* (anpi .+ bntau) + s2_1 .+= fn .* (bnpi .+ antau) + + #* Optimisation: branchless if statement toggling between .+= and .-= for s1_2 and s2_2 + isodd_factor = Float64(isodd(n)) - Float64(!isodd(n)) + s1_2 .+= isodd_factor .* fn .* (anpi .- bntau) + s2_2 .+= isodd_factor .* fn .* (bnpi .- antau) - let np1x2 = 2 * n + 1, tau = n * amu .* pi_ - (n + 1) .* pi0 - let fn = np1x2 / (n * (n + 1)), anpi = an * pi_, antau = an * tau, bnpi = bn * pi_, bntau = bn * tau - s1_1::Vector{ComplexF64} .+= fn * (anpi + bntau) - s2_1::Vector{ComplexF64} .+= fn * (antau + bnpi) - if isodd(n) - s1_2::Vector{ComplexF64} .+= fn * (anpi - bntau) - s2_2::Vector{ComplexF64} .+= fn * (bnpi - antau) - else - s1_2::Vector{ComplexF64} .-= fn * (anpi - bntau) - s2_2::Vector{ComplexF64} .-= fn * (bnpi - antau) - end - end - pi1 = (np1x2 * amu .* pi_ - (n + 1) .* pi0) / n - end psi0, chi0 = psi1, chi1 psi1, chi1 = psi, chi xi1 = ComplexF64(psi1, -chi1) pi0 = pi_ end - @inbounds s1::Vector{ComplexF64} = vcat(s1_1, s1_2[1:-1:end-1]) - @inbounds s2::Vector{ComplexF64} = vcat(s2_1, s2_2[1:-1:end-1]) + + @inbounds s1::Vector{ComplexF64} = @views vcat(s1_1, reverse(s1_2)) + @inbounds s2::Vector{ComplexF64} = @views vcat(s2_1, reverse(s2_2)) + x_sq::Float64 = x^2 - qsca *= (2.0 / (x_sq)) - qext::Float64, qback::Float64 = let four_over_x_sq::Float64 = 4.0 / x_sq - (four_over_x_sq) * real(s1[1]), - (four_over_x_sq) * (abs(s1[2*nang-1])^2) - end + four_over_x_sq::Float64 = 4.0 / x_sq + qsca *= four_over_x_sq / 2 + qext = four_over_x_sq * real(s1[1]) + qback = four_over_x_sq * (abs(s1[2*nang-1])^2) + return (qext, qsca, qback, s1, s2) end diff --git a/test/Project.toml b/test/Project.toml deleted file mode 100644 index 755c045..0000000 --- a/test/Project.toml +++ /dev/null @@ -1,4 +0,0 @@ -[deps] -AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4" -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/test/benchmarks.jl b/test/benchmarks.jl index c816ef7..dd0f3d6 100644 --- a/test/benchmarks.jl +++ b/test/benchmarks.jl @@ -3,15 +3,16 @@ module Benchmarks include("../anchors.jl") include("ffi_wraps.jl") -import .Anchors.ROOT_DIR +import .Anchors.SRC_DIR import .FFIWraps: bhmie_c, bhmie_fortran, bhmie_fortran77 using BenchmarkTools +using InteractiveUtils #! DEBUG -include("$ROOT_DIR/src/miemfp.jl") +include("$SRC_DIR/miemfp.jl") function bench_vs_ffi() # Fixed testing values - nang = UInt32(2) # Example number of angles + nang = 2 # Example number of angles c_result = @benchmark bhmie_c(x, cxref, nang, cxs1, cxs2) setup=( x = rand(Float32); @@ -36,11 +37,11 @@ function bench_vs_ffi() cxs1 = rand(ComplexF32, $nang); cxs2 = rand(ComplexF32, $nang); ) - - j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), nang) setup=( + + j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), Int64(nang)) setup=( x = rand(Float32); cxref = rand(ComplexF32); - nang = UInt32($nang); + nang = $nang; ) println("\nC Implementation") diff --git a/test/miemfp_tests.jl b/test/miemfp_tests.jl index 8cab4f7..106973d 100644 --- a/test/miemfp_tests.jl +++ b/test/miemfp_tests.jl @@ -1,7 +1,6 @@ using Test using Random using PropCheck -using Debugger using PyCall if !@isdefined TestUtils @@ -25,7 +24,7 @@ miemfp.bhmie( nang::Int64, s1::Vector{ComplexF64}, s2::Vector{ComplexF64}, -) = miemfp.bhmie(x, cxref, UInt32(nang)) +) = miemfp.bhmie(x, cxref, nang) function miemfp.bhmie( x::Float64, diff --git a/test/miemfp_tests.py b/test/miemfp_tests.py index c401eb4..6e6b426 100644 --- a/test/miemfp_tests.py +++ b/test/miemfp_tests.py @@ -22,9 +22,9 @@ def compare_bhmie_functions( # This is to ensure that only one instance of each function is running at a time # to avoid memory issues in the FFI code await event1.wait() - f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2] + f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2] # Only testing at nang = 2 to avoid memory issues await event2.wait() - f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2] + f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2] # Only testing at nang = 2 to avoid memory issues return np.all(np.isclose(f1_result, f2_result))