Final performance optimisations for bhmie

This commit is contained in:
2024-02-23 14:07:45 +00:00
parent 9f5bc32711
commit b595fa0fb5
6 changed files with 105 additions and 74 deletions

View File

@@ -1,17 +1,20 @@
name = "nanoconc" name = "nanoconc"
uuid = "9a947172-b1ea-4b16-84a6-f3d50752424d" uuid = "9a947172-b1ea-4b16-84a6-f3d50752424d"
authors = ["Cian Hughes <chughes000@gmail.com>"] authors = ["Cian Hughes <chughes000@gmail.com>"]
version = "0.1.1" version = "0.1.2"
[deps] [deps]
AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b" CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Debugger = "31a5f54b-26ea-5ae9-a837-f05ce5417438" Debugger = "31a5f54b-26ea-5ae9-a837-f05ce5417438"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f" HDF5 = "f67ccb44-e63f-5c2f-98bd-6dc0ccc4ba2f"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59" Interpolations = "a98d9a8b-a2ab-59e6-89dd-64a1c18fca59"
Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826" Memoize = "c03570c3-d221-55d1-a50c-7939bbd78826"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0" PyCall = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc" QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
XLSX = "fdbf4ff8-1666-58a4-91e7-1b58723a45e0" StaticVectors = "20fadf95-9e3d-483c-97cd-cab2760e7998"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

View File

@@ -4,21 +4,47 @@ parameters for coated/uncoated particles of a given size/material
""" """
@fastmath module miemfp @fastmath module miemfp
#############################################################################
# STATUS NOTES: #
# Synchronous parallelization implemented for bare calculations, notable #
# performance increase. Need to do same for coated #
#############################################################################
using Base.Threads using Base.Threads
using Distributed
using Memoize using Memoize
using Interpolations using Interpolations
using StaticArrays
const PI_5_996E3::Float64 = pi * 5.996e3 const PI_5_996E3::Float64 = pi * 5.996e3
const TWO_PI::Float64 = 2 * pi const TWO_PI::Float64 = 2 * π
const HALF_PI::Float64 = π / 2
const BHCOAT_DEL::Float64 = 1e-8 const BHCOAT_DEL::Float64 = 1e-8
# Avoiding recomputing amu on every loop saves a lot of unnecessary allocations
const PRECOMPUTED_AMU = @SVector [
1.0, 0.5403023058681398, -0.4161468365471424, -0.9899924966004454, -0.6536436208636119, 0.28366218546322625,
0.960170286650366, 0.7539022543433046, -0.14550003380861354, -0.9111302618846769, -0.8390715290764524,
0.004425697988050785, 0.8438539587324921, 0.9074467814501962, 0.1367372182078336, -0.7596879128588213,
-0.9576594803233847, -0.27516333805159693, 0.6603167082440802, 0.9887046181866692, 0.40808206181339196,
-0.5477292602242684, -0.9999608263946371, -0.5328330203333975, 0.424179007336997, 0.9912028118634736,
0.6469193223286404, -0.2921388087338362, -0.9626058663135666, -0.7480575296890004, 0.15425144988758405,
0.9147423578045313, 0.8342233605065102, -0.013276747223059479, -0.8485702747846052, -0.9036922050915067,
-0.12796368962740468, 0.7654140519453434, 0.9550736440472949, 0.26664293235993725, -0.6669380616522619,
-0.9873392775238264, -0.39998531498835127, 0.5551133015206257, 0.9998433086476912, 0.5253219888177297,
-0.4321779448847783, -0.9923354691509287, -0.6401443394691997, 0.3005925437436371, 0.9649660284921133,
0.7421541968137826, -0.16299078079570548, -0.9182827862121189, -0.8293098328631502, 0.022126756261955736,
0.853220107722584, 0.8998668269691938, 0.11918013544881928, -0.7710802229758452, -0.9524129804151563,
-0.25810163593826746, 0.6735071623235862, 0.9858965815825497
]
"Computes values of amu where nang > 64"
@memoize function compute_amu(nang::Int64)::Vector{Float64}
cos.((HALF_PI / (nang - 1)) .* (63:nang-2))
end
"Returns amu values generated as efficiently as possible for a given nang value"
function get_amu(nang::Int64)::Vector{Float64}
if nang <= 64
return @views PRECOMPUTED_AMU[1:nang]
else
return @views vcat(PRECOMPUTED_AMU, compute_amu(nang))
end
end
"Helper function for calculating om" "Helper function for calculating om"
@memoize function _om_calc(wavel::Float64)::Float64 @memoize function _om_calc(wavel::Float64)::Float64
PI_5_996E3 / wavel PI_5_996E3 / wavel
@@ -119,8 +145,8 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64
d1y2::ComplexF64 = 1.0 / (n / y2 - d0y2) - n / y2 d1y2::ComplexF64 = 1.0 / (n / y2 - d0y2) - n / y2
if iflag == false if iflag == false
d1x1 = 1.0 / (n / x1 - d0x1) - n / x1 d1x1 = x1 / (n - (d0x1 * x1)) - n / x1
d1x2 = 1.0 / (n / x2 - d0x2) - n / x2 d1x2 = x1 / (n - (d0x2 * x2)) - n / x2
chix2 = two_n_minus_1 * chi1x2 / x2 - chi0x2 chix2 = two_n_minus_1 * chi1x2 / x2 - chi0x2
chiy2 = two_n_minus_1 * chi1y2 / y2 - chi0y2 chiy2 = two_n_minus_1 * chi1y2 / y2 - chi0y2
chipx2::ComplexF64 = chi1x2 - n * chix2 / x2 chipx2::ComplexF64 = chi1x2 - n * chix2 / x2
@@ -175,29 +201,30 @@ function bhcoat(x::Float64, y::Float64, rfrel1::ComplexF64, rfrel2::ComplexF64
return (qext, qsca, qback) return (qext, qsca, qback)
end end
""" """
Finds scattering parameters for uncoated particles. Heavily modified, but originally Finds scattering parameters for uncoated particles. Heavily modified, but originally
based on a combination of the original FORTRAN77 BHMIE algorithm and a Python based on a combination of the original FORTRAN77 BHMIE algorithm and a Python
implementation attributed to Herbert Kaiser hosted on ScatterLib (http://scatterlib.wikidot.com/mie) implementation attributed to Herbert Kaiser hosted on ScatterLib (http://scatterlib.wikidot.com/mie)
""" """
function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32 function bhmie(x::Float64, refrel::ComplexF64, nang::Int64
)::Tuple{Float64,Float64,Float64,Array{ComplexF64,1},Array{ComplexF64,1}} )::Tuple{Float64,Float64,Float64,Array{ComplexF64,1},Array{ComplexF64,1}}
y::ComplexF64 = x * refrel y::ComplexF64 = x * refrel
nstop::UInt32 = UInt32(round(x + 4.0 * cbrt(x) + 2.0)) nstop::Int64 = round(x + 4.0 * cbrt(x) + 2.0)
nn::UInt32 = UInt32(round(max(nstop, abs(y)) + 14)) nn::Int64 = round(max(nstop, abs(y)) + 14)
amu::Vector{Float64} = cos.((1.570796327 / Float64(nang - 1)) .* Float64.(0:nang-1)) amu::Vector = get_amu(nang)
y_inv = inv(y)
d = Vector{ComplexF64}(undef, nn) d = Vector{ComplexF64}(undef, nn)
d[nn] = ComplexF64(0.0, 0.0) d[nn] = ComplexF64(0.0, 0.0)
@simd for n in nn:-1:2 for n in nn:-1:2
let n_over_y = n / y n_over_y = y_inv * n
@views d[n-1] = n_over_y - (1.0 / (d[n] + n_over_y)) d[n-1] = n_over_y - inv(d[n] + n_over_y)
end
end end
psi0 = cos(x) psi0 = cos(x)
psi1 = sin(x) psi1 = sin(x)
qsca::Float64 = 0.0 qsca = 0.0
# xi0 = ComplexF64(psi0, psi1) # xi0 = ComplexF64(psi0, psi1)
xi1 = ComplexF64(psi1, -psi0) xi1 = ComplexF64(psi1, -psi0)
chi1 = psi0 chi1 = psi0
@@ -206,54 +233,59 @@ function bhmie(x::Float64, refrel::ComplexF64, nang::UInt32
s1_2 = zeros(ComplexF64, nang) s1_2 = zeros(ComplexF64, nang)
s2_1 = zeros(ComplexF64, nang) s2_1 = zeros(ComplexF64, nang)
s2_2 = zeros(ComplexF64, nang) s2_2 = zeros(ComplexF64, nang)
pi0::Vector{Float64} = zeros(nang) pi0 = zeros(Float64, nang)
pi1::Vector{Float64} = ones(nang) pi1 = ones(Float64, nang)
tau::Vector{Float64} = similar(Vector{Float64}, nang) tau = Vector{Float64}(undef, nang)
@inbounds @simd for n in (1:nstop)::UnitRange{Int64}
psi, chi = let nm1x2 = 2 * n - 1 @inbounds for n in 1:nstop
nm1x2 * psi1 / x - psi0, nm1x2 = Float64(2 * n - 1)
nm1x2 * chi1 / x - chi0 psi = nm1x2 * psi1 / x - psi0
end chi = nm1x2 * chi1 / x - chi0
xi = psi - (chi * im) xi = psi - (chi * im)
an::ComplexF64, bn::ComplexF64 = let @views dn = d[n] dn = d[n]
let an_mult::ComplexF64 = dn / refrel + n / x an_mult::ComplexF64 = dn / refrel + n / x
(an_mult * psi - psi1) / (an_mult * xi - xi1) an::ComplexF64 = (an_mult * psi - psi1) / (an_mult * xi - xi1)
end, bn_mult::ComplexF64 = refrel * dn + n / x
let bn_mult::ComplexF64 = refrel * dn + n / x bn::ComplexF64 = (bn_mult * psi - psi1) / (bn_mult * xi - xi1)
(bn_mult * psi - psi1) / (bn_mult * xi - xi1)
end
end
qsca += (2 * n + 1) * ((abs(an)^2) + (abs(bn)^2))
pi_ = pi1 pi_ = pi1
np1 = n + 1
_2np1 = 2 * n + 1
qsca += _2np1 * ((abs(an)^2) + (abs(bn)^2))
fn = _2np1 / (n * (np1))
pi1 = (_2np1 * amu .* pi_ - (n + 1) .* pi0) / n
tau = n .* amu .* pi_ .- np1 .* pi0
anpi = an * pi_
antau = an .* tau
bnpi = bn * pi_
bntau = bn .* tau
s1_1 .+= fn .* (anpi .+ bntau)
s2_1 .+= fn .* (bnpi .+ antau)
#* Optimisation: branchless if statement toggling between .+= and .-= for s1_2 and s2_2
isodd_factor = Float64(isodd(n)) - Float64(!isodd(n))
s1_2 .+= isodd_factor .* fn .* (anpi .- bntau)
s2_2 .+= isodd_factor .* fn .* (bnpi .- antau)
let np1x2 = 2 * n + 1, tau = n * amu .* pi_ - (n + 1) .* pi0
let fn = np1x2 / (n * (n + 1)), anpi = an * pi_, antau = an * tau, bnpi = bn * pi_, bntau = bn * tau
s1_1::Vector{ComplexF64} .+= fn * (anpi + bntau)
s2_1::Vector{ComplexF64} .+= fn * (antau + bnpi)
if isodd(n)
s1_2::Vector{ComplexF64} .+= fn * (anpi - bntau)
s2_2::Vector{ComplexF64} .+= fn * (bnpi - antau)
else
s1_2::Vector{ComplexF64} .-= fn * (anpi - bntau)
s2_2::Vector{ComplexF64} .-= fn * (bnpi - antau)
end
end
pi1 = (np1x2 * amu .* pi_ - (n + 1) .* pi0) / n
end
psi0, chi0 = psi1, chi1 psi0, chi0 = psi1, chi1
psi1, chi1 = psi, chi psi1, chi1 = psi, chi
xi1 = ComplexF64(psi1, -chi1) xi1 = ComplexF64(psi1, -chi1)
pi0 = pi_ pi0 = pi_
end end
@inbounds s1::Vector{ComplexF64} = vcat(s1_1, s1_2[1:-1:end-1])
@inbounds s2::Vector{ComplexF64} = vcat(s2_1, s2_2[1:-1:end-1]) @inbounds s1::Vector{ComplexF64} = @views vcat(s1_1, reverse(s1_2))
@inbounds s2::Vector{ComplexF64} = @views vcat(s2_1, reverse(s2_2))
x_sq::Float64 = x^2 x_sq::Float64 = x^2
qsca *= (2.0 / (x_sq)) four_over_x_sq::Float64 = 4.0 / x_sq
qext::Float64, qback::Float64 = let four_over_x_sq::Float64 = 4.0 / x_sq qsca *= four_over_x_sq / 2
(four_over_x_sq) * real(s1[1]), qext = four_over_x_sq * real(s1[1])
(four_over_x_sq) * (abs(s1[2*nang-1])^2) qback = four_over_x_sq * (abs(s1[2*nang-1])^2)
end
return (qext, qsca, qback, s1, s2) return (qext, qsca, qback, s1, s2)
end end

View File

@@ -1,4 +0,0 @@
[deps]
AirspeedVelocity = "1c8270ee-6884-45cc-9545-60fa71ec23e4"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

View File

@@ -3,15 +3,16 @@ module Benchmarks
include("../anchors.jl") include("../anchors.jl")
include("ffi_wraps.jl") include("ffi_wraps.jl")
import .Anchors.ROOT_DIR import .Anchors.SRC_DIR
import .FFIWraps: bhmie_c, bhmie_fortran, bhmie_fortran77 import .FFIWraps: bhmie_c, bhmie_fortran, bhmie_fortran77
using BenchmarkTools using BenchmarkTools
using InteractiveUtils #! DEBUG
include("$ROOT_DIR/src/miemfp.jl") include("$SRC_DIR/miemfp.jl")
function bench_vs_ffi() function bench_vs_ffi()
# Fixed testing values # Fixed testing values
nang = UInt32(2) # Example number of angles nang = 2 # Example number of angles
c_result = @benchmark bhmie_c(x, cxref, nang, cxs1, cxs2) setup=( c_result = @benchmark bhmie_c(x, cxref, nang, cxs1, cxs2) setup=(
x = rand(Float32); x = rand(Float32);
@@ -36,11 +37,11 @@ function bench_vs_ffi()
cxs1 = rand(ComplexF32, $nang); cxs1 = rand(ComplexF32, $nang);
cxs2 = rand(ComplexF32, $nang); cxs2 = rand(ComplexF32, $nang);
) )
j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), nang) setup=( j_result = @benchmark miemfp.bhmie(Float64(x), ComplexF64(cxref), Int64(nang)) setup=(
x = rand(Float32); x = rand(Float32);
cxref = rand(ComplexF32); cxref = rand(ComplexF32);
nang = UInt32($nang); nang = $nang;
) )
println("\nC Implementation") println("\nC Implementation")

View File

@@ -1,7 +1,6 @@
using Test using Test
using Random using Random
using PropCheck using PropCheck
using Debugger
using PyCall using PyCall
if !@isdefined TestUtils if !@isdefined TestUtils
@@ -25,7 +24,7 @@ miemfp.bhmie(
nang::Int64, nang::Int64,
s1::Vector{ComplexF64}, s1::Vector{ComplexF64},
s2::Vector{ComplexF64}, s2::Vector{ComplexF64},
) = miemfp.bhmie(x, cxref, UInt32(nang)) ) = miemfp.bhmie(x, cxref, nang)
function miemfp.bhmie( function miemfp.bhmie(
x::Float64, x::Float64,

View File

@@ -22,9 +22,9 @@ def compare_bhmie_functions(
# This is to ensure that only one instance of each function is running at a time # This is to ensure that only one instance of each function is running at a time
# to avoid memory issues in the FFI code # to avoid memory issues in the FFI code
await event1.wait() await event1.wait()
f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2] f1_result = f1(x, cxref, 2, cxs1, cxs2)[:2] # Only testing at nang = 2 to avoid memory issues
await event2.wait() await event2.wait()
f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2] f2_result = f2(x, cxref, 2, cxs1, cxs2)[:2] # Only testing at nang = 2 to avoid memory issues
return np.all(np.isclose(f1_result, f2_result)) return np.all(np.isclose(f1_result, f2_result))