Skip to content

Remove dynamic dispatch in Hessian evaluation #2740

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Apr 30, 2025
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 52 additions & 32 deletions src/Nonlinear/ReverseAD/forward_over_reverse.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,15 @@

const TAG = :ReverseAD

"""
const MAX_CHUNK::Int = 10

An upper bound on the chunk sie for forward-over-reverse. Increasing this could
improve performance at the cost of extra memory allocation. It has been 10 for a
long time, and nobody seems to have complained.
"""
const MAX_CHUNK = 10

"""
_eval_hessian(
d::NLPEvaluator,
Expand All @@ -23,46 +32,30 @@ Returns the number of non-zeros in the computed Hessian, which will be used to
update the offset for the next call.
"""
function _eval_hessian(
d::NLPEvaluator,
f::_FunctionStorage,
H::AbstractVector{Float64},
λ::Float64,
offset::Int,
)::Int
chunk = min(size(f.seed_matrix, 2), d.max_chunk)
# As a performance optimization, skip dynamic dispatch if the chunk is 1.
if chunk == 1
return _eval_hessian_inner(d, f, H, λ, offset, Val(1))
else
return _eval_hessian_inner(d, f, H, λ, offset, Val(chunk))
end
end

function _eval_hessian_inner(
d::NLPEvaluator,
ex::_FunctionStorage,
H::AbstractVector{Float64},
scale::Float64,
nzcount::Int,
::Val{CHUNK},
) where {CHUNK}
)::Int
if ex.linearity == LINEAR
@assert length(ex.hess_I) == 0
return 0
end
chunk = min(size(ex.seed_matrix, 2), d.max_chunk)
Coloring.prepare_seed_matrix!(ex.seed_matrix, ex.rinfo)
# Compute hessian-vector products
num_products = size(ex.seed_matrix, 2) # number of hessian-vector products
num_chunks = div(num_products, CHUNK)
num_chunks = div(num_products, chunk)
@assert size(ex.seed_matrix, 1) == length(ex.rinfo.local_indices)
for offset in 1:CHUNK:(CHUNK*num_chunks)
_eval_hessian_chunk(d, ex, offset, CHUNK, Val(CHUNK))
for offset in 1:chunk:(chunk*num_chunks)
_eval_hessian_chunk(d, ex, offset, chunk, chunk)
end
# leftover chunk
remaining = num_products - CHUNK * num_chunks
remaining = num_products - chunk * num_chunks
if remaining > 0
offset = CHUNK * num_chunks + 1
_eval_hessian_chunk(d, ex, offset, remaining, Val(CHUNK))
offset = chunk * num_chunks + 1
_eval_hessian_chunk(d, ex, offset, remaining, chunk)
end
want, got = nzcount + length(ex.hess_I), length(H)
if want > got
Expand Down Expand Up @@ -90,32 +83,59 @@ function _eval_hessian_chunk(
ex::_FunctionStorage,
offset::Int,
chunk::Int,
::Val{CHUNK},
) where {CHUNK}
chunk_size::Int,
)
for r in eachindex(ex.rinfo.local_indices)
# set up directional derivatives
@inbounds idx = ex.rinfo.local_indices[r]
# load up ex.seed_matrix[r,k,k+1,...,k+remaining-1] into input_ϵ
for s in 1:chunk
# If `chunk < CHUNK`, leaves junk in the unused components
d.input_ϵ[(idx-1)*CHUNK+s] = ex.seed_matrix[r, offset+s-1]
# If `chunk < chunk_size`, leaves junk in the unused components
d.input_ϵ[(idx-1)*chunk_size+s] = ex.seed_matrix[r, offset+s-1]
end
end
_hessian_slice_inner(d, ex, Val(CHUNK))
_hessian_slice_inner(d, ex, chunk_size)
fill!(d.input_ϵ, 0.0)
# collect directional derivatives
for r in eachindex(ex.rinfo.local_indices)
@inbounds idx = ex.rinfo.local_indices[r]
# load output_ϵ into ex.seed_matrix[r,k,k+1,...,k+remaining-1]
for s in 1:chunk
ex.seed_matrix[r, offset+s-1] = d.output_ϵ[(idx-1)*CHUNK+s]
ex.seed_matrix[r, offset+s-1] = d.output_ϵ[(idx-1)*chunk_size+s]
end
end
return
end

function _hessian_slice_inner(d, ex, ::Val{CHUNK}) where {CHUNK}
T = ForwardDiff.Partials{CHUNK,Float64} # This is our element type.
# A wrapper function to avoid dynamic dispatch.
function _hessian_slice_inner(d, ex, chunk::Int)
@assert 1 <= chunk <= MAX_CHUNK
@assert MAX_CHUNK == 10
if chunk == 1
_hessian_slice_inner(d, ex, ForwardDiff.Partials{1,Float64})
elseif chunk == 2
_hessian_slice_inner(d, ex, ForwardDiff.Partials{2,Float64})
elseif chunk == 3
_hessian_slice_inner(d, ex, ForwardDiff.Partials{3,Float64})
elseif chunk == 4
_hessian_slice_inner(d, ex, ForwardDiff.Partials{4,Float64})
elseif chunk == 5
_hessian_slice_inner(d, ex, ForwardDiff.Partials{5,Float64})
elseif chunk == 6
_hessian_slice_inner(d, ex, ForwardDiff.Partials{6,Float64})
elseif chunk == 7
_hessian_slice_inner(d, ex, ForwardDiff.Partials{7,Float64})
elseif chunk == 8
_hessian_slice_inner(d, ex, ForwardDiff.Partials{8,Float64})
elseif chunk == 9
_hessian_slice_inner(d, ex, ForwardDiff.Partials{9,Float64})
else
_hessian_slice_inner(d, ex, ForwardDiff.Partials{10,Float64})
end
return
end

function _hessian_slice_inner(d, ex, ::Type{T}) where {T}
fill!(d.output_ϵ, 0.0)
output_ϵ = _reinterpret_unsafe(T, d.output_ϵ)
subexpr_forward_values_ϵ =
Expand Down
3 changes: 1 addition & 2 deletions src/Nonlinear/ReverseAD/mathoptinterface_api.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,7 @@ function MOI.initialize(d::NLPEvaluator, requested_features::Vector{Symbol})
max_expr_length = max(max_expr_length, length(d.constraints[end].nodes))
max_chunk = max(max_chunk, size(d.constraints[end].seed_matrix, 2))
end
# 10 is hardcoded upper bound to avoid excess memory allocation
max_chunk = min(max_chunk, 10)
max_chunk = min(max_chunk, MAX_CHUNK)
max_expr_with_sub_length = max(max_expr_with_sub_length, max_expr_length)
if d.want_hess || want_hess_storage
d.input_ϵ = zeros(max_chunk * N)
Expand Down
12 changes: 2 additions & 10 deletions test/Nonlinear/ReverseAD.jl
Original file line number Diff line number Diff line change
Expand Up @@ -150,21 +150,13 @@ function test_objective_quadratic_multivariate_subexpressions()
MOI.eval_hessian_objective(evaluator, H, val)
@test H == [2.0, 2.0, 1.0]
@test evaluator.backend.max_chunk == 2
# The call of `_eval_hessian_inner` from `_eval_hessian` needs dynamic dispatch for `Val(chunk)` so it allocates.
# We call directly `_eval_hessian_inner` to check that the rest does not allocates.
@test 0 == @allocated MOI.Nonlinear.ReverseAD._eval_hessian_inner(
evaluator.backend,
evaluator.backend.objective,
H,
1.0,
0,
Val(2),
)
@test 0 == @allocated MOI.eval_hessian_objective(evaluator, H, val)
@test MOI.hessian_lagrangian_structure(evaluator) ==
[(1, 1), (2, 2), (2, 1)]
H = [NaN, NaN, NaN]
μ = Float64[]
MOI.eval_hessian_lagrangian(evaluator, H, val, 1.5, μ)
@test 0 == @allocated MOI.eval_hessian_lagrangian(evaluator, H, val, 1.5, μ)
@test H == 1.5 .* [2.0, 2.0, 1.0]
v = [0.3, 0.4]
hv = [NaN, NaN]
Expand Down
Loading