Merge pull request #118 from SciML/ap/static_kernels

avik-pal · web-flow · commit ade88a24beee · 2024-01-13T11:08:44.000-05:00
Using SimpleNonlinearSolve in GPU Kernels
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -0,0 +1,16 @@
+steps:
+  - label: "Julia 1"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1"
+      - JuliaCI/julia-test#v1:
+    agents:
+      queue: "juliagpu"
+      cuda: "*"
+    timeout_in_minutes: 30
+    # Don't run Buildkite if the commit message includes the text [skip tests]
+    if: build.message !~ /\[skip tests\]/
+
+env:
+  GROUP: CUDA
+  JULIA_PKG_SERVER: ""
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SimpleNonlinearSolve"
 uuid = "727e6d20-b764-4bd8-a329-72de5adea6c7"
 authors = ["SciML"]
-version = "1.2.0"
+version = "1.2.1"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
diff --git a/README.md b/README.md
@@ -5,6 +5,7 @@
 
 [![codecov](https://codecov.io/gh/SciML/SimpleNonlinearSolve.jl/branch/main/graph/badge.svg)](https://codecov.io/gh/SciML/SimpleNonlinearSolve.jl)
 [![Build Status](https://github.com/SciML/SimpleNonlinearSolve.jl/workflows/CI/badge.svg)](https://github.com/SciML/SimpleNonlinearSolve.jl/actions?query=workflow%3ACI)
+[![Build status](https://badge.buildkite.com/c5f7db4f1b5e8a592514378b6fc807d934546cc7d5aa79d645.svg?branch=main)](https://buildkite.com/julialang/simplenonlinearsolve-dot-jl)
 
 [![ColPrac: Contributor's Guide on Collaborative Practices for Community Packages](https://img.shields.io/badge/ColPrac-Contributor%27s%20Guide-blueviolet)](https://github.com/SciML/ColPrac)
 [![SciML Code Style](https://img.shields.io/static/v1?label=code%20style&message=SciML&color=9558b2&labelColor=389826)](https://github.com/SciML/SciMLStyle)
diff --git a/src/SimpleNonlinearSolve.jl b/src/SimpleNonlinearSolve.jl
@@ -59,6 +59,12 @@ function SciMLBase.solve(prob::IntervalNonlinearProblem, alg::Nothing,
     return solve(prob, ITP(), args...; kwargs...)
 end
 
+# By Pass the highlevel checks for NonlinearProblem for Simple Algorithms
+function SciMLBase.solve(prob::NonlinearProblem, alg::AbstractSimpleNonlinearSolveAlgorithm,
+        args...; kwargs...)
+    return SciMLBase.__solve(prob, alg, args...; kwargs...)
+end
+
 @setup_workload begin
     for T in (Float32, Float64)
         prob_no_brack_scalar = NonlinearProblem{false}((u, p) -> u .* u .- p, T(0.1), T(2))
diff --git a/src/nlsolve/dfsane.jl b/src/nlsolve/dfsane.jl
@@ -1,6 +1,6 @@
 """
     SimpleDFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0,
-        M::Int = 10, γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
+        M::Union{Int, Val} = Val(10), γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
         nexp::Int = 2, η_strategy::Function = (f_1, k, x, F) -> f_1 ./ k^2)
 
 A low-overhead implementation of the df-sane method for solving large-scale nonlinear
@@ -42,21 +42,27 @@ see the paper [1].
 information for solving large-scale nonlinear systems of equations, Mathematics of
 Computation, 75, 1429-1448.
 """
-@kwdef @concrete struct SimpleDFSane <: AbstractSimpleNonlinearSolveAlgorithm
-    σ_min = 1e-10
-    σ_max = 1e10
-    σ_1 = 1.0
-    M::Int = 10
-    γ = 1e-4
-    τ_min = 0.1
-    τ_max = 0.5
-    nexp::Int = 2
-    η_strategy = (f_1, k, x, F) -> f_1 ./ k^2
+@concrete struct SimpleDFSane{M} <: AbstractSimpleNonlinearSolveAlgorithm
+    σ_min
+    σ_max
+    σ_1
+    γ
+    τ_min
+    τ_max
+    nexp::Int
+    η_strategy
 end
 
-function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
+function SimpleDFSane(; σ_min::Real = 1e-10, σ_max::Real = 1e10, σ_1::Real = 1.0,
+        M::Union{Int, Val} = Val(10), γ::Real = 1e-4, τ_min::Real = 0.1, τ_max::Real = 0.5,
+        nexp::Int = 2, η_strategy::F = (f_1, k, x, F) -> f_1 ./ k^2) where {F}
+    return SimpleDFSane{SciMLBase._unwrap_val(M)}(σ_min, σ_max, σ_1, γ, τ_min, τ_max, nexp,
+        η_strategy)
+end
+
+function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane{M}, args...;
         abstol = nothing, reltol = nothing, maxiters = 1000, alias_u0 = false,
-        termination_condition = nothing, kwargs...)
+        termination_condition = nothing, kwargs...) where {M}
     x = __maybe_unaliased(prob.u0, alias_u0)
     fx = _get_fx(prob, x)
     T = eltype(x)
@@ -65,7 +71,7 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
     σ_max = T(alg.σ_max)
     σ_k = T(alg.σ_1)
 
-    (; M, nexp, η_strategy) = alg
+    (; nexp, η_strategy) = alg
     γ = T(alg.γ)
     τ_min = T(alg.τ_min)
     τ_max = T(alg.τ_max)
@@ -77,7 +83,11 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
     α_1 = one(T)
     f_1 = fx_norm
 
-    history_f_k = fill(fx_norm, M)
+    history_f_k = if x isa SArray
+        ones(SVector{M, T}) * fx_norm
+    else
+        fill(fx_norm, M)
+    end
 
     # Generate the cache
     @bb x_cache = similar(x)
@@ -143,7 +153,11 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleDFSane, args...;
         fx_norm = fx_norm_new
 
         # Store function value
-        history_f_k[mod1(k, M)] = fx_norm_new
+        if history_f_k isa SVector
+            history_f_k = Base.setindex(history_f_k, fx_norm_new, mod1(k, M))
+        else
+            history_f_k[mod1(k, M)] = fx_norm_new
+        end
         k += 1
     end
 
diff --git a/src/nlsolve/lbroyden.jl b/src/nlsolve/lbroyden.jl
@@ -21,7 +21,22 @@ function SimpleLimitedMemoryBroyden(; threshold::Union{Val, Int} = Val(27))
     return SimpleLimitedMemoryBroyden{SciMLBase._unwrap_val(threshold)}()
 end
 
-@views function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
+function SciMLBase.__solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
+        args...; termination_condition = nothing, kwargs...)
+    if prob.u0 isa SArray
+        if termination_condition === nothing ||
+           termination_condition isa AbsNormTerminationMode
+            return __static_solve(prob, alg, args...; termination_condition, kwargs...)
+        end
+        @warn "Specifying `termination_condition = $(termination_condition)` for \
+               `SimpleLimitedMemoryBroyden` with `SArray` is not non-allocating. Use \
+               either `termination_condition = AbsNormTerminationMode()` or \
+               `termination_condition = nothing`." maxlog=1
+    end
+    return __generic_solve(prob, alg, args...; termination_condition, kwargs...)
+end
+
+@views function __generic_solve(prob::NonlinearProblem, alg::SimpleLimitedMemoryBroyden,
         args...; abstol = nothing, reltol = nothing, maxiters = 1000, alias_u0 = false,
         termination_condition = nothing, kwargs...)
     x = __maybe_unaliased(prob.u0, alias_u0)
@@ -36,7 +51,7 @@ end
 
     fx = _get_fx(prob, x)
 
-    U, Vᵀ = __init_low_rank_jacobian(x, fx, threshold)
+    U, Vᵀ = __init_low_rank_jacobian(x, fx, x isa StaticArray ? threshold : Val(η))
 
     abstol, reltol, tc_cache = init_termination_cache(abstol, reltol, fx, x,
         termination_condition)
@@ -48,7 +63,7 @@ end
     @bb δf = copy(fx)
 
     @bb vᵀ_cache = copy(x)
-    Tcache = __lbroyden_threshold_cache(x, threshold)
+    Tcache = __lbroyden_threshold_cache(x, x isa StaticArray ? threshold : Val(η))
     @bb mat_cache = copy(x)
 
     for i in 1:maxiters
@@ -83,6 +98,97 @@ end
     return build_solution(prob, alg, x, fx; retcode = ReturnCode.MaxIters)
 end
 
+# Non-allocating StaticArrays version of SimpleLimitedMemoryBroyden is actually quite
+# finicky, so we'll implement it separately from the generic version
+# Ignore termination_condition. Don't pass things into internal functions
+function __static_solve(prob::NonlinearProblem{<:SArray}, alg::SimpleLimitedMemoryBroyden,
+        args...; abstol = nothing, maxiters = 1000, kwargs...)
+    x = prob.u0
+    fx = _get_fx(prob, x)
+    threshold = __get_threshold(alg)
+
+    U, Vᵀ = __init_low_rank_jacobian(vec(x), vec(fx), threshold)
+
+    abstol = DiffEqBase._get_tolerance(abstol, eltype(x))
+
+    xo, δx, fo, δf = x, -fx, fx, fx
+
+    converged, res = __unrolled_lbroyden_initial_iterations(prob, xo, fo, δx, abstol, U, Vᵀ,
+        threshold)
+
+    converged &&
+        return build_solution(prob, alg, res.x, res.fx; retcode = ReturnCode.Success)
+
+    xo, fo, δx = res.x, res.fx, res.δx
+
+    for i in 1:(maxiters - SciMLBase._unwrap_val(threshold))
+        x = xo .+ δx
+        fx = prob.f(x, prob.p)
+        δf = fx - fo
+
+        maximum(abs, fx) ≤ abstol &&
+            return build_solution(prob, alg, x, fx; retcode = ReturnCode.Success)
+
+        vᵀ = _restructure(x, _rmatvec!!(U, Vᵀ, vec(δx)))
+        mvec = _restructure(x, _matvec!!(U, Vᵀ, vec(δf)))
+
+        d = dot(vᵀ, δf)
+        δx = @. (δx - mvec) / d
+
+        U = Base.setindex(U, vec(δx), mod1(i, SciMLBase._unwrap_val(threshold)))
+        Vᵀ = Base.setindex(Vᵀ, vec(vᵀ), mod1(i, SciMLBase._unwrap_val(threshold)))
+
+        δx = -_restructure(fx, _matvec!!(U, Vᵀ, vec(fx)))
+
+        xo = x
+        fo = fx
+    end
+
+    return build_solution(prob, alg, xo, fo; retcode = ReturnCode.MaxIters)
+end
+
+@generated function __unrolled_lbroyden_initial_iterations(prob, xo, fo, δx, abstol, U,
+        Vᵀ, ::Val{threshold}) where {threshold}
+    calls = []
+    for i in 1:threshold
+        static_idx, static_idx_p1 = Val(i - 1), Val(i)
+        push!(calls,
+            quote
+                x = xo .+ δx
+                fx = prob.f(x, prob.p)
+                δf = fx - fo
+
+                maximum(abs, fx) ≤ abstol && return true, (; x, fx, δx)
+
+                _U = __first_n_getindex(U, $(static_idx))
+                _Vᵀ = __first_n_getindex(Vᵀ, $(static_idx))
+
+                vᵀ = _restructure(x, _rmatvec!!(_U, _Vᵀ, vec(δx)))
+                mvec = _restructure(x, _matvec!!(_U, _Vᵀ, vec(δf)))
+
+                d = dot(vᵀ, δf)
+                δx = @. (δx - mvec) / d
+
+                U = Base.setindex(U, vec(δx), $(i))
+                Vᵀ = Base.setindex(Vᵀ, vec(vᵀ), $(i))
+
+                _U = __first_n_getindex(U, $(static_idx_p1))
+                _Vᵀ = __first_n_getindex(Vᵀ, $(static_idx_p1))
+                δx = -_restructure(fx, _matvec!!(_U, _Vᵀ, vec(fx)))
+
+                xo = x
+                fo = fx
+            end)
+    end
+    push!(calls, quote
+        # Termination Check
+        maximum(abs, fx) ≤ abstol && return true, (; x, fx, δx)
+
+        return false, (; x, fx, δx)
+    end)
+    return Expr(:block, calls...)
+end
+
 function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
     # xᵀ × (-I + UVᵀ)
     η = size(U, 2)
@@ -98,6 +204,9 @@ function _rmatvec!!(y, xᵀU, U, Vᵀ, x)
     return y
 end
 
+@inline _rmatvec!!(::Nothing, Vᵀ, x) = -x
+@inline _rmatvec!!(U, Vᵀ, x) = __mapTdot(__mapdot(x, U), Vᵀ) .- x
+
 function _matvec!!(y, Vᵀx, U, Vᵀ, x)
     # (-I + UVᵀ) × x
     η = size(U, 2)
@@ -113,7 +222,56 @@ function _matvec!!(y, Vᵀx, U, Vᵀ, x)
     return y
 end
 
+@inline _matvec!!(::Nothing, Vᵀ, x) = -x
+@inline _matvec!!(U, Vᵀ, x) = __mapTdot(__mapdot(x, Vᵀ), U) .- x
+
+function __mapdot(x::SVector{S1}, Y::SVector{S2, <:SVector{S1}}) where {S1, S2}
+    return map(Base.Fix1(dot, x), Y)
+end
+@generated function __mapTdot(x::SVector{S1}, Y::SVector{S1, <:SVector{S2}}) where {S1, S2}
+    calls = []
+    syms = [gensym("m$(i)") for i in 1:S1]
+    for i in 1:S1
+        push!(calls, :($(syms[i]) = x[$(i)] .* Y[$i]))
+    end
+    push!(calls, :(return .+($(syms...))))
+    return Expr(:block, calls...)
+end
+
+@generated function __first_n_getindex(x::SVector{L, T}, ::Val{N}) where {L, T, N}
+    @assert N ≤ L
+    getcalls = ntuple(i -> :(x[$i]), N)
+    N == 0 && return :(return nothing)
+    return :(return SVector{$N, $T}(($(getcalls...))))
+end
+
 __lbroyden_threshold_cache(x, ::Val{threshold}) where {threshold} = similar(x, threshold)
-function __lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold}
-    return SArray{Tuple{threshold}, eltype(x)}(ntuple(_ -> zero(eltype(x)), threshold))
+function __lbroyden_threshold_cache(x::StaticArray, ::Val{threshold}) where {threshold}
+    return zeros(MArray{Tuple{threshold}, eltype(x)})
+end
+__lbroyden_threshold_cache(x::SArray, ::Val{threshold}) where {threshold} = nothing
+
+function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
+        ::Val{threshold}) where {S1, S2, T1, T2, threshold}
+    T = promote_type(T1, T2)
+    fuSize, uSize = Size(fu), Size(u)
+    Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
+    U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
+    return U, Vᵀ
+end
+@generated function __init_low_rank_jacobian(u::SVector{Lu, T1}, fu::SVector{Lfu, T2},
+        ::Val{threshold}) where {Lu, Lfu, T1, T2, threshold}
+    T = promote_type(T1, T2)
+    inner_inits_Vᵀ = [:(zeros(SVector{$Lu, $T})) for i in 1:threshold]
+    inner_inits_U = [:(zeros(SVector{$Lfu, $T})) for i in 1:threshold]
+    return quote
+        Vᵀ = SVector($(inner_inits_Vᵀ...))
+        U = SVector($(inner_inits_U...))
+        return U, Vᵀ
+    end
+end
+function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
+    Vᵀ = similar(u, threshold, length(u))
+    U = similar(u, length(fu), threshold)
+    return U, Vᵀ
 end
diff --git a/src/utils.jl b/src/utils.jl
@@ -243,20 +243,6 @@ function __init_identity_jacobian!!(J::SVector{S1}) where {S1}
     return ones(SVector{S1, eltype(J)})
 end
 
-function __init_low_rank_jacobian(u::StaticArray{S1, T1}, fu::StaticArray{S2, T2},
-        ::Val{threshold}) where {S1, S2, T1, T2, threshold}
-    T = promote_type(T1, T2)
-    fuSize, uSize = Size(fu), Size(u)
-    Vᵀ = MArray{Tuple{threshold, prod(uSize)}, T}(undef)
-    U = MArray{Tuple{prod(fuSize), threshold}, T}(undef)
-    return U, Vᵀ
-end
-function __init_low_rank_jacobian(u, fu, ::Val{threshold}) where {threshold}
-    Vᵀ = similar(u, threshold, length(u))
-    U = similar(u, length(fu), threshold)
-    return U, Vᵀ
-end
-
 @inline _vec(v) = vec(v)
 @inline _vec(v::Number) = v
 @inline _vec(v::AbstractVector) = v
diff --git a/test/basictests.jl b/test/basictests.jl
@@ -164,8 +164,8 @@ end
 ## SimpleDFSane needs to allocate a history vector
 @testset "Allocation Checks: $(_nameof(alg))" for alg in (SimpleNewtonRaphson(),
     SimpleHalley(), SimpleBroyden(), SimpleKlement(), SimpleLimitedMemoryBroyden(),
-    SimpleTrustRegion())
-    @check_allocs nlsolve(prob, alg) = DiffEqBase.__solve(prob, alg; abstol = 1e-9)
+    SimpleTrustRegion(), SimpleDFSane())
+    @check_allocs nlsolve(prob, alg) = SciMLBase.solve(prob, alg; abstol = 1e-9)
 
     nlprob_scalar = NonlinearProblem{false}(quadratic_f, 1.0, 2.0)
     nlprob_sa = NonlinearProblem{false}(quadratic_f, @SVector[1.0, 1.0], 2.0)
@@ -175,18 +175,17 @@ end
         @test true
     catch e
         @error e
-        @test false
+        # History Vector Allocates
+        @test false broken=(alg isa SimpleDFSane)
     end
 
     # ForwardDiff allocates for hessian since we don't propagate the chunksize
-    # SimpleLimitedMemoryBroyden needs to do views on the low rank matrices so the sizes
-    # are dynamic. This can be fixed but no without maintaining the simplicity of the code
     try
         nlsolve(nlprob_sa, alg)
         @test true
     catch e
         @error e
-        @test false broken=(alg isa SimpleHalley || alg isa SimpleLimitedMemoryBroyden)
+        @test false broken=(alg isa SimpleHalley)
     end
 end
 
diff --git a/test/cuda.jl b/test/cuda.jl
diff --git a/test/cuda/Project.toml b/test/cuda/Project.toml
diff --git a/test/runtests.jl b/test/runtests.jl