Skip to content

Implement a CPU backend using POCL #556

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 17, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -22,12 +22,8 @@ jobs:
fail-fast: false
matrix:
version:
- '1.6'
- '1.7'
- '1.8'
- '1.9'
- '1.10'
- '~1.11.0-0'
- '1.11'
os:
- ubuntu-latest
- macOS-latest
35 changes: 18 additions & 17 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,20 +1,32 @@
name = "KernelAbstractions"
uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
version = "0.9.34"
version = "0.10.0-dev"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4"
PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
SPIRVIntrinsics = "71d1d633-e7e8-4a92-83a1-de8814b09ba8"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd"

[weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[extensions]
EnzymeExt = "EnzymeCore"
LinearAlgebraExt = "LinearAlgebra"
SparseArraysExt = "SparseArrays"

[compat]
Adapt = "0.4, 1.0, 2.0, 3.0, 4"
@@ -24,23 +36,12 @@ InteractiveUtils = "1.6"
LinearAlgebra = "1.6"
MacroTools = "0.5"
PrecompileTools = "1"
Requires = "1.3"
SparseArrays = "<0.0.1, 1.6"
StaticArrays = "0.12, 1.0"
UUIDs = "<0.0.1, 1.6"
julia = "1.6"

[extensions]
EnzymeExt = "EnzymeCore"
LinearAlgebraExt = "LinearAlgebra"
SparseArraysExt = "SparseArrays"
julia = "1.10"

[extras]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[weakdeps]
EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
3 changes: 3 additions & 0 deletions cuda/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[deps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
52 changes: 24 additions & 28 deletions examples/histogram.jl
Original file line number Diff line number Diff line change
@@ -74,32 +74,28 @@ function move(backend, input)
end

@testset "histogram tests" begin
if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
@test_skip false
else
rand_input = [rand(1:128) for i in 1:1000]
linear_input = [i for i in 1:1024]
all_two = [2 for i in 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
histogram_two_baseline = create_histogram(all_two)

rand_input = move(backend, rand_input)
linear_input = move(backend, linear_input)
all_two = move(backend, all_two)

rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
two_histogram = KernelAbstractions.zeros(backend, Int, 2)

histogram!(rand_histogram, rand_input)
histogram!(linear_histogram, linear_input)
histogram!(two_histogram, all_two)
KernelAbstractions.synchronize(CPU())

@test isapprox(Array(rand_histogram), histogram_rand_baseline)
@test isapprox(Array(linear_histogram), histogram_linear_baseline)
@test isapprox(Array(two_histogram), histogram_two_baseline)
end
rand_input = [rand(1:128) for i in 1:1000]
linear_input = [i for i in 1:1024]
all_two = [2 for i in 1:512]

histogram_rand_baseline = create_histogram(rand_input)
histogram_linear_baseline = create_histogram(linear_input)
histogram_two_baseline = create_histogram(all_two)

rand_input = move(backend, rand_input)
linear_input = move(backend, linear_input)
all_two = move(backend, all_two)

rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
two_histogram = KernelAbstractions.zeros(backend, Int, 2)

histogram!(rand_histogram, rand_input)
histogram!(linear_histogram, linear_input)
histogram!(two_histogram, all_two)
KernelAbstractions.synchronize(CPU())

@test isapprox(Array(rand_histogram), histogram_rand_baseline)
@test isapprox(Array(linear_histogram), histogram_linear_baseline)
@test isapprox(Array(two_histogram), histogram_two_baseline)
end
3 changes: 1 addition & 2 deletions examples/naive_transpose.jl
Original file line number Diff line number Diff line change
@@ -15,8 +15,7 @@ function naive_transpose!(a, b)
end
backend = get_backend(a)
@assert get_backend(b) == backend
groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
kernel! = naive_transpose_kernel!(backend, groupsize)
kernel! = naive_transpose_kernel!(backend, 256)
kernel!(a, b, ndrange = size(a))
return
end
80 changes: 32 additions & 48 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
@@ -50,7 +50,7 @@ synchronize(backend)
```
"""
macro kernel(expr)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indices=# false)
return __kernel(expr, #=force_inbounds=# false, #=unsafe_indices=# false)
end

"""
@@ -66,18 +66,20 @@ This allows for two different configurations:

!!! warn
This is an experimental feature.

!!! note
`cpu={true, false}` is deprecated for KernelAbstractions 1.0
"""
macro kernel(ex...)
if length(ex) == 1
return __kernel(ex[1], true, false, false)
return __kernel(ex[1], false, false)
else
generate_cpu = true
unsafe_indices = false
force_inbounds = false
for i in 1:(length(ex) - 1)
if ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :cpu && ex[i].args[2] isa Bool
generate_cpu = ex[i].args[2]
#deprecated
elseif ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
force_inbounds = ex[i].args[2]
@@ -94,7 +96,7 @@ macro kernel(ex...)
)
end
end
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indices)
return __kernel(ex[end], force_inbounds, unsafe_indices)
end
end

@@ -190,6 +192,8 @@ After releasing the memory of an array, it should no longer be accessed.
"""
function unsafe_free! end

unsafe_free!(::AbstractArray) = return

###
# Kernel language
# - @localmem
@@ -254,6 +258,9 @@ For storage that only persists between `@synchronize` statements, an `MArray` ca
instead.

See also [`@uniform`](@ref).

!!! note
`@private` is deprecated for KernelAbstractions 1.0
"""
macro private(T, dims)
if dims isa Integer
@@ -269,6 +276,9 @@ end

Creates a private local of `mem` per item in the workgroup. This can be safely used
across [`@synchronize`](@ref) statements.

!!! note
`@private` is deprecated for KernelAbstractions 1.0
"""
macro private(expr)
return esc(expr)
@@ -279,6 +289,9 @@ end

`expr` is evaluated outside the workitem scope. This is useful for variable declarations
that span workitems, or are reused across `@synchronize` statements.

!!! note
`@uniform` is deprecated for KernelAbstractions 1.0
"""
macro uniform(value)
return esc(value)
@@ -330,6 +343,8 @@ Access the hidden context object used by KernelAbstractions.
!!! warn
Only valid to be used from a kernel with `cpu=false`.

!!! note
`@context` will be supported on all backends in KernelAbstractions 1.0
```
function f(@context, a)
I = @index(Global, Linear)
@@ -478,31 +493,11 @@ Abstract type for all GPU based KernelAbstractions backends.

!!! note
New backend implementations **must** sub-type this abstract type.
"""
abstract type GPU <: Backend end

"""
CPU(; static=false)

Instantiate a CPU (multi-threaded) backend.

## Options:
- `static`: Uses a static thread assignment, this can be beneficial for NUMA aware code.
Defaults to false.
"""
struct CPU <: Backend
static::Bool
CPU(; static::Bool = false) = new(static)
end

"""
isgpu(::Backend)::Bool

Returns true for all [`GPU`](@ref) backends.
!!! note
`GPU` will be removed in KernelAbstractions v1.0
"""
isgpu(::GPU) = true
isgpu(::CPU) = false

abstract type GPU <: Backend end

"""
get_backend(A::AbstractArray)::Backend
@@ -518,12 +513,9 @@ function get_backend end
# Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
get_backend(A::AbstractArray) = get_backend(parent(A))

get_backend(::Array) = CPU()

# Define:
# adapt_storage(::Backend, a::Array) = adapt(BackendArray, a)
# adapt_storage(::Backend, a::BackendArray) = a
Adapt.adapt_storage(::CPU, a::Array) = a

"""
allocate(::Backend, Type, dims...)::AbstractArray
@@ -743,7 +735,7 @@ Partition a kernel for the given ndrange and workgroupsize.
return iterspace, dynamic
end

function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: Union{CPU, GPU}, S <: _Size, NDRange <: _Size, XPUName}
function construct(backend::Backend, ::S, ::NDRange, xpu_name::XPUName) where {Backend <: GPU, S <: _Size, NDRange <: _Size, XPUName}
return Kernel{Backend, S, NDRange, XPUName}(backend, xpu_name)
end

@@ -760,6 +752,10 @@ include("compiler.jl")
function __workitems_iterspace end
function __validindex end

# for reflection
function mkcontext end
function launch_config end

include("macros.jl")

###
@@ -829,8 +825,11 @@ end
end

# CPU backend
include("pocl/pocl.jl")
using .POCL
export POCLBackend

include("cpu.jl")
const CPU = POCLBackend

# precompile
PrecompileTools.@compile_workload begin
@@ -844,19 +843,4 @@ PrecompileTools.@compile_workload begin
end
end

if !isdefined(Base, :get_extension)
using Requires
end

@static if !isdefined(Base, :get_extension)
function __init__()
@require EnzymeCore = "f151be2c-9106-41f4-ab19-57ee4f262869" include("../ext/EnzymeExt.jl")
end
end

if !isdefined(Base, :get_extension)
include("../ext/LinearAlgebraExt.jl")
include("../ext/SparseArraysExt.jl")
end

end #module
1 change: 0 additions & 1 deletion src/cpu.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
unsafe_free!(::AbstractArray) = return
synchronize(::CPU) = nothing

allocate(::CPU, ::Type{T}, dims::Tuple) where {T} = Array{T}(undef, dims)
Loading
Loading