JuliaGPU
diff --git a/‎.github/workflows/runic.yml
Lines changed: 25 additions & 0 deletions b/‎.github/workflows/runic.yml
Lines changed: 25 additions & 0 deletions
diff --git a/‎benchmark/benchmarks.jl
Lines changed: 8 additions & 6 deletions b/‎benchmark/benchmarks.jl
Lines changed: 8 additions & 6 deletions
diff --git a/‎docs/make.jl
Lines changed: 12 additions & 12 deletions b/‎docs/make.jl
Lines changed: 12 additions & 12 deletions
diff --git a/‎examples/histogram.jl
Lines changed: 9 additions & 9 deletions b/‎examples/histogram.jl
Lines changed: 9 additions & 9 deletions
diff --git a/‎examples/matmul.jl
Lines changed: 6 additions & 6 deletions b/‎examples/matmul.jl
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/memcopy.jl
Lines changed: 1 addition & 1 deletion b/‎examples/memcopy.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/memcopy_static.jl
Lines changed: 1 addition & 1 deletion b/‎examples/memcopy_static.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/mpi.jl
Lines changed: 6 additions & 4 deletions b/‎examples/mpi.jl
Lines changed: 6 additions & 4 deletions
diff --git a/‎examples/naive_transpose.jl
Lines changed: 2 additions & 3 deletions b/‎examples/naive_transpose.jl
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/numa_aware.jl
Lines changed: 8 additions & 6 deletions b/‎examples/numa_aware.jl
Lines changed: 8 additions & 6 deletions
@@ -0,0 +1,25 @@
+name: Runic formatting
+on:
+  push:
+    branches:
+      - 'master'
+      - 'release-'
+    tags:
+      - '*'
+  pull_request:
+jobs:
+  runic:
+    name: Runic
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: julia-actions/setup-julia@v2
+        with:
+          version: "nightly" # Only nightly have the -m flag currently
+      - uses: julia-actions/cache@v2
+      - name: Install Runic
+        run: |
+          julia --color=yes --project=@runic -e 'using Pkg; Pkg.add(url = "https://github.com/fredrikekre/Runic.jl")'
+      - name: Run Runic
+        run: |
+          git ls-files -z -- '*.jl' | xargs -0 julia --project=@runic -m Runic --check --diff
@@ -36,11 +36,12 @@ let static = BenchmarkGroup()
         for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
             dtype[N] = @benchmarkable begin
                 kernel = saxpy_kernel!($BACKEND, 1024)
-                kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
-            end setup=(
+                kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
+            end setup = (
                 X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
                 Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
-                Z = KernelAbstractions.zeros($BACKEND, $T, $N))
+                Z = KernelAbstractions.zeros($BACKEND, $T, $N)
+            )
         end
         static["$T"] = dtype
     end
@@ -53,11 +54,12 @@ let default = BenchmarkGroup()
         for N in (64, 256, 512, 1024, 2048, 4096, 16384, 32768, 65536, 262144, 1048576)
             dtype[N] = @benchmarkable begin
                 kernel = saxpy_kernel!($BACKEND)
-                kernel(Z, convert($T, 2.0), X, Y, ndrange=size(Z))
-            end setup=(
+                kernel(Z, convert($T, 2.0), X, Y, ndrange = size(Z))
+            end setup = (
                 X = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
                 Y = rand!(KernelAbstractions.zeros($BACKEND, $T, $N));
-                Z = KernelAbstractions.zeros($BACKEND, $T, $N))
+                Z = KernelAbstractions.zeros($BACKEND, $T, $N)
+            )
         end
         default["$T"] = dtype
     end
 
@@ -7,17 +7,17 @@ function main()
     ci = get(ENV, "CI", "") == "true"
 
     makedocs(;
-        modules=[KernelAbstractions],
-        authors="JuliaGPU and contributors",
-        repo="https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
-        sitename="KernelAbstractions.jl",
-        format=Documenter.HTML(;
-            prettyurls=ci,
-            canonical="https://juliagpu.github.io/KernelAbstractions.jl",
-            assets=String[],
+        modules = [KernelAbstractions],
+        authors = "JuliaGPU and contributors",
+        repo = "https://github.com/JuliaGPU/KernelAbstractions.jl/blob/{commit}{path}#L{line}",
+        sitename = "KernelAbstractions.jl",
+        format = Documenter.HTML(;
+            prettyurls = ci,
+            canonical = "https://juliagpu.github.io/KernelAbstractions.jl",
+            assets = String[],
         ),
-        warnonly=[:missing_docs],
-        pages=[
+        warnonly = [:missing_docs],
+        pages = [
             "Home" => "index.md",
             "Quickstart" => "quickstart.md",
             "Writing kernels" => "kernels.md",
@@ -34,13 +34,13 @@ function main()
             "Extras" => [
                 "extras/unrolling.md",
             ], # Extras
-            "Notes for implementations" => "implementations.md"
+            "Notes for implementations" => "implementations.md",
         ], # pages
     )
 
     if ci
         deploydocs(;
-            repo="github.com/JuliaGPU/KernelAbstractions.jl",
+            repo = "github.com/JuliaGPU/KernelAbstractions.jl",
             push_preview = true,
         )
     end
 
@@ -30,28 +30,28 @@ end
     # possible to get a value of 312, then we will have 2 separate shmem blocks,
     # one from 1->256, and another from 256->512
     @uniform max_element = 1
-    for min_element = 1:gs:N
+    for min_element in 1:gs:N
 
         # Setting shared_histogram to 0
         @inbounds shared_histogram[lid] = 0
         @synchronize()
 
         max_element = min_element + gs
         if max_element > N
-            max_element = N+1
+            max_element = N + 1
         end
 
         # Defining bin on shared memory and writing to it if possible
         bin = input[tid]
         if bin >= min_element && bin < max_element
-            bin -= min_element-1
+            bin -= min_element - 1
             @atomic shared_histogram[bin] += 1
         end
 
         @synchronize()
 
-        if ((lid+min_element-1) <= N)
-            @atomic histogram_output[lid+min_element-1] += shared_histogram[lid]
+        if ((lid + min_element - 1) <= N)
+            @atomic histogram_output[lid + min_element - 1] += shared_histogram[lid]
         end
 
     end
@@ -62,7 +62,7 @@ function histogram!(histogram_output, input)
     backend = get_backend(histogram_output)
     # Need static block size
     kernel! = histogram_kernel!(backend, (256,))
-    kernel!(histogram_output, input, ndrange=size(input))
+    kernel!(histogram_output, input, ndrange = size(input))
 end
 
 function move(backend, input)
@@ -75,9 +75,9 @@ end
     if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
         @test_skip false
     else
-        rand_input = [rand(1:128) for i = 1:1000]
-        linear_input = [i for i = 1:1024]
-        all_two = [2 for i = 1:512]
+        rand_input = [rand(1:128) for i in 1:1000]
+        linear_input = [i for i in 1:1024]
+        all_two = [2 for i in 1:512]
 
         histogram_rand_baseline = create_histogram(rand_input)
         histogram_linear_baseline = create_histogram(linear_input)
 
@@ -7,11 +7,11 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
     # creating a temporary sum variable for matrix multiplication
     tmp_sum = zero(eltype(output))
-    for k = 1:size(a)[2]
-        tmp_sum += a[i,k] * b[k, j]
+    for k in 1:size(a)[2]
+        tmp_sum += a[i, k] * b[k, j]
     end
 
-    output[i,j] = tmp_sum
+    output[i, j] = tmp_sum
 end
 
 # Creating a wrapper kernel for launching with error checks
@@ -22,14 +22,14 @@ function matmul!(output, a, b)
     end
     backend = KernelAbstractions.get_backend(a)
     kernel! = matmul_kernel!(backend)
-    kernel!(output, a, b, ndrange=size(output)) 
+    kernel!(output, a, b, ndrange = size(output))
 end
 
 a = rand!(allocate(backend, Float32, 256, 123))
 b = rand!(allocate(backend, Float32, 123, 45))
 output = KernelAbstractions.zeros(backend, Float32, 256, 45)
 
-matmul!(output, a,b)
+matmul!(output, a, b)
 KernelAbstractions.synchronize(backend)
 
-@test isapprox(output, a*b)
+@test isapprox(output, a * b)
@@ -12,7 +12,7 @@ function mycopy!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend)
-    kernel(A, B, ndrange=length(A))
+    kernel(A, B, ndrange = length(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
 
@@ -12,7 +12,7 @@ function mycopy_static!(A, B)
     @assert get_backend(B) == backend
 
     kernel = copy_kernel!(backend, 32, size(A)) # if size(A) varies this will cause recompilation
-    kernel(A, B, ndrange=size(A))
+    kernel(A, B, ndrange = size(A))
 end
 
 A = KernelAbstractions.zeros(backend, Float64, 128, 128)
 
@@ -43,8 +43,8 @@ function main(backend)
     comm = MPI.COMM_WORLD
     MPI.Barrier(comm)
 
-    dst_rank = mod(MPI.Comm_rank(comm)+1, MPI.Comm_size(comm))
-    src_rank = mod(MPI.Comm_rank(comm)-1, MPI.Comm_size(comm))
+    dst_rank = mod(MPI.Comm_rank(comm) + 1, MPI.Comm_size(comm))
+    src_rank = mod(MPI.Comm_rank(comm) - 1, MPI.Comm_size(comm))
 
     T = Int64
     M = 10
@@ -59,8 +59,10 @@ function main(backend)
 
     KernelAbstractions.synchronize(backend)
 
-    recv_task, send_task = exchange!(h_send_buf, d_recv_buf, h_recv_buf,
-                                       src_rank, dst_rank, comm)
+    recv_task, send_task = exchange!(
+        h_send_buf, d_recv_buf, h_recv_buf,
+        src_rank, dst_rank, comm,
+    )
 
     cooperative_wait(recv_task)
     cooperative_wait(send_task)
 
@@ -17,7 +17,7 @@ function naive_transpose!(a, b)
     @assert get_backend(b) == backend
     groupsize = KernelAbstractions.isgpu(backend) ? 256 : 1024
     kernel! = naive_transpose_kernel!(backend, groupsize)
-    kernel!(a, b, ndrange=size(a))
+    kernel!(a, b, ndrange = size(a))
 end
 
 # resolution of grid will be res*res
@@ -27,7 +27,6 @@ res = 1024
 b = rand!(allocate(backend, Float32, res, res))
 a = KernelAbstractions.zeros(backend, Float32, res, res)
 
-naive_transpose!(a,b)
+naive_transpose!(a, b)
 KernelAbstractions.synchronize(backend)
 @test a == transpose(b)
-
@@ -18,8 +18,10 @@ end
 Estimate the memory bandwidth (GB/s) by performing a time measurement of a
 SAXPY kernel. Returns the memory bandwidth (GB/s) and the compute (GFLOP/s).
 """
-function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
-                       init = :parallel)
+function measure_membw(
+        backend = CPU(); verbose = true, N = 1024 * 500_000, dtype = Float32,
+        init = :parallel,
+    )
     bytes = 3 * sizeof(dtype) * N # num bytes transferred in SAXPY
     flops = 2 * N # num flops in SAXY
 
@@ -37,10 +39,10 @@ function measure_membw(backend = CPU(); verbose = true, N = 1024 * 500_000, dtyp
         kernel = saxpy_kernel($backend, $workgroup_size, $(size(Y)))
         kernel($a, $X, $Y, ndrange = $(size(Y)))
         KernelAbstractions.synchronize($backend)
-    end evals=2 samples=10
+    end evals = 2 samples = 10
 
-    mem_rate = bytes * 1e-9 / t # GB/s
-    flop_rate = flops * 1e-9 / t # GFLOP/s
+    mem_rate = bytes * 1.0e-9 / t # GB/s
+    flop_rate = flops * 1.0e-9 / t # GFLOP/s
 
     if verbose
         println("\tMemory Bandwidth (GB/s): ", round(mem_rate; digits = 2))
@@ -51,7 +53,7 @@ end
 
 # Static should be much better (on a system with multiple NUMA domains)
 measure_membw(CPU());
-measure_membw(CPU(; static=true));
+measure_membw(CPU(; static = true));
 
 # The following has significantly worse performance (even on systems with a single memory domain)!
 # measure_membw(CPU(); init=:serial);