Merge #1077

1077: Limit thread block size in Courant number kernels r=mwarusz a=mwarusz # Description This limits the number of threads in a thread block for two kernels used in Courant number calculation. This allows use of high order polynomials on the GPU, at least without hyperdiffusion. Missed in #1005. CC @vchuravy I have - [ ] Written and run all necessary tests with CLIMA by including `tests/runtests.jl` - [ ] Followed all necessary [style guidelines](https://CliMA.github.io/CLIMA/latest/CodingConventions.html) and run `julia .dev/format.jl` - [ ] Updated the documentation to reflect changes from this PR. Co-authored-by: Maciej Waruszewski <mwarusz@igf.fuw.edu.pl>
CliMA · May 11, 2020 · 47a0a67 · 47a0a67
2 parents 0196742 + b07b979
commit 47a0a67
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 10 deletions.
diff --git a/src/Numerics/DGmethods/DGmodel.jl b/src/Numerics/DGmethods/DGmodel.jl
@@ -796,17 +796,20 @@ function courant(
         device = grid.vgeo isa Array ? CPU() : CUDA()
         pointwise_courant = similar(grid.vgeo, Nq^dim, nrealelem)
         event = Event(device)
-        event = Grids.kernel_min_neighbor_distance!(device, (Nq, Nq, Nqk))(
+        event = Grids.kernel_min_neighbor_distance!(
+            device,
+            min(Nq * Nq * Nqk, 1024),
+        )(
             Val(N),
             Val(dim),
             direction,
             pointwise_courant,
             grid.vgeo,
             topology.realelems;
-            ndrange = (nrealelem * Nq, Nq, Nqk),
+            ndrange = (nrealelem * Nq * Nq * Nqk),
             dependencies = (event,),
         )
-        event = kernel_local_courant!(device, Nq * Nq * Nqk)(
+        event = kernel_local_courant!(device, min(Nq * Nq * Nqk, 1024))(
             m,
             Val(dim),
             Val(N),

diff --git a/src/Numerics/DGmethods/DGmodel_kernels.jl b/src/Numerics/DGmethods/DGmodel_kernels.jl
@@ -2594,8 +2594,10 @@ end
             MArray{Tuple{num_state_gradient_flux}, FT}(undef)
     end
 
-    e = @index(Group, Linear)
-    n = @index(Local, Linear)
+    I = @index(Global, Linear)
+    e = (I - 1) ÷ Np + 1
+    n = (I - 1) % Np + 1
+
     @inbounds begin
         @unroll for s in 1:num_state_conservative
             local_state_conservative[s] = state_conservative[n, s, e]

diff --git a/src/Numerics/Mesh/Grids.jl b/src/Numerics/Mesh/Grids.jl
@@ -307,14 +307,14 @@ function min_node_distance(
         device = grid.vgeo isa Array ? CPU() : CUDA()
         min_neighbor_distance = similar(grid.vgeo, Nq^dim, nrealelem)
         event = Event(device)
-        event = kernel_min_neighbor_distance!(device, (Nq, Nq, Nqk))(
+        event = kernel_min_neighbor_distance!(device, min(Nq * Nq * Nqk, 1024))(
             Val(N),
             Val(dim),
             direction,
             min_neighbor_distance,
             grid.vgeo,
             topology.realelems;
-            ndrange = (Nq, Nq, Nqk, nrealelem),
+            ndrange = (Nq * Nq * Nqk * nrealelem),
             dependencies = (event,),
         )
         wait(device, event)
@@ -613,6 +613,7 @@ neighbors.
         FT = eltype(min_neighbor_distance)
         Nq = N + 1
         Nqk = dim == 2 ? 1 : Nq
+        Np = Nq * Nq * Nqk
 
         if direction isa EveryDirection
             mininξ = (true, true, true)
@@ -623,12 +624,16 @@ neighbors.
         end
     end
 
-    e = @index(Group, Linear)
-    i, j, k = @index(Local, NTuple)
+    I = @index(Global, Linear)
+    e = (I - 1) ÷ Np + 1
+    ijk = (I - 1) % Np + 1
+
+    i = (ijk - 1) % Nq + 1
+    j = (ijk - 1) ÷ Nq % Nq + 1
+    k = (ijk - 1) ÷ Nq^2 % Nqk + 1
 
     md = typemax(FT)
 
-    ijk = i + Nq * (j - 1) + Nq * Nq * (k - 1)
     x = SVector(vgeo[ijk, _x1, e], vgeo[ijk, _x2, e], vgeo[ijk, _x3, e])
 
     if mininξ[1]