Skip to content

Commit

Permalink
Fix localmem kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
efaulhaber committed Dec 23, 2024
1 parent 7d3053a commit 5b025d4
Showing 1 changed file with 6 additions and 4 deletions.
10 changes: 6 additions & 4 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -414,14 +414,16 @@ end
ndrange = max_particles_per_cell * length(nonempty_cells)

n_gpus = length(CUDA.devices())
ndrange_local = [div(ndrange, n_gpus) for _ in 1:n_gpus]
ndrange_local[end] += ndrange % n_gpus
cells_split = Iterators.partition(nonempty_cells, ceil(Int, length(nonempty_cells) / n_gpus))
@assert length(cells_split) == n_gpus

kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,))
@sync for i in 1:n_gpus
@sync for (i, nonempty_cells_) in enumerate(cells_split)
Threads.@spawn begin
CUDA.device!(i - 1)
kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange = ndrange_local[i])
kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells_,
Val(max_particles_per_cell), search_radius;
ndrange = length(nonempty_cells_) * max_particles_per_cell)
KernelAbstractions.synchronize(backend)
end
end
Expand Down

0 comments on commit 5b025d4

Please sign in to comment.