Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Proof of Concept] GPU kernel using block-local shared memory #73

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/PointNeighbors.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ using Adapt: Adapt
using Atomix: Atomix
using Base: @propagate_inbounds
using GPUArraysCore: AbstractGPUArray
using KernelAbstractions: KernelAbstractions, @kernel, @index
using KernelAbstractions: KernelAbstractions, @kernel, @index, @localmem, @synchronize
using LinearAlgebra: dot
using Polyester: Polyester
@reexport using StaticArrays: SVector
Expand Down
281 changes: 281 additions & 0 deletions src/nhs_grid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,287 @@ end
end
end

# for cell in cells
# for point in cell
# for neighbor_cell in neighbor_cells

# for neighbor in neighbor_cell
@inline function foreach_point_neighbor_localmem(f, system_coords, neighbor_coords,
neighborhood_search; search_radius = search_radius(neighborhood_search))
backend = KernelAbstractions.get_backend(system_coords)
max_particles_per_cell = 64
nhs_size = size(neighborhood_search.cell_list.linear_indices)
# cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
linear_indices = neighborhood_search.cell_list.linear_indices
cartesian_indices = CartesianIndices(size(linear_indices))
lengths = Array(neighborhood_search.cell_list.cells.lengths)
# max_particles_per_cell = maximum(lengths)
nonempty_cells = Adapt.adapt(backend, filter(index -> lengths[linear_indices[index]] > 0, cartesian_indices))
ndrange = max_particles_per_cell * length(nonempty_cells)
kernel = foreach_neighbor_localmem(backend, (max_particles_per_cell,))
kernel(f, system_coords, neighbor_coords, neighborhood_search, nonempty_cells, Val(max_particles_per_cell), search_radius; ndrange)

KernelAbstractions.synchronize(backend)

return nothing
end

@inline function copy_to_localmem!(local_points, local_neighbor_coords,
neighbor_cell, neighbor_system_coords,
neighborhood_search, particleidx)
points_view = points_in_cell(neighbor_cell, neighborhood_search)
n_particles_in_neighbor_cell = length(points_view)

# First use all threads to load the neighbors into local memory in parallel
if particleidx <= n_particles_in_neighbor_cell
@inbounds p = local_points[particleidx] = points_view[particleidx]
for d in 1:ndims(neighborhood_search)
@inbounds local_neighbor_coords[d, particleidx] = neighbor_system_coords[d, p]
efaulhaber marked this conversation as resolved.
Show resolved Hide resolved
end
end
return n_particles_in_neighbor_cell
end

# @parallel(block) for cell in cells
# for neighbor_cell in neighboring_cells
# @parallel(thread) for neighbor in neighbor_cell
# copy_coordinates_to_localmem(neighbor)
#
# # Make sure all threads finished the copying
# @synchronize
#
# @parallel(thread) for particle in cell
# for neighbor in neighbor_cell
# # This uses the neighbor coordinates from the local memory
# compute(point, neighbor)
#
# # Make sure all threads finished computing before we continue with copying
# @synchronize
@kernel cpu=false function foreach_neighbor_localmem(f::F, system_coords, neighbor_system_coords,
neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
cell_ = @index(Group)
cell = @inbounds Tuple(cells[cell_])
particleidx = @index(Local)
@assert 1 <= particleidx <= MAX

# Coordinate buffer in local memory
local_points = @localmem Int32 MAX
local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)

points = points_in_cell(cell, neighborhood_search)
n_particles_in_current_cell = length(points)

# Extract point coordinates if a point lies on this thread
if particleidx <= n_particles_in_current_cell
point = @inbounds points[particleidx]
point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
point)
else
point = zero(Int32)
point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
end

for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
neighbor_cell = Tuple(neighbor_cell_)

n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
neighbor_cell, neighbor_system_coords,
neighborhood_search, particleidx)

# Make sure all threads finished the copying
@synchronize

# Now each thread works on one point again
if particleidx <= n_particles_in_current_cell
for local_neighbor in 1:n_particles_in_neighbor_cell
@inbounds neighbor = local_points[local_neighbor]
@inbounds neighbor_coords = extract_svector(local_neighbor_coords,
Val(ndims(neighborhood_search)),
local_neighbor)

pos_diff = point_coords - neighbor_coords
distance2 = dot(pos_diff, pos_diff)

# TODO periodic

if distance2 <= search_radius^2
distance = sqrt(distance2) # TODO: eventuell fastmath

# Inline to avoid loss of performance
# compared to not using `foreach_point_neighbor`.
@inline f(point, neighbor, pos_diff, distance)
end
end
end

# Make sure all threads finished computing before we continue with copying
@synchronize()
end
end

# @parallel(block) for cell in cells
# @parallel(thread) for neighbor in first_neighbor_cell
# copy_coordinates_to_localmem!(local_coords, neighbor)
#
# for neighbor_cell in neighboring_cells
# @parallel(thread) for neighbor in neighbor_cell + 1
# copy_coordinates_to_localmem!(next_local_coords, neighbor)
#
# # No synchronize needed. The following loop works on `local_coords`.
#
# @parallel(thread) for particle in cell
# for neighbor in neighbor_cell
# # This uses the neighbor coordinates from the local memory
# compute(point, neighbor)
#
# # Make sure all threads finished computing before we switch variables
# @synchronize
# local_coords, next_local_coords = next_local_coords, local_coords
@kernel cpu=false function foreach_neighbor_double_buffer(f::F, system_coords, neighbor_system_coords,
neighborhood_search, cells, ::Val{MAX}, search_radius) where {F, MAX}
cell_ = @index(Group)
cell = @inbounds Tuple(cells[cell_])
particleidx = @index(Local)
@assert 1 <= particleidx <= MAX

# Coordinate buffer in local memory
local_points = @localmem Int32 MAX
local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)

# Next coordinate buffer in local memory
next_local_points = @localmem Int32 MAX
next_local_neighbor_coords = @localmem eltype(system_coords) (ndims(neighborhood_search), MAX)

points = points_in_cell(cell, neighborhood_search)
n_particles_in_current_cell = length(points)

# Extract point coordinates if a point lies on this thread
if particleidx <= n_particles_in_current_cell
point = @inbounds points[particleidx]
point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
point)
else
point = zero(Int32)
point_coords = zero(SVector{ndims(neighborhood_search), eltype(system_coords)})
end

neighborhood = neighboring_cells(cell, neighborhood_search)
# (neighbor_cell, state) = iterate(neighborhood)
neighbor_cell = Tuple(first(neighborhood))

n_particles_in_neighbor_cell = copy_to_localmem!(local_points, local_neighbor_coords,
neighbor_cell, neighbor_system_coords,
neighborhood_search, particleidx)

@synchronize()

for neighbor_ in 1:length(neighborhood)
neighbor_cell = @inbounds Tuple(neighborhood[neighbor_])

# while true
# next = iterate(neighborhood, state)
# if next !== nothing

if neighbor_ < length(neighborhood)
next_neighbor_cell = @inbounds Tuple(neighborhood[neighbor_ + 1])
# (next_neighbor_cell, state) = next
next_n_particles_in_neighbor_cell = copy_to_localmem!(next_local_points, next_local_neighbor_coords,
next_neighbor_cell, neighbor_system_coords,
neighborhood_search, particleidx)
end

# Now each thread works on one point again
if particleidx <= n_particles_in_current_cell
for local_neighbor in 1:n_particles_in_neighbor_cell
@inbounds neighbor = local_points[local_neighbor]
@inbounds neighbor_coords = extract_svector(local_neighbor_coords,
Val(ndims(neighborhood_search)),
local_neighbor)

pos_diff = point_coords - neighbor_coords
distance2 = dot(pos_diff, pos_diff)

# TODO periodic

if distance2 <= search_radius^2
distance = sqrt(distance2) # TODO: eventuell fastmath

# Inline to avoid loss of performance
# compared to not using `foreach_point_neighbor`.
@inline f(point, neighbor, pos_diff, distance)
end
end
end

# next === nothing && break
neighbor_ >= length(neighborhood) && break
@synchronize()

# swap variables
n_particles_in_neighbor_cell = next_n_particles_in_neighbor_cell
local_points, next_local_points = next_local_points, local_points
local_neighbor_coords, next_local_neighbor_coords = next_local_neighbor_coords, local_neighbor_coords
end
end

@inline function foreach_point_neighbor_cell_blocks(f, system_coords, neighbor_coords,
neighborhood_search)
backend = KernelAbstractions.get_backend(system_coords)
max_particles_per_cell = 64
nhs_size = size(neighborhood_search.cell_list.linear_indices)
cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
ndrange = max_particles_per_cell * length(cells)
kernel = foreach_neighbor_cell_blocks(backend, (max_particles_per_cell,))
kernel(f, system_coords, neighbor_coords, neighborhood_search, Val(max_particles_per_cell); ndrange)

KernelAbstractions.synchronize(backend)

return nothing
end

@kernel cpu=false function foreach_neighbor_cell_blocks(f::F, system_coords, neighbor_coords,
neighborhood_search, ::Val{MAX}) where {F, MAX}
cell_ = @index(Group)
nhs_size = size(neighborhood_search.cell_list.linear_indices)
@inbounds cells = CartesianIndices(ntuple(i -> 2:(nhs_size[i] - 1), ndims(neighborhood_search)))
cell = @inbounds Tuple(cells[cell_])
particleidx = @index(Local)
@assert 1 <= particleidx <= MAX

pv = points_in_cell(cell, neighborhood_search)
n_particles_in_current_cell = length(pv)
if particleidx <= n_particles_in_current_cell
point = @inbounds pv[particleidx]
point_coords = @inbounds extract_svector(system_coords, Val(ndims(neighborhood_search)),
point)
# KernelAbstractions.@print("Point $point with coords ($(point_coords[1]), $(point_coords[2]))\n")

for neighbor_cell_ in neighboring_cells(cell, neighborhood_search)
neighbor_cell = Tuple(neighbor_cell_)
points_view = points_in_cell(neighbor_cell, neighborhood_search)

for neighbor in points_view
@inbounds neighbor_coords = extract_svector(neighbor_system_coords,
Val(ndims(neighborhood_search)), neighbor)

pos_diff = point_coords - neighbor_coords
distance2 = dot(pos_diff, pos_diff)

# TODO periodic

if distance2 <= search_radius^2
# KernelAbstractions.@print("Point $point, neighbor $neighbor with distance2 $distance2\n")
distance = sqrt(distance2) # TODO: eventuell fastmath

# Inline to avoid loss of performance
# compared to not using `foreach_point_neighbor`.
@inline f(point, neighbor, pos_diff, distance)
end
end
end
end
end

@inline function neighboring_cells(cell, neighborhood_search)
NDIMS = ndims(neighborhood_search)

Expand Down
Loading