Skip to content

Commit

Permalink
[WIP] BVH buffer in explicitly managed host/device memory on systems
Browse files Browse the repository at this point in the history
without unified memory (e.g. dGPUs)
  • Loading branch information
freibold committed Sep 13, 2024
1 parent af475bd commit ea1927b
Show file tree
Hide file tree
Showing 10 changed files with 293 additions and 84 deletions.
23 changes: 17 additions & 6 deletions kernels/common/scene.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ namespace embree

/* use proper device and context for SYCL allocations */
#if defined(EMBREE_SYCL_SUPPORT)
if (DeviceGPU* gpu_device = dynamic_cast<DeviceGPU*>(device))
hwaccel = AccelBuffer(AccelAllocator<char>(device,gpu_device->getGPUDevice(),gpu_device->getGPUContext()),0);
if (dynamic_cast<DeviceGPU*>(device))
accelBuffer = AccelBuffer(device);
#endif

/* one can overwrite flags through device for debugging */
Expand Down Expand Up @@ -789,10 +789,8 @@ namespace embree
void Scene::build_gpu_accels()
{
#if defined(EMBREE_SYCL_SUPPORT)
auto [aabb, stride] = rthwifBuild(this,hwaccel);
hwaccel_stride = stride;
bounds = LBBox<embree::Vec3fa>(aabb);
hwaccel_bounds = aabb;
accelBuffer.build(this);
bounds = LBBox<embree::Vec3fa>(accelBuffer.getBounds());
#endif
}

Expand Down Expand Up @@ -917,6 +915,10 @@ namespace embree
taskGroup->scheduler = nullptr;
throw;
}

#if defined(EMBREE_SYCL_SUPPORT)
accelBuffer.commit();
#endif
}

#endif
Expand Down Expand Up @@ -982,6 +984,10 @@ namespace embree
accels_clear();
throw;
}

#if defined(EMBREE_SYCL_SUPPORT)
accelBuffer.commit();
#endif
}
#endif

Expand Down Expand Up @@ -1024,6 +1030,11 @@ namespace embree
accels_clear();
throw;
}

#if defined(EMBREE_SYCL_SUPPORT)
accelBuffer.commit();
#endif

}
#endif

Expand Down
9 changes: 1 addition & 8 deletions kernels/common/scene.h
Original file line number Diff line number Diff line change
Expand Up @@ -296,14 +296,7 @@ namespace embree

#if defined(EMBREE_SYCL_SUPPORT)
public:
BBox3f hwaccel_bounds = empty;
AccelBuffer hwaccel; // the buffer containing the HW acceleration structures corresponding to the scene. One for each time segment, stored in a contiguous chunk of memory.
size_t hwaccel_stride; // the stride between two HW acceleration structures for different time segments stored in hwaccel.

__forceinline char* getHWAccel(uint32_t time_segment) const {
char* ptr = (char*)hwaccel.data() + time_segment * hwaccel_stride;
return ptr;
}
AccelBuffer accelBuffer;
#endif

private:
Expand Down
20 changes: 20 additions & 0 deletions kernels/level_zero/ze_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ static std::mutex zeWrapperMutex;
static void* handle = nullptr;

static decltype(zeMemFree)* zeMemFreeInternal = nullptr;
static decltype(zeMemAllocHost)* zeMemAllocHostInternal = nullptr;
static decltype(zeMemAllocDevice)* zeMemAllocDeviceInternal = nullptr;
static decltype(zeMemAllocShared)* zeMemAllocSharedInternal = nullptr;
static decltype(zeDriverGetExtensionProperties)* zeDriverGetExtensionPropertiesInternal = nullptr;
static decltype(zeDeviceGetProperties)* zeDeviceGetPropertiesInternal = nullptr;
Expand Down Expand Up @@ -156,6 +158,8 @@ ze_result_t ZeWrapper::init()
handle = load_module();

zeMemFreeInternal = find_symbol<decltype(zeMemFree)*>(handle, "zeMemFree");
zeMemAllocHostInternal = find_symbol<decltype(zeMemAllocHost)*>(handle, "zeMemAllocHost");
zeMemAllocDeviceInternal = find_symbol<decltype(zeMemAllocDevice)*>(handle, "zeMemAllocDevice");
zeMemAllocSharedInternal = find_symbol<decltype(zeMemAllocShared)*>(handle, "zeMemAllocShared");
zeDriverGetExtensionPropertiesInternal = find_symbol<decltype(zeDriverGetExtensionProperties)*>(handle, "zeDriverGetExtensionProperties");
zeDeviceGetPropertiesInternal = find_symbol<decltype(zeDeviceGetProperties)*>(handle, "zeDeviceGetProperties");
Expand Down Expand Up @@ -218,6 +222,22 @@ ze_result_t ZeWrapper::zeMemFree(ze_context_handle_t context, void* ptr)
return zeMemFreeInternal(context, ptr);
}

ze_result_t ZeWrapper::zeMemAllocHost(ze_context_handle_t context, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, void** ptr)
{
if (!handle || !zeMemAllocHostInternal)
throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first.");

return zeMemAllocHostInternal(context, desch, s0, s1, ptr);
}

ze_result_t ZeWrapper::zeMemAllocDevice(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr)
{
if (!handle || !zeMemAllocDeviceInternal)
throw std::runtime_error("ZeWrapper not initialized, call ZeWrapper::init() first.");

return zeMemAllocDeviceInternal(context, descd, s0, s1, ze_handle, ptr);
}

ze_result_t ZeWrapper::zeMemAllocShared(ze_context_handle_t context, const ze_device_mem_alloc_desc_t* descd, const ze_host_mem_alloc_desc_t* desch, size_t s0, size_t s1, ze_device_handle_t ze_handle, void** ptr)
{
if (!handle || !zeMemAllocSharedInternal)
Expand Down
2 changes: 2 additions & 0 deletions kernels/level_zero/ze_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ struct ZeWrapper
static ze_result_t initRTASBuilder(ze_driver_handle_t hDriver, RTAS_BUILD_MODE rtas_build_mode = RTAS_BUILD_MODE::AUTO);

static ze_result_t zeMemFree(ze_context_handle_t, void*);
static ze_result_t zeMemAllocHost(ze_context_handle_t, const ze_host_mem_alloc_desc_t*, size_t, size_t, void**);
static ze_result_t zeMemAllocDevice(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**);
static ze_result_t zeMemAllocShared(ze_context_handle_t, const ze_device_mem_alloc_desc_t*, const ze_host_mem_alloc_desc_t*, size_t, size_t, ze_device_handle_t, void**);
static ze_result_t zeDriverGetExtensionProperties(ze_driver_handle_t, uint32_t*, ze_driver_extension_properties_t*);
static ze_result_t zeDeviceGetProperties(ze_device_handle_t, ze_device_properties_t*);
Expand Down
16 changes: 8 additions & 8 deletions kernels/sycl/rthwif_embree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, RayHit& ray
raydesc.flags |= intel_ray_flags_cull_back_facing_triangles;
#endif

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
return false;
Expand Down Expand Up @@ -152,7 +152,7 @@ __forceinline bool intersect_user_geometry(intel_ray_query_t& query, Ray& ray, U
raydesc.flags |= intel_ray_flags_cull_back_facing_triangles;
#endif

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(0);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(0);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);
return false;
Expand Down Expand Up @@ -208,7 +208,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, RayHit& ray, Ins
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);

Expand Down Expand Up @@ -261,7 +261,7 @@ __forceinline bool intersect_instance(intel_ray_query_t& query, Ray& ray, Instan
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);

Expand Down Expand Up @@ -319,7 +319,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, RayHit& ra
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);

Expand Down Expand Up @@ -374,7 +374,7 @@ __forceinline bool intersect_instance_array(intel_ray_query_t& query, Ray& ray,
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) object->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_forward_ray(query, raydesc, hwaccel_ptr);

Expand Down Expand Up @@ -752,7 +752,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcIntersectRTHW(sycl::global_
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr);
intel_ray_query_start_traversal(query);
Expand Down Expand Up @@ -854,7 +854,7 @@ SYCL_EXTERNAL __attribute__((always_inline)) void rtcOccludedRTHW(sycl::global_p
bvh_id = (uint32_t) clamp(uint32_t(numTimeSegments*time), 0u, numTimeSegments-1);
}

intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->getHWAccel(bvh_id);
intel_raytracing_acceleration_structure_t hwaccel_ptr = (intel_raytracing_acceleration_structure_t) scene->accelBuffer.getHWAccel(bvh_id);

intel_ray_query_t query = intel_ray_query_init(raydesc, hwaccel_ptr);
intel_ray_query_start_traversal(query);
Expand Down
106 changes: 95 additions & 11 deletions kernels/sycl/rthwif_embree_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ namespace embree
return sycl_device.get_info<sycl::info::device::max_compute_units>();
}

void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context)
void* rthwifAllocAccelBuffer(Device* embree_device, size_t bytes, sycl::device device, sycl::context context, sycl::usm::alloc alloc_type)
{
ze_context_handle_t hContext = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(context);
ze_device_handle_t hDevice = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(device);
Expand All @@ -176,28 +176,44 @@ namespace embree
relaxed.stype = ZE_STRUCTURE_TYPE_RELAXED_ALLOCATION_LIMITS_EXP_DESC;
relaxed.pNext = &rt_desc;
relaxed.flags = ZE_RELAXED_ALLOCATION_LIMITS_EXP_FLAG_MAX_SIZE;

ze_device_mem_alloc_desc_t device_desc;
device_desc.stype = ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC;
device_desc.pNext = &relaxed;
device_desc.flags = ZE_DEVICE_MEM_ALLOC_FLAG_BIAS_CACHED;
device_desc.ordinal = 0;

ze_host_mem_alloc_desc_t host_desc;
host_desc.stype = ZE_STRUCTURE_TYPE_HOST_MEM_ALLOC_DESC;
host_desc.pNext = nullptr;
host_desc.flags = ZE_HOST_MEM_ALLOC_FLAG_BIAS_CACHED;

void* ptr = nullptr;

// TODO: modify memory monitor to account for host and device code separately
if (embree_device) embree_device->memoryMonitor(bytes,false);
ze_result_t result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);

ze_result_t result;
switch (alloc_type) {
case sycl::usm::alloc::host:
result = ZeWrapper::zeMemAllocHost(hContext,&host_desc,bytes,rtasProp.rtasBufferAlignment,&ptr);
break;
case sycl::usm::alloc::device:
result = ZeWrapper::zeMemAllocDevice(hContext,&device_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
break;
case sycl::usm::alloc::shared:
result = ZeWrapper::zeMemAllocShared(hContext,&device_desc,&host_desc,bytes,rtasProp.rtasBufferAlignment,hDevice,&ptr);
break;
default:
throw_RTCError(RTC_ERROR_UNKNOWN, "sycl::usm::alloc type unknown in rthwifAllocAccelBuffer");
}

if (result != ZE_RESULT_SUCCESS)
throw_RTCError(RTC_ERROR_OUT_OF_MEMORY,"rtas memory allocation failed");

return ptr;
}

void rthwifFreeAccelBuffer(Device* embree_device, void* ptr, size_t bytes, sycl::context context)
{
if (ptr == nullptr) return;
Expand Down Expand Up @@ -362,9 +378,9 @@ namespace embree
const AffineSpace3fa local2world = geom->getLocal2World();
out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR;
out->pTransform = (float*) &out->xfmdata;
out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->hwaccel_bounds;
out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->accelBuffer.getBounds();
out->xfmdata = *(ze_rtas_transform_float3x4_aligned_column_major_exp_t*) &local2world;
out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->getHWAccel(0);
out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->accelBuffer.getHWAccel(0);
}

void createGeometryDesc(ze_rtas_builder_instance_geometry_info_exp_t* out, Scene* scene, Instance* geom)
Expand All @@ -377,8 +393,8 @@ namespace embree
out->instanceUserID = 0;
out->transformFormat = ZE_RTAS_BUILDER_INPUT_DATA_FORMAT_EXP_FLOAT3X4_ALIGNED_COLUMN_MAJOR;
out->pTransform = (float*) &geom->local2world[0];
out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->hwaccel_bounds;
out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->getHWAccel(0);
out->pBounds = (ze_rtas_aabb_exp_t*) &dynamic_cast<Scene*>(geom->object)->accelBuffer.getBounds();
out->pAccelerationStructure = dynamic_cast<Scene*>(geom->object)->accelBuffer.getHWAccel(0);
}

void createGeometryDesc(char* out, Scene* scene, Geometry* geom, GEOMETRY_TYPE type)
Expand Down Expand Up @@ -417,7 +433,7 @@ namespace embree
return result;
}

std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBuffer& accel)
std::tuple<BBox3f, size_t> rthwifBuild(Scene* scene, AccelBufferData& accel)
{
DeviceGPU* gpuDevice = dynamic_cast<DeviceGPU*>(scene->device);
if (gpuDevice == nullptr) throw std::runtime_error("internal error");
Expand Down Expand Up @@ -647,4 +663,72 @@ namespace embree

return std::tie(fullBounds, sizeTotal.rtasBufferSizeBytesExpected);
}

AccelBuffer::AccelBuffer(Device *device) : device(device)
{
DeviceGPU *gpu_device = dynamic_cast<DeviceGPU *>(device);

if (!gpu_device)
{
throw_RTCError(RTC_ERROR_INVALID_ARGUMENT, "AccelBuffer constructor called with non-GPU device");
}

unifiedMemory = false; //gpu_device->has_unified_memory();

if (unifiedMemory)
{
accelBufferShared = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::shared), 0);
}
else
{
accelBufferHost = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::host), 0);
accelBufferDevice = AccelBufferData(AccelAllocator<char>(device, gpu_device->getGPUDevice(), gpu_device->getGPUContext(), sycl::usm::alloc::shared), 0);
}
}

void AccelBuffer::build(Scene *scene)
{
auto [aabb, stride] = rthwifBuild(scene, getAccelBufferData());
hwaccel_stride = stride;
hwaccel_bounds = aabb;
}

void AccelBuffer::commit()
{
if (unifiedMemory) {
hwaccel = (char*)accelBufferShared.data();
return;
}

auto deviceGPU = reinterpret_cast<DeviceGPU*>(device);
if (!deviceGPU) {
return;
}

std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl;
std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl;

accelBufferDevice.resize(accelBufferHost.size());

sycl::queue queue(deviceGPU->getGPUDevice());
queue.memcpy(accelBufferDevice.data(), accelBufferHost.data(), accelBufferHost.size());
queue.wait_and_throw();

std::vector<char> host_data(accelBufferHost.size());
queue.memcpy(host_data.data(), accelBufferDevice.data(), accelBufferDevice.size());
queue.wait_and_throw();

for (size_t i = 0; i < accelBufferHost.size(); ++i) {
if (accelBufferHost[i] != host_data[i]) {
std::cout << (int)accelBufferHost[i] << " - " << (int)host_data[i] << std::endl;
}
}

std::cout << "accelBufferHost.size(): " << accelBufferHost.size() << std::endl;
std::cout << "accelBufferDevice.size(): " << accelBufferDevice.size() << std::endl;

hwaccel = (char*)accelBufferDevice.data();
printf("hwaccel %p\n", hwaccel);
}

}
Loading

0 comments on commit ea1927b

Please sign in to comment.