postprocess: hailo: Handle multiple scaler crops on the ISP outputs

Add a HailoPostProcessingStage::ConvertInferenceCoordinates() helper that converts from the inference (low res) image co-ordinate space to the main image space, accounting for different scaler crops. Fixup the Yolo inference and pose estimation stages to use this new helper. Signed-off-by: Naushir Patuck <naush@raspberrypi.com>
raspberrypi · Aug 29, 2024 · f58bfe1 · f58bfe1
1 parent e3357be
commit f58bfe1
Show file tree

Hide file tree

Showing 4 changed files with 96 additions and 29 deletions.
diff --git a/post_processing_stages/hailo/hailo_postprocessing_stage.cpp b/post_processing_stages/hailo/hailo_postprocessing_stage.cpp
@@ -15,6 +15,9 @@
 
 using namespace hailort;
 
+using Rectangle = libcamera::Rectangle;
+using Size = libcamera::Size;
+
 Allocator::Allocator()
 {
 }
@@ -255,3 +258,31 @@ HailoROIPtr HailoPostProcessingStage::MakeROI(const std::vector<OutTensor> &outp
 
 	return roi;
 }
+
+Rectangle HailoPostProcessingStage::ConvertInferenceCoordinates(const std::vector<float> &coords,
+																const std::vector<Rectangle> &scaler_crops) const
+{
+	if (coords.size() != 4 || scaler_crops.size() != 2)
+		return {};
+
+	// Convert the inference image co-ordinates into the final ISP output co-ordinates.
+	const Size &isp_output_size = output_stream_->configuration().size;
+
+	// Object scaled to the full sensor resolution
+	Rectangle obj;
+	obj.x = std::round(coords[0] * (scaler_crops[1].width - 1));
+	obj.y = std::round(coords[1] * (scaler_crops[1].height - 1));
+	obj.width = std::round(coords[2] * (scaler_crops[1].width - 1));
+	obj.height = std::round(coords[3] * (scaler_crops[1].height - 1));
+
+	// Object on the low res scaler crop -> translated by the start of the low res crop offset
+	const Rectangle obj_translated_l = obj.translatedBy(scaler_crops[1].topLeft());
+	// -> bounded to the high res output crop
+	const Rectangle obj_bounded = obj_translated_l.boundedTo(scaler_crops[0]);
+	// -> translated to the start of the high res crop offset
+	const Rectangle obj_translated_h = obj_bounded.translatedBy(-scaler_crops[0].topLeft());
+	// -> and scaled to the ISP output.
+	const Rectangle obj_scaled = obj_translated_h.scaledBy(isp_output_size, scaler_crops[0].size());
+
+	return obj_scaled;
+}
diff --git a/post_processing_stages/hailo/hailo_postprocessing_stage.hpp b/post_processing_stages/hailo/hailo_postprocessing_stage.hpp
@@ -13,6 +13,8 @@
 #include <string>
 #include <vector>
 
+#include <libcamera/geometry.h>
+
 #include <hailo/hailort.hpp>
 #include "hailo_objects.hpp"
 
@@ -110,6 +112,9 @@ class HailoPostProcessingStage : public PostProcessingStage
 	hailo_status DispatchJob(const uint8_t *input, hailort::AsyncInferJob &job, std::vector<OutTensor> &output_tensors);
 	HailoROIPtr MakeROI(const std::vector<OutTensor> &output_tensors) const;
 
+	libcamera::Rectangle ConvertInferenceCoordinates(const std::vector<float> &coords,
+													 const std::vector<libcamera::Rectangle> &scaler_crops) const;
+
 	libcamera::Stream *low_res_stream_;
 	libcamera::Stream *output_stream_;
 	libcamera::Stream *raw_stream_;

diff --git a/post_processing_stages/hailo/hailo_yolo_inference.cpp b/post_processing_stages/hailo/hailo_yolo_inference.cpp
@@ -13,6 +13,7 @@
 #include <string>
 #include <vector>
 
+#include <libcamera/controls.h>
 #include <libcamera/geometry.h>
 
 #include "core/rpicam_app.hpp"
@@ -28,6 +29,8 @@ using PostProcFuncPtrNms = void (*)(HailoROIPtr);
 using InitFuncPtr = YoloParams *(*)(std::string, std::string);
 using FreeFuncPtr = void (*)(void *);
 
+using Rectangle = libcamera::Rectangle;
+
 namespace fs = std::filesystem;
 
 #define NAME "hailo_yolo_inference"
@@ -49,7 +52,7 @@ class YoloInference : public HailoPostProcessingStage
 	bool Process(CompletedRequestPtr &completed_request) override;
 
 private:
-	std::vector<Detection> runInference(const uint8_t *frame);
+	std::vector<Detection> runInference(const uint8_t *frame, const std::vector<libcamera::Rectangle> &scaler_crops);
 	void filterOutputObjects(std::vector<Detection> &objects);
 
 	struct LtObject
@@ -187,7 +190,23 @@ bool YoloInference::Process(CompletedRequestPtr &completed_request)
 		return false;
 	}
 
-	std::vector<Detection> objects = runInference(input_ptr);
+	std::vector<Rectangle> scaler_crops;
+	auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop);
+	auto rpi_scaler_crop = completed_request->metadata.get(controls::rpi::ScalerCrops);
+
+	if (rpi_scaler_crop)
+	{
+		for (unsigned int i = 0; i < rpi_scaler_crop->size(); i++)
+			scaler_crops.push_back(rpi_scaler_crop->data()[i]);
+	}
+	else if (scaler_crop)
+	{
+		// Push-back twice, once for main, once for low res.
+		scaler_crops.push_back(*scaler_crop);
+		scaler_crops.push_back(*scaler_crop);
+	}
+
+	std::vector<Detection> objects = runInference(input_ptr, scaler_crops);
 	if (objects.size())
 	{
 		if (temporal_filtering_)
@@ -215,7 +234,7 @@ bool YoloInference::Process(CompletedRequestPtr &completed_request)
 	return false;
 }
 
-std::vector<Detection> YoloInference::runInference(const uint8_t *frame)
+std::vector<Detection> YoloInference::runInference(const uint8_t *frame, const std::vector<Rectangle> &scaler_crops)
 {
 	hailort::AsyncInferJob job;
 	std::vector<OutTensor> output_tensors;
@@ -265,20 +284,19 @@ std::vector<Detection> YoloInference::runInference(const uint8_t *frame)
 
 	// Translate results to the rpicam-apps Detection objects
 	std::vector<Detection> results;
-	Size output_size = output_stream_->configuration().size;
 	for (auto const &d : detections)
 	{
 		if (d->get_confidence() < threshold_)
 			continue;
 
 		// Extract bounding box co-ordinates in the output image co-ordinates.
 		auto const &box = d->get_bbox();
-		int x0 = std::round(std::max(box.xmin(), 0.0f) * (output_size.width - 1));
-		int x1 = std::round(std::min(box.xmax(), 1.0f) * (output_size.width - 1));
-		int y0 = std::round(std::max(box.ymin(), 0.0f) * (output_size.height - 1));
-		int y1 = std::round(std::min(box.ymax(), 1.0f) * (output_size.height - 1));
-		results.emplace_back(d->get_class_id(), d->get_label(), d->get_confidence(), x0, y0,
-							 x1 - x0, y1 - y0);
+		const float x0 = std::max(box.xmin(), 0.0f);
+		const float x1 = std::min(box.xmax(), 1.0f);
+		const float y0 = std::max(box.ymin(), 0.0f);
+		const float y1 = std::min(box.ymax(), 1.0f);
+		libcamera::Rectangle r = ConvertInferenceCoordinates({ x0, y0, x1 - x0, y1 - y0 }, scaler_crops);
+		results.emplace_back(d->get_class_id(), d->get_label(), d->get_confidence(), r.x, r.y, r.width, r.height);
 		LOG(2, "Object: " << results.back().toString());
 
 		if (--max_detections_ == 0)

diff --git a/post_processing_stages/hailo/hailo_yolov8_pose.cpp b/post_processing_stages/hailo/hailo_yolov8_pose.cpp
@@ -18,6 +18,7 @@
 #include "core/rpicam_app.hpp"
 #include "hailo_postprocessing_stage.hpp"
 
+using Rectangle = libcamera::Rectangle;
 using Size = libcamera::Size;
 using PostProcFuncPtr = std::pair<std::vector<KeyPt>, std::vector<PairPairs>>(*)(HailoROIPtr);
 
@@ -38,7 +39,7 @@ class YoloPose : public HailoPostProcessingStage
 	bool Process(CompletedRequestPtr &completed_request) override;
 
 private:
-	void runInference(const uint8_t *input, uint32_t *output);
+	void runInference(const uint8_t *input, uint32_t *output, const std::vector<Rectangle> &scaler_crops);
 
 	PostProcessingLib postproc_;
 };
@@ -113,16 +114,32 @@ bool YoloPose::Process(CompletedRequestPtr &completed_request)
 		return false;
 	}
 
+	std::vector<Rectangle> scaler_crops;
+	auto scaler_crop = completed_request->metadata.get(controls::ScalerCrop);
+	auto rpi_scaler_crop = completed_request->metadata.get(controls::rpi::ScalerCrops);
+
+	if (rpi_scaler_crop)
+	{
+		for (unsigned int i = 0; i < rpi_scaler_crop->size(); i++)
+			scaler_crops.push_back(rpi_scaler_crop->data()[i]);
+	}
+	else if (scaler_crop)
+	{
+		// Push-back twice, once for main, once for low res.
+		scaler_crops.push_back(*scaler_crop);
+		scaler_crops.push_back(*scaler_crop);
+	}
+
 	BufferWriteSync w(app_, completed_request->buffers[output_stream_]);
 	libcamera::Span<uint8_t> buffer = w.Get()[0];
 	uint32_t *output = (uint32_t *)buffer.data();
 
-	runInference(input_ptr, output);
+	runInference(input_ptr, output, scaler_crops);
 
 	return false;
 }
 
-void YoloPose::runInference(const uint8_t *input, uint32_t *output)
+void YoloPose::runInference(const uint8_t *input, uint32_t *output, const std::vector<Rectangle> &scaler_crops)
 {
 	hailort::AsyncInferJob job;
 	std::vector<OutTensor> output_tensors;
@@ -159,30 +176,26 @@ void YoloPose::runInference(const uint8_t *input, uint32_t *output)
 			continue;
 
 		HailoBBox bbox = detection->get_bbox();
-
-		cv::rectangle(image,
-					  cv::Point2f(bbox.xmin() * float(output_stream_info_.width),
-								  bbox.ymin() * float(output_stream_info_.height)),
-					  cv::Point2f(bbox.xmax() * float(output_stream_info_.width),
-								  bbox.ymax() * float(output_stream_info_.height)),
+		const float x0 = std::max(bbox.xmin(), 0.0f);
+		const float x1 = std::min(bbox.xmax(), 1.0f);
+		const float y0 = std::max(bbox.ymin(), 0.0f);
+		const float y1 = std::min(bbox.ymax(), 1.0f);
+		Rectangle r = ConvertInferenceCoordinates({ x0, y0, x1 - x0, y1 - y0 }, scaler_crops);
+		cv::rectangle(image, cv::Point2f(r.x, r.y), cv::Point2f(r.x + r.width, r.y + r.height),
 					  cv::Scalar(0, 0, 255), 1);
 	}
 
 	for (auto &keypoint : keypoints_and_pairs.first)
 	{
-		cv::circle(image,
-				   cv::Point(keypoint.xs * float(output_stream_info_.width),
-							 keypoint.ys * float(output_stream_info_.height)),
-				   3, cv::Scalar(255, 0, 255), -1);
+		Rectangle r = ConvertInferenceCoordinates({ keypoint.xs, keypoint.ys, 1, 1 }, scaler_crops);
+		cv::circle(image, cv::Point(r.x, r.y), 3, cv::Scalar(255, 0, 255), -1);
 	}
 
-	for (PairPairs &p : keypoints_and_pairs.second)
+	for (const PairPairs &p : keypoints_and_pairs.second)
 	{
-		auto pt1 =
-			cv::Point(p.pt1.first * float(output_stream_info_.width), p.pt1.second * float(output_stream_info_.height));
-		auto pt2 =
-			cv::Point(p.pt2.first * float(output_stream_info_.width), p.pt2.second * float(output_stream_info_.height));
-		cv::line(image, pt1, pt2, cv::Scalar(255, 0, 255), 3);
+		Rectangle r1 = ConvertInferenceCoordinates({ p.pt1.first, p.pt1.second, 1, 1 }, scaler_crops);
+		Rectangle r2 = ConvertInferenceCoordinates({ p.pt2.first, p.pt2.second, 1, 1 }, scaler_crops);
+		cv::line(image, cv::Point(r1.x, r1.y), cv::Point(r2.x, r2.y), cv::Scalar(255, 0, 255), 3);
 	}
 }