From ea6be06be799dcea46fbe8321e4dbf257ed2e0bc Mon Sep 17 00:00:00 2001 From: Nicola Cabiddu Date: Wed, 20 Mar 2024 14:34:41 +0000 Subject: [PATCH] some code cleanup and heuristic for packed --- src/realm/array_direct.hpp | 2 +- src/realm/array_encode.cpp | 2 +- src/realm/array_flex.cpp | 42 ----------- src/realm/array_flex.hpp | 138 +++++++++++++++++++++++++++---------- src/realm/array_packed.cpp | 72 ------------------- src/realm/array_packed.hpp | 123 +++++++++++++++++++++++++++++++-- 6 files changed, 223 insertions(+), 156 deletions(-) diff --git a/src/realm/array_direct.hpp b/src/realm/array_direct.hpp index 66920221483..04c99d464ac 100644 --- a/src/realm/array_direct.hpp +++ b/src/realm/array_direct.hpp @@ -346,7 +346,7 @@ class bf_iterator { } field_position = next_field_position; } - + inline void move(size_t index, size_t initial_offset = 0) { field_position = initial_offset + index * step_size; diff --git a/src/realm/array_encode.cpp b/src/realm/array_encode.cpp index e58ed3287e4..4ead75b45dd 100644 --- a/src/realm/array_encode.cpp +++ b/src/realm/array_encode.cpp @@ -148,7 +148,7 @@ bool ArrayEncode::always_encode(const Array& origin, Array& arr, bool packed) co bool ArrayEncode::encode(const Array& origin, Array& arr) const { // return false; - // return always_encode(origin, arr, false); // true packed, false flex + // return always_encode(origin, arr, true); // true packed, false flex std::vector values; std::vector indices; diff --git a/src/realm/array_flex.cpp b/src/realm/array_flex.cpp index e5765d12cba..8a03c19224f 100644 --- a/src/realm/array_flex.cpp +++ b/src/realm/array_flex.cpp @@ -30,16 +30,6 @@ using namespace realm; -inline bool run_eq_neq_parallel_subscan(size_t w, size_t range) -{ - return w < 32 && range >= 50; -} - -inline bool run_lt_gt_parallel_subscan(size_t w, size_t range) -{ - return w < 16 && range >= 50; -} - void ArrayFlex::init_array(char* h, uint8_t flags, size_t v_width, size_t ndx_width, size_t v_size, size_t ndx_size) const { @@ -136,38 +126,6 @@ void ArrayFlex::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const } } -bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, - QueryStateBase* state) const -{ - if(!run_eq_neq_parallel_subscan(arr.m_width, end-start)) - return find_linear(arr, value, start, end, baseindex, state); - return find_parallel(arr, value, start, end, baseindex, state); -} - -bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, - QueryStateBase* state) const -{ - if(!run_eq_neq_parallel_subscan(arr.m_width, end-start)) - return find_linear(arr, value, start, end, baseindex, state); - return find_parallel(arr, value, start, end, baseindex, state); -} - -bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, - QueryStateBase* state) const -{ - if(!run_lt_gt_parallel_subscan(arr.m_width, end-start)) - return find_linear(arr, value, start, end, baseindex, state); - return find_parallel(arr, value, start, end, baseindex, state); -} - -bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, - QueryStateBase* state) const -{ - if(!run_lt_gt_parallel_subscan(arr.m_width, end-start)) - return find_linear(arr, value, start, end, baseindex, state); - return find_parallel(arr, value, start, end, baseindex, state); -} - int64_t ArrayFlex::sum(const Array& arr, size_t start, size_t end) const { const auto& encoder = arr.m_encoder; diff --git a/src/realm/array_flex.hpp b/src/realm/array_flex.hpp index 05f6f42c234..c5ee6d14585 100644 --- a/src/realm/array_flex.hpp +++ b/src/realm/array_flex.hpp @@ -54,22 +54,26 @@ class ArrayFlex { private: int64_t do_get(uint64_t*, size_t, size_t, size_t, size_t, size_t, uint64_t) const; bool find_all_match(size_t, size_t, size_t, QueryStateBase*) const; - - template + + template inline bool find_linear(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - - template + + template inline bool find_parallel(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - - bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + + inline bool find_eq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + inline bool find_neq(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + inline bool find_lt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + inline bool find_gt(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + + inline bool run_eq_neq_parallel_subscan(size_t, size_t, size_t, size_t) const; + inline bool run_lt_gt_parallel_subscan(size_t, size_t, size_t, size_t) const; }; -template -inline bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const +template +inline bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const { REALM_ASSERT_DEBUG(start <= arr.m_size && (end <= arr.m_size || end == size_t(-1)) && start <= end); Cond c; @@ -91,7 +95,7 @@ inline bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, s } REALM_ASSERT_3(arr.m_width, !=, 0); - + if constexpr (std::is_same_v) { return find_eq(arr, value, start, end, baseindex, state); } @@ -107,35 +111,36 @@ inline bool ArrayFlex::find_all(const Array& arr, int64_t value, size_t start, s return true; } -template -inline bool ArrayFlex::find_linear(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const +template +inline bool ArrayFlex::find_linear(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const { - const auto cmp = [](int64_t item, int64_t key){ - if constexpr(std::is_same_v) + const auto cmp = [](int64_t item, int64_t key) { + if constexpr (std::is_same_v) return item == key; - if constexpr(std::is_same_v) { + if constexpr (std::is_same_v) { return item != key; } - if constexpr(std::is_same_v) { + if constexpr (std::is_same_v) { return item < key; } - if constexpr(std::is_same_v) { + if constexpr (std::is_same_v) { return item > key; } REALM_UNREACHABLE(); }; - + auto data = (uint64_t*)arr.m_data; const auto& encoder = arr.get_encoder(); const auto offset = encoder.width() * encoder.v_size(); const auto v_width = encoder.width(); const auto ndx_width = encoder.ndx_width(); - + bf_iterator ndx_it((uint64_t*)data, offset, ndx_width, ndx_width, start); bf_iterator val_it((uint64_t*)data, 0, v_width, v_width, *ndx_it); - while(start < end) { + while (start < end) { const auto sv = sign_extend_field_by_mask(encoder.width_mask(), *val_it); - if(cmp(sv, value) && !state->match(start + baseindex)) + if (cmp(sv, value) && !state->match(start + baseindex)) return false; ++start; ++ndx_it; @@ -151,12 +156,12 @@ inline uint64_t vector_compare(uint64_t MSBs, uint64_t a, uint64_t b) return find_all_fields_EQ(MSBs, a, b); if constexpr (std::is_same_v) return find_all_fields_NE(MSBs, a, b); - - if constexpr (std::is_same_v){ - if(std::is_same_v) + + if constexpr (std::is_same_v) { + if (std::is_same_v) return find_all_fields_signed_GT(MSBs, a, b); - if(std::is_same_v) - return find_all_fields_unsigned_GT(MSBs,a, b); + if (std::is_same_v) + return find_all_fields_unsigned_GT(MSBs, a, b); REALM_UNREACHABLE(); } if constexpr (std::is_same_v) { @@ -180,11 +185,11 @@ inline uint64_t vector_compare(uint64_t MSBs, uint64_t a, uint64_t b) return find_all_fields_unsigned_LE(MSBs, a, b); REALM_UNREACHABLE(); } - } -template -inline bool ArrayFlex::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, QueryStateBase* state) const +template +inline bool ArrayFlex::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const { const auto& encoder = arr.m_encoder; const auto v_width = encoder.width(); @@ -192,26 +197,87 @@ inline bool ArrayFlex::find_parallel(const Array& arr, int64_t value, size_t sta const auto ndx_width = encoder.ndx_width(); const auto offset = v_size * v_width; uint64_t* data = (uint64_t*)arr.m_data; - + auto MSBs = encoder.msb(); auto search_vector = populate(v_width, value); auto v_start = parallel_subword_find(vector_compare, data, 0, v_width, MSBs, search_vector, 0, v_size); if (v_start == v_size) return true; - + MSBs = encoder.ndx_msb(); search_vector = populate(ndx_width, v_start); while (start < end) { - start = - parallel_subword_find(vector_compare, data, offset, ndx_width, MSBs, search_vector, start, end); + start = parallel_subword_find(vector_compare, data, offset, ndx_width, MSBs, + search_vector, start, end); if (start < end) if (!state->match(start + baseindex)) return false; - + ++start; } return true; } +inline bool ArrayFlex::find_eq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto v_width = arr.m_width; + const auto v_range = arr.get_encoder().v_size(); + const auto ndx_width = arr.get_encoder().ndx_width(); + const auto ndx_range = end - start; + if (!run_eq_neq_parallel_subscan(v_width, v_range, ndx_width, ndx_range)) + return find_linear(arr, value, start, end, baseindex, state); + return find_parallel(arr, value, start, end, baseindex, state); +} + +inline bool ArrayFlex::find_neq(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto v_width = arr.m_width; + const auto v_range = arr.get_encoder().v_size(); + const auto ndx_width = arr.get_encoder().ndx_width(); + const auto ndx_range = end - start; + if (!run_eq_neq_parallel_subscan(v_width, v_range, ndx_width, ndx_range)) + return find_linear(arr, value, start, end, baseindex, state); + return find_parallel(arr, value, start, end, baseindex, state); +} + +inline bool ArrayFlex::find_lt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto v_width = arr.m_width; + const auto v_range = arr.get_encoder().v_size(); + const auto ndx_width = arr.get_encoder().ndx_width(); + const auto ndx_range = end - start; + if (!run_lt_gt_parallel_subscan(v_width, v_range, ndx_width, ndx_range)) + return find_linear(arr, value, start, end, baseindex, state); + return find_parallel(arr, value, start, end, baseindex, state); +} + +inline bool ArrayFlex::find_gt(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + const auto v_width = arr.m_width; + const auto v_range = arr.get_encoder().v_size(); + const auto ndx_width = arr.get_encoder().ndx_width(); + const auto ndx_range = end - start; + if (!run_lt_gt_parallel_subscan(v_width, v_range, ndx_width, ndx_range)) + return find_linear(arr, value, start, end, baseindex, state); + return find_parallel(arr, value, start, end, baseindex, state); +} + +inline bool ArrayFlex::run_eq_neq_parallel_subscan(size_t v_width, size_t v_range, size_t ndx_width, + size_t ndx_range) const +{ + return v_width < 32 && ndx_width < 32 && v_range >= 16 && ndx_range >= 16; +} + +inline bool ArrayFlex::run_lt_gt_parallel_subscan(size_t v_width, size_t v_range, size_t ndx_width, + size_t ndx_range) const +{ + return v_width < 16 && ndx_width < 16 && v_range >= 16 && ndx_range >= 16; +} + + } // namespace realm #endif // REALM_ARRAY_COMPRESS_HPP diff --git a/src/realm/array_packed.cpp b/src/realm/array_packed.cpp index d7925fa8cd3..432eec5f766 100644 --- a/src/realm/array_packed.cpp +++ b/src/realm/array_packed.cpp @@ -32,12 +32,6 @@ using namespace realm; -template bool ArrayPacked::find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; -template bool ArrayPacked::find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; -template bool ArrayPacked::find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; -template bool ArrayPacked::find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; - - void ArrayPacked::init_array(char* h, uint8_t flags, size_t v_width, size_t v_size) const { using Encoding = NodeHeader::Encoding; @@ -112,72 +106,6 @@ void ArrayPacked::get_chunk(const Array& arr, size_t ndx, int64_t res[8]) const } } -template -uint64_t vector_compare(uint64_t MSBs, uint64_t a, uint64_t b) -{ - if constexpr (std::is_same_v) - return find_all_fields_EQ(MSBs, a, b); - if constexpr (std::is_same_v) - return find_all_fields_NE(MSBs, a, b); - if constexpr (std::is_same_v) - return find_all_fields_signed_GT(MSBs, a, b); - if constexpr (std::is_same_v) - return find_all_fields_signed_LT(MSBs, a, b); -} - -template -bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, - QueryStateBase* state) const -{ - REALM_ASSERT_DEBUG(start <= arr.m_size && (end <= arr.m_size || end == size_t(-1)) && start <= end); - Cond c; - - if (end == npos) - end = arr.m_size; - - if (!(arr.m_size > start && start < end)) - return true; - - const auto lbound = arr.m_lbound; - const auto ubound = arr.m_ubound; - - if (!c.can_match(value, lbound, ubound)) - return true; - - if (c.will_match(value, lbound, ubound)) { - return find_all_match(start, end, baseindex, state); - } - - REALM_ASSERT_3(arr.m_width, !=, 0); - // NOTE: this is one of the most important functions in the whole codebase, since it determines how fast the - // queries run. - // - // Main idea around find. - // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can - // contain in parallel. Once we have found the starting point, keep matching values as much as we can between - // start and end. - // - // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0. - // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to - // apply a mask and a shift bits every time, then compare the values. - // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and - // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is - // the width of each single value within a 64 bit word and N is the total number of values stored in the array. - - // in packed format a parallel subword find pays off also for width >= 32 - const auto MSBs = arr.get_encoder().msb(); - const auto search_vector = populate(arr.m_width, value); - while (start < end) { - start = parallel_subword_find(vector_compare, (const uint64_t*)arr.m_data, 0, arr.m_width, MSBs, - search_vector, start, end); - if (start < end) - if (!state->match(start + baseindex)) - return false; - ++start; - } - return true; -} - bool ArrayPacked::find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const { REALM_ASSERT_DEBUG(state->match_count() < state->limit()); diff --git a/src/realm/array_packed.hpp b/src/realm/array_packed.hpp index a7bf7ddd020..aa43ccd8700 100644 --- a/src/realm/array_packed.hpp +++ b/src/realm/array_packed.hpp @@ -19,6 +19,9 @@ #ifndef REALM_ARRAY_PACKED_HPP #define REALM_ARRAY_PACKED_HPP +#include +#include + #include #include @@ -28,9 +31,6 @@ namespace realm { // Compress array in Packed format // Decompress array in WTypeBits formats // -class Array; -class ArrayEncode; -class QueryStateBase; class ArrayPacked { public: // encoding/decoding @@ -43,13 +43,128 @@ class ArrayPacked { void set_direct(const Array&, size_t, int64_t) const; template - bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + inline bool find_all(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; int64_t sum(const Array&, size_t, size_t) const; private: int64_t do_get(uint64_t*, size_t, size_t, size_t, uint64_t) const; bool find_all_match(size_t start, size_t end, size_t baseindex, QueryStateBase* state) const; + + template + inline bool find_parallel(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + + template + inline bool find_linear(const Array&, int64_t, size_t, size_t, size_t, QueryStateBase*) const; + + inline bool run_parallel_scan(size_t width, size_t range) const; }; + +template +inline bool ArrayPacked::find_all(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + REALM_ASSERT_DEBUG(start <= arr.m_size && (end <= arr.m_size || end == size_t(-1)) && start <= end); + Cond c; + + if (end == npos) + end = arr.m_size; + + if (!(arr.m_size > start && start < end)) + return true; + + const auto lbound = arr.m_lbound; + const auto ubound = arr.m_ubound; + + if (!c.can_match(value, lbound, ubound)) + return true; + + if (c.will_match(value, lbound, ubound)) { + return find_all_match(start, end, baseindex, state); + } + + REALM_ASSERT_3(arr.m_width, !=, 0); + + if (!run_parallel_scan(arr.m_width, end - start)) + return find_linear(arr, value, start, end, baseindex, state); + + return find_parallel(arr, value, start, end, baseindex, state); +} + +template +inline bool ArrayPacked::find_parallel(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + // + // Main idea around find parallel (applicable to flex arrays too). + // Try to find the starting point where the condition can be met, comparing as many values as a single 64bit can + // contain in parallel. Once we have found the starting point, keep matching values as much as we can between + // start and end. + // + // EG: we store the value 6, with width 4bits (0110), 6 is 4 bits because, 110 (6) + sign bit 0. + // Inside 64bits we can fit max 16 times 6. If we go from index 0 to 15 throughout the same 64 bits, we need to + // apply a mask and a shift bits every time, then compare the values. + // This is not the cheapest thing to do. Instead we can compare all values contained within 64 bits in one go and + // see if there is a match with what we are looking for. Reducing the number of comparison by ~logk(N) where K is + // the width of each single value within a 64 bit word and N is the total number of values stored in the array. + + // apparently the compiler is not able to deduce the type of a global function after moving stuff in the header + // (no so sure why) + static auto vector_compare = [](uint64_t MSBs, uint64_t a, uint64_t b) { + if constexpr (std::is_same_v) + return find_all_fields_EQ(MSBs, a, b); + if constexpr (std::is_same_v) + return find_all_fields_NE(MSBs, a, b); + if constexpr (std::is_same_v) + return find_all_fields_signed_GT(MSBs, a, b); + if constexpr (std::is_same_v) + return find_all_fields_signed_LT(MSBs, a, b); + REALM_UNREACHABLE(); + }; + + const auto data = (const uint64_t*)arr.m_data; + const auto width = arr.m_width; + const auto MSBs = arr.get_encoder().msb(); + const auto search_vector = populate(arr.m_width, value); + while (start < end) { + start = parallel_subword_find(vector_compare, data, 0, width, MSBs, search_vector, start, end); + if (start < end) + if (!state->match(start + baseindex)) + return false; + ++start; + } + return true; +} + +template +inline bool ArrayPacked::find_linear(const Array& arr, int64_t value, size_t start, size_t end, size_t baseindex, + QueryStateBase* state) const +{ + auto compare = [](int64_t a, int64_t b) { + if constexpr (std::is_same_v) + return a == b; + if constexpr (std::is_same_v) + return a != b; + if constexpr (std::is_same_v) + return a > b; + if constexpr (std::is_same_v) + return a < b; + }; + bf_iterator it((uint64_t*)arr.m_data, 0, arr.m_width, arr.m_width, start); + while (start < end) { + const auto sv = sign_extend_field_by_mask(arr.get_encoder().width_mask(), *it); + if (compare(sv, value) && !state->match(start + baseindex)) + return false; + ++start; + ++it; + } + return true; +} + +inline bool ArrayPacked::run_parallel_scan(size_t width, size_t range) const +{ + return width < 32 && range >= 16; +} + } // namespace realm #endif // REALM_ARRAY_PACKED_HPP