Skip to content

Commit

Permalink
Use ref struct instead of ReadOnlySpan for PopcntAvx2
Browse files Browse the repository at this point in the history
  • Loading branch information
ogxd committed Sep 6, 2024
1 parent 8ac9b1a commit 90b7fad
Showing 1 changed file with 24 additions and 18 deletions.
42 changes: 24 additions & 18 deletions Equativ.RoaringBitmaps/PopcntAvx2.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@ internal static class PopcntAvx2
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static ulong Popcnt(ReadOnlySpan<ulong> longs)
{
ulong total = Popcnt(MemoryMarshal.Cast<ulong, Vector256<long>>(longs), longs.Length / 4);
ref Vector256<long> start = ref Unsafe.As<ulong, Vector256<long>>(ref MemoryMarshal.GetReference(longs));
ulong total = Popcnt(ref start, longs.Length / 4);

// Handle remaining bytes
for (int i = longs.Length - longs.Length % 4; i < longs.Length; i++)
Expand All @@ -25,54 +26,59 @@ internal static ulong Popcnt(ReadOnlySpan<ulong> longs)
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static Vector256<long> Popcnt(Vector256<byte> v)
private static Vector256<long> PopcntVec(ref Vector256<long> v)
{
Vector256<byte> m1 = Vector256.Create((byte)0x55);
Vector256<byte> m2 = Vector256.Create((byte)0x33);
Vector256<byte> m4 = Vector256.Create((byte)0x0F);

Vector256<byte> t1 = Avx2.Subtract(v, Avx2.And(Avx2.ShiftRightLogical(v.AsUInt16(), 1).AsByte(), m1));
Vector256<byte> t1 = Avx2.Subtract(v.AsByte(), Avx2.And(Avx2.ShiftRightLogical(v.AsUInt16(), 1).AsByte(), m1));
Vector256<byte> t2 = Avx2.Add(Avx2.And(t1, m2), Avx2.And(Avx2.ShiftRightLogical(t1.AsUInt16(), 2).AsByte(), m2));
Vector256<byte> t3 = Avx2.And(Avx2.Add(t2, Avx2.ShiftRightLogical(t2.AsUInt16(), 4).AsByte()), m4);
return Avx2.SumAbsoluteDifferences(t3, Vector256<byte>.Zero).AsInt64();
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static void CSA(out Vector256<long> h, out Vector256<long> l, Vector256<long> a, Vector256<long> b, Vector256<long> c)
private static void CSA(out Vector256<long> h, out Vector256<long> l, ref Vector256<long> a, ref Vector256<long> b, ref Vector256<long> c)
{
Vector256<long> u = Avx2.Xor(a, b);
h = Avx2.Or(Avx2.And(a, b), Avx2.And(u, c));
l = Avx2.Xor(u, c);
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
private static ulong Popcnt(ReadOnlySpan<Vector256<long>> data, int size)
private static ulong Popcnt(ref Vector256<long> start, int size)
{
Vector256<long> total = Vector256<long>.Zero;
Vector256<long> ones = Vector256<long>.Zero;
Vector256<long> twos = Vector256<long>.Zero;
Vector256<long> fours = Vector256<long>.Zero;
Vector256<long> twosA, twosB;

int limit = size - size % 4;
int i = 0;

ref var end = ref Unsafe.Add(ref start, limit);

for (; i < limit; i += 4)
while (Unsafe.IsAddressLessThan(ref start, ref end))
{
CSA(out twosA, out ones, ones, data[i + 0], data[i + 1]);
CSA(out twosB, out ones, ones, data[i + 2], data[i + 3]);
CSA(out fours, out twos, twos, twosA, twosB);
CSA(out var twosA, out ones, ref ones, ref start, ref Unsafe.Add(ref start, 1));
CSA(out var twosB, out ones, ref ones, ref Unsafe.Add(ref start, 2), ref Unsafe.Add(ref start, 3));
CSA(out var fours, out twos, ref twos, ref twosA, ref twosB);

total = Avx2.Add(total, Popcnt(fours.AsByte()));
total = Avx2.Add(total, PopcntVec(ref fours));

start = ref Unsafe.Add(ref start, 4);
}

total = Avx2.ShiftLeftLogical(total, 2);
total = Avx2.Add(total, Avx2.ShiftLeftLogical(Popcnt(twos.AsByte()), 1));
total = Avx2.Add(total, Popcnt(ones.AsByte()));
total = Avx2.Add(total, Avx2.ShiftLeftLogical(PopcntVec(ref twos), 1));
total = Avx2.Add(total, PopcntVec(ref ones));

ref var end2 = ref Unsafe.Add(ref start, size % 4);

for (; i < size; i++)
// Handle remaining vectors
while (Unsafe.IsAddressLessThan(ref start, ref end2))
{
total = Avx2.Add(total, Popcnt(data[i].AsByte()));
total = Avx2.Add(total, PopcntVec(ref start));
start = ref Unsafe.Add(ref start, 1);
}

return (ulong)total.GetElement(0) +
Expand Down

0 comments on commit 90b7fad

Please sign in to comment.