Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ fully reproducible.

| processor and base freq. | SimdBase64 (GB/s) | .NET speed (GB/s) | speed up |
|:----------------|:------------------------|:-------------------|:-------------------|
| Apple M2 processor (ARM, 3.5 Ghz) | 6.5 | 3.8 | 1.7 x |
| AWS Graviton 3 (ARM, 2.6 GHz) | 3.6 | 2.0 | 1.8 x |
| Intel Ice Lake (2.0 GHz) | 6.5 | 3.4 | 1.9 x |
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.8 | 2.9 | 2.3 x |
| Apple M2 processor (ARM, 3.5 Ghz) | 10 | 3.8 | 2.6 x |
| AWS Graviton 3 (ARM, 2.6 GHz) | 5.1 | 2.0 | 2.6 x |
| Intel Ice Lake (2.0 GHz) | 7.6 | 3.4 | 2.2 x |
| AMD EPYC 7R32 (Zen 2, 2.8 GHz) | 6.9 | 3.0 | 2.3 x |

## Results (SimdBase64 vs. string .NET functions)

Expand Down
6 changes: 1 addition & 5 deletions benchmark/Benchmark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@
using BenchmarkDotNet.Running;
using BenchmarkDotNet.Configs;
using BenchmarkDotNet.Reports;
using BenchmarkDotNet.Filters;
using BenchmarkDotNet.Jobs;
using System.Text;
using System.Runtime.InteropServices;
using BenchmarkDotNet.Columns;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;

namespace SimdUnicodeBenchmarks
{
Expand Down Expand Up @@ -464,7 +460,7 @@ public unsafe void RunOurDecodingBenchmarkWithAllocUTF16(string[] data, int[] le

if (dataoutput.Length != lengths[i])
{
Console.WriteLine($"Error: {dataoutput.Length } != {lengths[i]}");
Console.WriteLine($"Error: {dataoutput.Length} != {lengths[i]}");
#pragma warning disable CA2201
throw new Exception("Error");
}
Expand Down
7 changes: 4 additions & 3 deletions src/Base64.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ public static int MaximalBinaryLengthFromBase64<T>(ReadOnlySpan<T> input)
{
return Scalar.Base64.MaximalBinaryLengthFromBase64Scalar(input);
}
public static byte[] FromBase64String(string s) {
public static byte[] FromBase64String(string s)
{
ReadOnlySpan<char> base64 = s.AsSpan();
byte[] newBytes = new byte[SimdBase64.Base64.MaximalBinaryLengthFromBase64<char>(base64)];
int bytesConsumed = 0;
Expand All @@ -35,7 +36,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<byte> source,
//if (Vector512.IsHardwareAccelerated && Avx512Vbmi2.IsSupported)
//{
//}
if (Avx2.IsSupported)
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
{
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
}
Expand All @@ -60,7 +61,7 @@ public unsafe static OperationStatus DecodeFromBase64(ReadOnlySpan<char> source,
//{
// return GetPointerToFirstInvalidByteAvx512(pInputBuffer, inputLength, out Utf16CodeUnitCountAdjustment, out ScalarCodeUnitCountAdjustment);
//}
if (Avx2.IsSupported)
if (Avx2.IsSupported && Popcnt.IsSupported && Bmi1.IsSupported)
{
return AVX2.Base64.DecodeFromBase64AVX2(source, dest, out bytesConsumed, out bytesWritten, isUrl);
}
Expand Down
59 changes: 59 additions & 0 deletions src/Base64ARM.cs
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,65 @@ private static unsafe ulong ToBase64MaskUrl(Block64* b, ref bool error)
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
{
// if mask is a power of 2, we can use a simpler version
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
{
int pos64 = ArmBase.Arm64.LeadingZeroCount(mask);
int pos = pos64 & 0xf;
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Vector128<byte> v0 = Vector128.Create((byte)(0xe - pos));
switch (pos64 >> 4)
{
case 3:
{
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk0, sh);
Vector128.Store(compressed, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
}
break;

case 2:
{
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk1, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(compressed, output + 1 * 16);
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
}
break;

case 1:
{
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk2, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16);
Vector128.Store(compressed, output + 2 * 16);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);
}
break;

case 0:
{
Vector128<byte> v2 = AdvSimd.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = AdvSimd.Subtract(v1, v2);
Vector128<byte> compressed = AdvSimd.Arm64.VectorTableLookup(b.chunk3, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16);
Vector128.Store(b.chunk2, output + 2 * 16);
Vector128.Store(compressed, output + 3 * 16);
}
break;
}
return 63;
}
ulong nmask = ~mask;
Compress(b.chunk0, (ushort)mask, output, tablePtr);
Compress(b.chunk1, (ushort)(mask >> 16), output + UInt64.PopCount(nmask & 0xFFFF), tablePtr);
Expand Down
63 changes: 63 additions & 0 deletions src/Base64AVX2UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,69 @@ private static UInt64 ToBase64Mask(bool base64Url, ref Vector256<byte> src, ref
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
{
// if mask is a power of 2, we can use a simpler version
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
{
ulong pos64 = Bmi1.X64.TrailingZeroCount(mask);
ulong pos = pos64 & 0xf;
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
switch (pos64 >> 4)
{
case 0:
{
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
Vector128.Store(compressed, output + 0 * 16);
Vector128.Store(chunk1, output + 1 * 16 - 1);
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
}
break;

case 1:
{
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk0, 0);
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
Vector128.Store(chunk0, output + 0 * 16);
Vector128.Store(compressed, output + 1 * 16);
Vector256.Store(b.chunk1, output + 2 * 16 - 1);
}
break;

case 2:
{
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(chunk0, sh);
Vector256.Store(b.chunk0, output + 0 * 16);
Vector128.Store(compressed, output + 2 * 16);
Vector128.Store(chunk1, output + 3 * 16 - 1);
}
break;

case 3:
{
Vector128<byte> chunk0 = Avx2.ExtractVector128(b.chunk1, 0);
Vector128<byte> chunk1 = Avx2.ExtractVector128(b.chunk0, 1);
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(chunk1, sh);
Vector256.Store(b.chunk0, output + 0 * 16);
Vector128.Store(chunk0, output + 2 * 16);
Vector128.Store(compressed, output + 3 * 16);
}
break;
}
return 63;
}
ulong nmask = ~mask;
Compress(b.chunk0, (UInt32)mask, output, tablePtr);
Compress(b.chunk1, (UInt32)(mask >> 32), output + Popcnt.X64.PopCount(nmask & 0xFFFFFFFF), tablePtr);
Expand Down
63 changes: 63 additions & 0 deletions src/Base64SSEUTF8.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
using System;
using System.Numerics;
using System.Runtime.Intrinsics;
using System.Runtime.Intrinsics.X86;
using System.Runtime.CompilerServices;
Expand Down Expand Up @@ -131,6 +132,68 @@ private static ushort ToBase64Mask(bool base64Url, ref Vector128<byte> src, ref
[MethodImpl(MethodImplOptions.AggressiveInlining)]
private unsafe static ulong CompressBlock(ref Block64 b, ulong mask, byte* output, byte* tablePtr)
{
// if mask is a power of 2, we can use a simpler version
if ((mask & (mask - 1)) == 0) // check if mask is a power of 2
{
int pos64 = BitOperations.TrailingZeroCount(mask);
int pos = pos64 & 0xf;
Vector128<byte> v1 = Vector128.Create((byte)0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
Vector128<byte> v0 = Vector128.Create((byte)(pos-1));
switch (pos64 >> 4)
{
case 0:
{
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk0, sh);
Vector128.Store(compressed, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16 - 1);
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);

}
break;

case 1:
{
Vector128<byte> v2 = Sse2.CompareGreaterThan(v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk1, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(compressed, output + 1 * 16);
Vector128.Store(b.chunk2, output + 2 * 16 - 1);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);

}
break;

case 2:
{
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk2, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16);
Vector128.Store(compressed, output + 2 * 16);
Vector128.Store(b.chunk3, output + 3 * 16 - 1);

}
break;

case 3:
{
Vector128<byte> v2 = Sse2.CompareGreaterThan (v1.AsSByte(), v0.AsSByte()).AsByte();
Vector128<byte> sh = Sse2.Subtract(v1, v2);
Vector128<byte> compressed = Ssse3.Shuffle(b.chunk3, sh);
Vector128.Store(b.chunk0, output + 0 * 16);
Vector128.Store(b.chunk1, output + 1 * 16);
Vector128.Store(b.chunk2, output + 2 * 16);
Vector128.Store(compressed, output + 3 * 16);
}
break;
}
return 63;
}
ulong nmask = ~mask;
Compress(b.chunk0, (ushort)mask, output, tablePtr);
Compress(b.chunk1, (ushort)(mask >> 16), output + Popcnt.X64.PopCount(nmask & 0xFFFF), tablePtr);
Expand Down