From 14a04be7e23ee4d02746550d09573f9167df581c Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Wed, 4 Apr 2018 21:51:08 -0400
Subject: [PATCH 1/7] Integrate Convection BC7 codec

---
 DirectXTex/BC.h                               |    4 +
 DirectXTex/BC7Parallel.cpp                    | 2074 +++++++++++++++++
 DirectXTex/DirectXTexCompress.cpp             |  200 +-
 DirectXTex/DirectXTex_Desktop_2013.vcxproj    |    1 +
 .../DirectXTex_Desktop_2013.vcxproj.filters   |    3 +
 DirectXTex/DirectXTex_Desktop_2015.vcxproj    |    1 +
 .../DirectXTex_Desktop_2015.vcxproj.filters   |    3 +
 .../DirectXTex_Desktop_2015_Win10.vcxproj     |    1 +
 ...ectXTex_Desktop_2015_Win10.vcxproj.filters |    3 +
 DirectXTex/DirectXTex_Desktop_2017.vcxproj    |    1 +
 .../DirectXTex_Desktop_2017.vcxproj.filters   |    3 +
 .../DirectXTex_Desktop_2017_Win10.vcxproj     |    1 +
 ...ectXTex_Desktop_2017_Win10.vcxproj.filters |    3 +
 DirectXTex/DirectXTex_Windows10.vcxproj       |    1 +
 .../DirectXTex_Windows10.vcxproj.filters      |    3 +
 DirectXTex/DirectXTex_Windows10_2015.vcxproj  |    1 +
 .../DirectXTex_Windows10_2015.vcxproj.filters |    3 +
 DirectXTex/DirectXTex_Windows81.vcxproj       |    1 +
 .../DirectXTex_Windows81.vcxproj.filters      |    3 +
 DirectXTex/DirectXTex_WindowsPhone81.vcxproj  |    1 +
 .../DirectXTex_WindowsPhone81.vcxproj.filters |    3 +
 DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj |    1 +
 ...DirectXTex_XboxOneXDK_2015.vcxproj.filters |    3 +
 DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj |    1 +
 ...DirectXTex_XboxOneXDK_2017.vcxproj.filters |    3 +
 25 files changed, 2254 insertions(+), 68 deletions(-)
 create mode 100644 DirectXTex/BC7Parallel.cpp

diff --git a/DirectXTex/BC.h b/DirectXTex/BC.h
index 82a3b2e5..f85aca7c 100644
--- a/DirectXTex/BC.h
+++ b/DirectXTex/BC.h
@@ -23,6 +23,9 @@ namespace DirectX
 
 // Because these are used in SAL annotations, they need to remain macros rather than const values
 #define NUM_PIXELS_PER_BLOCK 16
+#define MAX_PARALLEL_BLOCKS 8
+#define MAX_BLOCK_SIZE 16
+#define BC7_NUM_PARALLEL_BLOCKS 8
 
 //-------------------------------------------------------------------------------------
 // Constants
@@ -322,5 +325,6 @@ void D3DXEncodeBC5S(_Out_writes_(16) uint8_t *pBC, _In_reads_(NUM_PIXELS_PER_BLO
 void D3DXEncodeBC6HU(_Out_writes_(16) uint8_t *pBC, _In_reads_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
 void D3DXEncodeBC6HS(_Out_writes_(16) uint8_t *pBC, _In_reads_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
 void D3DXEncodeBC7(_Out_writes_(16) uint8_t *pBC, _In_reads_(NUM_PIXELS_PER_BLOCK) const XMVECTOR *pColor, _In_ DWORD flags);
+void D3DXEncodeBC7Parallel(_Out_writes_(16 * BC7_NUM_PARALLEL_BLOCKS) uint8_t *pBC, _In_reads_(NUM_PIXELS_PER_BLOCK * BC7_NUM_PARALLEL_BLOCKS) const XMVECTOR *pColor, _In_ DWORD flags);
 
 } // namespace
diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
new file mode 100644
index 00000000..6854474a
--- /dev/null
+++ b/DirectXTex/BC7Parallel.cpp
@@ -0,0 +1,2074 @@
+/*
+    Based on codec from Convection Texture Tools
+    Copyright (c) 2018 Eric Lasota
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject
+    to the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+    -------------------------------------------------------------------------------------
+
+    Copyright (c) Microsoft Corporation. All rights reserved.
+    Licensed under the MIT License.
+
+    http://go.microsoft.com/fwlink/?LinkId=248926
+
+    -------------------------------------------------------------------------------------
+
+    Contains portions of libsquish
+
+    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
+
+    Permission is hereby granted, free of charge, to any person obtaining
+    a copy of this software and associated documentation files (the 
+    "Software"), to deal in the Software without restriction, including
+    without limitation the rights to use, copy, modify, merge, publish,
+    distribute, sublicense, and/or sell copies of the Software, and to 
+    permit persons to whom the Software is furnished to do so, subject to 
+    the following conditions:
+
+    The above copyright notice and this permission notice shall be included
+    in all copies or substantial portions of the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
+    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
+    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+#include "directxtexp.h"
+
+#include "BC.h"
+
+using namespace DirectX;
+using namespace DirectX::PackedVector;
+
+namespace
+{
+    enum AlphaMode
+    {
+        AlphaMode_Combined,
+        AlphaMode_Separate,
+        AlphaMode_None,
+    };
+
+    enum PBitMode
+    {
+        PBitMode_PerEndpoint,
+        PBitMode_PerSubset,
+        PBitMode_None
+    };
+
+    struct BC7ModeInfo
+    {
+        PBitMode m_pBitMode;
+        AlphaMode m_alphaMode;
+        int m_rgbBits;
+        int m_alphaBits;
+        int m_partitionBits;
+        int m_numSubsets;
+        int m_indexBits;
+        int m_alphaIndexBits;
+        bool m_hasIndexSelector;
+    };
+
+    BC7ModeInfo g_modes[] =
+    {
+        { PBitMode_PerEndpoint, AlphaMode_None, 4, 0, 4, 3, 3, 0, false },     // 0
+        { PBitMode_PerSubset, AlphaMode_None, 6, 0, 6, 2, 3, 0, false },       // 1
+        { PBitMode_None, AlphaMode_None, 5, 0, 6, 3, 2, 0, false },            // 2
+        { PBitMode_PerEndpoint, AlphaMode_None, 7, 0, 6, 2, 2, 0, false },     // 3 (Mode reference has an error, P-bit is really per-endpoint)
+
+        { PBitMode_None, AlphaMode_Separate, 5, 6, 0, 1, 2, 3, true },         // 4
+        { PBitMode_None, AlphaMode_Separate, 7, 8, 0, 1, 2, 2, false },        // 5
+        { PBitMode_PerEndpoint, AlphaMode_Combined, 7, 7, 0, 1, 4, 0, false }, // 6
+        { PBitMode_PerEndpoint, AlphaMode_Combined, 5, 5, 6, 2, 2, 0, false }  // 7
+    };
+
+    static uint16_t g_partitionMap[64] =
+    {
+        0xCCCC, 0x8888, 0xEEEE, 0xECC8,
+        0xC880, 0xFEEC, 0xFEC8, 0xEC80,
+        0xC800, 0xFFEC, 0xFE80, 0xE800,
+        0xFFE8, 0xFF00, 0xFFF0, 0xF000,
+        0xF710, 0x008E, 0x7100, 0x08CE,
+        0x008C, 0x7310, 0x3100, 0x8CCE,
+        0x088C, 0x3110, 0x6666, 0x366C,
+        0x17E8, 0x0FF0, 0x718E, 0x399C,
+        0xaaaa, 0xf0f0, 0x5a5a, 0x33cc,
+        0x3c3c, 0x55aa, 0x9696, 0xa55a,
+        0x73ce, 0x13c8, 0x324c, 0x3bdc,
+        0x6996, 0xc33c, 0x9966, 0x660,
+        0x272, 0x4e4, 0x4e40, 0x2720,
+        0xc936, 0x936c, 0x39c6, 0x639c,
+        0x9336, 0x9cc6, 0x817e, 0xe718,
+        0xccf0, 0xfcc, 0x7744, 0xee22,
+    };
+
+    static uint32_t g_partitionMap2[64] =
+    {
+        0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8,
+        0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050,
+        0xaa550000, 0xaa555500, 0xaaaa5500, 0x90909090,
+        0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250,
+        0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0,
+        0xa8a85454, 0x6a6a4040, 0xa4a45000, 0x1a1a0500,
+        0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400,
+        0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200,
+        0xa9a58000, 0x5090a0a8, 0xa8a09050, 0x24242424,
+        0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50,
+        0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
+        0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600,
+        0xaa444444, 0x54a854a8, 0x95809580, 0x96969600,
+        0xa85454a8, 0x80959580, 0xaa141414, 0x96960000,
+        0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000,
+        0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
+    };
+
+    static int g_fixupIndexes2[64] =
+    {
+        15,15,15,15,
+        15,15,15,15,
+        15,15,15,15,
+        15,15,15,15,
+        15, 2, 8, 2,
+        2, 8, 8,15,
+        2, 8, 2, 2,
+        8, 8, 2, 2,
+
+        15,15, 6, 8,
+        2, 8,15,15,
+        2, 8, 2, 2,
+        2,15,15, 6,
+        6, 2, 6, 8,
+        15,15, 2, 2,
+        15,15,15,15,
+        15, 2, 2,15,
+    };
+
+    static int g_fixupIndexes3[64][2] =
+    {
+        { 3,15 },{ 3, 8 },{ 15, 8 },{ 15, 3 },
+        { 8,15 },{ 3,15 },{ 15, 3 },{ 15, 8 },
+        { 8,15 },{ 8,15 },{ 6,15 },{ 6,15 },
+        { 6,15 },{ 5,15 },{ 3,15 },{ 3, 8 },
+        { 3,15 },{ 3, 8 },{ 8,15 },{ 15, 3 },
+        { 3,15 },{ 3, 8 },{ 6,15 },{ 10, 8 },
+        { 5, 3 },{ 8,15 },{ 8, 6 },{ 6,10 },
+        { 8,15 },{ 5,15 },{ 15,10 },{ 15, 8 },
+
+        { 8,15 },{ 15, 3 },{ 3,15 },{ 5,10 },
+        { 6,10 },{ 10, 8 },{ 8, 9 },{ 15,10 },
+        { 15, 6 },{ 3,15 },{ 15, 8 },{ 5,15 },
+        { 15, 3 },{ 15, 6 },{ 15, 6 },{ 15, 8 },
+        { 3,15 },{ 15, 3 },{ 5,15 },{ 5,15 },
+        { 5,15 },{ 8,15 },{ 5,15 },{ 10,15 },
+        { 5,15 },{ 10,15 },{ 8,15 },{ 13,15 },
+        { 15, 3 },{ 12,15 },{ 3,15 },{ 3, 8 },
+    };
+
+    struct InputBlock
+    {
+        int32_t m_pixels[16];
+    };
+
+#if (defined(_M_IX86_FP) && _M_IX86_FP >= 2) || defined(_M_X64)
+    // SSE2 version
+
+    struct ParallelMath
+    {
+        static const int ParallelSize = 8;
+
+        struct Int16
+        {
+            __m128i m_value;
+
+            inline Int16 operator+(int16_t other) const
+            {
+                Int16 result;
+                result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(other));
+                return result;
+            }
+
+            inline Int16 operator+(Int16 other) const
+            {
+                Int16 result;
+                result.m_value = _mm_add_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16 operator|(Int16 other) const
+            {
+                Int16 result;
+                result.m_value = _mm_or_si128(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16 operator-(Int16 other) const
+            {
+                Int16 result;
+                result.m_value = _mm_sub_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16 operator*(const Int16& other) const
+            {
+                Int16 result;
+                result.m_value = _mm_mullo_epi16(m_value, other.m_value);
+                return result;
+            }
+
+            inline Int16 operator<<(int bits) const
+            {
+                Int16 result;
+                result.m_value = _mm_slli_epi16(m_value, bits);
+                return result;
+            }
+        };
+
+        struct Int32
+        {
+            __m128i m_values[2];
+        };
+
+        struct Float
+        {
+            __m128 m_values[2];
+
+            inline Float operator+(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator-(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator*(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+
+            inline Float operator/(const Float& other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
+                result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
+                return result;
+            }
+
+            inline Float operator/(float other) const
+            {
+                Float result;
+                result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
+                result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
+                return result;
+            }
+        };
+
+        struct Int16CompFlag
+        {
+            __m128i m_value;
+        };
+
+        struct FloatCompFlag
+        {
+            __m128 m_values[2];
+        };
+
+        static Float Select(FloatCompFlag flag, Float a, Float b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
+            return result;
+        }
+
+        static Int16 Select(Int16CompFlag flag, Int16 a, Int16 b)
+        {
+            Int16 result;
+            result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
+            return result;
+        }
+
+        static void ConditionalSet(Int16& dest, Int16CompFlag flag, const Int16 src)
+        {
+            dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
+        }
+
+        static void ConditionalSet(Float& dest, FloatCompFlag flag, const Float src)
+        {
+            for (int i = 0; i < 2; i++)
+                dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
+        }
+
+        static Int16 Min(Int16 a, Int16 b)
+        {
+            Int16 result;
+            result.m_value = _mm_min_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Min(Float a, Float b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Int16 Max(Int16 a, Int16 b)
+        {
+            Int16 result;
+            result.m_value = _mm_max_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static Float Max(Float a, Float b)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Float Clamp(Float v, float min, float max)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
+            return result;
+    }
+
+        static void ReadPackedInputs(const InputBlock* inputBlocks, int pxOffset, Int32& outPackedPx)
+        {
+            for (int i = 0; i < 4; i++)
+                reinterpret_cast<int32_t*>(&outPackedPx.m_values[0])[i] = inputBlocks[i].m_pixels[pxOffset];
+            for (int i = 0; i < 4; i++)
+                reinterpret_cast<int32_t*>(&outPackedPx.m_values[1])[i] = inputBlocks[i + 4].m_pixels[pxOffset];
+        }
+
+        static void UnpackChannel(Int32 inputPx, int ch, Int16& chOut)
+        {
+            __m128i ch0 = _mm_srli_epi32(inputPx.m_values[0], ch * 8);
+            __m128i ch1 = _mm_srli_epi32(inputPx.m_values[1], ch * 8);
+            ch0 = _mm_and_si128(ch0, _mm_set1_epi32(0xff));
+            ch1 = _mm_and_si128(ch1, _mm_set1_epi32(0xff));
+
+            chOut.m_value = _mm_packs_epi32(ch0, ch1);
+        }
+
+        static Float MakeFloat(float v)
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
+            return f;
+        }
+
+        static Float MakeFloatZero()
+        {
+            Float f;
+            f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
+            return f;
+        }
+
+        static Int16 MakeUInt16(uint16_t v)
+        {
+            Int16 result;
+            result.m_value = _mm_set1_epi16(static_cast<short>(v));
+            return result;
+        }
+
+        static uint16_t ExtractUInt16(const Int16& v, int offset)
+        {
+            return reinterpret_cast<const uint16_t*>(&v)[offset];
+        }
+
+        static float ExtractFloat(float v, int offset)
+        {
+            return reinterpret_cast<const float*>(&v)[offset];
+        }
+
+        static Int16CompFlag Less(Int16 a, Int16 b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Less(Float a, Float b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Int16CompFlag Equal(Int16 a, Int16 b)
+        {
+            Int16CompFlag result;
+            result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
+            return result;
+        }
+
+        static FloatCompFlag Equal(Float a, Float b)
+        {
+            FloatCompFlag result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
+            return result;
+        }
+
+        static Float UInt16ToFloat(Int16 v)
+        {
+            Float result;
+            result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
+            result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
+            return result;
+        }
+
+        static Int16CompFlag FloatFlagToInt16(FloatCompFlag v)
+        {
+            __m128i lo = _mm_castps_si128(v.m_values[0]);
+            __m128i hi = _mm_castps_si128(v.m_values[1]);
+
+            Int16CompFlag result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Int16 FloatToUInt16(Float v)
+        {
+            __m128 half = _mm_set1_ps(0.5f);
+            __m128i lo = _mm_cvttps_epi32(_mm_add_ps(v.m_values[0], half));
+            __m128i hi = _mm_cvttps_epi32(_mm_add_ps(v.m_values[1], half));
+
+            Int16 result;
+            result.m_value = _mm_packs_epi32(lo, hi);
+            return result;
+        }
+
+        static Float Sqrt(Float f)
+        {
+            Float result;
+            for (int i = 0; i < 2; i++)
+                result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
+            return result;
+        }
+
+        static Int16 SqDiff(Int16 a, Int16 b)
+        {
+            __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
+
+            Int16 result;
+            result.m_value = _mm_mullo_epi16(diff, diff);
+            return result;
+        }
+
+        static Int16 UnsignedRightShift(Int16 v, int bits)
+        {
+            Int16 result;
+            result.m_value = _mm_srli_epi16(v.m_value, bits);
+            return result;
+        }
+
+        static bool AnySet(Int16CompFlag v)
+        {
+            return _mm_movemask_epi8(v.m_value) != 0;
+        }
+    };
+
+#else
+    // Scalar version
+
+    struct ParallelMath
+    {
+        static const int ParallelSize = 1;
+
+        typedef float Float;
+        typedef int16_t Int16;
+        typedef int32_t Int32;
+        typedef bool Int16CompFlag;
+        typedef bool FloatCompFlag;
+
+        template<class T>
+        inline static void ConditionalSet(T& dest, bool flag, const T src)
+        {
+            if (flag)
+                dest = src;
+        }
+
+        template<class T>
+        inline static T Select(bool flag, T a, T b)
+        {
+            return flag ? a : b;
+        }
+
+        template<class T>
+        inline static T Min(T a, T b)
+        {
+            if (a < b)
+                return a;
+            return b;
+        }
+
+        template<class T>
+        inline static T Max(T a, T b)
+        {
+            if (a > b)
+                return a;
+            return b;
+        }
+
+        template<class T>
+        inline static T Clamp(T v, T min, T max)
+        {
+            return Max(Min(v, max), min);
+        }
+
+        inline static void ReadPackedInputs(const InputBlock* inputBlocks, int pxOffset, Int32& outPackedPx)
+        {
+            outPackedPx = inputBlocks[0].m_pixels[pxOffset];
+        }
+
+        inline static void UnpackChannel(Int32 inputPx, int ch, Int16& chOut)
+        {
+            chOut = static_cast<uint16_t>((inputPx >> (ch * 8)) & 0xff);
+        }
+
+        inline static float MakeFloat(float v)
+        {
+            return v;
+        }
+
+        inline static float MakeFloatZero()
+        {
+            return 0.f;
+        }
+
+        inline static int16_t MakeUInt16(int16_t v)
+        {
+            return v;
+        }
+
+        inline static int16_t ExtractUInt16(int16_t v, int offset)
+        {
+            return v;
+        }
+
+        inline static float ExtractFloat(float v, int offset)
+        {
+            return v;
+        }
+
+        template<class T>
+        inline static bool Less(T a, T b)
+        {
+            return a < b;
+        }
+
+        template<class T>
+        inline static bool Equal(T a, T b)
+        {
+            return a == b;
+        }
+
+        inline static float UInt16ToFloat(uint16_t v)
+        {
+            return static_cast<float>(v);
+        }
+
+        inline static Int16CompFlag FloatFlagToInt16(FloatCompFlag v)
+        {
+            return v;
+        }
+
+        inline static uint16_t FloatToUInt16(float v)
+        {
+            return static_cast<uint16_t>(floorf(v + 0.5f));
+        }
+
+        inline static float Sqrt(float f)
+        {
+            return sqrtf(f);
+        }
+
+        inline static uint16_t SqDiff(uint16_t a, uint16_t b)
+        {
+            int diff = static_cast<int>(a) - static_cast<int>(b);
+            return static_cast<uint16_t>(diff * diff);
+        }
+
+        inline static bool AnySet(bool b)
+        {
+            return b;
+        }
+
+        inline static int16_t UnsignedRightShift(int16_t v, int bits)
+        {
+            uint32_t i = static_cast<uint32_t>(v) & 0xffff;
+            return static_cast<int16_t>(i >> bits);
+        }
+    };
+
+#endif
+
+    struct PackingVector
+    {
+        uint32_t m_vector[4];
+        int m_offset;
+
+        void Init()
+        {
+            for (int i = 0; i < 4; i++)
+                m_vector[i] = 0;
+
+            m_offset = 0;
+        }
+
+        inline void Pack(uint16_t value, int bits)
+        {
+            int vOffset = m_offset >> 5;
+            int bitOffset = m_offset & 0x1f;
+
+            m_vector[vOffset] |= (static_cast<uint32_t>(value) << bitOffset) & static_cast<uint32_t>(0xffffffff);
+
+            int overflowBits = bitOffset + bits - 32;
+            if (overflowBits > 0)
+                m_vector[vOffset + 1] |= (static_cast<uint32_t>(value) >> (bits - overflowBits));
+
+            m_offset += bits;
+        }
+
+        inline void Flush(uint8_t* output)
+        {
+            assert(m_offset == 128);
+
+            for (int v = 0; v < 4; v++)
+            {
+                uint32_t chunk = m_vector[v];
+                for (int b = 0; b < 4; b++)
+                    output[v * 4 + b] = static_cast<uint8_t>((chunk >> (b * 8)) & 0xff);
+            }
+        }
+    };
+
+    void ComputeTweakFactors(int tweak, int bits, float* outFactors)
+    {
+        int totalUnits = (1 << bits) - 1;
+        int minOutsideUnits = ((tweak >> 1) & 1);
+        int maxOutsideUnits = (tweak & 1);
+        int insideUnits = totalUnits - minOutsideUnits - maxOutsideUnits;
+
+        outFactors[0] = -static_cast<float>(minOutsideUnits) / static_cast<float>(insideUnits);
+        outFactors[1] = static_cast<float>(maxOutsideUnits) / static_cast<float>(insideUnits) + 1.0f;
+    }
+
+    template<int TVectorSize>
+    class UnfinishedEndpoints
+    {
+    public:
+        typedef ParallelMath::Float MFloat;
+        typedef ParallelMath::Int16 MInt16;
+
+        MFloat m_base[TVectorSize];
+        MFloat m_offset[TVectorSize];
+
+        void Finish(int tweak, int bits, MInt16* outEP0, MInt16* outEP1)
+        {
+            float tweakFactors[2];
+            ComputeTweakFactors(tweak, bits, tweakFactors);
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+            {
+                MFloat ep0f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[0], 0.0f, 255.0f);
+                MFloat ep1f = ParallelMath::Clamp(m_base[ch] + m_offset[ch] * tweakFactors[1], 0.0f, 255.0f);
+                outEP0[ch] = ParallelMath::FloatToUInt16(ep0f);
+                outEP1[ch] = ParallelMath::FloatToUInt16(ep1f);
+            }
+        }
+    };
+
+    class EndpointSelectorRGBA
+    {
+    public:
+        static const int NumPasses = 3;
+        static const int NumPowerIterations = 8;
+
+        typedef ParallelMath::Float MFloat;
+        typedef ParallelMath::Int16 MInt16;
+
+        MFloat m_total[4];
+        MFloat m_ctr[4];
+        MFloat m_axis[4];
+        MFloat m_xx;
+        MFloat m_xy;
+        MFloat m_xz;
+        MFloat m_xw;
+        MFloat m_yy;
+        MFloat m_yz;
+        MFloat m_yw;
+        MFloat m_zz;
+        MFloat m_zw;
+        MFloat m_ww;
+        MFloat m_minDist;
+        MFloat m_maxDist;
+
+        EndpointSelectorRGBA()
+        {
+            for (int i = 0; i < 4; i++)
+            {
+                m_total[i] = ParallelMath::MakeFloatZero();
+                m_ctr[i] = ParallelMath::MakeFloatZero();
+                m_axis[i] = ParallelMath::MakeFloatZero();
+            }
+            m_xx = ParallelMath::MakeFloatZero();
+            m_xy = ParallelMath::MakeFloatZero();
+            m_xz = ParallelMath::MakeFloatZero();
+            m_xw = ParallelMath::MakeFloatZero();
+            m_yy = ParallelMath::MakeFloatZero();
+            m_yz = ParallelMath::MakeFloatZero();
+            m_yw = ParallelMath::MakeFloatZero();
+            m_zz = ParallelMath::MakeFloatZero();
+            m_zw = ParallelMath::MakeFloatZero();
+            m_ww = ParallelMath::MakeFloatZero();
+            m_minDist = ParallelMath::MakeFloat(1000.0f);
+            m_maxDist = ParallelMath::MakeFloat(-1000.0f);
+        }
+
+        void InitPass(int step)
+        {
+            if (step == 1)
+            {
+                for (int i = 0; i < 4; i++)
+                    m_ctr[i] = m_ctr[i] / ParallelMath::Max(m_total[i], ParallelMath::MakeFloat(0.0001f));
+            }
+            else if (step == 2)
+            {
+                MFloat matrix[4][4] =
+                {
+                    { m_xx, m_xy, m_xz, m_xw },
+                    { m_xy, m_yy, m_yz, m_yw },
+                    { m_xz, m_yz, m_zz, m_zw },
+                    { m_xw, m_yw, m_zw, m_ww }
+                };
+
+                MFloat v[4] = { ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f) };
+                for (int p = 0; p < NumPowerIterations; p++)
+                {
+                    // matrix multiply
+                    MFloat w[4];
+                    for (int i = 0; i < 4; i++)
+                    {
+                        w[i] = matrix[0][i] * v[0];
+                        for (int row = 1; row < 4; row++)
+                            w[i] = w[i] + matrix[row][i] * v[row];
+                    }
+
+                    MFloat a = ParallelMath::Max(w[0], ParallelMath::Max(w[1], ParallelMath::Max(w[2], w[3])));
+
+                    ParallelMath::FloatCompFlag aZero = ParallelMath::Equal(a, ParallelMath::MakeFloatZero());
+
+                    ParallelMath::ConditionalSet(a, aZero, ParallelMath::MakeFloat(1.0f));
+
+                    for (int c = 0; c < 4; c++)
+                        v[c] = w[c] / a;
+                }
+
+                MFloat vlen = ParallelMath::Sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2] + v[3] * v[3]);
+
+                ParallelMath::FloatCompFlag vZero = ParallelMath::Equal(vlen, ParallelMath::MakeFloatZero());
+                ParallelMath::ConditionalSet(vlen, vZero, ParallelMath::MakeFloat(1.0f));
+
+                for (int i = 0; i < 4; i++)
+                    m_axis[i] = v[i] / vlen;
+            }
+        }
+
+        void Contribute(int step, const MInt16* pixel, MFloat weight)
+        {
+            MFloat pt[4];
+            for (int i = 0; i < 4; i++)
+                pt[i] = ParallelMath::UInt16ToFloat(pixel[i]);
+
+            if (step == 0)
+            {
+                for (int i = 0; i < 4; i++)
+                {
+                    m_total[i] = m_total[i] + weight;
+                    m_ctr[i] = m_ctr[i] + weight * pt[i];
+                }
+            }
+            else if (step == 1)
+            {
+                MFloat a[4];
+                MFloat b[4];
+
+                for (int i = 0; i < 4; i++)
+                {
+                    a[i] = pt[i] - m_ctr[i];
+                    b[i] = weight * a[i];
+                }
+
+                m_xx = m_xx + a[0] * b[0];
+                m_xy = m_xy + a[0] * b[1];
+                m_xz = m_xz + a[0] * b[2];
+                m_xw = m_xw + a[0] * b[3];
+                m_yy = m_yy + a[1] * b[1];
+                m_yz = m_yz + a[1] * b[2];
+                m_yw = m_yw + a[1] * b[3];
+                m_zz = m_zz + a[2] * b[2];
+                m_zw = m_zw + a[2] * b[3];
+                m_ww = m_ww + a[3] * b[3];
+            }
+            else if (step == 2)
+            {
+                MFloat diff[4];
+                for (int i = 0; i < 4; i++)
+                    diff[i] = pt[i] - m_ctr[i];
+
+                MFloat dist = diff[0] * m_axis[0] + diff[1] * m_axis[1] + diff[2] * m_axis[2] + diff[3] * m_axis[3];
+                m_minDist = ParallelMath::Min(dist, m_minDist);
+                m_maxDist = ParallelMath::Max(dist, m_maxDist);
+            }
+        }
+
+        UnfinishedEndpoints<4> GetEndpoints() const
+        {
+            MFloat len = m_maxDist - m_minDist;
+
+            UnfinishedEndpoints<4> result;
+            for (int i = 0; i < 4; i++)
+            {
+                result.m_base[i] = m_ctr[i] + m_axis[i] * m_minDist;
+                result.m_offset[i] = m_axis[i] * len;
+            }
+            return result;
+        }
+    };
+
+
+    class EndpointSelectorRGB
+    {
+    public:
+        static const int NumPasses = 3;
+        static const int NumPowerIterations = 8;
+
+        typedef ParallelMath::Float MFloat;
+        typedef ParallelMath::Int16 MInt16;
+
+        MFloat m_total[3];
+        MFloat m_ctr[3];
+        MFloat m_axis[3];
+        MFloat m_xx;
+        MFloat m_xy;
+        MFloat m_xz;
+        MFloat m_xw;
+        MFloat m_yy;
+        MFloat m_yz;
+        MFloat m_yw;
+        MFloat m_zz;
+        MFloat m_zw;
+        MFloat m_ww;
+        MFloat m_minDist;
+        MFloat m_maxDist;
+
+        EndpointSelectorRGB()
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                m_total[i] = ParallelMath::MakeFloatZero();
+                m_ctr[i] = ParallelMath::MakeFloatZero();
+                m_axis[i] = ParallelMath::MakeFloatZero();
+            }
+            m_xx = ParallelMath::MakeFloatZero();
+            m_xy = ParallelMath::MakeFloatZero();
+            m_xz = ParallelMath::MakeFloatZero();
+            m_xw = ParallelMath::MakeFloatZero();
+            m_yy = ParallelMath::MakeFloatZero();
+            m_yz = ParallelMath::MakeFloatZero();
+            m_yw = ParallelMath::MakeFloatZero();
+            m_zz = ParallelMath::MakeFloatZero();
+            m_zw = ParallelMath::MakeFloatZero();
+            m_ww = ParallelMath::MakeFloatZero();
+            m_minDist = ParallelMath::MakeFloat(1000.0f);
+            m_maxDist = ParallelMath::MakeFloat(-1000.0f);
+        }
+
+        void InitPass(int step)
+        {
+            if (step == 1)
+            {
+                for (int i = 0; i < 3; i++)
+                    m_ctr[i] = m_ctr[i] / ParallelMath::Max(m_total[i], ParallelMath::MakeFloat(0.0001f));
+            }
+            else if (step == 2)
+            {
+                MFloat matrix[3][3] =
+                {
+                    { m_xx, m_xy, m_xz },
+                { m_xy, m_yy, m_yz },
+                { m_xz, m_yz, m_zz },
+                };
+
+                MFloat v[3] = { ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f) };
+                for (int p = 0; p < NumPowerIterations; p++)
+                {
+                    // matrix multiply
+                    MFloat w[3];
+                    for (int i = 0; i < 3; i++)
+                    {
+                        w[i] = matrix[0][i] * v[0];
+                        for (int row = 1; row < 3; row++)
+                            w[i] = w[i] + matrix[row][i] * v[row];
+                    }
+
+                    MFloat a = ParallelMath::Max(w[0], ParallelMath::Max(w[1], w[2]));
+
+                    ParallelMath::FloatCompFlag aZero = ParallelMath::Equal(a, ParallelMath::MakeFloatZero());
+
+                    ParallelMath::ConditionalSet(a, aZero, ParallelMath::MakeFloat(1.0f));
+
+                    for (int c = 0; c < 3; c++)
+                        v[c] = w[c] / a;
+                }
+
+                MFloat vlen = ParallelMath::Sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+
+                ParallelMath::FloatCompFlag vZero = ParallelMath::Equal(vlen, ParallelMath::MakeFloatZero());
+                ParallelMath::ConditionalSet(vlen, vZero, ParallelMath::MakeFloat(1.0f));
+
+                for (int i = 0; i < 3; i++)
+                    m_axis[i] = v[i] / vlen;
+            }
+        }
+
+        void Contribute(int step, const MInt16* pixel, MFloat weight)
+        {
+            MFloat pt[3];
+            for (int i = 0; i < 3; i++)
+                pt[i] = ParallelMath::UInt16ToFloat(pixel[i]);
+
+            if (step == 0)
+            {
+                for (int i = 0; i < 3; i++)
+                {
+                    m_total[i] = m_total[i] + weight;
+                    m_ctr[i] = m_ctr[i] + weight * pt[i];
+                }
+            }
+            else if (step == 1)
+            {
+                MFloat a[3];
+                MFloat b[3];
+
+                for (int i = 0; i < 3; i++)
+                {
+                    a[i] = pt[i] - m_ctr[i];
+                    b[i] = weight * a[i];
+                }
+
+                m_xx = m_xx + a[0] * b[0];
+                m_xy = m_xy + a[0] * b[1];
+                m_xz = m_xz + a[0] * b[2];
+                m_yy = m_yy + a[1] * b[1];
+                m_yz = m_yz + a[1] * b[2];
+                m_zz = m_zz + a[2] * b[2];
+            }
+            else if (step == 2)
+            {
+                MFloat diff[3];
+                for (int i = 0; i < 3; i++)
+                    diff[i] = pt[i] - m_ctr[i];
+
+                MFloat dist = diff[0] * m_axis[0] + diff[1] * m_axis[1] + diff[2] * m_axis[2];
+                m_minDist = ParallelMath::Min(dist, m_minDist);
+                m_maxDist = ParallelMath::Max(dist, m_maxDist);
+            }
+        }
+
+        UnfinishedEndpoints<3> GetEndpoints() const
+        {
+            MFloat len = m_maxDist - m_minDist;
+
+            UnfinishedEndpoints<3> result;
+            for (int i = 0; i < 3; i++)
+            {
+                result.m_base[i] = m_ctr[i] + m_axis[i] * m_minDist;
+                result.m_offset[i] = m_axis[i] * len;
+            }
+            return result;
+        }
+    };
+
+    template<int TVectorSize>
+    class IndexSelector
+    {
+    public:
+        typedef ParallelMath::Float MFloat;
+        typedef ParallelMath::Int16 MInt16;
+
+        MInt16 m_endPoint[2][TVectorSize];
+        int m_prec;
+        float m_maxValue;
+        MFloat m_origin[TVectorSize];
+        MFloat m_axis[TVectorSize];
+
+        void Init(MInt16 endPoint[2][TVectorSize], int prec)
+        {
+            for (int ep = 0; ep < 2; ep++)
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    m_endPoint[ep][ch] = endPoint[ep][ch];
+
+            m_prec = prec;
+            m_maxValue = static_cast<float>((1 << m_prec) - 1);
+
+            MFloat axis[TVectorSize];
+            for (int ch = 0; ch < TVectorSize; ch++)
+            {
+                m_origin[ch] = ParallelMath::UInt16ToFloat(endPoint[0][ch]);
+
+                axis[ch] = ParallelMath::UInt16ToFloat(endPoint[1][ch]) - m_origin[ch];
+            }
+
+            MFloat lenSquared = axis[0] * axis[0];
+            for (int ch = 1; ch < TVectorSize; ch++)
+                lenSquared = lenSquared + axis[ch] * axis[ch];
+
+            ParallelMath::FloatCompFlag lenSquaredZero = ParallelMath::Equal(lenSquared, ParallelMath::MakeFloatZero());
+
+            ParallelMath::ConditionalSet(lenSquared, lenSquaredZero, ParallelMath::MakeFloat(1.0f));
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_axis[ch] = (axis[ch] / lenSquared) * m_maxValue;
+        }
+
+        void Reconstruct(MInt16 index, MInt16* pixel)
+        {
+            MInt16 weightRcp = ParallelMath::MakeUInt16(0);
+            if (m_prec == 2)
+                weightRcp = ParallelMath::MakeUInt16(10923);
+            else if (m_prec == 3)
+                weightRcp = ParallelMath::MakeUInt16(4681);
+            else if (m_prec == 4)
+                weightRcp = ParallelMath::MakeUInt16(2184);
+
+            MInt16 weight = ParallelMath::UnsignedRightShift(index * weightRcp + 256, 9);
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+                pixel[ch] = ParallelMath::UnsignedRightShift(((ParallelMath::MakeUInt16(64) - weight) * m_endPoint[0][ch] + weight * m_endPoint[1][ch] + ParallelMath::MakeUInt16(32)), 6);
+        }
+
+        MInt16 SelectIndex(const MInt16* pixel)
+        {
+            MFloat diff[TVectorSize];
+            for (int ch = 0; ch < TVectorSize; ch++)
+                diff[ch] = ParallelMath::UInt16ToFloat(pixel[ch]) - m_origin[ch];
+
+            MFloat dist = diff[0] * m_axis[0];
+            for (int ch = 1; ch < TVectorSize; ch++)
+                dist = dist + diff[ch] * m_axis[ch];
+
+            return ParallelMath::FloatToUInt16(ParallelMath::Clamp(dist, 0.0f, m_maxValue));
+        }
+    };
+
+    // Solve for a, b where v = a*t + b
+    // This allows endpoints to be mapped to where T=0 and T=1
+    // Least squares from totals:
+    // a = (tv - t*v/w)/(tt - t*t/w)
+    // b = (v - a*t)/w
+    template<int TVectorSize>
+    class EndpointRefiner
+    {
+    public:
+        typedef ParallelMath::Float MFloat;
+        typedef ParallelMath::Int16 MInt16;
+
+        MFloat m_tv[TVectorSize];
+        MFloat m_v[TVectorSize];
+        MFloat m_tt;
+        MFloat m_t;
+        MFloat m_w;
+
+        float m_maxIndex;
+
+        void Init(int indexBits)
+        {
+            for (int ch = 0; ch < TVectorSize; ch++)
+            {
+                m_tv[ch] = ParallelMath::MakeFloatZero();
+                m_v[ch] = ParallelMath::MakeFloatZero();
+            }
+            m_tt = ParallelMath::MakeFloatZero();
+            m_t = ParallelMath::MakeFloatZero();
+            m_w = ParallelMath::MakeFloatZero();
+
+            m_maxIndex = static_cast<float>((1 << indexBits) - 1);
+        }
+
+        void Contribute(const MInt16* pixel, MInt16 index, MFloat weight)
+        {
+            MFloat v[TVectorSize];
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+                v[ch] = ParallelMath::UInt16ToFloat(pixel[ch]);
+
+            MFloat t = ParallelMath::UInt16ToFloat(index) / m_maxIndex;
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+            {
+                m_tv[ch] = m_tv[ch] + weight * t * v[ch];
+                m_v[ch] = m_v[ch] + weight * v[ch];
+            }
+            m_tt = m_tt + weight * t * t;
+            m_t = m_t + weight * t;
+            m_w = m_w + weight;
+        }
+
+        void GetRefinedEndpoints(MInt16 endPoint[2][TVectorSize])
+        {
+            // a = (tv - t*v/w)/(tt - t*t/w)
+            // b = (v - a*t)/w
+            ParallelMath::FloatCompFlag wZero = ParallelMath::Equal(m_w, ParallelMath::MakeFloatZero());
+
+            MFloat w = ParallelMath::Select(wZero, ParallelMath::MakeFloat(1.0f), m_w);
+
+            MFloat adenom = (m_tt - m_t * m_t / w);
+
+            ParallelMath::FloatCompFlag adenomZero = ParallelMath::Equal(adenom, ParallelMath::MakeFloatZero());
+            ParallelMath::ConditionalSet(adenom, adenomZero, ParallelMath::MakeFloat(1.0f));
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+            {
+                /*
+                if (adenom == 0.0)
+                    p1 = p2 = er.v / er.w;
+                else
+                {
+                    float4 a = (er.tv - er.t*er.v / er.w) / adenom;
+                    float4 b = (er.v - a * er.t) / er.w;
+                    p1 = b;
+                    p2 = a + b;
+                }
+                */
+
+                MFloat a = (m_tv[ch] - m_t * m_v[ch] / w) / adenom;
+                MFloat b = (m_v[ch] - a * m_t) / w;
+
+                MFloat p1 = ParallelMath::Clamp(b, 0.0f, 255.0f);
+                MFloat p2 = ParallelMath::Clamp(a + b, 0.0f, 255.0f);
+
+                ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] / w));
+                ParallelMath::ConditionalSet(p2, adenomZero, p1);
+
+                endPoint[0][ch] = ParallelMath::FloatToUInt16(p1);
+                endPoint[1][ch] = ParallelMath::FloatToUInt16(p2);
+            }
+        }
+    };
+
+    class BC7Computer
+    {
+    public:
+        static const int NumTweakRounds = 4;
+        static const int NumRefineRounds = 2;
+
+        typedef ParallelMath::Int16 MInt16;
+        typedef ParallelMath::Int32 MInt32;
+        typedef ParallelMath::Float MFloat;
+
+        struct WorkInfo
+        {
+            MInt16 m_mode;
+            MFloat m_error;
+            MInt16 m_ep[3][2][4];
+            MInt16 m_indexes[16];
+            MInt16 m_indexes2[16];
+
+            union
+            {
+                MInt16 m_partition;
+                struct IndexSelectorAndRotation
+                {
+                    MInt16 m_indexSelector;
+                    MInt16 m_rotation;
+                } m_isr;
+            };
+        };
+
+        static void TweakAlpha(const MInt16 original[2], int tweak, int bits, MInt16 result[2])
+        {
+            float tf[2];
+            ComputeTweakFactors(tweak, bits, tf);
+
+            MFloat base = ParallelMath::UInt16ToFloat(original[0]);
+            MFloat offs = ParallelMath::UInt16ToFloat(original[1]) - base;
+
+            result[0] = ParallelMath::FloatToUInt16(ParallelMath::Clamp(base + offs * tf[0], 0.0f, 255.0f));
+            result[1] = ParallelMath::FloatToUInt16(ParallelMath::Clamp(base + offs * tf[1], 0.0f, 255.0f));
+        }
+
+        static void Quantize(MInt16* color, int bits, int channels)
+        {
+            float maxColor = static_cast<float>((1 << bits) - 1);
+
+            for (int i = 0; i < channels; i++)
+                color[i] = ParallelMath::FloatToUInt16(ParallelMath::Clamp(ParallelMath::UInt16ToFloat(color[i]) * ParallelMath::MakeFloat(1.0f / 255.0f) * maxColor, 0.f, 255.f));
+        }
+
+        static void QuantizeP(MInt16* color, int bits, uint16_t p, int channels)
+        {
+            uint16_t pShift = static_cast<uint16_t>(1 << (7 - bits));
+            MInt16 pShiftV = ParallelMath::MakeUInt16(pShift);
+
+            float maxColorF = static_cast<float>(255 - (1 << (7 - bits)));
+
+            float maxQuantized = static_cast<float>((1 << bits) - 1);
+
+            for (int ch = 0; ch < channels; ch++)
+            {
+                MInt16 clr = color[ch];
+                if (p)
+                    clr = ParallelMath::Max(clr, pShiftV) - pShiftV;
+
+                MFloat rerangedColor = ParallelMath::UInt16ToFloat(clr) * maxQuantized / maxColorF;
+
+                clr = ParallelMath::FloatToUInt16(ParallelMath::Clamp(rerangedColor, 0.0f, maxQuantized)) << 1;
+                if (p)
+                    clr = clr | ParallelMath::MakeUInt16(1);
+
+                color[ch] = clr;
+            }
+        }
+
+        static void Unquantize(MInt16* color, int bits, int channels)
+        {
+            for (int ch = 0; ch < channels; ch++)
+            {
+                MInt16 clr = color[ch];
+                clr = clr << (8 - bits);
+                color[ch] = clr | ParallelMath::UnsignedRightShift(clr, bits);
+            }
+        }
+
+        static void CompressEndpoints0(MInt16 ep[2][4], uint16_t p[2])
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                QuantizeP(ep[j], 4, p[j], 3);
+                Unquantize(ep[j], 5, 3);
+                ep[j][3] = ParallelMath::MakeUInt16(255);
+            }
+        }
+
+        static void CompressEndpoints1(MInt16 ep[2][4], uint16_t p)
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                QuantizeP(ep[j], 6, p, 3);
+                Unquantize(ep[j], 7, 3);
+                ep[j][3] = ParallelMath::MakeUInt16(255);
+            }
+        }
+
+        static void CompressEndpoints2(MInt16 ep[2][4])
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                Quantize(ep[j], 5, 3);
+                Unquantize(ep[j], 5, 3);
+                ep[j][3] = ParallelMath::MakeUInt16(255);
+            }
+        }
+
+        static void CompressEndpoints3(MInt16 ep[2][4], uint16_t p[2])
+        {
+            for (int j = 0; j < 2; j++)
+                QuantizeP(ep[j], 7, p[j], 3);
+        }
+
+        static void CompressEndpoints4(MInt16 epRGB[2][3], MInt16 epA[2])
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                Quantize(epRGB[j], 5, 3);
+                Unquantize(epRGB[j], 5, 3);
+
+                Quantize(epA + j, 6, 1);
+                Unquantize(epA + j, 6, 1);
+            }
+        }
+
+        static void CompressEndpoints5(MInt16 epRGB[2][3], MInt16 epA[2])
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                Quantize(epRGB[j], 7, 3);
+                Unquantize(epRGB[j], 7, 3);
+            }
+
+            // Alpha is full precision
+            (void)epA;
+        }
+
+        static void CompressEndpoints6(MInt16 ep[2][4], uint16_t p[2])
+        {
+            for (int j = 0; j < 2; j++)
+                QuantizeP(ep[j], 7, p[j], 4);
+        }
+
+        static void CompressEndpoints7(MInt16 ep[2][4], uint16_t p[2])
+        {
+            for (int j = 0; j < 2; j++)
+            {
+                QuantizeP(ep[j], 5, p[j], 4);
+                Unquantize(ep[j], 6, 4);
+            }
+        }
+
+        static MFloat ComputeError(DWORD flags, const MInt16 reconstructed[4], const MInt16 original[4])
+        {
+            MFloat error = ParallelMath::MakeFloatZero();
+            if (flags & BC_FLAGS_UNIFORM)
+            {
+                for (int ch = 0; ch < 4; ch++)
+                    error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch]));
+            }
+            else
+            {
+                const float perceptualWeights[4] = { 0.2125f / 0.7154f, 1.0f, 0.0721f / 0.7154f, 1.0f };
+                for (int ch = 0; ch < 4; ch++)
+                    error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch])) * ParallelMath::MakeFloat(perceptualWeights[ch]);
+            }
+
+            return error;
+        }
+
+        static void TrySinglePlane(DWORD flags, const MInt16 pixels[16][4], WorkInfo& work)
+        {
+            for (uint16_t mode = 0; mode <= 7; mode++)
+            {
+                if ((flags & BC_FLAGS_FORCE_BC7_MODE6) && mode != 6)
+                    continue;
+
+                if ((flags & BC_FLAGS_USE_3SUBSETS) && g_modes[mode].m_numSubsets == 3)
+                    continue;
+
+                if (mode == 4 || mode == 5)
+                    continue;
+
+                MInt16 rgbAdjustedPixels[16][4];
+                for (int px = 0; px < 16; px++)
+                {
+                    for (int ch = 0; ch < 3; ch++)
+                        rgbAdjustedPixels[px][ch] = pixels[px][ch];
+
+                    if (g_modes[mode].m_alphaMode == AlphaMode_None)
+                        rgbAdjustedPixels[px][3] = ParallelMath::MakeUInt16(255);
+                    else
+                        rgbAdjustedPixels[px][3] = pixels[px][3];
+                }
+
+                unsigned int numPartitions = 1 << g_modes[mode].m_partitionBits;
+                int numSubsets = g_modes[mode].m_numSubsets;
+                int indexPrec = g_modes[mode].m_indexBits;
+
+                int parityBitMax = 1;
+                if (g_modes[mode].m_pBitMode == PBitMode_PerEndpoint)
+                    parityBitMax = 4;
+                else if (g_modes[mode].m_pBitMode == PBitMode_PerSubset)
+                    parityBitMax = 2;
+
+                for (uint16_t partition = 0; partition < numPartitions; partition++)
+                {
+                    EndpointSelectorRGBA epSelectors[3];
+
+                    for (int epPass = 0; epPass < EndpointSelectorRGBA::NumPasses; epPass++)
+                    {
+                        for (int subset = 0; subset < numSubsets; subset++)
+                            epSelectors[subset].InitPass(epPass);
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            int subset = 0;
+                            if (numSubsets == 2)
+                                subset = (g_partitionMap[partition] >> px) & 1;
+                            else if (numSubsets == 3)
+                                subset = g_partitionMap2[partition] >> (px * 2) & 3;
+
+                            assert(subset < 3);
+
+                            epSelectors[subset].Contribute(epPass, rgbAdjustedPixels[px], ParallelMath::MakeFloat(1.0f));
+                        }
+                    }
+
+                    UnfinishedEndpoints<4> unfinishedEPs[3];
+                    for (int subset = 0; subset < numSubsets; subset++)
+                        unfinishedEPs[subset] = epSelectors[subset].GetEndpoints();
+
+                    MInt16 bestIndexes[16];
+                    MInt16 bestEP[3][2][4];
+                    MFloat bestSubsetError[3] = { ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX), ParallelMath::MakeFloat(FLT_MAX) };
+
+                    for (int px = 0; px < 16; px++)
+                        bestIndexes[px] = ParallelMath::MakeUInt16(0);
+
+                    for (int tweak = 0; tweak < NumTweakRounds; tweak++)
+                    {
+                        MInt16 baseEP[3][2][4];
+
+                        for (int subset = 0; subset < numSubsets; subset++)
+                            unfinishedEPs[subset].Finish(tweak, indexPrec, baseEP[subset][0], baseEP[subset][1]);
+
+                        for (int pIter = 0; pIter < parityBitMax; pIter++)
+                        {
+                            uint16_t p[2];
+                            p[0] = (pIter & 1);
+                            p[1] = ((pIter >> 1) & 1);
+
+                            MInt16 ep[3][2][4];
+
+                            for (int subset = 0; subset < numSubsets; subset++)
+                                for (int epi = 0; epi < 2; epi++)
+                                    for (int ch = 0; ch < 4; ch++)
+                                        ep[subset][epi][ch] = baseEP[subset][epi][ch];
+
+                            for (int refine = 0; refine < NumRefineRounds; refine++)
+                            {
+                                switch (mode)
+                                {
+                                case 0:
+                                    for (int subset = 0; subset < 3; subset++)
+                                        CompressEndpoints0(ep[subset], p);
+                                    break;
+                                case 1:
+                                    for (int subset = 0; subset < 2; subset++)
+                                        CompressEndpoints1(ep[subset], p[0]);
+                                    break;
+                                case 2:
+                                    for (int subset = 0; subset < 3; subset++)
+                                        CompressEndpoints2(ep[subset]);
+                                    break;
+                                case 3:
+                                    for (int subset = 0; subset < 2; subset++)
+                                        CompressEndpoints3(ep[subset], p);
+                                    break;
+                                case 6:
+                                    CompressEndpoints6(ep[0], p);
+                                    break;
+                                case 7:
+                                    for (int subset = 0; subset < 2; subset++)
+                                        CompressEndpoints7(ep[subset], p);
+                                    break;
+                                default:
+                                    assert(false);
+                                    break;
+                                };
+
+                                IndexSelector<4> indexSelectors[3];
+
+                                for (int subset = 0; subset < numSubsets; subset++)
+                                    indexSelectors[subset].Init(ep[subset], indexPrec);
+
+                                EndpointRefiner<4> epRefiners[3];
+
+                                for (int subset = 0; subset < numSubsets; subset++)
+                                    epRefiners[subset].Init(indexPrec);
+
+                                MFloat subsetError[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
+
+                                MInt16 indexes[16];
+
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    int subset = 0;
+                                    if (numSubsets == 2)
+                                        subset = (g_partitionMap[partition] >> px) & 1;
+                                    else if (numSubsets == 3)
+                                        subset = g_partitionMap2[partition] >> (px * 2) & 3;
+
+                                    assert(subset < 3);
+
+                                    MInt16 index = indexSelectors[subset].SelectIndex(rgbAdjustedPixels[px]);
+
+                                    epRefiners[subset].Contribute(rgbAdjustedPixels[px], index, ParallelMath::MakeFloat(1.0f));
+
+                                    MInt16 reconstructed[4];
+
+                                    indexSelectors[subset].Reconstruct(index, reconstructed);
+
+                                    subsetError[subset] = subsetError[subset] + ComputeError(flags, reconstructed, pixels[px]);
+
+                                    indexes[px] = index;
+                                }
+
+                                ParallelMath::FloatCompFlag subsetErrorBetter[3];
+                                ParallelMath::Int16CompFlag subsetErrorBetter16[3];
+
+                                bool anyImprovements = false;
+                                for (int subset = 0; subset < numSubsets; subset++)
+                                {
+                                    subsetErrorBetter[subset] = ParallelMath::Less(subsetError[subset], bestSubsetError[subset]);
+                                    subsetErrorBetter16[subset] = ParallelMath::FloatFlagToInt16(subsetErrorBetter[subset]);
+
+                                    if (ParallelMath::AnySet(subsetErrorBetter16[subset]))
+                                    {
+                                        ParallelMath::ConditionalSet(bestSubsetError[subset], subsetErrorBetter[subset], subsetError[subset]);
+                                        for (int epi = 0; epi < 2; epi++)
+                                            for (int ch = 0; ch < 4; ch++)
+                                                ParallelMath::ConditionalSet(bestEP[subset][epi][ch], subsetErrorBetter16[subset], ep[subset][epi][ch]);
+
+                                        anyImprovements = true;
+                                    }
+                                }
+
+                                if (anyImprovements)
+                                {
+                                    for (int px = 0; px < 16; px++)
+                                    {
+                                        int subset = 0;
+                                        if (numSubsets == 2)
+                                            subset = (g_partitionMap[partition] >> px) & 1;
+                                        else if (numSubsets == 3)
+                                            subset = g_partitionMap2[partition] >> (px * 2) & 3;
+
+                                        ParallelMath::ConditionalSet(bestIndexes[px], subsetErrorBetter16[subset], indexes[px]);
+                                    }
+                                }
+
+                                if (refine != NumRefineRounds - 1)
+                                {
+                                    for (int subset = 0; subset < numSubsets; subset++)
+                                        epRefiners[subset].GetRefinedEndpoints(ep[subset]);
+                                }
+                            } // refine
+                        } // p
+                    } // tweak
+
+                    MFloat totalError = bestSubsetError[0];
+                    for (int subset = 1; subset < numSubsets; subset++)
+                        totalError = totalError + bestSubsetError[subset];
+
+                    ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(totalError, work.m_error);
+                    ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                    if (ParallelMath::AnySet(errorBetter16))
+                    {
+                        work.m_error = ParallelMath::Min(totalError, work.m_error);
+                        ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt16(mode));
+                        ParallelMath::ConditionalSet(work.m_partition, errorBetter16, ParallelMath::MakeUInt16(partition));
+
+                        for (int px = 0; px < 16; px++)
+                            ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, bestIndexes[px]);
+
+                        for (int subset = 0; subset < numSubsets; subset++)
+                            for (int epi = 0; epi < 2; epi++)
+                                for (int ch = 0; ch < 4; ch++)
+                                    ParallelMath::ConditionalSet(work.m_ep[subset][epi][ch], errorBetter16, bestEP[subset][epi][ch]);
+                    }
+                }
+            }
+        }
+
+        static void TryDualPlane(DWORD flags, const MInt16 pixels[16][4], WorkInfo& work)
+        {
+            if (flags & BC_FLAGS_FORCE_BC7_MODE6)
+                return; // Mode 6 is not a dual-plane mode, skip it
+
+            for (uint16_t mode = 4; mode <= 5; mode++)
+            {
+                for (uint16_t rotation = 0; rotation < 4; rotation++)
+                {
+                    int alphaChannel = (rotation + 3) & 3;
+                    int redChannel = (rotation == 1) ? 3 : 0;
+                    int greenChannel = (rotation == 2) ? 3 : 1;
+                    int blueChannel = (rotation == 3) ? 3 : 2;
+
+                    MInt16 rotatedRGB[16][3];
+
+                    for (int px = 0; px < 16; px++)
+                    {
+                        rotatedRGB[px][0] = pixels[px][redChannel];
+                        rotatedRGB[px][1] = pixels[px][greenChannel];
+                        rotatedRGB[px][2] = pixels[px][blueChannel];
+                    }
+
+                    uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
+
+                    for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
+                    {
+                        EndpointSelectorRGB rgbSelector;
+
+                        for (int epPass = 0; epPass < EndpointSelectorRGB::NumPasses; epPass++)
+                        {
+                            rgbSelector.InitPass(epPass);
+                            for (int px = 0; px < 16; px++)
+                                rgbSelector.Contribute(epPass, rotatedRGB[px], ParallelMath::MakeFloat(1.0f));
+                        }
+
+                        MInt16 alphaRange[2];
+
+                        alphaRange[0] = alphaRange[1] = pixels[0][alphaChannel];
+                        for (int px = 1; px < 16; px++)
+                        {
+                            alphaRange[0] = ParallelMath::Min(pixels[px][alphaChannel], alphaRange[0]);
+                            alphaRange[1] = ParallelMath::Max(pixels[px][alphaChannel], alphaRange[1]);
+                        }
+
+                        int rgbPrec = 0;
+                        int alphaPrec = 0;
+
+                        if (mode == 4)
+                        {
+                            rgbPrec = indexSelector ? 3 : 2;
+                            alphaPrec = indexSelector ? 2 : 3;
+                        }
+                        else
+                            rgbPrec = alphaPrec = 2;
+
+                        UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints();
+
+                        MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
+                        MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
+
+                        MInt16 bestRGBIndexes[16];
+                        MInt16 bestAlphaIndexes[16];
+                        MInt16 bestEP[2][4];
+
+                        for (int px = 0; px < 16; px++)
+                            bestRGBIndexes[px] = bestAlphaIndexes[px] = ParallelMath::MakeUInt16(0);
+
+                        for (int tweak = 0; tweak < NumTweakRounds; tweak++)
+                        {
+                            MInt16 rgbEP[2][3];
+                            MInt16 alphaEP[2];
+
+                            unfinishedRGB.Finish(tweak, rgbPrec, rgbEP[0], rgbEP[1]);
+
+                            TweakAlpha(alphaRange, tweak, alphaPrec, alphaEP);
+
+                            for (int refine = 0; refine < NumRefineRounds; refine++)
+                            {
+                                if (mode == 4)
+                                    CompressEndpoints4(rgbEP, alphaEP);
+                                else
+                                    CompressEndpoints5(rgbEP, alphaEP);
+
+                                IndexSelector<1> alphaIndexSelector;
+                                IndexSelector<3> rgbIndexSelector;
+
+                                {
+                                    MInt16 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
+                                    alphaIndexSelector.Init(alphaEPTemp, alphaPrec);
+                                }
+                                rgbIndexSelector.Init(rgbEP, rgbPrec);
+
+                                EndpointRefiner<3> rgbRefiner;
+                                EndpointRefiner<1> alphaRefiner;
+
+                                rgbRefiner.Init(rgbPrec);
+                                alphaRefiner.Init(alphaPrec);
+
+                                MFloat errorRGB = ParallelMath::MakeFloatZero();
+                                MFloat errorA = ParallelMath::MakeFloatZero();
+
+                                MInt16 rgbIndexes[16];
+                                MInt16 alphaIndexes[16];
+
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    MInt16 rgbIndex = rgbIndexSelector.SelectIndex(rotatedRGB[px]);
+                                    MInt16 alphaIndex = alphaIndexSelector.SelectIndex(pixels[px] + alphaChannel);
+
+                                    rgbRefiner.Contribute(rotatedRGB[px], rgbIndex, ParallelMath::MakeFloat(1.0f));
+                                    alphaRefiner.Contribute(pixels[px] + alphaChannel, alphaIndex, ParallelMath::MakeFloat(1.0f));
+
+                                    MInt16 reconstructedRGB[3];
+                                    MInt16 reconstructedAlpha[1];
+
+                                    rgbIndexSelector.Reconstruct(rgbIndex, reconstructedRGB);
+                                    alphaIndexSelector.Reconstruct(alphaIndex, reconstructedAlpha);
+
+                                    MInt16 reconstructedRGBA[4];
+                                    reconstructedRGBA[redChannel] = reconstructedRGB[0];
+                                    reconstructedRGBA[greenChannel] = reconstructedRGB[1];
+                                    reconstructedRGBA[blueChannel] = reconstructedRGB[2];
+                                    reconstructedRGBA[alphaChannel] = pixels[px][alphaChannel];
+
+                                    errorRGB = errorRGB + ComputeError(flags, reconstructedRGBA, pixels[px]);
+
+                                    reconstructedRGBA[redChannel] = pixels[px][redChannel];
+                                    reconstructedRGBA[greenChannel] = pixels[px][greenChannel];
+                                    reconstructedRGBA[blueChannel] = pixels[px][blueChannel];
+                                    reconstructedRGBA[alphaChannel] = reconstructedAlpha[0];
+
+                                    errorA = errorA + ComputeError(flags, reconstructedRGBA, pixels[px]);
+
+                                    rgbIndexes[px] = rgbIndex;
+                                    alphaIndexes[px] = alphaIndex;
+                                }
+
+                                ParallelMath::FloatCompFlag rgbBetter = ParallelMath::Less(errorRGB, bestRGBError);
+                                ParallelMath::FloatCompFlag alphaBetter = ParallelMath::Less(errorA, bestAlphaError);
+
+                                ParallelMath::Int16CompFlag rgbBetterInt16 = ParallelMath::FloatFlagToInt16(rgbBetter);
+                                ParallelMath::Int16CompFlag alphaBetterInt16 = ParallelMath::FloatFlagToInt16(alphaBetter);
+
+                                bestRGBError = ParallelMath::Min(errorRGB, bestRGBError);
+                                bestAlphaError = ParallelMath::Min(errorA, bestAlphaError);
+
+                                for (int px = 0; px < 16; px++)
+                                {
+                                    ParallelMath::ConditionalSet(bestRGBIndexes[px], rgbBetterInt16, rgbIndexes[px]);
+                                    ParallelMath::ConditionalSet(bestAlphaIndexes[px], alphaBetterInt16, alphaIndexes[px]);
+                                }
+
+                                for (int ep = 0; ep < 2; ep++)
+                                {
+                                    for (int ch = 0; ch < 3; ch++)
+                                        ParallelMath::ConditionalSet(bestEP[ep][ch], rgbBetterInt16, rgbEP[ep][ch]);
+                                    ParallelMath::ConditionalSet(bestEP[ep][3], alphaBetterInt16, alphaEP[ep]);
+                                }
+
+                                if (refine != NumRefineRounds - 1)
+                                {
+                                    rgbRefiner.GetRefinedEndpoints(rgbEP);
+
+                                    MInt16 alphaEPTemp[2][1];
+                                    alphaRefiner.GetRefinedEndpoints(alphaEPTemp);
+
+                                    for (int i = 0; i < 2; i++)
+                                        alphaEP[i] = alphaEPTemp[i][0];
+                                }
+                            }	// refine
+                        } // tweak
+
+                        MFloat combinedError = bestRGBError + bestAlphaError;
+
+                        ParallelMath::FloatCompFlag errorBetter = ParallelMath::Less(combinedError, work.m_error);
+                        ParallelMath::Int16CompFlag errorBetter16 = ParallelMath::FloatFlagToInt16(errorBetter);
+
+                        work.m_error = ParallelMath::Min(combinedError, work.m_error);
+
+                        ParallelMath::ConditionalSet(work.m_mode, errorBetter16, ParallelMath::MakeUInt16(mode));
+                        ParallelMath::ConditionalSet(work.m_isr.m_rotation, errorBetter16, ParallelMath::MakeUInt16(rotation));
+                        ParallelMath::ConditionalSet(work.m_isr.m_indexSelector, errorBetter16, ParallelMath::MakeUInt16(indexSelector));
+
+                        for (int px = 0; px < 16; px++)
+                        {
+                            ParallelMath::ConditionalSet(work.m_indexes[px], errorBetter16, indexSelector ? bestAlphaIndexes[px] : bestRGBIndexes[px]);
+                            ParallelMath::ConditionalSet(work.m_indexes2[px], errorBetter16, indexSelector ? bestRGBIndexes[px] : bestAlphaIndexes[px]);
+                        }
+
+                        for (int ep = 0; ep < 2; ep++)
+                            for (int ch = 0; ch < 4; ch++)
+                                ParallelMath::ConditionalSet(work.m_ep[0][ep][ch], errorBetter16, bestEP[ep][ch]);
+                    }
+                }
+            }
+        }
+
+        template<class T>
+        static void Swap(T& a, T& b)
+        {
+            T temp = a;
+            a = b;
+            b = temp;
+        }
+
+        static void Pack(DWORD flags, const InputBlock* inputs, uint8_t* packedBlocks)
+        {
+            MInt16 pixels[16][4];
+
+            for (int px = 0; px < 16; px++)
+            {
+                MInt32 packedPx;
+                ParallelMath::ReadPackedInputs(inputs, px, packedPx);
+
+                for (int ch = 0; ch < 4; ch++)
+                    ParallelMath::UnpackChannel(packedPx, ch, pixels[px][ch]);
+            }
+
+            WorkInfo work;
+            memset(&work, 0, sizeof(work));
+
+            work.m_error = ParallelMath::MakeFloat(FLT_MAX);
+
+            TryDualPlane(flags, pixels, work);
+            TrySinglePlane(flags, pixels, work);
+
+            for (int block = 0; block < ParallelMath::ParallelSize; block++)
+            {
+                PackingVector pv;
+                pv.Init();
+
+                uint16_t mode = ParallelMath::ExtractUInt16(work.m_mode, block);
+                uint16_t partition = ParallelMath::ExtractUInt16(work.m_partition, block);
+                uint16_t indexSelector = ParallelMath::ExtractUInt16(work.m_isr.m_indexSelector, block);
+
+                const BC7ModeInfo& modeInfo = g_modes[mode];
+
+                uint16_t indexes[16];
+                uint16_t indexes2[16];
+                uint16_t endPoints[3][2][4];
+
+                for (int i = 0; i < 16; i++)
+                {
+                    indexes[i] = ParallelMath::ExtractUInt16(work.m_indexes[i], block);
+                    if (modeInfo.m_alphaMode == AlphaMode_Separate)
+                        indexes2[i] = ParallelMath::ExtractUInt16(work.m_indexes2[i], block);
+                }
+
+                for (int subset = 0; subset < 3; subset++)
+                {
+                    for (int ep = 0; ep < 2; ep++)
+                    {
+                        for (int ch = 0; ch < 4; ch++)
+                            endPoints[subset][ep][ch] = ParallelMath::ExtractUInt16(work.m_ep[subset][ep][ch], block);
+                    }
+                }
+
+                int fixups[3] = { 0, 0, 0 };
+
+                if (modeInfo.m_alphaMode == AlphaMode_Separate)
+                {
+                    bool flipRGB = ((indexes[0] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+                    bool flipAlpha = ((indexes2[0] & (1 << (modeInfo.m_alphaIndexBits - 1))) != 0);
+
+                    if (flipRGB)
+                    {
+                        uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                        for (int px = 0; px < 16; px++)
+                            indexes[px] = highIndex - indexes[px];
+                    }
+
+                    if (flipAlpha)
+                    {
+                        uint16_t highIndex = (1 << modeInfo.m_alphaIndexBits) - 1;
+                        for (int px = 0; px < 16; px++)
+                            indexes2[px] = highIndex - indexes2[px];
+                    }
+
+                    if (indexSelector)
+                        Swap(flipRGB, flipAlpha);
+
+                    if (flipRGB)
+                    {
+                        for (int ch = 0; ch < 3; ch++)
+                            Swap(endPoints[0][0][ch], endPoints[0][1][ch]);
+                    }
+                    if (flipAlpha)
+                        Swap(endPoints[0][0][3], endPoints[0][1][3]);
+
+                }
+                else
+                {
+                    if (modeInfo.m_numSubsets == 2)
+                        fixups[1] = g_fixupIndexes2[partition];
+                    else if (modeInfo.m_numSubsets == 3)
+                    {
+                        fixups[1] = g_fixupIndexes3[partition][0];
+                        fixups[2] = g_fixupIndexes3[partition][1];
+                    }
+
+                    bool flip[3] = { false, false, false };
+                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                        flip[subset] = ((indexes[fixups[subset]] & (1 << (modeInfo.m_indexBits - 1))) != 0);
+
+                    if (flip[0] || flip[1] || flip[2])
+                    {
+                        uint16_t highIndex = (1 << modeInfo.m_indexBits) - 1;
+                        for (int px = 0; px < 16; px++)
+                        {
+                            int subset = 0;
+                            if (modeInfo.m_numSubsets == 2)
+                                subset = (g_partitionMap[partition] >> px) & 1;
+                            else if (modeInfo.m_numSubsets == 3)
+                                subset = (g_partitionMap2[partition] >> (px * 2)) & 3;
+
+                            if (flip[subset])
+                                indexes[px] = highIndex - indexes[px];
+                        }
+
+                        int maxCH = (modeInfo.m_alphaMode == AlphaMode_Combined) ? 4 : 3;
+                        for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                        {
+                            if (flip[subset])
+                                for (int ch = 0; ch < maxCH; ch++)
+                                    Swap(endPoints[subset][0][ch], endPoints[subset][1][ch]);
+                        }
+                    }
+                }
+
+                pv.Pack(static_cast<uint8_t>(1 << mode), mode + 1);
+
+                if (modeInfo.m_partitionBits)
+                    pv.Pack(partition, modeInfo.m_partitionBits);
+
+                if (modeInfo.m_alphaMode == AlphaMode_Separate)
+                {
+                    uint16_t rotation = ParallelMath::ExtractUInt16(work.m_isr.m_rotation, block);
+                    pv.Pack(rotation, 2);
+                }
+
+                if (modeInfo.m_hasIndexSelector)
+                    pv.Pack(indexSelector, 1);
+
+                // Encode RGB
+                for (int ch = 0; ch < 3; ch++)
+                {
+                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                    {
+                        for (int ep = 0; ep < 2; ep++)
+                        {
+                            uint16_t epPart = endPoints[subset][ep][ch];
+                            epPart >>= (8 - modeInfo.m_rgbBits);
+
+                            pv.Pack(epPart, modeInfo.m_rgbBits);
+                        }
+                    }
+                }
+
+                // Encode alpha
+                if (modeInfo.m_alphaMode != AlphaMode_None)
+                {
+                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                    {
+                        for (int ep = 0; ep < 2; ep++)
+                        {
+                            uint16_t epPart = endPoints[subset][ep][3];
+                            epPart >>= (8 - modeInfo.m_alphaBits);
+
+                            pv.Pack(epPart, modeInfo.m_alphaBits);
+                        }
+                    }
+                }
+
+                // Encode parity bits
+                if (modeInfo.m_pBitMode == PBitMode_PerSubset)
+                {
+                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                    {
+                        uint16_t epPart = endPoints[subset][0][0];
+                        epPart >>= (7 - modeInfo.m_rgbBits);
+                        epPart &= 1;
+
+                        pv.Pack(epPart, 1);
+                    }
+                }
+                else if (modeInfo.m_pBitMode == PBitMode_PerEndpoint)
+                {
+                    for (int subset = 0; subset < modeInfo.m_numSubsets; subset++)
+                    {
+                        for (int ep = 0; ep < 2; ep++)
+                        {
+                            uint16_t epPart = endPoints[subset][ep][0];
+                            epPart >>= (7 - modeInfo.m_rgbBits);
+                            epPart &= 1;
+
+                            pv.Pack(epPart, 1);
+                        }
+                    }
+                }
+
+                // Encode indexes
+                for (int px = 0; px < 16; px++)
+                {
+                    int bits = modeInfo.m_indexBits;
+                    if ((px == 0) || (px == fixups[1]) || (px == fixups[2]))
+                        bits--;
+
+                    pv.Pack(indexes[px], bits);
+                }
+
+                // Encode secondary indexes
+                if (modeInfo.m_alphaMode == AlphaMode_Separate)
+                {
+                    for (int px = 0; px < 16; px++)
+                    {
+                        int bits = modeInfo.m_alphaIndexBits;
+                        if (px == 0)
+                            bits--;
+
+                        pv.Pack(indexes2[px], bits);
+                    }
+                }
+
+                pv.Flush(packedBlocks);
+
+                packedBlocks += 16;
+            }
+        }
+    };
+}
+
+
+_Use_decl_annotations_
+void DirectX::D3DXEncodeBC7Parallel(uint8_t *pBC, const XMVECTOR *pColor, DWORD flags)
+{
+    assert(pColor);
+    assert(pBC);
+
+    for (size_t blockBase = 0; blockBase < BC7_NUM_PARALLEL_BLOCKS; blockBase += ParallelMath::ParallelSize)
+    {
+        InputBlock inputBlocks[BC7_NUM_PARALLEL_BLOCKS];
+
+        for (size_t block = 0; block < ParallelMath::ParallelSize; block++)
+        {
+            InputBlock& inputBlock = inputBlocks[block];
+
+            for (size_t i = 0; i < NUM_PIXELS_PER_BLOCK; ++i)
+            {
+                int32_t packedPixel = 0;
+                for (size_t ch = 0; ch < 4; ch++)
+                {
+                    int32_t convertedValue = static_cast<int32_t>(std::max<float>(0.0f, std::min<float>(255.0f, reinterpret_cast<const float*>(pColor)[ch] * 255.0f + 0.01f)));
+                    packedPixel |= (convertedValue << (ch * 8));
+                }
+
+                inputBlock.m_pixels[i] = packedPixel;
+                pColor++;
+            }
+        }
+
+        BC7Computer::Pack(flags, inputBlocks, pBC);
+
+        pBC += ParallelMath::ParallelSize * 16;
+    }
+}
diff --git a/DirectXTex/DirectXTexCompress.cpp b/DirectXTex/DirectXTexCompress.cpp
index f0b51f17..d2e2204c 100644
--- a/DirectXTex/DirectXTexCompress.cpp
+++ b/DirectXTex/DirectXTexCompress.cpp
@@ -41,25 +41,25 @@ namespace
         return (compress & TEX_COMPRESS_SRGB);
     }
 
-    inline bool DetermineEncoderSettings(_In_ DXGI_FORMAT format, _Out_ BC_ENCODE& pfEncode, _Out_ size_t& blocksize, _Out_ DWORD& cflags)
+    inline bool DetermineEncoderSettings(_In_ DXGI_FORMAT format, _Out_ BC_ENCODE& pfEncode, _Out_ size_t& blocksize, _Out_ DWORD& cflags, _Out_ int& nBlocksPerChunk)
     {
         switch (format)
         {
         case DXGI_FORMAT_BC1_UNORM:
-        case DXGI_FORMAT_BC1_UNORM_SRGB:    pfEncode = nullptr;         blocksize = 8;   cflags = 0; break;
+        case DXGI_FORMAT_BC1_UNORM_SRGB:    pfEncode = nullptr;         blocksize = 8;   cflags = 0; nBlocksPerChunk = 1; break;
         case DXGI_FORMAT_BC2_UNORM:
-        case DXGI_FORMAT_BC2_UNORM_SRGB:    pfEncode = D3DXEncodeBC2;   blocksize = 16;  cflags = 0; break;
+        case DXGI_FORMAT_BC2_UNORM_SRGB:    pfEncode = D3DXEncodeBC2;   blocksize = 16;  cflags = 0; nBlocksPerChunk = 1; break;
         case DXGI_FORMAT_BC3_UNORM:
-        case DXGI_FORMAT_BC3_UNORM_SRGB:    pfEncode = D3DXEncodeBC3;   blocksize = 16;  cflags = 0; break;
-        case DXGI_FORMAT_BC4_UNORM:         pfEncode = D3DXEncodeBC4U;  blocksize = 8;   cflags = TEX_FILTER_RGB_COPY_RED; break;
-        case DXGI_FORMAT_BC4_SNORM:         pfEncode = D3DXEncodeBC4S;  blocksize = 8;   cflags = TEX_FILTER_RGB_COPY_RED; break;
-        case DXGI_FORMAT_BC5_UNORM:         pfEncode = D3DXEncodeBC5U;  blocksize = 16;  cflags = TEX_FILTER_RGB_COPY_RED | TEX_FILTER_RGB_COPY_GREEN; break;
-        case DXGI_FORMAT_BC5_SNORM:         pfEncode = D3DXEncodeBC5S;  blocksize = 16;  cflags = TEX_FILTER_RGB_COPY_RED | TEX_FILTER_RGB_COPY_GREEN; break;
-        case DXGI_FORMAT_BC6H_UF16:         pfEncode = D3DXEncodeBC6HU; blocksize = 16;  cflags = 0; break;
-        case DXGI_FORMAT_BC6H_SF16:         pfEncode = D3DXEncodeBC6HS; blocksize = 16;  cflags = 0; break;
+        case DXGI_FORMAT_BC3_UNORM_SRGB:    pfEncode = D3DXEncodeBC3;   blocksize = 16;  cflags = 0; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC4_UNORM:         pfEncode = D3DXEncodeBC4U;  blocksize = 8;   cflags = TEX_FILTER_RGB_COPY_RED; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC4_SNORM:         pfEncode = D3DXEncodeBC4S;  blocksize = 8;   cflags = TEX_FILTER_RGB_COPY_RED; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC5_UNORM:         pfEncode = D3DXEncodeBC5U;  blocksize = 16;  cflags = TEX_FILTER_RGB_COPY_RED | TEX_FILTER_RGB_COPY_GREEN; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC5_SNORM:         pfEncode = D3DXEncodeBC5S;  blocksize = 16;  cflags = TEX_FILTER_RGB_COPY_RED | TEX_FILTER_RGB_COPY_GREEN; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC6H_UF16:         pfEncode = D3DXEncodeBC6HU; blocksize = 16;  cflags = 0; nBlocksPerChunk = 1; break;
+        case DXGI_FORMAT_BC6H_SF16:         pfEncode = D3DXEncodeBC6HS; blocksize = 16;  cflags = 0; nBlocksPerChunk = 1; break;
         case DXGI_FORMAT_BC7_UNORM:
-        case DXGI_FORMAT_BC7_UNORM_SRGB:    pfEncode = D3DXEncodeBC7;   blocksize = 16;  cflags = 0; break;
-        default:                            pfEncode = nullptr;         blocksize = 0;   cflags = 0; return false;
+        case DXGI_FORMAT_BC7_UNORM_SRGB:    pfEncode = D3DXEncodeBC7Parallel; blocksize = 16; cflags = 0; nBlocksPerChunk = BC7_NUM_PARALLEL_BLOCKS; break;
+        default:                            pfEncode = nullptr;         blocksize = 0;   cflags = 0; nBlocksPerChunk = 1; return false;
         }
 
         return true;
@@ -100,10 +100,11 @@ namespace
         BC_ENCODE pfEncode;
         size_t blocksize;
         DWORD cflags;
-        if (!DetermineEncoderSettings(result.format, pfEncode, blocksize, cflags))
+        int nBlocksPerChunk = 0;
+        if (!DetermineEncoderSettings(result.format, pfEncode, blocksize, cflags, nBlocksPerChunk))
             return HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED);
 
-        __declspec(align(16)) XMVECTOR temp[16];
+        __declspec(align(16)) XMVECTOR tempBlocks[16 * MAX_PARALLEL_BLOCKS];
         const uint8_t *pSrc = image.pixels;
         const uint8_t *pEnd = image.pixels + image.slicePitch;
         const size_t rowPitch = image.rowPitch;
@@ -113,8 +114,12 @@ namespace
             uint8_t* dptr = pDest;
             size_t ph = std::min<size_t>(4, image.height - h);
             size_t w = 0;
+
+            int nQueuedBlocks = 0;
             for (size_t count = 0; (count < result.rowPitch) && (w < image.width); count += blocksize, w += 4)
             {
+                XMVECTOR *temp = tempBlocks + nQueuedBlocks * 16;
+
                 size_t pw = std::min<size_t>(4, image.width - w);
                 assert(pw > 0 && ph > 0);
 
@@ -177,13 +182,36 @@ namespace
 
                 _ConvertScanline(temp, 16, result.format, format, cflags | srgb);
 
+                if (nQueuedBlocks == nBlocksPerChunk)
+                {
+                    if (pfEncode)
+                        pfEncode(dptr, tempBlocks, bcflags);
+                    else
+                        D3DXEncodeBC1(dptr, tempBlocks, threshold, bcflags);
+
+                    dptr += blocksize * nBlocksPerChunk;
+                    nQueuedBlocks = 0;
+                }
+
+                sptr += sbpp * 4;
+            }
+
+            if (nQueuedBlocks != 0)
+            {
+                uint8_t scratch[MAX_BLOCK_SIZE * MAX_PARALLEL_BLOCKS];
+
+                for (int i = nQueuedBlocks; i < nBlocksPerChunk; i++)
+                    for (int element = 0; element < NUM_PIXELS_PER_BLOCK; element++)
+                        tempBlocks[i * NUM_PIXELS_PER_BLOCK + element] = XMVectorSet(0.f, 0.f, 0.f, 0.f);
+
                 if (pfEncode)
-                    pfEncode(dptr, temp, bcflags);
+                    pfEncode(scratch, tempBlocks, bcflags);
                 else
-                    D3DXEncodeBC1(dptr, temp, threshold, bcflags);
+                    D3DXEncodeBC1(scratch, tempBlocks, threshold, bcflags);
 
-                sptr += sbpp * 4;
-                dptr += blocksize;
+                memcpy(dptr, scratch, blocksize * nQueuedBlocks);
+                dptr += blocksize * nQueuedBlocks;
+                nQueuedBlocks = 0;
             }
 
             pSrc += rowPitch * 4;
@@ -229,7 +257,8 @@ namespace
         BC_ENCODE pfEncode;
         size_t blocksize;
         DWORD cflags;
-        if (!DetermineEncoderSettings(result.format, pfEncode, blocksize, cflags))
+        int nBlocksPerChunk;
+        if (!DetermineEncoderSettings(result.format, pfEncode, blocksize, cflags, nBlocksPerChunk))
             return HRESULT_FROM_WIN32(ERROR_NOT_SUPPORTED);
 
         // Refactored version of loop to support parallel independance
@@ -238,89 +267,124 @@ namespace
         bool fail = false;
 
 #pragma omp parallel for
-        for (int nb = 0; nb < static_cast<int>(nBlocks); ++nb)
+        for (int nbBase = 0; nbBase < static_cast<int>(nBlocks); nbBase += nBlocksPerChunk)
         {
-            int nbWidth = std::max<int>(1, int((image.width + 3) / 4));
+            __declspec(align(16)) XMVECTOR tempBlocks[16 * MAX_PARALLEL_BLOCKS];
+
+            int numProcessableBlocks = std::min<int>(static_cast<int>(nBlocks) - nbBase, nBlocksPerChunk);
 
-            int y = nb / nbWidth;
-            int x = (nb - (y*nbWidth)) * 4;
-            y *= 4;
+            for (int subBlock = 0; subBlock < numProcessableBlocks; subBlock++)
+            {
+                XMVECTOR *temp = tempBlocks + subBlock * NUM_PIXELS_PER_BLOCK;
+                int nb = nbBase + subBlock;
+                if (nb >= static_cast<int>(nBlocks))
+                {
+                    for (int i = 0; i < 16; i++)
+                        temp[i] = XMVectorSet(0.f, 0.f, 0.f, 0.f);
+                    continue;
+                }
 
-            assert((x >= 0) && (x < int(image.width)));
-            assert((y >= 0) && (y < int(image.height)));
+                int nbWidth = std::max<int>(1, int((image.width + 3) / 4));
 
-            size_t rowPitch = image.rowPitch;
-            const uint8_t *pSrc = image.pixels + (y*rowPitch) + (x*sbpp);
+                int y = nb / nbWidth;
+                int x = (nb - (y*nbWidth)) * 4;
+                y *= 4;
 
-            uint8_t *pDest = result.pixels + (nb*blocksize);
+                assert((x >= 0) && (x < int(image.width)));
+                assert((y >= 0) && (y < int(image.height)));
 
-            size_t ph = std::min<size_t>(4, image.height - y);
-            size_t pw = std::min<size_t>(4, image.width - x);
-            assert(pw > 0 && ph > 0);
+                size_t rowPitch = image.rowPitch;
+                const uint8_t *pSrc = image.pixels + (y*rowPitch) + (x*sbpp);
 
-            ptrdiff_t bytesLeft = pEnd - pSrc;
-            assert(bytesLeft > 0);
-            size_t bytesToRead = std::min<size_t>(rowPitch, bytesLeft);
+                size_t ph = std::min<size_t>(4, image.height - y);
+                size_t pw = std::min<size_t>(4, image.width - x);
+                assert(pw > 0 && ph > 0);
 
-            __declspec(align(16)) XMVECTOR temp[16];
-            if (!_LoadScanline(&temp[0], pw, pSrc, bytesToRead, format))
-                fail = true;
+                ptrdiff_t bytesLeft = pEnd - pSrc;
+                assert(bytesLeft > 0);
+                size_t bytesToRead = std::min<size_t>(rowPitch, bytesLeft);
 
-            if (ph > 1)
-            {
-                bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch);
-                if (!_LoadScanline(&temp[4], pw, pSrc + rowPitch, bytesToRead, format))
+                if (!_LoadScanline(&temp[0], pw, pSrc, bytesToRead, format))
                     fail = true;
 
-                if (ph > 2)
+                if (ph > 1)
                 {
-                    bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch * 2);
-                    if (!_LoadScanline(&temp[8], pw, pSrc + rowPitch * 2, bytesToRead, format))
+                    bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch);
+                    if (!_LoadScanline(&temp[4], pw, pSrc + rowPitch, bytesToRead, format))
                         fail = true;
 
-                    if (ph > 3)
+                    if (ph > 2)
                     {
-                        bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch * 3);
-                        if (!_LoadScanline(&temp[12], pw, pSrc + rowPitch * 3, bytesToRead, format))
+                        bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch * 2);
+                        if (!_LoadScanline(&temp[8], pw, pSrc + rowPitch * 2, bytesToRead, format))
                             fail = true;
+
+                        if (ph > 3)
+                        {
+                            bytesToRead = std::min<size_t>(rowPitch, bytesLeft - rowPitch * 3);
+                            if (!_LoadScanline(&temp[12], pw, pSrc + rowPitch * 3, bytesToRead, format))
+                                fail = true;
+                        }
                     }
                 }
-            }
-
-            if (pw != 4 || ph != 4)
-            {
-                // Replicate pixels for partial block
-                static const size_t uSrc[] = { 0, 0, 0, 1 };
 
-                if (pw < 4)
+                if (pw != 4 || ph != 4)
                 {
-                    for (size_t t = 0; t < ph && t < 4; ++t)
+                    // Replicate pixels for partial block
+                    static const size_t uSrc[] = { 0, 0, 0, 1 };
+
+                    if (pw < 4)
                     {
-                        for (size_t s = pw; s < 4; ++s)
+                        for (size_t t = 0; t < ph && t < 4; ++t)
                         {
-                            temp[(t << 2) | s] = temp[(t << 2) | uSrc[s]];
+                            for (size_t s = pw; s < 4; ++s)
+                            {
+                                temp[(t << 2) | s] = temp[(t << 2) | uSrc[s]];
+                            }
                         }
                     }
-                }
 
-                if (ph < 4)
-                {
-                    for (size_t t = ph; t < 4; ++t)
+                    if (ph < 4)
                     {
-                        for (size_t s = 0; s < 4; ++s)
+                        for (size_t t = ph; t < 4; ++t)
                         {
-                            temp[(t << 2) | s] = temp[(uSrc[t] << 2) | s];
+                            for (size_t s = 0; s < 4; ++s)
+                            {
+                                temp[(t << 2) | s] = temp[(uSrc[t] << 2) | s];
+                            }
                         }
                     }
                 }
+
+                _ConvertScanline(temp, 16, result.format, format, cflags | srgb);
+            }
+
+            for (int fillBlock = numProcessableBlocks; fillBlock < nBlocksPerChunk; fillBlock++)
+            {
+                for (int element = 0; element < NUM_PIXELS_PER_BLOCK; element++)
+                    tempBlocks[fillBlock * NUM_PIXELS_PER_BLOCK + element] = XMVectorSet(0.f, 0.f, 0.f, 0.f);
             }
 
-            _ConvertScanline(temp, 16, result.format, format, cflags | srgb);
+            uint8_t *pDest = result.pixels + (nbBase*blocksize);
 
-            if (pfEncode)
-                pfEncode(pDest, temp, bcflags);
+            if (numProcessableBlocks == nBlocksPerChunk)
+            {
+                if (pfEncode)
+                    pfEncode(pDest, tempBlocks, bcflags);
+                else
+                    D3DXEncodeBC1(pDest, tempBlocks, threshold, bcflags);
+            }
             else
-                D3DXEncodeBC1(pDest, temp, threshold, bcflags);
+            {
+                uint8_t scratch[MAX_BLOCK_SIZE * MAX_PARALLEL_BLOCKS];
+
+                if (pfEncode)
+                    pfEncode(scratch, tempBlocks, bcflags);
+                else
+                    D3DXEncodeBC1(scratch, tempBlocks, threshold, bcflags);
+
+                memcpy(pDest, scratch, numProcessableBlocks * blocksize);
+            }
         }
 
         return (fail) ? E_FAIL : S_OK;
diff --git a/DirectXTex/DirectXTex_Desktop_2013.vcxproj b/DirectXTex/DirectXTex_Desktop_2013.vcxproj
index 41f0be84..2f9b6bc6 100644
--- a/DirectXTex/DirectXTex_Desktop_2013.vcxproj
+++ b/DirectXTex/DirectXTex_Desktop_2013.vcxproj
@@ -392,6 +392,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <CLInclude Include="DDS.h" />
     <ClInclude Include="filters.h" />
diff --git a/DirectXTex/DirectXTex_Desktop_2013.vcxproj.filters b/DirectXTex/DirectXTex_Desktop_2013.vcxproj.filters
index 1c9c27eb..5c908b71 100644
--- a/DirectXTex/DirectXTex_Desktop_2013.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Desktop_2013.vcxproj.filters
@@ -50,6 +50,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Desktop_2015.vcxproj b/DirectXTex/DirectXTex_Desktop_2015.vcxproj
index 122b5b6a..89091049 100644
--- a/DirectXTex/DirectXTex_Desktop_2015.vcxproj
+++ b/DirectXTex/DirectXTex_Desktop_2015.vcxproj
@@ -383,6 +383,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <CLInclude Include="DDS.h" />
     <ClInclude Include="filters.h" />
diff --git a/DirectXTex/DirectXTex_Desktop_2015.vcxproj.filters b/DirectXTex/DirectXTex_Desktop_2015.vcxproj.filters
index 693d4d11..4a1f5abf 100644
--- a/DirectXTex/DirectXTex_Desktop_2015.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Desktop_2015.vcxproj.filters
@@ -50,6 +50,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj b/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj
index 96a9e7b9..85dd219f 100644
--- a/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj
+++ b/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj
@@ -392,6 +392,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <ClInclude Include="d3dx12.h" />
     <CLInclude Include="DDS.h" />
diff --git a/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj.filters b/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj.filters
index 0b3fb322..50b6a2f0 100644
--- a/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Desktop_2015_Win10.vcxproj.filters
@@ -53,6 +53,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Desktop_2017.vcxproj b/DirectXTex/DirectXTex_Desktop_2017.vcxproj
index f8df59de..55365b8c 100644
--- a/DirectXTex/DirectXTex_Desktop_2017.vcxproj
+++ b/DirectXTex/DirectXTex_Desktop_2017.vcxproj
@@ -384,6 +384,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <CLInclude Include="DDS.h" />
     <ClInclude Include="filters.h" />
diff --git a/DirectXTex/DirectXTex_Desktop_2017.vcxproj.filters b/DirectXTex/DirectXTex_Desktop_2017.vcxproj.filters
index 693d4d11..4a1f5abf 100644
--- a/DirectXTex/DirectXTex_Desktop_2017.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Desktop_2017.vcxproj.filters
@@ -50,6 +50,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj b/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj
index 16bc7916..06134523 100644
--- a/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj
+++ b/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj
@@ -392,6 +392,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <ClInclude Include="d3dx12.h" />
     <CLInclude Include="DDS.h" />
diff --git a/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj.filters b/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj.filters
index 0b3fb322..50b6a2f0 100644
--- a/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Desktop_2017_Win10.vcxproj.filters
@@ -53,6 +53,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Windows10.vcxproj b/DirectXTex/DirectXTex_Windows10.vcxproj
index db6dd963..9117a7c9 100644
--- a/DirectXTex/DirectXTex_Windows10.vcxproj
+++ b/DirectXTex/DirectXTex_Windows10.vcxproj
@@ -30,6 +30,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClCompile Include="BCDirectCompute.cpp" />
     <ClCompile Include="DirectXTexCompress.cpp" />
     <ClCompile Include="DirectXTexCompressGPU.cpp" />
diff --git a/DirectXTex/DirectXTex_Windows10.vcxproj.filters b/DirectXTex/DirectXTex_Windows10.vcxproj.filters
index 83c5bb1c..7b925935 100644
--- a/DirectXTex/DirectXTex_Windows10.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Windows10.vcxproj.filters
@@ -44,6 +44,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Windows10_2015.vcxproj b/DirectXTex/DirectXTex_Windows10_2015.vcxproj
index 60ee4110..ec3f2819 100644
--- a/DirectXTex/DirectXTex_Windows10_2015.vcxproj
+++ b/DirectXTex/DirectXTex_Windows10_2015.vcxproj
@@ -30,6 +30,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClCompile Include="BCDirectCompute.cpp" />
     <ClCompile Include="DirectXTexCompress.cpp" />
     <ClCompile Include="DirectXTexCompressGPU.cpp" />
diff --git a/DirectXTex/DirectXTex_Windows10_2015.vcxproj.filters b/DirectXTex/DirectXTex_Windows10_2015.vcxproj.filters
index 83c5bb1c..7b925935 100644
--- a/DirectXTex/DirectXTex_Windows10_2015.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Windows10_2015.vcxproj.filters
@@ -44,6 +44,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_Windows81.vcxproj b/DirectXTex/DirectXTex_Windows81.vcxproj
index 9741a4f4..918a1155 100644
--- a/DirectXTex/DirectXTex_Windows81.vcxproj
+++ b/DirectXTex/DirectXTex_Windows81.vcxproj
@@ -610,6 +610,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <CLInclude Include="DDS.h" />
     <CLInclude Include="filters.h" />
diff --git a/DirectXTex/DirectXTex_Windows81.vcxproj.filters b/DirectXTex/DirectXTex_Windows81.vcxproj.filters
index db38ee66..d1a195a4 100644
--- a/DirectXTex/DirectXTex_Windows81.vcxproj.filters
+++ b/DirectXTex/DirectXTex_Windows81.vcxproj.filters
@@ -58,6 +58,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_WindowsPhone81.vcxproj b/DirectXTex/DirectXTex_WindowsPhone81.vcxproj
index cabc65b8..2889c94b 100644
--- a/DirectXTex/DirectXTex_WindowsPhone81.vcxproj
+++ b/DirectXTex/DirectXTex_WindowsPhone81.vcxproj
@@ -156,6 +156,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClInclude Include="BCDirectCompute.h" />
     <CLInclude Include="DDS.h" />
     <ClInclude Include="filters.h" />
diff --git a/DirectXTex/DirectXTex_WindowsPhone81.vcxproj.filters b/DirectXTex/DirectXTex_WindowsPhone81.vcxproj.filters
index c232e025..669e689f 100644
--- a/DirectXTex/DirectXTex_WindowsPhone81.vcxproj.filters
+++ b/DirectXTex/DirectXTex_WindowsPhone81.vcxproj.filters
@@ -10,6 +10,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj b/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj
index 0cd492b7..efac4d0b 100644
--- a/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj
+++ b/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj
@@ -45,6 +45,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClCompile Include="BCDirectCompute.cpp" />
     <ClCompile Include="DirectXTexCompress.cpp" />
     <ClCompile Include="DirectXTexCompressGPU.cpp" />
diff --git a/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj.filters b/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj.filters
index c06b9016..48e7f45b 100644
--- a/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj.filters
+++ b/DirectXTex/DirectXTex_XboxOneXDK_2015.vcxproj.filters
@@ -108,6 +108,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
diff --git a/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj b/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj
index 1dbc749a..5d81de4c 100644
--- a/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj
+++ b/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj
@@ -45,6 +45,7 @@
     <ClCompile Include="BC.cpp" />
     <ClCompile Include="BC4BC5.cpp" />
     <ClCompile Include="BC6HBC7.cpp" />
+    <ClCompile Include="BC7Parallel.cpp" />
     <ClCompile Include="BCDirectCompute.cpp" />
     <ClCompile Include="DirectXTexCompress.cpp" />
     <ClCompile Include="DirectXTexCompressGPU.cpp" />
diff --git a/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj.filters b/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj.filters
index c06b9016..48e7f45b 100644
--- a/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj.filters
+++ b/DirectXTex/DirectXTex_XboxOneXDK_2017.vcxproj.filters
@@ -108,6 +108,9 @@
     <ClCompile Include="BC6HBC7.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>
+    <ClCompile Include="BC7Parallel.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
     <ClCompile Include="BCDirectCompute.cpp">
       <Filter>Source Files</Filter>
     </ClCompile>

From 9078067d215e461a56735409c01ac29383f2342f Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Fri, 6 Apr 2018 00:42:29 -0400
Subject: [PATCH 2/7] Replaced Squish code, fixed BC_FLAG_USE_3SUBSETS working
 incorrectly, moved the many safe-denominator switches to a function, fixed a
 bunch of math to handle channel weights more accurately.

---
 DirectXTex/BC7Parallel.cpp | 635 +++++++++++++++++--------------------
 1 file changed, 287 insertions(+), 348 deletions(-)

diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
index 6854474a..75a22975 100644
--- a/DirectXTex/BC7Parallel.cpp
+++ b/DirectXTex/BC7Parallel.cpp
@@ -27,31 +27,6 @@
     Licensed under the MIT License.
 
     http://go.microsoft.com/fwlink/?LinkId=248926
-
-    -------------------------------------------------------------------------------------
-
-    Contains portions of libsquish
-
-    Copyright (c) 2006 Simon Brown                          si@sjbrown.co.uk
-
-    Permission is hereby granted, free of charge, to any person obtaining
-    a copy of this software and associated documentation files (the 
-    "Software"), to deal in the Software without restriction, including
-    without limitation the rights to use, copy, modify, merge, publish,
-    distribute, sublicense, and/or sell copies of the Software, and to 
-    permit persons to whom the Software is furnished to do so, subject to 
-    the following conditions:
-
-    The above copyright notice and this permission notice shall be included
-    in all copies or substantial portions of the Software.
-
-    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-    OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 
-    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-    IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 
-    CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 
-    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 
-    SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 #include "directxtexp.h"
 
@@ -337,6 +312,11 @@ namespace
                 dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
         }
 
+        static void MakeSafeDenominator(Float& v)
+        {
+            ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
+        }
+
         static Int16 Min(Int16 a, Int16 b)
         {
             Int16 result;
@@ -533,6 +513,12 @@ namespace
                 dest = src;
         }
 
+        static void MakeSafeDenominator(float& v)
+        {
+            if (v == 0.f)
+                v = 1.0f;
+        }
+
         template<class T>
         inline static T Select(bool flag, T a, T b)
         {
@@ -588,11 +574,13 @@ namespace
 
         inline static int16_t ExtractUInt16(int16_t v, int offset)
         {
+            (void)offset;
             return v;
         }
 
         inline static float ExtractFloat(float v, int offset)
         {
+            (void)offset;
             return v;
         }
 
@@ -706,8 +694,22 @@ namespace
         typedef ParallelMath::Float MFloat;
         typedef ParallelMath::Int16 MInt16;
 
-        MFloat m_base[TVectorSize];
-        MFloat m_offset[TVectorSize];
+        UnfinishedEndpoints()
+        {
+        }
+
+        UnfinishedEndpoints(const MFloat base[TVectorSize], const MFloat offset[TVectorSize])
+        {
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_base[ch] = base[ch];
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_offset[ch] = offset[ch];
+        }
+
+        UnfinishedEndpoints(const UnfinishedEndpoints& other)
+            : UnfinishedEndpoints(other.m_base, other.m_offset)
+        {
+        }
 
         void Finish(int tweak, int bits, MInt16* outEP0, MInt16* outEP1)
         {
@@ -722,318 +724,204 @@ namespace
                 outEP1[ch] = ParallelMath::FloatToUInt16(ep1f);
             }
         }
+
+    private:
+        MFloat m_base[TVectorSize];
+        MFloat m_offset[TVectorSize];
     };
 
-    class EndpointSelectorRGBA
+    template<int TMatrixSize>
+    class PackedCovarianceMatrix
     {
     public:
-        static const int NumPasses = 3;
-        static const int NumPowerIterations = 8;
+        // 0: xx,
+        // 1: xy, yy
+        // 3: xz, yz, zz 
+        // 6: xw, yw, zw, ww
+        // ... etc.
+        static const int PyramidSize = (TMatrixSize * (TMatrixSize + 1)) / 2;
 
         typedef ParallelMath::Float MFloat;
-        typedef ParallelMath::Int16 MInt16;
 
-        MFloat m_total[4];
-        MFloat m_ctr[4];
-        MFloat m_axis[4];
-        MFloat m_xx;
-        MFloat m_xy;
-        MFloat m_xz;
-        MFloat m_xw;
-        MFloat m_yy;
-        MFloat m_yz;
-        MFloat m_yw;
-        MFloat m_zz;
-        MFloat m_zw;
-        MFloat m_ww;
-        MFloat m_minDist;
-        MFloat m_maxDist;
-
-        EndpointSelectorRGBA()
+        PackedCovarianceMatrix()
         {
-            for (int i = 0; i < 4; i++)
-            {
-                m_total[i] = ParallelMath::MakeFloatZero();
-                m_ctr[i] = ParallelMath::MakeFloatZero();
-                m_axis[i] = ParallelMath::MakeFloatZero();
-            }
-            m_xx = ParallelMath::MakeFloatZero();
-            m_xy = ParallelMath::MakeFloatZero();
-            m_xz = ParallelMath::MakeFloatZero();
-            m_xw = ParallelMath::MakeFloatZero();
-            m_yy = ParallelMath::MakeFloatZero();
-            m_yz = ParallelMath::MakeFloatZero();
-            m_yw = ParallelMath::MakeFloatZero();
-            m_zz = ParallelMath::MakeFloatZero();
-            m_zw = ParallelMath::MakeFloatZero();
-            m_ww = ParallelMath::MakeFloatZero();
-            m_minDist = ParallelMath::MakeFloat(1000.0f);
-            m_maxDist = ParallelMath::MakeFloat(-1000.0f);
-        }
-
-        void InitPass(int step)
-        {
-            if (step == 1)
-            {
-                for (int i = 0; i < 4; i++)
-                    m_ctr[i] = m_ctr[i] / ParallelMath::Max(m_total[i], ParallelMath::MakeFloat(0.0001f));
-            }
-            else if (step == 2)
-            {
-                MFloat matrix[4][4] =
-                {
-                    { m_xx, m_xy, m_xz, m_xw },
-                    { m_xy, m_yy, m_yz, m_yw },
-                    { m_xz, m_yz, m_zz, m_zw },
-                    { m_xw, m_yw, m_zw, m_ww }
-                };
-
-                MFloat v[4] = { ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f) };
-                for (int p = 0; p < NumPowerIterations; p++)
-                {
-                    // matrix multiply
-                    MFloat w[4];
-                    for (int i = 0; i < 4; i++)
-                    {
-                        w[i] = matrix[0][i] * v[0];
-                        for (int row = 1; row < 4; row++)
-                            w[i] = w[i] + matrix[row][i] * v[row];
-                    }
-
-                    MFloat a = ParallelMath::Max(w[0], ParallelMath::Max(w[1], ParallelMath::Max(w[2], w[3])));
-
-                    ParallelMath::FloatCompFlag aZero = ParallelMath::Equal(a, ParallelMath::MakeFloatZero());
-
-                    ParallelMath::ConditionalSet(a, aZero, ParallelMath::MakeFloat(1.0f));
-
-                    for (int c = 0; c < 4; c++)
-                        v[c] = w[c] / a;
-                }
-
-                MFloat vlen = ParallelMath::Sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2] + v[3] * v[3]);
-
-                ParallelMath::FloatCompFlag vZero = ParallelMath::Equal(vlen, ParallelMath::MakeFloatZero());
-                ParallelMath::ConditionalSet(vlen, vZero, ParallelMath::MakeFloat(1.0f));
-
-                for (int i = 0; i < 4; i++)
-                    m_axis[i] = v[i] / vlen;
-            }
+            for (int i = 0; i < PyramidSize; i++)
+                m_values[i] = ParallelMath::MakeFloatZero();
         }
 
-        void Contribute(int step, const MInt16* pixel, MFloat weight)
+        void Add(const ParallelMath::Float vec[TMatrixSize], ParallelMath::Float weight)
         {
-            MFloat pt[4];
-            for (int i = 0; i < 4; i++)
-                pt[i] = ParallelMath::UInt16ToFloat(pixel[i]);
-
-            if (step == 0)
+            int index = 0;
+            for (int row = 0; row < TMatrixSize; row++)
             {
-                for (int i = 0; i < 4; i++)
+                for (int col = 0; col <= row; col++)
                 {
-                    m_total[i] = m_total[i] + weight;
-                    m_ctr[i] = m_ctr[i] + weight * pt[i];
+                    m_values[index] = m_values[index] + vec[row] * vec[col] * weight;
+                    index++;
                 }
             }
-            else if (step == 1)
+        }
+
+        void Product(MFloat outVec[TMatrixSize], const MFloat inVec[TMatrixSize])
+        {
+            for (int row = 0; row < TMatrixSize; row++)
             {
-                MFloat a[4];
-                MFloat b[4];
+                MFloat sum = ParallelMath::MakeFloatZero();
 
-                for (int i = 0; i < 4; i++)
+                int index = (row * (row + 1)) >> 1;
+                for (int col = 0; col < TMatrixSize; col++)
                 {
-                    a[i] = pt[i] - m_ctr[i];
-                    b[i] = weight * a[i];
+                    sum = sum + inVec[col] * m_values[index];
+                    if (col >= row)
+                        index += col;
+                    else
+                        index++;
                 }
 
-                m_xx = m_xx + a[0] * b[0];
-                m_xy = m_xy + a[0] * b[1];
-                m_xz = m_xz + a[0] * b[2];
-                m_xw = m_xw + a[0] * b[3];
-                m_yy = m_yy + a[1] * b[1];
-                m_yz = m_yz + a[1] * b[2];
-                m_yw = m_yw + a[1] * b[3];
-                m_zz = m_zz + a[2] * b[2];
-                m_zw = m_zw + a[2] * b[3];
-                m_ww = m_ww + a[3] * b[3];
-            }
-            else if (step == 2)
-            {
-                MFloat diff[4];
-                for (int i = 0; i < 4; i++)
-                    diff[i] = pt[i] - m_ctr[i];
-
-                MFloat dist = diff[0] * m_axis[0] + diff[1] * m_axis[1] + diff[2] * m_axis[2] + diff[3] * m_axis[3];
-                m_minDist = ParallelMath::Min(dist, m_minDist);
-                m_maxDist = ParallelMath::Max(dist, m_maxDist);
+                outVec[row] = sum;
             }
         }
 
-        UnfinishedEndpoints<4> GetEndpoints() const
-        {
-            MFloat len = m_maxDist - m_minDist;
-
-            UnfinishedEndpoints<4> result;
-            for (int i = 0; i < 4; i++)
-            {
-                result.m_base[i] = m_ctr[i] + m_axis[i] * m_minDist;
-                result.m_offset[i] = m_axis[i] * len;
-            }
-            return result;
-        }
+    private:
+        ParallelMath::Float m_values[PyramidSize];
     };
 
+    static const int NumEndpointSelectorPasses = 3;
 
-    class EndpointSelectorRGB
+    template<int TVectorSize, int TIterationCount>
+    class EndpointSelector
     {
     public:
-        static const int NumPasses = 3;
-        static const int NumPowerIterations = 8;
-
         typedef ParallelMath::Float MFloat;
-        typedef ParallelMath::Int16 MInt16;
 
-        MFloat m_total[3];
-        MFloat m_ctr[3];
-        MFloat m_axis[3];
-        MFloat m_xx;
-        MFloat m_xy;
-        MFloat m_xz;
-        MFloat m_xw;
-        MFloat m_yy;
-        MFloat m_yz;
-        MFloat m_yw;
-        MFloat m_zz;
-        MFloat m_zw;
-        MFloat m_ww;
-        MFloat m_minDist;
-        MFloat m_maxDist;
-
-        EndpointSelectorRGB()
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                m_total[i] = ParallelMath::MakeFloatZero();
-                m_ctr[i] = ParallelMath::MakeFloatZero();
-                m_axis[i] = ParallelMath::MakeFloatZero();
-            }
-            m_xx = ParallelMath::MakeFloatZero();
-            m_xy = ParallelMath::MakeFloatZero();
-            m_xz = ParallelMath::MakeFloatZero();
-            m_xw = ParallelMath::MakeFloatZero();
-            m_yy = ParallelMath::MakeFloatZero();
-            m_yz = ParallelMath::MakeFloatZero();
-            m_yw = ParallelMath::MakeFloatZero();
-            m_zz = ParallelMath::MakeFloatZero();
-            m_zw = ParallelMath::MakeFloatZero();
-            m_ww = ParallelMath::MakeFloatZero();
-            m_minDist = ParallelMath::MakeFloat(1000.0f);
-            m_maxDist = ParallelMath::MakeFloat(-1000.0f);
-        }
-
-        void InitPass(int step)
-        {
-            if (step == 1)
+        EndpointSelector()
+        {
+            for (int ch = 0; ch < TVectorSize; ch++)
             {
-                for (int i = 0; i < 3; i++)
-                    m_ctr[i] = m_ctr[i] / ParallelMath::Max(m_total[i], ParallelMath::MakeFloat(0.0001f));
+                m_centroid[ch] = ParallelMath::MakeFloatZero();
+                m_direction[ch] = ParallelMath::MakeFloatZero();
             }
-            else if (step == 2)
+            m_weightTotal = ParallelMath::MakeFloatZero();
+            m_minDist = ParallelMath::MakeFloat(FLT_MAX);
+            m_maxDist = ParallelMath::MakeFloat(-FLT_MAX);
+        }
+
+        void ContributePass(const MFloat value[TVectorSize], int pass, MFloat weight)
+        {
+            if (pass == 0)
+                ContributeCentroid(value, weight);
+            else if (pass == 1)
+                ContributeDirection(value, weight);
+            else if (pass == 2)
+                ContributeMinMax(value);
+        }
+
+        void FinishPass(int pass)
+        {
+            if (pass == 0)
+                FinishCentroid();
+            else if (pass == 1)
+                FinishDirection();
+        }
+
+        UnfinishedEndpoints<TVectorSize> GetEndpoints(const float channelWeights[TVectorSize]) const
+        {
+            MFloat unweightedBase[TVectorSize];
+            MFloat unweightedOffset[TVectorSize];
+
+            for (int ch = 0; ch < TVectorSize; ch++)
             {
-                MFloat matrix[3][3] =
-                {
-                    { m_xx, m_xy, m_xz },
-                { m_xy, m_yy, m_yz },
-                { m_xz, m_yz, m_zz },
-                };
+                MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
+                MFloat max = m_centroid[ch] + m_direction[ch] * (m_maxDist - m_minDist);
 
-                MFloat v[3] = { ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f), ParallelMath::MakeFloat(1.0f) };
-                for (int p = 0; p < NumPowerIterations; p++)
-                {
-                    // matrix multiply
-                    MFloat w[3];
-                    for (int i = 0; i < 3; i++)
-                    {
-                        w[i] = matrix[0][i] * v[0];
-                        for (int row = 1; row < 3; row++)
-                            w[i] = w[i] + matrix[row][i] * v[row];
-                    }
+                float safeWeight = channelWeights[ch];
+                if (safeWeight == 0.f)
+                    safeWeight = 1.0f;
 
-                    MFloat a = ParallelMath::Max(w[0], ParallelMath::Max(w[1], w[2]));
+                unweightedBase[ch] = min / channelWeights[ch];
+                unweightedOffset[ch] = (max - min) / channelWeights[ch];
+            }
 
-                    ParallelMath::FloatCompFlag aZero = ParallelMath::Equal(a, ParallelMath::MakeFloatZero());
+            return UnfinishedEndpoints<TVectorSize>(unweightedBase, unweightedOffset);
+        }
 
-                    ParallelMath::ConditionalSet(a, aZero, ParallelMath::MakeFloat(1.0f));
+    private:
+        void ContributeCentroid(const MFloat value[TVectorSize], MFloat weight)
+        {
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_centroid[ch] = m_centroid[ch] + value[ch] * weight;
+            m_weightTotal = m_weightTotal + weight;
+        }
 
-                    for (int c = 0; c < 3; c++)
-                        v[c] = w[c] / a;
-                }
+        void FinishCentroid()
+        {
+            MFloat denom = m_weightTotal;
+            ParallelMath::MakeSafeDenominator(denom);
 
-                MFloat vlen = ParallelMath::Sqrt(v[0] * v[0] + v[1] * v[1] + v[2] * v[2]);
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_centroid[ch] = m_centroid[ch] / denom;
+        }
 
-                ParallelMath::FloatCompFlag vZero = ParallelMath::Equal(vlen, ParallelMath::MakeFloatZero());
-                ParallelMath::ConditionalSet(vlen, vZero, ParallelMath::MakeFloat(1.0f));
+        void ContributeDirection(const MFloat value[TVectorSize], MFloat weight)
+        {
+            MFloat diff[TVectorSize];
+            for (int ch = 0; ch < TVectorSize; ch++)
+                diff[ch] = value[ch] - m_centroid[ch];
 
-                for (int i = 0; i < 3; i++)
-                    m_axis[i] = v[i] / vlen;
-            }
+            m_covarianceMatrix.Add(diff, weight);
         }
 
-        void Contribute(int step, const MInt16* pixel, MFloat weight)
+        void FinishDirection()
         {
-            MFloat pt[3];
-            for (int i = 0; i < 3; i++)
-                pt[i] = ParallelMath::UInt16ToFloat(pixel[i]);
+            MFloat approx[TVectorSize];
+            for (int ch = 0; ch < TVectorSize; ch++)
+                approx[ch] = ParallelMath::MakeFloat(1.0f);
 
-            if (step == 0)
+            for (int i = 0; i < TIterationCount; i++)
             {
-                for (int i = 0; i < 3; i++)
-                {
-                    m_total[i] = m_total[i] + weight;
-                    m_ctr[i] = m_ctr[i] + weight * pt[i];
-                }
-            }
-            else if (step == 1)
-            {
-                MFloat a[3];
-                MFloat b[3];
+                MFloat product[TVectorSize];
+                m_covarianceMatrix.Product(product, approx);
 
-                for (int i = 0; i < 3; i++)
-                {
-                    a[i] = pt[i] - m_ctr[i];
-                    b[i] = weight * a[i];
-                }
+                MFloat largestComponent = product[0];
+                for (int ch = 1; ch < TVectorSize; ch++)
+                    largestComponent = ParallelMath::Max(largestComponent, product[ch]);
 
-                m_xx = m_xx + a[0] * b[0];
-                m_xy = m_xy + a[0] * b[1];
-                m_xz = m_xz + a[0] * b[2];
-                m_yy = m_yy + a[1] * b[1];
-                m_yz = m_yz + a[1] * b[2];
-                m_zz = m_zz + a[2] * b[2];
+                // product = largestComponent*newApprox
+                ParallelMath::MakeSafeDenominator(largestComponent);
+                for (int ch = 0; ch < TVectorSize; ch++)
+                    approx[ch] = product[ch] / largestComponent;
             }
-            else if (step == 2)
-            {
-                MFloat diff[3];
-                for (int i = 0; i < 3; i++)
-                    diff[i] = pt[i] - m_ctr[i];
 
-                MFloat dist = diff[0] * m_axis[0] + diff[1] * m_axis[1] + diff[2] * m_axis[2];
-                m_minDist = ParallelMath::Min(dist, m_minDist);
-                m_maxDist = ParallelMath::Max(dist, m_maxDist);
-            }
+            // Normalize
+            MFloat approxLen = ParallelMath::MakeFloatZero();
+            for (int ch = 0; ch < TVectorSize; ch++)
+                approxLen = approxLen + approx[ch] * approx[ch];
+
+            approxLen = ParallelMath::Sqrt(approxLen);
+
+            ParallelMath::MakeSafeDenominator(approxLen);
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_direction[ch] = approx[ch] / approxLen;
         }
 
-        UnfinishedEndpoints<3> GetEndpoints() const
+        void ContributeMinMax(const MFloat value[TVectorSize])
         {
-            MFloat len = m_maxDist - m_minDist;
+            MFloat dist = ParallelMath::MakeFloatZero();
+            for (int ch = 0; ch < TVectorSize; ch++)
+                dist = dist + m_direction[ch] * (value[ch] - m_centroid[ch]);
 
-            UnfinishedEndpoints<3> result;
-            for (int i = 0; i < 3; i++)
-            {
-                result.m_base[i] = m_ctr[i] + m_axis[i] * m_minDist;
-                result.m_offset[i] = m_axis[i] * len;
-            }
-            return result;
+            m_minDist = ParallelMath::Min(m_minDist, dist);
+            m_maxDist = ParallelMath::Max(m_maxDist, dist);
         }
+        
+        ParallelMath::Float m_centroid[TVectorSize];
+        ParallelMath::Float m_direction[TVectorSize];
+        PackedCovarianceMatrix<TVectorSize> m_covarianceMatrix;
+        ParallelMath::Float m_weightTotal;
+
+        ParallelMath::Float m_minDist;
+        ParallelMath::Float m_maxDist;
     };
 
     template<int TVectorSize>
@@ -1043,14 +931,28 @@ namespace
         typedef ParallelMath::Float MFloat;
         typedef ParallelMath::Int16 MInt16;
 
-        MInt16 m_endPoint[2][TVectorSize];
-        int m_prec;
-        float m_maxValue;
-        MFloat m_origin[TVectorSize];
-        MFloat m_axis[TVectorSize];
-
-        void Init(MInt16 endPoint[2][TVectorSize], int prec)
+        void Init(const float channelWeights[TVectorSize], MInt16 endPoint[2][TVectorSize], int prec)
         {
+            m_isUniform = true;
+            for (int ch = 1; ch < TVectorSize; ch++)
+            {
+                if (channelWeights[ch] != channelWeights[0])
+                    m_isUniform = false;
+            }
+
+            // To work with channel weights, we need something where:
+            // pxDiff = px - ep[0]
+            // epDiff = ep[1] - ep[0]
+            //
+            // weightedEPDiff = epDiff * channelWeights
+            // normalizedWeightedAxis = weightedEPDiff / len(weightedEPDiff)
+            // normalizedIndex = dot(pxDiff * channelWeights, normalizedWeightedAxis) / len(weightedEPDiff)
+            // index = normalizedIndex * maxValue
+            //
+            // Equivalent to:
+            // axis = channelWeights * maxValue * epDiff * channelWeights / lenSquared(epDiff * channelWeights)
+            // index = dot(axis, pxDiff)
+
             for (int ep = 0; ep < 2; ep++)
                 for (int ch = 0; ch < TVectorSize; ch++)
                     m_endPoint[ep][ch] = endPoint[ep][ch];
@@ -1058,24 +960,22 @@ namespace
             m_prec = prec;
             m_maxValue = static_cast<float>((1 << m_prec) - 1);
 
-            MFloat axis[TVectorSize];
+            MFloat epDiffWeighted[TVectorSize];
             for (int ch = 0; ch < TVectorSize; ch++)
             {
                 m_origin[ch] = ParallelMath::UInt16ToFloat(endPoint[0][ch]);
 
-                axis[ch] = ParallelMath::UInt16ToFloat(endPoint[1][ch]) - m_origin[ch];
+                epDiffWeighted[ch] = (ParallelMath::UInt16ToFloat(endPoint[1][ch]) - m_origin[ch]) * channelWeights[ch];
             }
 
-            MFloat lenSquared = axis[0] * axis[0];
+            MFloat lenSquared = epDiffWeighted[0] * epDiffWeighted[0];
             for (int ch = 1; ch < TVectorSize; ch++)
-                lenSquared = lenSquared + axis[ch] * axis[ch];
-
-            ParallelMath::FloatCompFlag lenSquaredZero = ParallelMath::Equal(lenSquared, ParallelMath::MakeFloatZero());
+                lenSquared = lenSquared + epDiffWeighted[ch] * epDiffWeighted[ch];
 
-            ParallelMath::ConditionalSet(lenSquared, lenSquaredZero, ParallelMath::MakeFloat(1.0f));
+            ParallelMath::MakeSafeDenominator(lenSquared);
 
             for (int ch = 0; ch < TVectorSize; ch++)
-                m_axis[ch] = (axis[ch] / lenSquared) * m_maxValue;
+                m_axis[ch] = epDiffWeighted[ch] * (m_maxValue * channelWeights[ch]) / lenSquared;
         }
 
         void Reconstruct(MInt16 index, MInt16* pixel)
@@ -1106,6 +1006,14 @@ namespace
 
             return ParallelMath::FloatToUInt16(ParallelMath::Clamp(dist, 0.0f, m_maxValue));
         }
+
+    private:
+        MInt16 m_endPoint[2][TVectorSize];
+        MFloat m_origin[TVectorSize];
+        MFloat m_axis[TVectorSize];
+        int m_prec;
+        float m_maxValue;
+        bool m_isUniform;
     };
 
     // Solve for a, b where v = a*t + b
@@ -1127,8 +1035,9 @@ namespace
         MFloat m_w;
 
         float m_maxIndex;
+        float m_channelWeights[TVectorSize];
 
-        void Init(int indexBits)
+        void Init(int indexBits, const float channelWeights[TVectorSize])
         {
             for (int ch = 0; ch < TVectorSize; ch++)
             {
@@ -1140,6 +1049,9 @@ namespace
             m_w = ParallelMath::MakeFloatZero();
 
             m_maxIndex = static_cast<float>((1 << indexBits) - 1);
+
+            for (int ch = 0; ch < TVectorSize; ch++)
+                m_channelWeights[ch] = channelWeights[ch];
         }
 
         void Contribute(const MInt16* pixel, MInt16 index, MFloat weight)
@@ -1147,7 +1059,7 @@ namespace
             MFloat v[TVectorSize];
 
             for (int ch = 0; ch < TVectorSize; ch++)
-                v[ch] = ParallelMath::UInt16ToFloat(pixel[ch]);
+                v[ch] = ParallelMath::UInt16ToFloat(pixel[ch]) * m_channelWeights[ch];
 
             MFloat t = ParallelMath::UInt16ToFloat(index) / m_maxIndex;
 
@@ -1165,9 +1077,9 @@ namespace
         {
             // a = (tv - t*v/w)/(tt - t*t/w)
             // b = (v - a*t)/w
-            ParallelMath::FloatCompFlag wZero = ParallelMath::Equal(m_w, ParallelMath::MakeFloatZero());
+            MFloat w = m_w;
 
-            MFloat w = ParallelMath::Select(wZero, ParallelMath::MakeFloat(1.0f), m_w);
+            ParallelMath::MakeSafeDenominator(w);
 
             MFloat adenom = (m_tt - m_t * m_t / w);
 
@@ -1191,14 +1103,19 @@ namespace
                 MFloat a = (m_tv[ch] - m_t * m_v[ch] / w) / adenom;
                 MFloat b = (m_v[ch] - a * m_t) / w;
 
-                MFloat p1 = ParallelMath::Clamp(b, 0.0f, 255.0f);
-                MFloat p2 = ParallelMath::Clamp(a + b, 0.0f, 255.0f);
+                MFloat p1 = b;
+                MFloat p2 = a + b;
 
                 ParallelMath::ConditionalSet(p1, adenomZero, (m_v[ch] / w));
                 ParallelMath::ConditionalSet(p2, adenomZero, p1);
 
-                endPoint[0][ch] = ParallelMath::FloatToUInt16(p1);
-                endPoint[1][ch] = ParallelMath::FloatToUInt16(p2);
+                // Unweight
+                float inverseWeight = m_channelWeights[ch];
+                if (inverseWeight == 0.f)
+                    inverseWeight = 1.f;
+
+                endPoint[0][ch] = ParallelMath::FloatToUInt16(ParallelMath::Clamp(p1 / inverseWeight, 0.f, 255.0f));
+                endPoint[1][ch] = ParallelMath::FloatToUInt16(ParallelMath::Clamp(p2 / inverseWeight, 0.f, 255.0f));
             }
         }
     };
@@ -1362,7 +1279,8 @@ namespace
             }
         }
 
-        static MFloat ComputeError(DWORD flags, const MInt16 reconstructed[4], const MInt16 original[4])
+        template<int TVectorSize>
+        static MFloat ComputeError(DWORD flags, const MInt16 reconstructed[TVectorSize], const MInt16 original[TVectorSize], const float channelWeights[TVectorSize])
         {
             MFloat error = ParallelMath::MakeFloatZero();
             if (flags & BC_FLAGS_UNIFORM)
@@ -1372,22 +1290,31 @@ namespace
             }
             else
             {
-                const float perceptualWeights[4] = { 0.2125f / 0.7154f, 1.0f, 0.0721f / 0.7154f, 1.0f };
                 for (int ch = 0; ch < 4; ch++)
-                    error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch])) * ParallelMath::MakeFloat(perceptualWeights[ch]);
+                    error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch])) * ParallelMath::MakeFloat(channelWeights[ch]);
             }
 
             return error;
         }
 
-        static void TrySinglePlane(DWORD flags, const MInt16 pixels[16][4], WorkInfo& work)
+        template<int TChannelCount>
+        static void PreWeightPixels(MFloat preWeightedPixels[16][TChannelCount], const MInt16 pixels[16][TChannelCount], const float channelWeights[TChannelCount])
+        {
+            for (int px = 0; px < 16; px++)
+            {
+                for (int ch = 0; ch < TChannelCount; ch++)
+                    preWeightedPixels[px][ch] = ParallelMath::UInt16ToFloat(pixels[px][ch]) * channelWeights[ch];
+            }
+        }
+
+        static void TrySinglePlane(DWORD flags, const MInt16 pixels[16][4], const float channelWeights[4], WorkInfo& work)
         {
             for (uint16_t mode = 0; mode <= 7; mode++)
             {
                 if ((flags & BC_FLAGS_FORCE_BC7_MODE6) && mode != 6)
                     continue;
 
-                if ((flags & BC_FLAGS_USE_3SUBSETS) && g_modes[mode].m_numSubsets == 3)
+                if (!(flags & BC_FLAGS_USE_3SUBSETS) && g_modes[mode].m_numSubsets == 3)
                     continue;
 
                 if (mode == 4 || mode == 5)
@@ -1417,12 +1344,13 @@ namespace
 
                 for (uint16_t partition = 0; partition < numPartitions; partition++)
                 {
-                    EndpointSelectorRGBA epSelectors[3];
+                    EndpointSelector<4, 8> epSelectors[3];
 
-                    for (int epPass = 0; epPass < EndpointSelectorRGBA::NumPasses; epPass++)
+                    for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
                     {
-                        for (int subset = 0; subset < numSubsets; subset++)
-                            epSelectors[subset].InitPass(epPass);
+                        MFloat preWeightedPixels[16][4];
+
+                        PreWeightPixels<4>(preWeightedPixels, rgbAdjustedPixels, channelWeights);
 
                         for (int px = 0; px < 16; px++)
                         {
@@ -1434,13 +1362,16 @@ namespace
 
                             assert(subset < 3);
 
-                            epSelectors[subset].Contribute(epPass, rgbAdjustedPixels[px], ParallelMath::MakeFloat(1.0f));
+                            epSelectors[subset].ContributePass(preWeightedPixels[px], epPass, ParallelMath::MakeFloat(1.0f));
                         }
+
+                        for (int subset = 0; subset < numSubsets; subset++)
+                            epSelectors[subset].FinishPass(epPass);
                     }
 
                     UnfinishedEndpoints<4> unfinishedEPs[3];
                     for (int subset = 0; subset < numSubsets; subset++)
-                        unfinishedEPs[subset] = epSelectors[subset].GetEndpoints();
+                        unfinishedEPs[subset] = epSelectors[subset].GetEndpoints(channelWeights);
 
                     MInt16 bestIndexes[16];
                     MInt16 bestEP[3][2][4];
@@ -1504,12 +1435,12 @@ namespace
                                 IndexSelector<4> indexSelectors[3];
 
                                 for (int subset = 0; subset < numSubsets; subset++)
-                                    indexSelectors[subset].Init(ep[subset], indexPrec);
+                                    indexSelectors[subset].Init(channelWeights, ep[subset], indexPrec);
 
                                 EndpointRefiner<4> epRefiners[3];
 
                                 for (int subset = 0; subset < numSubsets; subset++)
-                                    epRefiners[subset].Init(indexPrec);
+                                    epRefiners[subset].Init(indexPrec, channelWeights);
 
                                 MFloat subsetError[3] = { ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero(), ParallelMath::MakeFloatZero() };
 
@@ -1533,7 +1464,7 @@ namespace
 
                                     indexSelectors[subset].Reconstruct(index, reconstructed);
 
-                                    subsetError[subset] = subsetError[subset] + ComputeError(flags, reconstructed, pixels[px]);
+                                    subsetError[subset] = subsetError[subset] + ComputeError<4>(flags, reconstructed, pixels[px], channelWeights);
 
                                     indexes[px] = index;
                                 }
@@ -1606,8 +1537,14 @@ namespace
             }
         }
 
-        static void TryDualPlane(DWORD flags, const MInt16 pixels[16][4], WorkInfo& work)
+        static void TryDualPlane(DWORD flags, const MInt16 pixels[16][4], const float channelWeights[4], WorkInfo& work)
         {
+            // TODO: These error calculations are not optimal for weight-by-alpha, but this routine needs to be mostly rewritten for that.
+            // The alpha/color solutions are co-dependent in that case, but a good way to solve it would probably be to
+            // solve the alpha channel first, then solve the RGB channels, which in turn breaks down into two cases:
+            // - Separate alpha channel, then weighted RGB
+            // - Alpha+2 other channels, then the independent channel
+
             if (flags & BC_FLAGS_FORCE_BC7_MODE6)
                 return; // Mode 6 is not a dual-plane mode, skip it
 
@@ -1631,15 +1568,24 @@ namespace
 
                     uint16_t maxIndexSelector = (mode == 4) ? 2 : 1;
 
+                    float rotatedRGBWeights[3] = { channelWeights[redChannel], channelWeights[greenChannel], channelWeights[blueChannel] };
+                    float rotatedAlphaWeight[1] = { channelWeights[alphaChannel] };
+
+                    float uniformWeight[1] = { 1.0f };   // Since the alpha channel is independent, there's no need to bother with weights when doing refinement or selection, only error
+
+                    MFloat preWeightedRotatedRGB[16][3];
+                    PreWeightPixels<3>(preWeightedRotatedRGB, rotatedRGB, rotatedRGBWeights);
+
                     for (uint16_t indexSelector = 0; indexSelector < maxIndexSelector; indexSelector++)
                     {
-                        EndpointSelectorRGB rgbSelector;
+                        EndpointSelector<3, 8> rgbSelector;
 
-                        for (int epPass = 0; epPass < EndpointSelectorRGB::NumPasses; epPass++)
+                        for (int epPass = 0; epPass < NumEndpointSelectorPasses; epPass++)
                         {
-                            rgbSelector.InitPass(epPass);
                             for (int px = 0; px < 16; px++)
-                                rgbSelector.Contribute(epPass, rotatedRGB[px], ParallelMath::MakeFloat(1.0f));
+                                rgbSelector.ContributePass(preWeightedRotatedRGB[px], epPass, ParallelMath::MakeFloat(1.0f));
+
+                            rgbSelector.FinishPass(epPass);
                         }
 
                         MInt16 alphaRange[2];
@@ -1662,7 +1608,7 @@ namespace
                         else
                             rgbPrec = alphaPrec = 2;
 
-                        UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints();
+                        UnfinishedEndpoints<3> unfinishedRGB = rgbSelector.GetEndpoints(rotatedRGBWeights);
 
                         MFloat bestRGBError = ParallelMath::MakeFloat(FLT_MAX);
                         MFloat bestAlphaError = ParallelMath::MakeFloat(FLT_MAX);
@@ -1690,20 +1636,21 @@ namespace
                                 else
                                     CompressEndpoints5(rgbEP, alphaEP);
 
+
                                 IndexSelector<1> alphaIndexSelector;
                                 IndexSelector<3> rgbIndexSelector;
 
                                 {
                                     MInt16 alphaEPTemp[2][1] = { { alphaEP[0] },{ alphaEP[1] } };
-                                    alphaIndexSelector.Init(alphaEPTemp, alphaPrec);
+                                    alphaIndexSelector.Init(uniformWeight, alphaEPTemp, alphaPrec);
                                 }
-                                rgbIndexSelector.Init(rgbEP, rgbPrec);
+                                rgbIndexSelector.Init(rotatedRGBWeights, rgbEP, rgbPrec);
 
                                 EndpointRefiner<3> rgbRefiner;
                                 EndpointRefiner<1> alphaRefiner;
 
-                                rgbRefiner.Init(rgbPrec);
-                                alphaRefiner.Init(alphaPrec);
+                                rgbRefiner.Init(rgbPrec, rotatedRGBWeights);
+                                alphaRefiner.Init(alphaPrec, uniformWeight);
 
                                 MFloat errorRGB = ParallelMath::MakeFloatZero();
                                 MFloat errorA = ParallelMath::MakeFloatZero();
@@ -1725,20 +1672,9 @@ namespace
                                     rgbIndexSelector.Reconstruct(rgbIndex, reconstructedRGB);
                                     alphaIndexSelector.Reconstruct(alphaIndex, reconstructedAlpha);
 
-                                    MInt16 reconstructedRGBA[4];
-                                    reconstructedRGBA[redChannel] = reconstructedRGB[0];
-                                    reconstructedRGBA[greenChannel] = reconstructedRGB[1];
-                                    reconstructedRGBA[blueChannel] = reconstructedRGB[2];
-                                    reconstructedRGBA[alphaChannel] = pixels[px][alphaChannel];
-
-                                    errorRGB = errorRGB + ComputeError(flags, reconstructedRGBA, pixels[px]);
+                                    errorRGB = errorRGB + ComputeError<3>(flags, reconstructedRGB, rotatedRGB[px], rotatedRGBWeights);
 
-                                    reconstructedRGBA[redChannel] = pixels[px][redChannel];
-                                    reconstructedRGBA[greenChannel] = pixels[px][greenChannel];
-                                    reconstructedRGBA[blueChannel] = pixels[px][blueChannel];
-                                    reconstructedRGBA[alphaChannel] = reconstructedAlpha[0];
-
-                                    errorA = errorA + ComputeError(flags, reconstructedRGBA, pixels[px]);
+                                    errorA = errorA + ComputeError<1>(flags, reconstructedAlpha, pixels[px] + alphaChannel, rotatedAlphaWeight);
 
                                     rgbIndexes[px] = rgbIndex;
                                     alphaIndexes[px] = alphaIndex;
@@ -1812,7 +1748,7 @@ namespace
             b = temp;
         }
 
-        static void Pack(DWORD flags, const InputBlock* inputs, uint8_t* packedBlocks)
+        static void Pack(DWORD flags, const InputBlock* inputs, uint8_t* packedBlocks, const float channelWeights[4])
         {
             MInt16 pixels[16][4];
 
@@ -1830,8 +1766,8 @@ namespace
 
             work.m_error = ParallelMath::MakeFloat(FLT_MAX);
 
-            TryDualPlane(flags, pixels, work);
-            TrySinglePlane(flags, pixels, work);
+            TryDualPlane(flags, pixels, channelWeights, work);
+            TrySinglePlane(flags, pixels, channelWeights, work);
 
             for (int block = 0; block < ParallelMath::ParallelSize; block++)
             {
@@ -2067,7 +2003,10 @@ void DirectX::D3DXEncodeBC7Parallel(uint8_t *pBC, const XMVECTOR *pColor, DWORD
             }
         }
 
-        BC7Computer::Pack(flags, inputBlocks, pBC);
+        const float perceptualWeights[4] = { 0.2125f / 0.7154f, 1.0f, 0.0721f / 0.7154f, 1.0f };
+        const float uniformWeights[4] = { 1.0f, 1.0f, 1.0f, 1.0f };
+
+        BC7Computer::Pack(flags, inputBlocks, pBC, (flags & BC_FLAGS_UNIFORM) ? uniformWeights : perceptualWeights);
 
         pBC += ParallelMath::ParallelSize * 16;
     }

From 2cb41d3e0421a3afd776f077d8793de1616c21d6 Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Sat, 14 Apr 2018 00:02:10 -0400
Subject: [PATCH 3/7] Skip modes 0-3 if all blocks have alpha, skip mode 7 if
 no blocks have alpha

---
 DirectXTex/BC7Parallel.cpp | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
index 75a22975..3bc0cea4 100644
--- a/DirectXTex/BC7Parallel.cpp
+++ b/DirectXTex/BC7Parallel.cpp
@@ -1309,6 +1309,23 @@ namespace
 
         static void TrySinglePlane(DWORD flags, const MInt16 pixels[16][4], const float channelWeights[4], WorkInfo& work)
         {
+            MInt16 maxAlpha = ParallelMath::MakeUInt16(0);
+            MInt16 minAlpha = ParallelMath::MakeUInt16(255);
+            for (int px = 0; px < 16; px++)
+            {
+                maxAlpha = ParallelMath::Max(maxAlpha, pixels[px][3]);
+                minAlpha = ParallelMath::Min(minAlpha, pixels[px][3]);
+            }
+
+            // Try RGB modes if any block has a min alpha 251 or higher
+            bool allowRGBModes = ParallelMath::AnySet(ParallelMath::Less(ParallelMath::MakeUInt16(250), minAlpha));
+
+            // Try mode 7 if any block has alpha.
+            // Mode 7 is almost never selected for RGB blocks because mode 4 has very accurate 7.7.7.1 endpoints
+            // and its parity bit doesn't affect alpha, meaning mode 7 can only be better in extremely specific
+            // situations, and only by at most 1 unit of error per pixel.
+            bool allowMode7 = ParallelMath::AnySet(ParallelMath::Less(maxAlpha, ParallelMath::MakeUInt16(255)));
+
             for (uint16_t mode = 0; mode <= 7; mode++)
             {
                 if ((flags & BC_FLAGS_FORCE_BC7_MODE6) && mode != 6)
@@ -1320,6 +1337,12 @@ namespace
                 if (mode == 4 || mode == 5)
                     continue;
 
+                if (mode < 4 && !allowRGBModes)
+                    continue;
+
+                if (mode == 7 && !allowMode7)
+                    continue;
+
                 MInt16 rgbAdjustedPixels[16][4];
                 for (int px = 0; px < 16; px++)
                 {

From 03a29bcf301ef84bd6438a69a8d8f15b7d60fd7d Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Sat, 21 Apr 2018 01:32:18 -0400
Subject: [PATCH 4/7] Fix -singleproc regression

---
 DirectXTex/DirectXTexCompress.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/DirectXTex/DirectXTexCompress.cpp b/DirectXTex/DirectXTexCompress.cpp
index d2e2204c..9e518b72 100644
--- a/DirectXTex/DirectXTexCompress.cpp
+++ b/DirectXTex/DirectXTexCompress.cpp
@@ -182,6 +182,8 @@ namespace
 
                 _ConvertScanline(temp, 16, result.format, format, cflags | srgb);
 
+                nQueuedBlocks++;
+
                 if (nQueuedBlocks == nBlocksPerChunk)
                 {
                     if (pfEncode)

From 2df2e49e72531b10926a4730717ea969633a96ae Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Sat, 21 Apr 2018 04:30:13 -0400
Subject: [PATCH 5/7] Fixed OOB read

---
 DirectXTex/BC7Parallel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
index 3bc0cea4..73e0d300 100644
--- a/DirectXTex/BC7Parallel.cpp
+++ b/DirectXTex/BC7Parallel.cpp
@@ -1285,12 +1285,12 @@ namespace
             MFloat error = ParallelMath::MakeFloatZero();
             if (flags & BC_FLAGS_UNIFORM)
             {
-                for (int ch = 0; ch < 4; ch++)
+                for (int ch = 0; ch < TVectorSize; ch++)
                     error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch]));
             }
             else
             {
-                for (int ch = 0; ch < 4; ch++)
+                for (int ch = 0; ch < TVectorSize; ch++)
                     error = error + ParallelMath::UInt16ToFloat(ParallelMath::SqDiff(reconstructed[ch], original[ch])) * ParallelMath::MakeFloat(channelWeights[ch]);
             }
 

From cca48f8e31852b04bb7a92195e981eca02028fb7 Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Fri, 27 Apr 2018 02:03:54 -0400
Subject: [PATCH 6/7] Fixed PCA math bugs

---
 DirectXTex/BC7Parallel.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
index 73e0d300..5a22c79d 100644
--- a/DirectXTex/BC7Parallel.cpp
+++ b/DirectXTex/BC7Parallel.cpp
@@ -773,7 +773,7 @@ namespace
                 {
                     sum = sum + inVec[col] * m_values[index];
                     if (col >= row)
-                        index += col;
+                        index += col + 1;
                     else
                         index++;
                 }
@@ -832,7 +832,7 @@ namespace
             for (int ch = 0; ch < TVectorSize; ch++)
             {
                 MFloat min = m_centroid[ch] + m_direction[ch] * m_minDist;
-                MFloat max = m_centroid[ch] + m_direction[ch] * (m_maxDist - m_minDist);
+                MFloat max = m_centroid[ch] + m_direction[ch] * m_maxDist;
 
                 float safeWeight = channelWeights[ch];
                 if (safeWeight == 0.f)

From 8404155d05c2bd0dd179f2fd52af422e5a378134 Mon Sep 17 00:00:00 2001
From: elasota <ejlasota@gmail.com>
Date: Sun, 13 May 2018 21:01:22 -0400
Subject: [PATCH 7/7] Fixed CompressEndpoints3 not initializing alpha

---
 DirectXTex/BC7Parallel.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/DirectXTex/BC7Parallel.cpp b/DirectXTex/BC7Parallel.cpp
index 5a22c79d..47b3faf4 100644
--- a/DirectXTex/BC7Parallel.cpp
+++ b/DirectXTex/BC7Parallel.cpp
@@ -1236,8 +1236,11 @@ namespace
 
         static void CompressEndpoints3(MInt16 ep[2][4], uint16_t p[2])
         {
-            for (int j = 0; j < 2; j++)
-                QuantizeP(ep[j], 7, p[j], 3);
+			for (int j = 0; j < 2; j++)
+			{
+				QuantizeP(ep[j], 7, p[j], 3);
+				ep[j][3] = ParallelMath::MakeUInt16(255);
+			}
         }
 
         static void CompressEndpoints4(MInt16 epRGB[2][3], MInt16 epA[2])