From 7cfb7ce652d1c13ab72392d95dc93d967bf505fb Mon Sep 17 00:00:00 2001
From: Fox Caminiti <fox@foxcam.net>
Date: Sun, 14 Aug 2022 12:38:08 -0400
Subject: concave masking; software anti aliasing

---
 prenderer.cpp | 99 ++++++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 81 insertions(+), 18 deletions(-)

(limited to 'prenderer.cpp')

diff --git a/prenderer.cpp b/prenderer.cpp
index 7550d0f..940cb0a 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -376,6 +376,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
     LayerBounds.Min.x -= LayerBounds.Min.x % 4;
     LayerBounds.Min.y -= LayerBounds.Min.y % 4;
 
+    uint16 WidthP, HeightP;
+    Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
     uint8 *TexPTR = (uint8 *)T.SourceBuffer;
     Assert(LayerBounds.Max.x <= Buffer->Width);
     Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -398,15 +401,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
     __m256 One = _mm256_set1_ps(1);
     __m256 Two = _mm256_set1_ps(2);
     __m256 Zero = _mm256_set1_ps(0);
+    // __m256 UMin = _mm256_set1_ps(0.0f - (1 / T.LayerWidth));
+    // __m256 VMin = _mm256_set1_ps(0.0f - (1 / T.LayerHeight));
+    // __m256 UMax = _mm256_set1_ps(1.0f + (1 / T.LayerWidth));
+    __m256 VMax = _mm256_set1_ps(1.0f - (1 / T.LayerHeight));
+
+    __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
     __m256 ZeroPointFive = _mm256_set1_ps(0.5);
     __m256i Zeroi = _mm256_set1_epi32(0);
     __m256i Onei = _mm256_set1_epi32(1);
     __m256 Four = _mm256_set1_ps(4);
     __m256 Sixteen = _mm256_set1_ps(16);
     __m256i FF = _mm256_set1_epi32(0xFF);
+    __m256i Full = _mm256_set1_epi32(0xFFFFFFFF);
     __m256i BottomTwoBits = _mm256_set1_epi32(0x03);
     __m256i Fouri = _mm256_set1_epi32(4);
     __m256i Sixteeni = _mm256_set1_epi32(16);
+    __m256i SixtyFouri = _mm256_set1_epi32(64);
     __m256 Real255 = _mm256_set1_ps(255.0f);
     __m256i Int255 = _mm256_set1_epi32(255);
     __m256 Norm255 = _mm256_set1_ps(1/255.0f);
@@ -442,11 +453,25 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
         {
             IACA_START;
 
-            __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+            // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
+            __m256 X0 = _mm256_set1_ps(0.30);
+            __m256 Y0 = _mm256_set1_ps(0.10);
+            __m256 X1 = _mm256_set1_ps(0.80);
+            __m256 Y1 = _mm256_set1_ps(0.35);
+            __m256 X2 = _mm256_set1_ps(0.05);
+            __m256 Y2 = _mm256_set1_ps(0.60);
+            __m256 X3 = _mm256_set1_ps(0.55);
+            __m256 Y3 = _mm256_set1_ps(0.85);
 
-            // TODO(fox): Not unwraping this function may lose a few cycles!
-            uint16 WidthP, HeightP;
-            Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+            __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+            __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
+            __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
+            __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
+            __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
+            __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
+            __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
+            __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
+            __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
 
             uint32 XLookup = (X >> 2)*16 + (X % 4);
             uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
@@ -456,13 +481,39 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
             __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
             __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
 
-            __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
-                                                                   _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+            __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
+            __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
+            __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
+            __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
+            __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
+            __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
+            __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
+            __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));
+
+            __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
+                                              _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
+            __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
+                                              _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
+            __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
+                                              _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
+            __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
+                                              _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));
+
+            // Each point that passes adds .25
+            __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
+                                       _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
+
+            // Zero - no points pass
+            // One - all points pass; not an edge
+            __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
+            __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);
 
             // If all of the pixels are zeroed in the mask (aka fall outside
             // the UV lookup), we can skip the iteration.
-            if (_mm256_movemask_epi8(LayerMask))
+            if (_mm256_movemask_epi8(Mask))
             {
+                __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
+
                 U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
                 V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
 
@@ -472,6 +523,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
                 __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
                 __m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
                 __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+                if (T.LayerWidth == 50 && _mm256_cvtsi256_si32(TexYIntPlusOne) == 49)
+                    int pp = 0;
                 // NOTE(fox): The comparison is for when we're on the last pixel of the texel.
 
                 __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
@@ -540,6 +593,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
                                              _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
                                                            _mm256_mul_ps(TexBoth,     A_TexBR)));
 
+                // Apply anti-aliasing to edges if there are any
+                if (_mm256_movemask_epi8(EdgeMask))
+                {
+                    A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask);
+                }
+
                 __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
                 __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
 
@@ -559,7 +618,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
                 __m256 A_Blend = LayerAlpha;
 
                 // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
-                if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal)
+                if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal)
                 {
                     __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
                     __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  DestPixel,      FF)), Norm255);
@@ -691,8 +750,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
                                       _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
                                       _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
 
-                // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
-                _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
+
+                _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel);
             }
             PixelX = _mm256_add_ps(PixelX, Four);
         }
@@ -708,6 +767,9 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
     LayerBounds.Min.x -= LayerBounds.Min.x % 4;
     LayerBounds.Min.y -= LayerBounds.Min.y % 4;
 
+    uint16 WidthP, HeightP;
+    Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
     uint8 *TexPTR = (uint8 *)T.SourceBuffer;
     Assert(LayerBounds.Max.x <= Buffer->Width);
     Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -761,11 +823,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
 
             __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
 
-
-            // TODO(fox): Not unwraping this function may lose a few cycles!
-            uint16 WidthP, HeightP;
-            Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
             uint32 XLookup = (X >> 2)*16 + (X % 4);
             uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
             uint32 PixelToSeek = XLookup + YLookup;
@@ -774,8 +831,8 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
             __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
             __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
 
-            __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)),
-                                                            _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One))));
+            __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
+                                                            _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));
 
             if (_mm_movemask_epi8(LayerMask))
             {
@@ -1082,7 +1139,7 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
             real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
             real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
 
-            if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
+            if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {
 
                 real32 TexXFull = U * T.LayerWidth;
                 uint32 TexXInt = (uint32)TexXFull;
@@ -1092,6 +1149,12 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
                 uint32 TexYInt = (uint32)TexYFull;
                 real32 TexY = TexYFull - TexYInt;
 
+                if(T.LayerWidth == 50)
+                    real32 pp = 0;
+
+                if(TexYInt > 47 && T.LayerWidth == 50)
+                    real32 pp = 0;
+
                 real32 TexXInv = 1 - TexX;
                 real32 TexYInv = 1 - TexY;
                 real32 TexBothXInv = TexXInv * TexY;
-- 
cgit v1.2.3