From 7cfb7ce652d1c13ab72392d95dc93d967bf505fb Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Sun, 14 Aug 2022 12:38:08 -0400 Subject: concave masking; software anti aliasing --- prenderer.cpp | 99 ++++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 81 insertions(+), 18 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index 7550d0f..940cb0a 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -376,6 +376,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; + uint16 WidthP, HeightP; + Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); @@ -398,15 +401,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 One = _mm256_set1_ps(1); __m256 Two = _mm256_set1_ps(2); __m256 Zero = _mm256_set1_ps(0); + // __m256 UMin = _mm256_set1_ps(0.0f - (1 / T.LayerWidth)); + // __m256 VMin = _mm256_set1_ps(0.0f - (1 / T.LayerHeight)); + // __m256 UMax = _mm256_set1_ps(1.0f + (1 / T.LayerWidth)); + __m256 VMax = _mm256_set1_ps(1.0f - (1 / T.LayerHeight)); + + __m256 ZeroPoint25 = _mm256_set1_ps(0.25); __m256 ZeroPointFive = _mm256_set1_ps(0.5); __m256i Zeroi = _mm256_set1_epi32(0); __m256i Onei = _mm256_set1_epi32(1); __m256 Four = _mm256_set1_ps(4); __m256 Sixteen = _mm256_set1_ps(16); __m256i FF = _mm256_set1_epi32(0xFF); + __m256i Full = _mm256_set1_epi32(0xFFFFFFFF); __m256i BottomTwoBits = _mm256_set1_epi32(0x03); __m256i Fouri = _mm256_set1_epi32(4); __m256i Sixteeni = _mm256_set1_epi32(16); + __m256i SixtyFouri = _mm256_set1_epi32(64); __m256 Real255 = _mm256_set1_ps(255.0f); __m256i Int255 = _mm256_set1_epi32(255); __m256 Norm255 = _mm256_set1_ps(1/255.0f); @@ -442,11 +453,25 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) { IACA_START; - __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); + // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky. + __m256 X0 = _mm256_set1_ps(0.30); + __m256 Y0 = _mm256_set1_ps(0.10); + __m256 X1 = _mm256_set1_ps(0.80); + __m256 Y1 = _mm256_set1_ps(0.35); + __m256 X2 = _mm256_set1_ps(0.05); + __m256 Y2 = _mm256_set1_ps(0.60); + __m256 X3 = _mm256_set1_ps(0.55); + __m256 Y3 = _mm256_set1_ps(0.85); - // TODO(fox): Not unwraping this function may lose a few cycles! - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); + __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0); + __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0); + __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1); + __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1); + __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2); + __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2); + __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3); + __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3); uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; @@ -456,13 +481,39 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY)); __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY)); - __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), - _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); + __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY)); + __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY)); + __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY)); + __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY)); + __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY)); + __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY)); + __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY)); + __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY)); + + __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1))); + __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1))); + __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1))); + __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1))); + + // Each point that passes adds .25 + __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)), + _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25))); + + // Zero - no points pass + // One - all points pass; not an edge + __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14); + __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13); // If all of the pixels are zeroed in the mask (aka fall outside // the UV lookup), we can skip the iteration. - if (_mm256_movemask_epi8(LayerMask)) + if (_mm256_movemask_epi8(Mask)) { + __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask); + U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); V = _mm256_max_ps(_mm256_min_ps(One, V), Zero); @@ -472,6 +523,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); + if (T.LayerWidth == 50 && _mm256_cvtsi256_si32(TexYIntPlusOne) == 49) + int pp = 0; // NOTE(fox): The comparison is for when we're on the last pixel of the texel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); @@ -540,6 +593,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL), _mm256_mul_ps(TexBoth, A_TexBR))); + // Apply anti-aliasing to edges if there are any + if (_mm256_movemask_epi8(EdgeMask)) + { + A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask); + } + __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity); __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha); @@ -559,7 +618,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 A_Blend = LayerAlpha; // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it). - if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal) + if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal) { __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255); @@ -691,8 +750,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); - _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); + + _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel); } PixelX = _mm256_add_ps(PixelX, Four); } @@ -708,6 +767,9 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; + uint16 WidthP, HeightP; + Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); @@ -761,11 +823,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); - - // TODO(fox): Not unwraping this function may lose a few cycles! - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); - uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; @@ -774,8 +831,8 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY)); __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY)); - __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), - _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); + __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)), + _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One)))); if (_mm_movemask_epi8(LayerMask)) { @@ -1082,7 +1139,7 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); - if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { + if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) { real32 TexXFull = U * T.LayerWidth; uint32 TexXInt = (uint32)TexXFull; @@ -1092,6 +1149,12 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi uint32 TexYInt = (uint32)TexYFull; real32 TexY = TexYFull - TexYInt; + if(T.LayerWidth == 50) + real32 pp = 0; + + if(TexYInt > 47 && T.LayerWidth == 50) + real32 pp = 0; + real32 TexXInv = 1 - TexX; real32 TexYInv = 1 - TexY; real32 TexBothXInv = TexXInv * TexY; -- cgit v1.2.3