summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp99
1 files changed, 81 insertions, 18 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index 7550d0f..940cb0a 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -376,6 +376,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
LayerBounds.Min.x -= LayerBounds.Min.x % 4;
LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
uint8 *TexPTR = (uint8 *)T.SourceBuffer;
Assert(LayerBounds.Max.x <= Buffer->Width);
Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -398,15 +401,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 One = _mm256_set1_ps(1);
__m256 Two = _mm256_set1_ps(2);
__m256 Zero = _mm256_set1_ps(0);
+ // __m256 UMin = _mm256_set1_ps(0.0f - (1 / T.LayerWidth));
+ // __m256 VMin = _mm256_set1_ps(0.0f - (1 / T.LayerHeight));
+ // __m256 UMax = _mm256_set1_ps(1.0f + (1 / T.LayerWidth));
+ __m256 VMax = _mm256_set1_ps(1.0f - (1 / T.LayerHeight));
+
+ __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
__m256 ZeroPointFive = _mm256_set1_ps(0.5);
__m256i Zeroi = _mm256_set1_epi32(0);
__m256i Onei = _mm256_set1_epi32(1);
__m256 Four = _mm256_set1_ps(4);
__m256 Sixteen = _mm256_set1_ps(16);
__m256i FF = _mm256_set1_epi32(0xFF);
+ __m256i Full = _mm256_set1_epi32(0xFFFFFFFF);
__m256i BottomTwoBits = _mm256_set1_epi32(0x03);
__m256i Fouri = _mm256_set1_epi32(4);
__m256i Sixteeni = _mm256_set1_epi32(16);
+ __m256i SixtyFouri = _mm256_set1_epi32(64);
__m256 Real255 = _mm256_set1_ps(255.0f);
__m256i Int255 = _mm256_set1_epi32(255);
__m256 Norm255 = _mm256_set1_ps(1/255.0f);
@@ -442,11 +453,25 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
IACA_START;
- __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+ // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
+ __m256 X0 = _mm256_set1_ps(0.30);
+ __m256 Y0 = _mm256_set1_ps(0.10);
+ __m256 X1 = _mm256_set1_ps(0.80);
+ __m256 Y1 = _mm256_set1_ps(0.35);
+ __m256 X2 = _mm256_set1_ps(0.05);
+ __m256 Y2 = _mm256_set1_ps(0.60);
+ __m256 X3 = _mm256_set1_ps(0.55);
+ __m256 Y3 = _mm256_set1_ps(0.85);
- // TODO(fox): Not unwraping this function may lose a few cycles!
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+ __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+ __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
+ __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
+ __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
+ __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
+ __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
+ __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
+ __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
+ __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
@@ -456,13 +481,39 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
__m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
- __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
- _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+ __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
+ __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
+ __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
+ __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
+ __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
+ __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
+ __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
+ __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));
+
+ __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
+ __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
+ __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
+ __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));
+
+ // Each point that passes adds .25
+ __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
+ _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
+
+ // Zero - no points pass
+ // One - all points pass; not an edge
+ __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
+ __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);
// If all of the pixels are zeroed in the mask (aka fall outside
// the UV lookup), we can skip the iteration.
- if (_mm256_movemask_epi8(LayerMask))
+ if (_mm256_movemask_epi8(Mask))
{
+ __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
+
U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
@@ -472,6 +523,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
__m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+ if (T.LayerWidth == 50 && _mm256_cvtsi256_si32(TexYIntPlusOne) == 49)
+ int pp = 0;
// NOTE(fox): The comparison is for when we're on the last pixel of the texel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
@@ -540,6 +593,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
_mm256_mul_ps(TexBoth, A_TexBR)));
+ // Apply anti-aliasing to edges if there are any
+ if (_mm256_movemask_epi8(EdgeMask))
+ {
+ A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask);
+ }
+
__m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
__m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
@@ -559,7 +618,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 A_Blend = LayerAlpha;
// Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
- if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal)
+ if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal)
{
__m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
__m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
@@ -691,8 +750,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
- // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
- _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
+
+ _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel);
}
PixelX = _mm256_add_ps(PixelX, Four);
}
@@ -708,6 +767,9 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
LayerBounds.Min.x -= LayerBounds.Min.x % 4;
LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
uint8 *TexPTR = (uint8 *)T.SourceBuffer;
Assert(LayerBounds.Max.x <= Buffer->Width);
Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -761,11 +823,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
-
- // TODO(fox): Not unwraping this function may lose a few cycles!
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
@@ -774,8 +831,8 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
__m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
- __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)),
- _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One))));
+ __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
+ _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));
if (_mm_movemask_epi8(LayerMask))
{
@@ -1082,7 +1139,7 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
- if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
+ if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {
real32 TexXFull = U * T.LayerWidth;
uint32 TexXInt = (uint32)TexXFull;
@@ -1092,6 +1149,12 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
uint32 TexYInt = (uint32)TexYFull;
real32 TexY = TexYFull - TexYInt;
+ if(T.LayerWidth == 50)
+ real32 pp = 0;
+
+ if(TexYInt > 47 && T.LayerWidth == 50)
+ real32 pp = 0;
+
real32 TexXInv = 1 - TexX;
real32 TexYInv = 1 - TexY;
real32 TexBothXInv = TexXInv * TexY;