summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp15
1 files changed, 11 insertions, 4 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index 909fc4c..ccacbe8 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -355,6 +355,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
__m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
+ __m256 LayerBoundsMaxX = _mm256_set1_ps(LayerBounds.Max.x);
__m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
__m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
__m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
@@ -489,17 +490,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
_mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
+ // Preventing overlap between threads for non-packed. One nice thing
+ // about packed is that the 4-padded bitmap means we can set up the
+ // boundaries so we don't have to check this ever.
+ __m256i TileBarrier = _mm256_cmp_ps(PixelX, LayerBoundsMaxX, 13);
+
// Zero - no points pass
// One - all points pass; not an edge
__m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
__m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);
+ __m256i TotalMask = _mm256_andnot_si256(TileBarrier, _mm256_and_si256(Mask, NonEdge));
- __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)),
- _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1)));
+ // __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)),
+ // _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1)));
// If all of the pixels are zeroed in the mask (aka fall outside
// the UV lookup), we can skip the iteration.
- if (_mm256_movemask_epi8(LayerMask))
+ if (_mm256_movemask_epi8(TotalMask))
{
__m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
@@ -745,7 +752,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
- _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
+ _mm256_maskstore_epi32((int *)Pixel, TotalMask, OutputPixel);
}
#if PACKEDRGB
PixelX = _mm256_add_ps(PixelX, Four);