From ed51dab429e467fc144f0bfbed70a5291c8a0a27 Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Sun, 21 Aug 2022 10:20:31 -0400 Subject: multisampled gl masks --- prenderer.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index 909fc4c..ccacbe8 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -355,6 +355,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY); __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth); + __m256 LayerBoundsMaxX = _mm256_set1_ps(LayerBounds.Max.x); __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4); __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1); __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1); @@ -489,17 +490,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)), _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25))); + // Preventing overlap between threads for non-packed. One nice thing + // about packed is that the 4-padded bitmap means we can set up the + // boundaries so we don't have to check this ever. + __m256i TileBarrier = _mm256_cmp_ps(PixelX, LayerBoundsMaxX, 13); + // Zero - no points pass // One - all points pass; not an edge __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14); __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13); + __m256i TotalMask = _mm256_andnot_si256(TileBarrier, _mm256_and_si256(Mask, NonEdge)); - __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)), - _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1))); + // __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)), + // _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1))); // If all of the pixels are zeroed in the mask (aka fall outside // the UV lookup), we can skip the iteration. - if (_mm256_movemask_epi8(LayerMask)) + if (_mm256_movemask_epi8(TotalMask)) { __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask); @@ -745,7 +752,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); + _mm256_maskstore_epi32((int *)Pixel, TotalMask, OutputPixel); } #if PACKEDRGB PixelX = _mm256_add_ps(PixelX, Four); -- cgit v1.2.3