From 83ce428d8bb5f4a762abf879adec076bc34cf36a Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Wed, 27 Jul 2022 11:00:45 -0400 Subject: full support for odd-dimension bitmaps and comps --- prenderer.cpp | 379 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 281 insertions(+), 98 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index 4d4152d..356ecd7 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -7,11 +7,14 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi internal void AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal void +SSE2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); +internal void Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); internal bool32 CheckQueue(render_queue RenderInfo, uint16 Index); +// for the anchor point moving UI internal void CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir) { @@ -76,12 +79,14 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer) TransformInfo.YAxisPY = YLengthSq*YAxis.y; TransformInfo.LayerWidth = (real32)Source->Raster.Width; TransformInfo.LayerHeight = (real32)Source->Raster.Height; + TransformInfo.FullLayerWidth = Source->Raster.FullWidth; + TransformInfo.FullLayerHeight = Source->Raster.FullHeight; TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f; TransformInfo.OriginX = Origin.x; TransformInfo.OriginY = Origin.y; TransformInfo.BufferPitch = Buffer->Pitch; TransformInfo.LayerPitch = Source->Raster.Pitch; - TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY}; + TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1}; TransformInfo.SourceBuffer = Source->Raster.EffectBuffer; @@ -114,6 +119,19 @@ EndRenderState(project_state *State) } +internal void +RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { + for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) { + int16 Idx = RenderInfo->State->LayersToRender[i]; + if (InstructionMode == avx_enabled) + AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + else if (InstructionMode == sse_enabled) + SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + else + Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); + } +} + internal void QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State) { @@ -163,18 +181,7 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S // DEBUG_CycleCountStart(3); rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height}; - for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) { - int16 Idx = RenderInfo.State->LayersToRender[i]; -#if ARM - RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion); -#else - // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); - if (AVXEnabled) - AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); - else - Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); -#endif - } + RenderLayers(&RenderInfo, RenderRegion); // DEBUG_CycleCountEnd(3); // Debug.ExecutionAmount[4] += 1280*720; @@ -378,6 +385,7 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi } #else + internal void AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) { @@ -397,7 +405,9 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY); __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth); - __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4); + __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4); + __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1); + __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1); __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight); __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity); __m256 OriginX = _mm256_set1_ps(T.OriginX); @@ -451,7 +461,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); uint32 XLookup = (X >> 2)*16 + (X % 4); - uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4; + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; @@ -461,6 +471,8 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); + // If all of the pixels are zeroed in the mask (aka fall outside + // the UV lookup), we can skip the iteration. if (_mm256_movemask_epi8(LayerMask)) { U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); @@ -469,9 +481,10 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256 TexXFull = _mm256_mul_ps(U, LayerWidth); __m256 TexYFull = _mm256_mul_ps(V, LayerHeight); __m256i TexXInt = _mm256_cvttps_epi32(TexXFull); - __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei); + __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); - __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei); + __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); + // NOTE(fox): The comparison is for when we're on the last pixel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt)); @@ -484,11 +497,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni), _mm256_and_si256(TexXInt, BottomTwoBits)); - __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i), + __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri)); __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni), _mm256_and_si256(TexXIntPlusOne, BottomTwoBits)); - __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i), + __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri)); __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup); @@ -571,13 +584,239 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); - _mm256_storeu_si256((__m256i *)Pixel, PixelsMask); + // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); + _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); } PixelX = _mm256_add_ps(PixelX, Four); } } } + +internal void +SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) +{ + rectangle LayerBounds = ClipRectangle( T.ClipRect, + RenderRegion ); + // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. + LayerBounds.Min.x -= LayerBounds.Min.x % 4; + LayerBounds.Min.y -= LayerBounds.Min.y % 4; + + uint8 *TexPTR = (uint8 *)T.SourceBuffer; + Assert(LayerBounds.Max.x <= Buffer->Width); + Assert(LayerBounds.Max.y <= Buffer->Height); + + __m128 XAxisPX = _mm_set1_ps(T.XAxisPX); + __m128 XAxisPY = _mm_set1_ps(T.XAxisPY); + __m128 YAxisPX = _mm_set1_ps(T.YAxisPX); + __m128 YAxisPY = _mm_set1_ps(T.YAxisPY); + + __m128 LayerWidth = _mm_set1_ps(T.LayerWidth); + __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1); + __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4); + __m128 LayerHeight = _mm_set1_ps(T.LayerHeight); + __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1); + __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity); + __m128 OriginX = _mm_set1_ps(T.OriginX); + __m128 OriginY = _mm_set1_ps(T.OriginY); + + __m128 One = _mm_set1_ps(1); + __m128 Zero = _mm_set1_ps(0); + __m128i Zeroi = _mm_set1_epi32(0); + __m128i Onei = _mm_set1_epi32(1); + __m128 Four = _mm_set1_ps(4); + __m128 Sixteen = _mm_set1_ps(16); + __m128i FF = _mm_set1_epi32(0xFF); + __m128i BottomTwoBits = _mm_set1_epi32(0x03); + __m128i Fouri = _mm_set1_epi32(4); + __m128i Sixteeni = _mm_set1_epi32(16); + __m128 Reg255 = _mm_set1_ps(255.0f); + __m128i Int255 = _mm_set1_epi32(255); + __m128 Norm255 = _mm_set1_ps(1/255.0f); + + // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical. + + for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) + { + __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3); + + __m128 PixelY = _mm_set1_ps((real32)Y); + __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY); + + for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) + { + IACA_START; + + __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); + + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; + + __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY)); + __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY)); + + __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), + _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); + + if (_mm_movemask_epi8(LayerMask)) + { + U = _mm_max_ps(_mm_min_ps(One, U), Zero); + V = _mm_max_ps(_mm_min_ps(One, V), Zero); + + __m128 TexXFull = _mm_mul_ps(U, LayerWidth); + __m128 TexYFull = _mm_mul_ps(V, LayerHeight); + __m128i TexXInt = _mm_cvttps_epi32(TexXFull); + __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei)); + __m128i TexYInt = _mm_cvttps_epi32(TexYFull); + __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei)); + + __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt)); + __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt)); + __m128 TexXInv = _mm_sub_ps(One, TexX); + __m128 TexYInv = _mm_sub_ps(One, TexY); + __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY); + __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv); + __m128 TexBoth = _mm_mul_ps(TexY, TexX); + __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv); + + __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni), + _mm_and_si128(TexXInt, BottomTwoBits)); + __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i), + _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri)); + __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni), + _mm_and_si128(TexXIntPlusOne, BottomTwoBits)); + __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), + _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri)); + + __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup); + __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup); + __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne); + __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne); + + // SSE lacks gathering, so we have no choice but to manually + // look up each pixel's four bilinear samples in scalar. + + uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL); + uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR); + uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL); + uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR); + uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4); + uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4); + uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4); + uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4); + + uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4)); + uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4)); + uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4)); + uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4)); + uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4); + uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4); + uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4); + uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4); + + uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8)); + uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8)); + uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8)); + uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8)); + uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4); + uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4); + uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4); + uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4); + + uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12)); + uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12)); + uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12)); + uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12)); + uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4); + uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4); + uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4); + uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4); + + __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3); + __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3); + __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3); + __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3); + + __m128i R_TexTL = _mm_and_si128( PixelsTL, FF); + __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF); + __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF); + __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF); + + __m128i R_TexTR = _mm_and_si128( PixelsTR, FF); + __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF); + __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF); + __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF); + + __m128i R_TexBL = _mm_and_si128( PixelsBL, FF); + __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF); + __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF); + __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF); + + __m128i R_TexBR = _mm_and_si128( PixelsBR, FF); + __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF); + __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF); + __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF); + + __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR)))); + __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR)))); + __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR)))); + __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)), + _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))), + _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)), + _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR)))); + + A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity)); + + __m128i R_Out, G_Out, B_Out, A_Out; + // Only do alpha blending if a pixel's value doesn't equal 255 + if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255))) + { + __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255); + __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255); + + __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel); + __m128i R_Dest = _mm_and_si128( DestPixel, FF); + __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF); + __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF); + __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF); + + R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha))); + G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha))); + B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha))); + A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); + } + else + { + R_Out = _mm_cvtps_epi32(R_PixelBlend); + G_Out = _mm_cvtps_epi32(G_PixelBlend); + B_Out = _mm_cvtps_epi32(B_PixelBlend); + A_Out = _mm_cvtps_epi32(A_PixelBlend); + } + + __m128i OutputPixel = _mm_or_si128( + _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)), + _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24))); + _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel); + } + PixelX = _mm_add_ps(PixelX, Four); + } + } +} + + #endif internal void @@ -595,25 +834,17 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg // uint32 pp2 = 3; // bool32 real = true; - for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2) + for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { -#if PACKEDRGB -#else - uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x; -#endif - real32 StartVectorY[2]; - StartVectorY[0] = (real32)Y - T.OriginY; - StartVectorY[1] = (real32)(Y+1) - T.OriginY; + real32 StartVectorY = (real32)Y - T.OriginY; for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++) { - for (int16 i = 0; i < 2; i++) - { IACA_START; real32 StartVectorX = X - T.OriginX; - real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY); - real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY); + real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); + real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { real32 TexXFull = U * T.LayerWidth; @@ -631,7 +862,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg real32 TexBoth = TexY * TexX; real32 TexBothInv = TexXInv * TexYInv; -#if PACKEDRGB #if 0 uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel); uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel); @@ -641,35 +871,34 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint32 PixelC = *(uint32 *)TexPTR1; uint32 PixelD = *((uint32 *)TexPTR1 + 1); #else - uint16 LX, LY; uint32 XLookup, YLookup, PixelToSeek; - // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's! - LX = TexXInt; - LY = TexYInt; + // TODO(fox): Anti-aliasing on edges + uint16 LX = TexXInt; + uint16 LY = TexYInt; + uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1); + uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1); + + // TODO(fox): Be careful with the BytesPerPixel here! It's the + // buffer's, not the layer's (currently everything is 4 bytes + // per pixel). XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt+1; - LY = TexYInt; - XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); + YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt; - LY = TexYInt+1; XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); - LX = TexXInt+1; - LY = TexYInt+1; - XLookup = (LX >> 2)*16 + (LX % 4); - YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + XLookup = (LXPlus >> 2)*16 + (LXPlus % 4); + YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); #endif @@ -693,30 +922,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 TexAB = ((PixelB >> 24) & 0xFF); uint8 TexAC = ((PixelC >> 24) & 0xFF); uint8 TexAD = ((PixelD >> 24) & 0xFF); -#else - uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt); - uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt); - - uint8 TexRA = *TexPTR0; - uint8 TexRB = *(TexPTR0 + 1); - uint8 TexRC = *TexPTR1; - uint8 TexRD = *(TexPTR1 + 1); - - uint8 TexGA = *(TexPTR0 + Channel); - uint8 TexGB = *(TexPTR0 + 1 + Channel); - uint8 TexGC = *(TexPTR1 + Channel); - uint8 TexGD = *(TexPTR1 + 1 + Channel); - - uint8 TexBA = *(TexPTR0 + Channel*2); - uint8 TexBB = *(TexPTR0 + 1 + Channel*2); - uint8 TexBC = *(TexPTR1 + Channel*2); - uint8 TexBD = *(TexPTR1 + 1 + Channel*2); - - uint8 TexAA = *(TexPTR0 + Channel*3); - uint8 TexAB = *(TexPTR0 + 1 + Channel*3); - uint8 TexAC = *(TexPTR1 + Channel*3); - uint8 TexAD = *(TexPTR1 + 1 + Channel*3); -#endif real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB) + (TexBothXInv * TexRC) + (TexBoth * TexRD); @@ -733,9 +938,9 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 B = (uint8)PixelBlendB; uint8 A = (uint8)PixelBlendA; -#if PACKEDRGB XLookup = (X >> 2)*16 + (X % 4); - YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4; + YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4; + // if (real) { // real = false; // printf("XLook: %i, YLook: %i\n", XLookup, YLookup); @@ -748,16 +953,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg uint8 G1 = (*Pixel >> 8); uint8 B1 = (*Pixel >> 16); uint8 A1 = (*Pixel >> 24); -#else - uint8 *RD = Pixel; - uint8 *GD = Pixel + Buffer->Channel; - uint8 *BD = Pixel + Buffer->Channel*2; - uint8 *AD = Pixel + Buffer->Channel*3; - uint8 R1 = *RD; - uint8 G1 = *GD; - uint8 B1 = *BD; - uint8 A1 = *AD; -#endif if (A != 255) { real32 LayerAlpha = (255 - A) / 255.0f; @@ -767,23 +962,11 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg A = ClipAdd(A1, A); } -#if PACKEDRGB *Pixel = ((A << 24) | (B << 16) | (G << 8) | (R << 0)); } - } - } -#else - *RD = R; - *GD = G; - *BD = B; - *AD = A; - } - Pixel++; } - Row += Buffer->Pitch*2; -#endif } } -- cgit v1.2.3