summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
authorFox Caminiti <fox@foxcam.net>2022-07-27 11:00:45 -0400
committerFox Caminiti <fox@foxcam.net>2022-07-27 11:00:45 -0400
commit83ce428d8bb5f4a762abf879adec076bc34cf36a (patch)
treec1500f027d9eec514ba1a2912e7a4763e7be26b2 /prenderer.cpp
parentc6bd84c356b6aaa029b9708d7b99a4aba1673b6b (diff)
full support for odd-dimension bitmaps and comps
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp379
1 files changed, 281 insertions, 98 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index 4d4152d..356ecd7 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -7,11 +7,14 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi
internal void
AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
internal void
+SSE2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
+internal void
Fallback_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
internal bool32
CheckQueue(render_queue RenderInfo, uint16 Index);
+// for the anchor point moving UI
internal void
CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir)
{
@@ -76,12 +79,14 @@ CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer)
TransformInfo.YAxisPY = YLengthSq*YAxis.y;
TransformInfo.LayerWidth = (real32)Source->Raster.Width;
TransformInfo.LayerHeight = (real32)Source->Raster.Height;
+ TransformInfo.FullLayerWidth = Source->Raster.FullWidth;
+ TransformInfo.FullLayerHeight = Source->Raster.FullHeight;
TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f;
TransformInfo.OriginX = Origin.x;
TransformInfo.OriginY = Origin.y;
TransformInfo.BufferPitch = Buffer->Pitch;
TransformInfo.LayerPitch = Source->Raster.Pitch;
- TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY};
+ TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1};
TransformInfo.SourceBuffer = Source->Raster.EffectBuffer;
@@ -115,6 +120,19 @@ EndRenderState(project_state *State)
}
internal void
+RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
+ for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
+ int16 Idx = RenderInfo->State->LayersToRender[i];
+ if (InstructionMode == avx_enabled)
+ AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ else if (InstructionMode == sse_enabled)
+ SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ else
+ Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ }
+}
+
+internal void
QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State)
{
IsRendering = true;
@@ -163,18 +181,7 @@ QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *S
// DEBUG_CycleCountStart(3);
rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
- for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) {
- int16 Idx = RenderInfo.State->LayersToRender[i];
-#if ARM
- RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion);
-#else
- // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
- if (AVXEnabled)
- AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
- else
- Fallback_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
-#endif
- }
+ RenderLayers(&RenderInfo, RenderRegion);
// DEBUG_CycleCountEnd(3);
// Debug.ExecutionAmount[4] += 1280*720;
@@ -378,6 +385,7 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi
}
#else
+
internal void
AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
{
@@ -397,7 +405,9 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
__m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
- __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4);
+ __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
+ __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
+ __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
__m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
__m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
__m256 OriginX = _mm256_set1_ps(T.OriginX);
@@ -451,7 +461,7 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4;
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel;
@@ -461,6 +471,8 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
_mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+ // If all of the pixels are zeroed in the mask (aka fall outside
+ // the UV lookup), we can skip the iteration.
if (_mm256_movemask_epi8(LayerMask))
{
U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
@@ -469,9 +481,10 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
__m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
__m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
- __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei);
+ __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
- __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei);
+ __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+ // NOTE(fox): The comparison is for when we're on the last pixel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
__m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
@@ -484,11 +497,11 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
__m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
_mm256_and_si256(TexXInt, BottomTwoBits));
- __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i),
+ __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
_mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
__m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
_mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
- __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i),
+ __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
_mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
__m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
@@ -571,13 +584,239 @@ AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
- __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
- _mm256_storeu_si256((__m256i *)Pixel, PixelsMask);
+ // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
+ _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
}
PixelX = _mm256_add_ps(PixelX, Four);
}
}
}
+
+internal void
+SSE2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ __m128 XAxisPX = _mm_set1_ps(T.XAxisPX);
+ __m128 XAxisPY = _mm_set1_ps(T.XAxisPY);
+ __m128 YAxisPX = _mm_set1_ps(T.YAxisPX);
+ __m128 YAxisPY = _mm_set1_ps(T.YAxisPY);
+
+ __m128 LayerWidth = _mm_set1_ps(T.LayerWidth);
+ __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1);
+ __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4);
+ __m128 LayerHeight = _mm_set1_ps(T.LayerHeight);
+ __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1);
+ __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity);
+ __m128 OriginX = _mm_set1_ps(T.OriginX);
+ __m128 OriginY = _mm_set1_ps(T.OriginY);
+
+ __m128 One = _mm_set1_ps(1);
+ __m128 Zero = _mm_set1_ps(0);
+ __m128i Zeroi = _mm_set1_epi32(0);
+ __m128i Onei = _mm_set1_epi32(1);
+ __m128 Four = _mm_set1_ps(4);
+ __m128 Sixteen = _mm_set1_ps(16);
+ __m128i FF = _mm_set1_epi32(0xFF);
+ __m128i BottomTwoBits = _mm_set1_epi32(0x03);
+ __m128i Fouri = _mm_set1_epi32(4);
+ __m128i Sixteeni = _mm_set1_epi32(16);
+ __m128 Reg255 = _mm_set1_ps(255.0f);
+ __m128i Int255 = _mm_set1_epi32(255);
+ __m128 Norm255 = _mm_set1_ps(1/255.0f);
+
+ // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3);
+
+ __m128 PixelY = _mm_set1_ps((real32)Y);
+ __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY);
+
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+ {
+ IACA_START;
+
+ __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
+ __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
+ __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
+
+ __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)),
+ _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One))));
+
+ if (_mm_movemask_epi8(LayerMask))
+ {
+ U = _mm_max_ps(_mm_min_ps(One, U), Zero);
+ V = _mm_max_ps(_mm_min_ps(One, V), Zero);
+
+ __m128 TexXFull = _mm_mul_ps(U, LayerWidth);
+ __m128 TexYFull = _mm_mul_ps(V, LayerHeight);
+ __m128i TexXInt = _mm_cvttps_epi32(TexXFull);
+ __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei));
+ __m128i TexYInt = _mm_cvttps_epi32(TexYFull);
+ __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei));
+
+ __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt));
+ __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt));
+ __m128 TexXInv = _mm_sub_ps(One, TexX);
+ __m128 TexYInv = _mm_sub_ps(One, TexY);
+ __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY);
+ __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv);
+ __m128 TexBoth = _mm_mul_ps(TexY, TexX);
+ __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv);
+
+ __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni),
+ _mm_and_si128(TexXInt, BottomTwoBits));
+ __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri));
+ __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
+ _mm_and_si128(TexXIntPlusOne, BottomTwoBits));
+ __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup);
+ __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup);
+ __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne);
+ __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne);
+
+ // SSE lacks gathering, so we have no choice but to manually
+ // look up each pixel's four bilinear samples in scalar.
+
+ uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL);
+ uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR);
+ uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL);
+ uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR);
+ uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4);
+ uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4);
+ uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4);
+ uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4);
+
+ uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4));
+ uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4));
+ uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4));
+ uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4));
+ uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4);
+ uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4);
+ uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4);
+ uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4);
+
+ uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8));
+ uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8));
+ uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8));
+ uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8));
+ uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4);
+ uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4);
+ uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4);
+ uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4);
+
+ uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12));
+ uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12));
+ uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12));
+ uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12));
+ uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4);
+ uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4);
+ uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4);
+ uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4);
+
+ __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3);
+ __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3);
+ __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
+ __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);
+
+ __m128i R_TexTL = _mm_and_si128( PixelsTL, FF);
+ __m128i G_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF);
+ __m128i B_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF);
+ __m128i A_TexTL = _mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF);
+
+ __m128i R_TexTR = _mm_and_si128( PixelsTR, FF);
+ __m128i G_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF);
+ __m128i B_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF);
+ __m128i A_TexTR = _mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF);
+
+ __m128i R_TexBL = _mm_and_si128( PixelsBL, FF);
+ __m128i G_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF);
+ __m128i B_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF);
+ __m128i A_TexBL = _mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF);
+
+ __m128i R_TexBR = _mm_and_si128( PixelsBR, FF);
+ __m128i G_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF);
+ __m128i B_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF);
+ __m128i A_TexBR = _mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF);
+
+ __m128 R_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(R_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(R_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(R_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(R_TexBR))));
+ __m128 G_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(G_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(G_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(G_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(G_TexBR))));
+ __m128 B_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(B_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(B_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(B_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(B_TexBR))));
+ __m128 A_PixelBlend = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, _mm_cvtepi32_ps(A_TexTL)),
+ _mm_mul_ps(TexBothYInv, _mm_cvtepi32_ps(A_TexTR))),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, _mm_cvtepi32_ps(A_TexBL)),
+ _mm_mul_ps(TexBoth, _mm_cvtepi32_ps(A_TexBR))));
+
+ A_PixelBlend = _mm_sub_ps(A_PixelBlend, _mm_mul_ps(A_PixelBlend, LayerOpacity));
+
+ __m128i R_Out, G_Out, B_Out, A_Out;
+ // Only do alpha blending if a pixel's value doesn't equal 255
+ if (_mm_movemask_epi8(_mm_sub_epi32(_mm_cvtps_epi32(A_PixelBlend), Int255)))
+ {
+ __m128 LayerAlpha = _mm_mul_ps(A_PixelBlend, Norm255);
+ __m128 LayerAlphaInv = _mm_mul_ps(_mm_sub_ps(Reg255, A_PixelBlend), Norm255);
+
+ __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
+ __m128i R_Dest = _mm_and_si128( DestPixel, FF);
+ __m128i G_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF);
+ __m128i B_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF);
+ __m128i A_Dest = _mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF);
+
+ R_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm_mul_ps(R_PixelBlend, LayerAlpha)));
+ G_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm_mul_ps(G_PixelBlend, LayerAlpha)));
+ B_Out = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(_mm_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm_mul_ps(B_PixelBlend, LayerAlpha)));
+ A_Out = _mm_cvtps_epi32(_mm_min_ps(_mm_add_ps(_mm_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255));
+ }
+ else
+ {
+ R_Out = _mm_cvtps_epi32(R_PixelBlend);
+ G_Out = _mm_cvtps_epi32(G_PixelBlend);
+ B_Out = _mm_cvtps_epi32(B_PixelBlend);
+ A_Out = _mm_cvtps_epi32(A_PixelBlend);
+ }
+
+ __m128i OutputPixel = _mm_or_si128(
+ _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
+ _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
+ _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel);
+ }
+ PixelX = _mm_add_ps(PixelX, Four);
+ }
+ }
+}
+
+
#endif
internal void
@@ -595,25 +834,17 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
// uint32 pp2 = 3;
// bool32 real = true;
- for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2)
+ for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
{
-#if PACKEDRGB
-#else
- uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x;
-#endif
- real32 StartVectorY[2];
- StartVectorY[0] = (real32)Y - T.OriginY;
- StartVectorY[1] = (real32)(Y+1) - T.OriginY;
+ real32 StartVectorY = (real32)Y - T.OriginY;
for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
{
- for (int16 i = 0; i < 2; i++)
- {
IACA_START;
real32 StartVectorX = X - T.OriginX;
- real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY);
- real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY);
+ real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
+ real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
real32 TexXFull = U * T.LayerWidth;
@@ -631,7 +862,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
real32 TexBoth = TexY * TexX;
real32 TexBothInv = TexXInv * TexYInv;
-#if PACKEDRGB
#if 0
uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel);
uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel);
@@ -641,35 +871,34 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint32 PixelC = *(uint32 *)TexPTR1;
uint32 PixelD = *((uint32 *)TexPTR1 + 1);
#else
- uint16 LX, LY;
uint32 XLookup, YLookup, PixelToSeek;
- // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's!
- LX = TexXInt;
- LY = TexYInt;
+ // TODO(fox): Anti-aliasing on edges
+ uint16 LX = TexXInt;
+ uint16 LY = TexYInt;
+ uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
+ uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);
+
+ // TODO(fox): Be careful with the BytesPerPixel here! It's the
+ // buffer's, not the layer's (currently everything is 4 bytes
+ // per pixel).
XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt+1;
- LY = TexYInt;
- XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
+ YLookup = (LY >> 2)*(T.FullLayerWidth*4) + (LY % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt;
- LY = TexYInt+1;
XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
- LX = TexXInt+1;
- LY = TexYInt+1;
- XLookup = (LX >> 2)*16 + (LX % 4);
- YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ XLookup = (LXPlus >> 2)*16 + (LXPlus % 4);
+ YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
#endif
@@ -693,30 +922,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 TexAB = ((PixelB >> 24) & 0xFF);
uint8 TexAC = ((PixelC >> 24) & 0xFF);
uint8 TexAD = ((PixelD >> 24) & 0xFF);
-#else
- uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt);
- uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt);
-
- uint8 TexRA = *TexPTR0;
- uint8 TexRB = *(TexPTR0 + 1);
- uint8 TexRC = *TexPTR1;
- uint8 TexRD = *(TexPTR1 + 1);
-
- uint8 TexGA = *(TexPTR0 + Channel);
- uint8 TexGB = *(TexPTR0 + 1 + Channel);
- uint8 TexGC = *(TexPTR1 + Channel);
- uint8 TexGD = *(TexPTR1 + 1 + Channel);
-
- uint8 TexBA = *(TexPTR0 + Channel*2);
- uint8 TexBB = *(TexPTR0 + 1 + Channel*2);
- uint8 TexBC = *(TexPTR1 + Channel*2);
- uint8 TexBD = *(TexPTR1 + 1 + Channel*2);
-
- uint8 TexAA = *(TexPTR0 + Channel*3);
- uint8 TexAB = *(TexPTR0 + 1 + Channel*3);
- uint8 TexAC = *(TexPTR1 + Channel*3);
- uint8 TexAD = *(TexPTR1 + 1 + Channel*3);
-#endif
real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
+ (TexBothXInv * TexRC) + (TexBoth * TexRD);
@@ -733,9 +938,9 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 B = (uint8)PixelBlendB;
uint8 A = (uint8)PixelBlendA;
-#if PACKEDRGB
XLookup = (X >> 2)*16 + (X % 4);
- YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4;
+ YLookup = (Y >> 2)*(Buffer->FullWidth*4) + (Y % 4)*4;
+
// if (real) {
// real = false;
// printf("XLook: %i, YLook: %i\n", XLookup, YLookup);
@@ -748,16 +953,6 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
uint8 G1 = (*Pixel >> 8);
uint8 B1 = (*Pixel >> 16);
uint8 A1 = (*Pixel >> 24);
-#else
- uint8 *RD = Pixel;
- uint8 *GD = Pixel + Buffer->Channel;
- uint8 *BD = Pixel + Buffer->Channel*2;
- uint8 *AD = Pixel + Buffer->Channel*3;
- uint8 R1 = *RD;
- uint8 G1 = *GD;
- uint8 B1 = *BD;
- uint8 A1 = *AD;
-#endif
if (A != 255) {
real32 LayerAlpha = (255 - A) / 255.0f;
@@ -767,23 +962,11 @@ Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderReg
A = ClipAdd(A1, A);
}
-#if PACKEDRGB
*Pixel = ((A << 24) |
(B << 16) |
(G << 8) |
(R << 0));
}
- }
- }
-#else
- *RD = R;
- *GD = G;
- *BD = B;
- *AD = A;
- }
- Pixel++;
}
- Row += Buffer->Pitch*2;
-#endif
}
}