summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp263
1 files changed, 94 insertions, 169 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index a93fa90..e755fe7 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -1,39 +1,17 @@
-
-static void
-PushRect(rectangle RenderRegion);
-
-static void
-RenderLayerNeon(project_layer *Layer, comp_buffer *Buffer, rectangle RenderRegion);
static void
-AVX2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);
-static void
-SSE2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);
-static void
-Fallback_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion);
-
-static bool32
-CheckQueue(render_queue RenderInfo, uint16 Index);
-
-// for the anchor point moving UI
-static void
-CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir)
+Layer_CalcRotatedOffset(project_layer *Layer, v2 Increment, v2 Divisor, real32 *ValueX, real32 *ValueY)
{
- source *Source = Layer->Source;
real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
real32 s = Layer->scale.CurrentValue.f;
- if (Dir == 0) {
- v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s);
- Layer->x.CurrentValue.f += Value;
- Layer->ax.CurrentValue.f += XAxis.x/Source->Info.Width;
- Layer->ay.CurrentValue.f -= XAxis.y/Source->Info.Height;
- } else {
- v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s);
- Layer->y.CurrentValue.f += Value;
- Layer->ax.CurrentValue.f -= YAxis.x/Source->Info.Width;
- Layer->ay.CurrentValue.f += YAxis.y/Source->Info.Height;
- }
+ v2 XAxis = V2(cos(Rad), sin(Rad)) * (Increment.x / s);
+ v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Increment.y / -s);
+
+ *ValueX += XAxis.x/Divisor.x;
+ *ValueY -= XAxis.y/Divisor.y;
+ *ValueX -= YAxis.x/Divisor.x;
+ *ValueY += YAxis.y/Divisor.y;
}
static transform_info
@@ -121,10 +99,15 @@ static void
RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
int16 Idx = RenderInfo->State->LayersToRender[i];
+#if ARM
+ if (InstructionMode == instruction_mode_neon)
+ Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+#else
if (InstructionMode == instruction_mode_avx)
AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
else if (InstructionMode == instruction_mode_sse)
SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+#endif
else
Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
}
@@ -134,7 +117,7 @@ static void
QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State)
{
IsRendering = true;
- // render_queue RenderInfo = {File, State, CompBuffer};
+ render_queue RenderInfo = {File, State, CompBuffer};
for (int16 i = 0; i < File->NumberOfLayers; i++)
{
@@ -182,74 +165,100 @@ QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *St
#endif
}
-
#if ARM
+
static void
-RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion)
+NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
- float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x);
- float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y);
- float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x);
- float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y);
- float32x4_t LayerWidth = vdupq_n_f32();
- float32x4_t LayerHeight = vdupq_n_f32();
- float32x4_t LayerOpacity = vdupq_n_f32();
- float32x4_t OriginX = vdupq_n_f32(Origin.x);
- float32x4_t OriginY = vdupq_n_f32(Origin.y);
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ float32x4_t XAxisPX = vdupq_n_f32(T.XAxisPX);
+ float32x4_t XAxisPY = vdupq_n_f32(T.XAxisPY);
+ float32x4_t YAxisPX = vdupq_n_f32(T.YAxisPX);
+ float32x4_t YAxisPY = vdupq_n_f32(T.YAxisPY);
+
+ float32x4_t LayerWidth = vdupq_n_f32(T.LayerWidth);
+ int32x4_t FullLayerWidth4i = vdupq_n_s32(T.FullLayerWidth*4);
+ int32x4_t LayerWidthMinusOne = vdupq_n_s32(T.LayerWidth - 1);
+ int32x4_t LayerHeightMinusOne = vdupq_n_s32(T.LayerHeight - 1);
+ float32x4_t LayerHeight = vdupq_n_f32(T.LayerHeight);
+ float32x4_t LayerOpacity = vdupq_n_f32(T.LayerOpacity);
+ float32x4_t OriginX = vdupq_n_f32(T.OriginX);
+ float32x4_t OriginY = vdupq_n_f32(T.OriginY);
+
+ float32x4_t ClipPrevent = vdupq_n_f32(0.001f);
float32x4_t One = vdupq_n_f32(1);
+ float32x4_t Two = vdupq_n_f32(2);
float32x4_t Zero = vdupq_n_f32(0);
+
+ float32x4_t ZeroPoint25 = vdupq_n_f32(0.25);
+ float32x4_t ZeroPointFive = vdupq_n_f32(0.5);
+ int32x4_t Onei = vdupq_n_s32(1);
float32x4_t Four = vdupq_n_f32(4);
- int32x4_t FourInt = vdupq_n_s32(4);
- int32x4_t EightInt = vdupq_n_s32(8);
- int32x4_t SixteenInt = vdupq_n_s32(16);
- int32x4_t TwentyFourInt = vdupq_n_s32(24);
- float32x4_t Float255 = vdupq_n_f32(255.0f);
- int32x4_t Int255 = vdupq_n_s32(255);
+ int32x4_t FF = vdupq_n_s32(0xFF);
+ int32x4_t BottomTwoBits = vdupq_n_s32(0x03);
+ int32x4_t Fouri = vdupq_n_s32(4);
+ int32x4_t Sixteeni = vdupq_n_s32(16);
+ float32x4_t Real255 = vdupq_n_f32(255.0f);
float32x4_t Norm255 = vdupq_n_f32(1/255.0f);
- for(int16 Y = LayerBounds.Min.y;
- Y < LayerBounds.Max.y;
- Y++)
+ // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
+
+ // TODO(fox): A possible optimization could be made by using the 32x4x4
+ // load intrinsic and a loop that repeats four times.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
{
- uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x;
+ real32 xvals[4] = { (real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3 };
+ float32x4_t PixelX = vld1q_f32(xvals);
- real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x,
- (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2,
- (real32)LayerBounds.Min.x+3};
- float32x4_t PixelX = vld1q_f32(ScalarPixelX);
float32x4_t PixelY = vdupq_n_f32((real32)Y);
float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY);
- for(int16 XI = LayerBounds.Min.x;
- XI < LayerBounds.Max.x;
- XI += 1)
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
{
+
float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY));
float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY));
- uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)),
- vandq_u32(vcleq_f32(V, One), vcgezq_f32(V)));
+ uint32x4_t LayerMask = vandq_u32(vandq_u32(vcgeq_f32(U, Zero), vcltq_f32(U, One)),
+ vandq_u32(vcgeq_f32(V, Zero), vcltq_f32(V, One)));
// TODO(fox): Make more efficient with some sort of truncation
uint32 comp[4];
- vst1q_u32(comp, R);
+ vst1q_u32(comp, LayerMask);
if (comp[0] || comp[1] || comp[2] || comp[3]) {
U = vmaxq_f32(vminq_f32(One, U), Zero);
V = vmaxq_f32(vminq_f32(One, V), Zero);
float32x4_t TexXFull = vmulq_f32(U, LayerWidth);
float32x4_t TexYFull = vmulq_f32(V, LayerHeight);
-
int32x4_t TexXInt = vcvtq_s32_f32(TexXFull);
+ int32x4_t TexXIntPlusOne = vaddq_f32(TexXInt, vandq_u32(vcltq_u32(TexXInt, LayerWidthMinusOne), Onei));
int32x4_t TexYInt = vcvtq_s32_f32(TexYFull);
+ int32x4_t TexYIntPlusOne = vaddq_f32(TexYInt, vandq_u32(vcltq_u32(TexYInt, LayerWidthMinusOne), Onei));
- // fractions
- float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt));
- float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt));
+ float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_u32(TexXInt));
+ float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_u32(TexYInt));
float32x4_t TexXInv = vsubq_f32(One, TexX);
float32x4_t TexYInv = vsubq_f32(One, TexY);
float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY);
@@ -257,112 +266,28 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi
float32x4_t TexBoth = vmulq_f32(TexY, TexX);
float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv);
- int32 TexXP[4];
- vst1q_s32(TexXP, TexXInt);
- int32 TexYP[4];
- vst1q_s32(TexYP, TexYInt);
-
- uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32));
- uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32));
- uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32));
- uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32));
-
- // TexRGBA = vld4_u8(TexPTR0);
- // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0);
- // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4);
- // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8);
- // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12);
- // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1);
- // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5);
- // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9);
- // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13);
- // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2);
- // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6);
- // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10);
- // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14);
- // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3);
- // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7);
- // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11);
- // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15);
- uint8x16x4_t TexRGBA_A = {};
- uint8x16x4_t TexRGBA_B = {};
- uint8x16x4_t TexRGBA_C = {};
- uint8x16x4_t TexRGBA_D = {};
- TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0);
- TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0);
- TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0);
- TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0);
- TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4);
- TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4);
- TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4);
- TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4);
- TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8);
- TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8);
- TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8);
- TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8);
- TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12);
- TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12);
- TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12);
- TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12);
-
- uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0];
-
- float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
- float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
- vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0])));
-
- uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0];
-
-#if 0
- float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
- vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))),
- vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])),
- vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0]))));
-
- float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])),
- vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))),
- vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])),
- vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1]))));
-
- float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])),
- vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))),
- vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])),
- vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2]))));
-
- float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])),
- vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))),
- vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])),
- vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3]))));
-#endif
- float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
- float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]);
- float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]);
- float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]);
-
- // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4),
- // _mm_mul_ps(TexBothYInv, TexBRx4)),
- // _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4),
- // _mm_mul_ps(TexBoth, TexDRx4)));
-
- PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity));
- uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16),
- vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)),
- (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8),
- vcvtq_u32_f32(PixelBlendB))));
-
- uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0};
- uint32x4_t mask = vld1q_u32(ma);
- Output = vandq_u32(Output, mask);
- vst1q_u32(Pixel, Output);
-
+ int32x4_t XLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXInt, 2), Sixteeni),
+ vandq_u32(TexXInt, BottomTwoBits));
+ int32x4_t YLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYInt, 2), FullLayerWidth4i),
+ vmulq_u32(vandq_u32(TexYInt, BottomTwoBits), Fouri));
+ int32x4_t XLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXIntPlusOne, 2), Sixteeni),
+ vandq_u32(TexXIntPlusOne, BottomTwoBits));
+ int32x4_t YLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ vmulq_u32(vandq_u32(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ int32x4_t PixelLookupTL = vaddq_u32(XLookup, YLookup);
+ int32x4_t PixelLookupTR = vaddq_u32(XLookupPlusOne, YLookup);
+ int32x4_t PixelLookupBL = vaddq_u32(XLookup, YLookupPlusOne);
+ int32x4_t PixelLookupBR = vaddq_u32(XLookupPlusOne, YLookupPlusOne);
+
+ // I thought NEON had gather/scatter, but it appears it doesn't...
}
- Pixel++;
- PixelX = vaddq_f32(PixelX, One);
+
+ PixelX = vaddq_f32(PixelX, Four);
}
- Row += BufferPitch;
}
-
}
+
#else
static void