From 9062e0aae9f2f576b7a237c28028aa6b09feee5e Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Wed, 17 Aug 2022 23:41:08 -0400 Subject: undo and pen development --- prenderer.cpp | 263 +++++++++++++++++++++------------------------------------- 1 file changed, 94 insertions(+), 169 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index a93fa90..e755fe7 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -1,39 +1,17 @@ - -static void -PushRect(rectangle RenderRegion); - -static void -RenderLayerNeon(project_layer *Layer, comp_buffer *Buffer, rectangle RenderRegion); static void -AVX2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion); -static void -SSE2_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion); -static void -Fallback_RenderLayer(transform_info TransformInfo, comp_buffer *Buffer, rectangle RenderRegion); - -static bool32 -CheckQueue(render_queue RenderInfo, uint16 Index); - -// for the anchor point moving UI -static void -CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir) +Layer_CalcRotatedOffset(project_layer *Layer, v2 Increment, v2 Divisor, real32 *ValueX, real32 *ValueY) { - source *Source = Layer->Source; real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180)); real32 s = Layer->scale.CurrentValue.f; - if (Dir == 0) { - v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s); - Layer->x.CurrentValue.f += Value; - Layer->ax.CurrentValue.f += XAxis.x/Source->Info.Width; - Layer->ay.CurrentValue.f -= XAxis.y/Source->Info.Height; - } else { - v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s); - Layer->y.CurrentValue.f += Value; - Layer->ax.CurrentValue.f -= YAxis.x/Source->Info.Width; - Layer->ay.CurrentValue.f += YAxis.y/Source->Info.Height; - } + v2 XAxis = V2(cos(Rad), sin(Rad)) * (Increment.x / s); + v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Increment.y / -s); + + *ValueX += XAxis.x/Divisor.x; + *ValueY -= XAxis.y/Divisor.y; + *ValueX -= YAxis.x/Divisor.x; + *ValueY += YAxis.y/Divisor.y; } static transform_info @@ -121,10 +99,15 @@ static void RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) { int16 Idx = RenderInfo->State->LayersToRender[i]; +#if ARM + if (InstructionMode == instruction_mode_neon) + Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); +#else if (InstructionMode == instruction_mode_avx) AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); else if (InstructionMode == instruction_mode_sse) SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); +#endif else Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); } @@ -134,7 +117,7 @@ static void QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State) { IsRendering = true; - // render_queue RenderInfo = {File, State, CompBuffer}; + render_queue RenderInfo = {File, State, CompBuffer}; for (int16 i = 0; i < File->NumberOfLayers; i++) { @@ -182,74 +165,100 @@ QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *St #endif } - #if ARM + static void -RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion) +NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) { - float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x); - float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y); - float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x); - float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y); - float32x4_t LayerWidth = vdupq_n_f32(); - float32x4_t LayerHeight = vdupq_n_f32(); - float32x4_t LayerOpacity = vdupq_n_f32(); - float32x4_t OriginX = vdupq_n_f32(Origin.x); - float32x4_t OriginY = vdupq_n_f32(Origin.y); + rectangle LayerBounds = ClipRectangle( T.ClipRect, + RenderRegion ); + // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. + LayerBounds.Min.x -= LayerBounds.Min.x % 4; + LayerBounds.Min.y -= LayerBounds.Min.y % 4; + uint16 WidthP, HeightP; + Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + uint8 *TexPTR = (uint8 *)T.SourceBuffer; + Assert(LayerBounds.Max.x <= Buffer->Width); + Assert(LayerBounds.Max.y <= Buffer->Height); + + float32x4_t XAxisPX = vdupq_n_f32(T.XAxisPX); + float32x4_t XAxisPY = vdupq_n_f32(T.XAxisPY); + float32x4_t YAxisPX = vdupq_n_f32(T.YAxisPX); + float32x4_t YAxisPY = vdupq_n_f32(T.YAxisPY); + + float32x4_t LayerWidth = vdupq_n_f32(T.LayerWidth); + int32x4_t FullLayerWidth4i = vdupq_n_s32(T.FullLayerWidth*4); + int32x4_t LayerWidthMinusOne = vdupq_n_s32(T.LayerWidth - 1); + int32x4_t LayerHeightMinusOne = vdupq_n_s32(T.LayerHeight - 1); + float32x4_t LayerHeight = vdupq_n_f32(T.LayerHeight); + float32x4_t LayerOpacity = vdupq_n_f32(T.LayerOpacity); + float32x4_t OriginX = vdupq_n_f32(T.OriginX); + float32x4_t OriginY = vdupq_n_f32(T.OriginY); + + float32x4_t ClipPrevent = vdupq_n_f32(0.001f); float32x4_t One = vdupq_n_f32(1); + float32x4_t Two = vdupq_n_f32(2); float32x4_t Zero = vdupq_n_f32(0); + + float32x4_t ZeroPoint25 = vdupq_n_f32(0.25); + float32x4_t ZeroPointFive = vdupq_n_f32(0.5); + int32x4_t Onei = vdupq_n_s32(1); float32x4_t Four = vdupq_n_f32(4); - int32x4_t FourInt = vdupq_n_s32(4); - int32x4_t EightInt = vdupq_n_s32(8); - int32x4_t SixteenInt = vdupq_n_s32(16); - int32x4_t TwentyFourInt = vdupq_n_s32(24); - float32x4_t Float255 = vdupq_n_f32(255.0f); - int32x4_t Int255 = vdupq_n_s32(255); + int32x4_t FF = vdupq_n_s32(0xFF); + int32x4_t BottomTwoBits = vdupq_n_s32(0x03); + int32x4_t Fouri = vdupq_n_s32(4); + int32x4_t Sixteeni = vdupq_n_s32(16); + float32x4_t Real255 = vdupq_n_f32(255.0f); float32x4_t Norm255 = vdupq_n_f32(1/255.0f); - for(int16 Y = LayerBounds.Min.y; - Y < LayerBounds.Max.y; - Y++) + // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical. + + // TODO(fox): A possible optimization could be made by using the 32x4x4 + // load intrinsic and a loop that repeats four times. + + for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) { - uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x; + real32 xvals[4] = { (real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3 }; + float32x4_t PixelX = vld1q_f32(xvals); - real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x, - (real32)LayerBounds.Min.x+1, - (real32)LayerBounds.Min.x+2, - (real32)LayerBounds.Min.x+3}; - float32x4_t PixelX = vld1q_f32(ScalarPixelX); float32x4_t PixelY = vdupq_n_f32((real32)Y); float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY); - for(int16 XI = LayerBounds.Min.x; - XI < LayerBounds.Max.x; - XI += 1) + for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) { + float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX); + + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel; + float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY)); float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY)); - uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)), - vandq_u32(vcleq_f32(V, One), vcgezq_f32(V))); + uint32x4_t LayerMask = vandq_u32(vandq_u32(vcgeq_f32(U, Zero), vcltq_f32(U, One)), + vandq_u32(vcgeq_f32(V, Zero), vcltq_f32(V, One))); // TODO(fox): Make more efficient with some sort of truncation uint32 comp[4]; - vst1q_u32(comp, R); + vst1q_u32(comp, LayerMask); if (comp[0] || comp[1] || comp[2] || comp[3]) { U = vmaxq_f32(vminq_f32(One, U), Zero); V = vmaxq_f32(vminq_f32(One, V), Zero); float32x4_t TexXFull = vmulq_f32(U, LayerWidth); float32x4_t TexYFull = vmulq_f32(V, LayerHeight); - int32x4_t TexXInt = vcvtq_s32_f32(TexXFull); + int32x4_t TexXIntPlusOne = vaddq_f32(TexXInt, vandq_u32(vcltq_u32(TexXInt, LayerWidthMinusOne), Onei)); int32x4_t TexYInt = vcvtq_s32_f32(TexYFull); + int32x4_t TexYIntPlusOne = vaddq_f32(TexYInt, vandq_u32(vcltq_u32(TexYInt, LayerWidthMinusOne), Onei)); - // fractions - float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt)); - float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt)); + float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_u32(TexXInt)); + float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_u32(TexYInt)); float32x4_t TexXInv = vsubq_f32(One, TexX); float32x4_t TexYInv = vsubq_f32(One, TexY); float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY); @@ -257,112 +266,28 @@ RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegi float32x4_t TexBoth = vmulq_f32(TexY, TexX); float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv); - int32 TexXP[4]; - vst1q_s32(TexXP, TexXInt); - int32 TexYP[4]; - vst1q_s32(TexYP, TexYInt); - - uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32)); - uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32)); - uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32)); - uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32)); - - // TexRGBA = vld4_u8(TexPTR0); - // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0); - // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4); - // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8); - // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12); - // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1); - // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5); - // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9); - // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13); - // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2); - // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6); - // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10); - // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14); - // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3); - // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7); - // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11); - // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15); - uint8x16x4_t TexRGBA_A = {}; - uint8x16x4_t TexRGBA_B = {}; - uint8x16x4_t TexRGBA_C = {}; - uint8x16x4_t TexRGBA_D = {}; - TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0); - TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0); - TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0); - TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0); - TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4); - TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4); - TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4); - TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4); - TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8); - TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8); - TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8); - TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8); - TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12); - TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12); - TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12); - TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12); - - uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0]; - - float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); - float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), - vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))); - - uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0]; - -#if 0 - float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), - vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))), - vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])), - vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0])))); - - float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])), - vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))), - vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])), - vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1])))); - - float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])), - vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))), - vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])), - vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2])))); - - float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])), - vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))), - vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])), - vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3])))); -#endif - float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); - float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]); - float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]); - float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]); - - // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4), - // _mm_mul_ps(TexBothYInv, TexBRx4)), - // _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4), - // _mm_mul_ps(TexBoth, TexDRx4))); - - PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity)); - uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16), - vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)), - (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8), - vcvtq_u32_f32(PixelBlendB)))); - - uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0}; - uint32x4_t mask = vld1q_u32(ma); - Output = vandq_u32(Output, mask); - vst1q_u32(Pixel, Output); - + int32x4_t XLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXInt, 2), Sixteeni), + vandq_u32(TexXInt, BottomTwoBits)); + int32x4_t YLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYInt, 2), FullLayerWidth4i), + vmulq_u32(vandq_u32(TexYInt, BottomTwoBits), Fouri)); + int32x4_t XLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXIntPlusOne, 2), Sixteeni), + vandq_u32(TexXIntPlusOne, BottomTwoBits)); + int32x4_t YLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYIntPlusOne, 2), FullLayerWidth4i), + vmulq_u32(vandq_u32(TexYIntPlusOne, BottomTwoBits), Fouri)); + + int32x4_t PixelLookupTL = vaddq_u32(XLookup, YLookup); + int32x4_t PixelLookupTR = vaddq_u32(XLookupPlusOne, YLookup); + int32x4_t PixelLookupBL = vaddq_u32(XLookup, YLookupPlusOne); + int32x4_t PixelLookupBR = vaddq_u32(XLookupPlusOne, YLookupPlusOne); + + // I thought NEON had gather/scatter, but it appears it doesn't... } - Pixel++; - PixelX = vaddq_f32(PixelX, One); + + PixelX = vaddq_f32(PixelX, Four); } - Row += BufferPitch; } - } + #else static void -- cgit v1.2.3