From fc8040d695644aaca4596adebeca4ea1369ef630 Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Fri, 22 Jul 2022 20:45:08 -0400 Subject: first --- prenderer.cpp | 788 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 788 insertions(+) create mode 100644 prenderer.cpp (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp new file mode 100644 index 0000000..4080af1 --- /dev/null +++ b/prenderer.cpp @@ -0,0 +1,788 @@ + +internal void +PushRect(rectangle RenderRegion); + +internal void +RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion); +internal void +AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); +internal void +RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion); + +internal bool32 +CheckQueue(render_queue RenderInfo, uint16 Index); + +internal void +CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir) +{ + v2 Result = {}; + transform_info TransformInfo; + image_source *Source = (image_source *)Layer->RenderInfo; + + real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180)); + real32 s = Layer->scale.CurrentValue.f; + + if (Dir == 0) { + v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s); + Layer->x.CurrentValue.f += Value; + Layer->ax.CurrentValue.f += XAxis.x/Source->Raster.Width; + Layer->ay.CurrentValue.f -= XAxis.y/Source->Raster.Height; + } else { + v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s); + Layer->y.CurrentValue.f += Value; + Layer->ax.CurrentValue.f -= YAxis.x/Source->Raster.Width; + Layer->ay.CurrentValue.f += YAxis.y/Source->Raster.Height; + } +} + +internal transform_info +CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer) +{ + transform_info TransformInfo; + image_source *Source = (image_source *)Layer->RenderInfo; + + real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180)); + real32 s = Layer->scale.CurrentValue.f; + // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s}; + + v2 XAxis = (Source->Raster.Width * s)*V2(cos(Rad), sin(Rad)); + v2 YAxis = (Source->Raster.Height * -s)*V2(sin(Rad), -cos(Rad)); + + real32 AnchorX = Layer->ax.CurrentValue.f; + real32 AnchorY = Layer->ay.CurrentValue.f; + + v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f}; + v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY); + + real32 XLengthSq = 1.0f / LengthSq(XAxis); + real32 YLengthSq = 1.0f / LengthSq(YAxis); + + int32 MaxX = 0; + int32 MaxY = 0; + int32 MinX = Buffer->Width; + int32 MinY = Buffer->Height; + + v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis}; + for (int i = 0; i < 4; i++) { + if (Points[i].x < MinX) { MinX = Points[i].x; } + if (Points[i].y < MinY) { MinY = Points[i].y; } + if (Points[i].x > MaxX) { MaxX = Points[i].x; } + if (Points[i].y > MaxY) { MaxY = Points[i].y; } + } + + TransformInfo.XAxisPX = XLengthSq*XAxis.x; + TransformInfo.XAxisPY = XLengthSq*XAxis.y; + TransformInfo.YAxisPX = YLengthSq*YAxis.x; + TransformInfo.YAxisPY = YLengthSq*YAxis.y; + TransformInfo.LayerWidth = (real32)Source->Raster.Width; + TransformInfo.LayerHeight = (real32)Source->Raster.Height; + TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f; + TransformInfo.OriginX = Origin.x; + TransformInfo.OriginY = Origin.y; + TransformInfo.BufferPitch = Buffer->Pitch; + TransformInfo.LayerPitch = Source->Raster.Pitch; + TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY}; + + TransformInfo.SourceBuffer = Source->Raster.EffectBuffer; + + return TransformInfo; +} + +internal void +EndRenderState(project_state *State) +{ + IsRendering = false; + DEBUG_CycleCountEnd(3); + //TODO(fox): proper pixel accounting + // Debug.ExecutionAmount[4] += 1280*720; + + // printf("%lu %lu, avg %lu\n", Debug.EndCycleCount[3], Debug.ExecutionAmount[4], + // Debug.EndCycleCount[3] / Debug.ExecutionAmount[4]); + // Debug = {}; + + for (int16 i = 0; i < State->NumberOfLayersToRender; i++) + { + State->LayersToRender[i] = 0; + } + + State->NumberOfLayersToRender = 0; + + __atomic_store_n(&EntryCount, 0, __ATOMIC_SEQ_CST); + __atomic_store_n(&NextEntryToDo, 0, __ATOMIC_SEQ_CST); + __atomic_store_n(&CompletedJobs, 0, __ATOMIC_SEQ_CST); + +} + +internal void +QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State) +{ + IsRendering = true; + render_queue RenderInfo = {File, State, CompBuffer}; + + uint16 TileWidth = (CompBuffer->Width - (CompBuffer->Width & 3)) / 4; + uint16 TileHeight = (CompBuffer->Height - (CompBuffer->Height & 3)) / 4; + + for (int16 i = 0; i < File->NumberOfLayers; i++) + { + if (File->Layer[i]->StartFrame <= File->CurrentFrame && + File->Layer[i]->EndFrame >= File->CurrentFrame) + { + File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer); + State->LayersToRender[State->NumberOfLayersToRender] = i; + State->NumberOfLayersToRender++; + } + } + +#if THREADED + DEBUG_CycleCountStart(3); + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + // if (x == y) { + rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y}; + PushRect(RenderRegion); + // } + } + } + + // while (CompletedJobs != 16) { + // // CheckQueue(RenderInfo, 8); + // } + // DEBUG_CycleCountEnd(3); + // // //TODO(fox): proper pixel accounting + // Debug.ExecutionAmount[4] += 1280*720; + + // for (int16 i = 0; i < State->NumberOfLayersToRender; i++) + // { + // State->LayersToRender[i] = 0; + // } + + // State->NumberOfLayersToRender = 0; + +#else + DEBUG_CycleCountStart(3); + + rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height}; + for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) { + int16 Idx = RenderInfo.State->LayersToRender[i]; +#if ARM + RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion); +#else + // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); + if (Old) + RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); + else + AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion); +#endif + } + + DEBUG_CycleCountEnd(3); + Debug.ExecutionAmount[4] += 1280*720; + + for (int16 i = 0; i < State->NumberOfLayersToRender; i++) + { + State->LayersToRender[i] = 0; + } + + State->NumberOfLayersToRender = 0; + +#endif + + // printf("Completed jobs: %i\n", CompletedJobs); + // printf("Next: %i\n", NextEntryToDo); + // Assert(CompletedJobs == 4*4); + // __atomic_store_n(&EntryCount, 0, __ATOMIC_SEQ_CST); + // __atomic_store_n(&NextEntryToDo, 0, __ATOMIC_SEQ_CST); + // __atomic_store_n(&CompletedJobs, 0, __ATOMIC_SEQ_CST); +} + + +#if ARM +internal void +RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion) +{ + float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x); + float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y); + float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x); + float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y); + float32x4_t LayerWidth = vdupq_n_f32(); + float32x4_t LayerHeight = vdupq_n_f32(); + float32x4_t LayerOpacity = vdupq_n_f32(); + float32x4_t OriginX = vdupq_n_f32(Origin.x); + float32x4_t OriginY = vdupq_n_f32(Origin.y); + + + float32x4_t One = vdupq_n_f32(1); + float32x4_t Zero = vdupq_n_f32(0); + float32x4_t Four = vdupq_n_f32(4); + int32x4_t FourInt = vdupq_n_s32(4); + int32x4_t EightInt = vdupq_n_s32(8); + int32x4_t SixteenInt = vdupq_n_s32(16); + int32x4_t TwentyFourInt = vdupq_n_s32(24); + float32x4_t Float255 = vdupq_n_f32(255.0f); + int32x4_t Int255 = vdupq_n_s32(255); + float32x4_t Norm255 = vdupq_n_f32(1/255.0f); + + for(int16 Y = LayerBounds.Min.y; + Y < LayerBounds.Max.y; + Y++) + { + uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x; + + real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3}; + float32x4_t PixelX = vld1q_f32(ScalarPixelX); + float32x4_t PixelY = vdupq_n_f32((real32)Y); + float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY); + + for(int16 XI = LayerBounds.Min.x; + XI < LayerBounds.Max.x; + XI += 1) + { + float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX); + float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY)); + float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY)); + + uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)), + vandq_u32(vcleq_f32(V, One), vcgezq_f32(V))); + + // TODO(fox): Make more efficient with some sort of truncation + uint32 comp[4]; + vst1q_u32(comp, R); + if (comp[0] || comp[1] || comp[2] || comp[3]) { + U = vmaxq_f32(vminq_f32(One, U), Zero); + V = vmaxq_f32(vminq_f32(One, V), Zero); + + float32x4_t TexXFull = vmulq_f32(U, LayerWidth); + float32x4_t TexYFull = vmulq_f32(V, LayerHeight); + + int32x4_t TexXInt = vcvtq_s32_f32(TexXFull); + int32x4_t TexYInt = vcvtq_s32_f32(TexYFull); + + // fractions + float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt)); + float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt)); + float32x4_t TexXInv = vsubq_f32(One, TexX); + float32x4_t TexYInv = vsubq_f32(One, TexY); + float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY); + float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv); + float32x4_t TexBoth = vmulq_f32(TexY, TexX); + float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv); + + int32 TexXP[4]; + vst1q_s32(TexXP, TexXInt); + int32 TexYP[4]; + vst1q_s32(TexYP, TexYInt); + + uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32)); + uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32)); + uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32)); + uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32)); + + // TexRGBA = vld4_u8(TexPTR0); + // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0); + // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4); + // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8); + // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12); + // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1); + // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5); + // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9); + // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13); + // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2); + // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6); + // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10); + // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14); + // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3); + // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7); + // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11); + // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15); + uint8x16x4_t TexRGBA_A = {}; + uint8x16x4_t TexRGBA_B = {}; + uint8x16x4_t TexRGBA_C = {}; + uint8x16x4_t TexRGBA_D = {}; + TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0); + TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0); + TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0); + TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0); + TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4); + TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4); + TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4); + TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4); + TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8); + TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8); + TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8); + TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8); + TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12); + TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12); + TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12); + TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12); + + uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0]; + + float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); + float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), + vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))); + + uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0]; + +#if 0 + float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])), + vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))), + vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])), + vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0])))); + + float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])), + vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))), + vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])), + vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1])))); + + float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])), + vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))), + vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])), + vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2])))); + + float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])), + vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))), + vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])), + vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3])))); +#endif + float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]); + float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]); + float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]); + float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]); + + // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4), + // _mm_mul_ps(TexBothYInv, TexBRx4)), + // _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4), + // _mm_mul_ps(TexBoth, TexDRx4))); + + PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity)); + uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16), + vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)), + (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8), + vcvtq_u32_f32(PixelBlendB)))); + + uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0}; + uint32x4_t mask = vld1q_u32(ma); + Output = vandq_u32(Output, mask); + vst1q_u32(Pixel, Output); + + } + Pixel++; + PixelX = vaddq_f32(PixelX, One); + } + Row += BufferPitch; + } + +} +#else +internal void +AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) +{ + rectangle LayerBounds = ClipRectangle( T.ClipRect, + RenderRegion ); + // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned. + LayerBounds.Min.x -= LayerBounds.Min.x % 4; + LayerBounds.Min.y -= LayerBounds.Min.y % 4; + + uint8 *TexPTR = (uint8 *)T.SourceBuffer; + Assert(LayerBounds.Max.x <= Buffer->Width); + Assert(LayerBounds.Max.y <= Buffer->Height); + + __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX); + __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY); + __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX); + __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY); + + __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth); + __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4); + __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight); + __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity); + __m256 OriginX = _mm256_set1_ps(T.OriginX); + __m256 OriginY = _mm256_set1_ps(T.OriginY); + + __m256 One = _mm256_set1_ps(1); + __m256 Zero = _mm256_set1_ps(0); + __m256i Zeroi = _mm256_set1_epi32(0); + __m256i Onei = _mm256_set1_epi32(1); + __m256 Four = _mm256_set1_ps(4); + __m256 Sixteen = _mm256_set1_ps(16); + __m256i FF = _mm256_set1_epi32(0xFF); + __m256i BottomTwoBits = _mm256_set1_epi32(0x03); + __m256i Fouri = _mm256_set1_epi32(4); + __m256i Sixteeni = _mm256_set1_epi32(16); + __m256 Reg255 = _mm256_set1_ps(255.0f); + __m256i Int255 = _mm256_set1_epi32(255); + __m256 Norm255 = _mm256_set1_ps(1/255.0f); + // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0); + // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF); + + // NOTE(fox): Each loop operates on 8 pixels, 4 horizontal by 2 vertical, + // as per the bitmap packing scheme in memory. + + for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2) + { + __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3, + (real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3); + + __m256 PixelY = _mm256_setr_ps((real32)Y, + (real32)Y, + (real32)Y, + (real32)Y, + (real32)Y+1, + (real32)Y+1, + (real32)Y+1, + (real32)Y+1); + + __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY); + + for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) + { + IACA_START; + + __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); + + uint32 XLookup = (X >> 2)*16 + (X % 4); + uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4; + uint32 PixelToSeek = XLookup + YLookup; + uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel; + + __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY)); + __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY)); + + __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), + _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); + + if (_mm256_movemask_epi8(LayerMask)) + { + U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); + V = _mm256_max_ps(_mm256_min_ps(One, V), Zero); + + __m256 TexXFull = _mm256_mul_ps(U, LayerWidth); + __m256 TexYFull = _mm256_mul_ps(V, LayerHeight); + __m256i TexXInt = _mm256_cvttps_epi32(TexXFull); + __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei); + __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); + __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei); + + __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); + __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt)); + __m256 TexXInv = _mm256_sub_ps(One, TexX); + __m256 TexYInv = _mm256_sub_ps(One, TexY); + __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY); + __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv); + __m256 TexBoth = _mm256_mul_ps(TexY, TexX); + __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv); + + __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni), + _mm256_and_si256(TexXInt, BottomTwoBits)); + __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i), + _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri)); + __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni), + _mm256_and_si256(TexXIntPlusOne, BottomTwoBits)); + __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i), + _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri)); + + __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup); + __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup); + __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne); + __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne); + + // The big feature of AVX2: gathering. + __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4); + __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4); + __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4); + __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4); + + __m256i R_TexTL = _mm256_and_si256( PixelsTL, FF); + __m256i G_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF); + __m256i B_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF); + __m256i A_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF); + + __m256i R_TexTR = _mm256_and_si256( PixelsTR, FF); + __m256i G_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF); + __m256i B_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF); + __m256i A_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF); + + __m256i R_TexBL = _mm256_and_si256( PixelsBL, FF); + __m256i G_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF); + __m256i B_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF); + __m256i A_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF); + + __m256i R_TexBR = _mm256_and_si256( PixelsBR, FF); + __m256i G_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF); + __m256i B_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF); + __m256i A_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF); + + __m256 R_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(R_TexTL)), + _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(R_TexTR))), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(R_TexBL)), + _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(R_TexBR)))); + __m256 G_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(G_TexTL)), + _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(G_TexTR))), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(G_TexBL)), + _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(G_TexBR)))); + __m256 B_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(B_TexTL)), + _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(B_TexTR))), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(B_TexBL)), + _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(B_TexBR)))); + __m256 A_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(A_TexTL)), + _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(A_TexTR))), + _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(A_TexBL)), + _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(A_TexBR)))); + + A_PixelBlend = _mm256_sub_ps(A_PixelBlend, _mm256_mul_ps(A_PixelBlend, LayerOpacity)); + + __m256i R_Out, G_Out, B_Out, A_Out; + // Only do alpha blending if a pixel's value doesn't equal 255 + if (_mm256_movemask_epi8(_mm256_sub_epi32(_mm256_cvtps_epi32(A_PixelBlend), Int255))) + { + __m256 LayerAlpha = _mm256_mul_ps(A_PixelBlend, Norm255); + __m256 LayerAlphaInv = _mm256_mul_ps(_mm256_sub_ps(Reg255, A_PixelBlend), Norm255); + + __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); + __m256i R_Dest = _mm256_and_si256( DestPixel, FF); + __m256i G_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF); + __m256i B_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF); + __m256i A_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF); + + R_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm256_mul_ps(R_PixelBlend, LayerAlpha))); + G_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm256_mul_ps(G_PixelBlend, LayerAlpha))); + B_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm256_mul_ps(B_PixelBlend, LayerAlpha))); + A_Out = _mm256_cvtps_epi32(_mm256_min_ps(_mm256_add_ps(_mm256_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255)); + } + else + { + R_Out = _mm256_cvtps_epi32(R_PixelBlend); + G_Out = _mm256_cvtps_epi32(G_PixelBlend); + B_Out = _mm256_cvtps_epi32(B_PixelBlend); + A_Out = _mm256_cvtps_epi32(A_PixelBlend); + } + + __m256i OutputPixel = _mm256_or_si256( + _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), + _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); + + __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); + _mm256_storeu_si256((__m256i *)Pixel, PixelsMask); + } + PixelX = _mm256_add_ps(PixelX, Four); + } + } +} +#endif + +internal void +Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion) +{ + rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion); + + Assert(LayerBounds.Max.x <= Buffer->Width); + Assert(LayerBounds.Max.y <= Buffer->Height); + + uint8 *Row = ((uint8 *)Buffer->OriginalBuffer + Buffer->Pitch*(int16)(LayerBounds.Min.y) ); + + uint32 Channel = (T.LayerWidth * T.LayerHeight); + // uint32 pp1 = 2; + // uint32 pp2 = 3; + // bool32 real = true; + + for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2) + { +#if PACKEDRGB +#else + uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x; +#endif + real32 StartVectorY[2]; + StartVectorY[0] = (real32)Y - T.OriginY; + StartVectorY[1] = (real32)(Y+1) - T.OriginY; + + for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++) + { + for (int16 i = 0; i < 2; i++) + { + IACA_START; + + real32 StartVectorX = X - T.OriginX; + real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY); + real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY); + + if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { + real32 TexXFull = U * T.LayerWidth; + uint32 TexXInt = (uint32)TexXFull; + real32 TexX = TexXFull - TexXInt; + + real32 TexYFull = V * T.LayerHeight; + uint32 TexYInt = (uint32)TexYFull; + real32 TexY = TexYFull - TexYInt; + + real32 TexXInv = 1 - TexX; + real32 TexYInv = 1 - TexY; + real32 TexBothXInv = TexXInv * TexY; + real32 TexBothYInv = TexX * TexYInv; + real32 TexBoth = TexY * TexX; + real32 TexBothInv = TexXInv * TexYInv; + +#if PACKEDRGB +#if 0 + uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel); + uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel); + + uint32 PixelA = *(uint32 *)TexPTR0; + uint32 PixelB = *((uint32 *)TexPTR0 + 1); + uint32 PixelC = *(uint32 *)TexPTR1; + uint32 PixelD = *((uint32 *)TexPTR1 + 1); +#else + uint16 LX, LY; + uint32 XLookup, YLookup, PixelToSeek; + + // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's! + LX = TexXInt; + LY = TexYInt; + XLookup = (LX >> 2)*16 + (LX % 4); + YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + PixelToSeek = XLookup + YLookup; + uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); + + LX = TexXInt+1; + LY = TexYInt; + XLookup = (LX >> 2)*16 + (LX % 4); + YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + PixelToSeek = XLookup + YLookup; + uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); + + LX = TexXInt; + LY = TexYInt+1; + XLookup = (LX >> 2)*16 + (LX % 4); + YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + PixelToSeek = XLookup + YLookup; + uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); + + LX = TexXInt+1; + LY = TexYInt+1; + XLookup = (LX >> 2)*16 + (LX % 4); + YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4; + PixelToSeek = XLookup + YLookup; + uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); +#endif + + uint8 TexRA = (PixelA & 0xFF); + uint8 TexRB = (PixelB & 0xFF); + uint8 TexRC = (PixelC & 0xFF); + uint8 TexRD = (PixelD & 0xFF); + + uint8 TexGA = ((PixelA >> 8) & 0xFF); + uint8 TexGB = ((PixelB >> 8) & 0xFF); + uint8 TexGC = ((PixelC >> 8) & 0xFF); + uint8 TexGD = ((PixelD >> 8) & 0xFF); + + uint8 TexBA = ((PixelA >> 16) & 0xFF); + uint8 TexBB = ((PixelB >> 16) & 0xFF); + uint8 TexBC = ((PixelC >> 16) & 0xFF); + uint8 TexBD = ((PixelD >> 16) & 0xFF); + + uint8 TexAA = ((PixelA >> 24) & 0xFF); + uint8 TexAB = ((PixelB >> 24) & 0xFF); + uint8 TexAC = ((PixelC >> 24) & 0xFF); + uint8 TexAD = ((PixelD >> 24) & 0xFF); +#else + uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt); + uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt); + + uint8 TexRA = *TexPTR0; + uint8 TexRB = *(TexPTR0 + 1); + uint8 TexRC = *TexPTR1; + uint8 TexRD = *(TexPTR1 + 1); + + uint8 TexGA = *(TexPTR0 + Channel); + uint8 TexGB = *(TexPTR0 + 1 + Channel); + uint8 TexGC = *(TexPTR1 + Channel); + uint8 TexGD = *(TexPTR1 + 1 + Channel); + + uint8 TexBA = *(TexPTR0 + Channel*2); + uint8 TexBB = *(TexPTR0 + 1 + Channel*2); + uint8 TexBC = *(TexPTR1 + Channel*2); + uint8 TexBD = *(TexPTR1 + 1 + Channel*2); + + uint8 TexAA = *(TexPTR0 + Channel*3); + uint8 TexAB = *(TexPTR0 + 1 + Channel*3); + uint8 TexAC = *(TexPTR1 + Channel*3); + uint8 TexAD = *(TexPTR1 + 1 + Channel*3); +#endif + + real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB) + + (TexBothXInv * TexRC) + (TexBoth * TexRD); + real32 PixelBlendG = (TexBothInv * TexGA) + (TexBothYInv * TexGB) + + (TexBothXInv * TexGC) + (TexBoth * TexGD); + real32 PixelBlendB = (TexBothInv * TexBA) + (TexBothYInv * TexBB) + + (TexBothXInv * TexBC) + (TexBoth * TexBD); + real32 PixelBlendA = (TexBothInv * TexAA) + (TexBothYInv * TexAB) + + (TexBothXInv * TexAC) + (TexBoth * TexAD); + PixelBlendA = PixelBlendA - (PixelBlendA * T.LayerOpacity); + + uint8 R = (uint8)PixelBlendR; + uint8 G = (uint8)PixelBlendG; + uint8 B = (uint8)PixelBlendB; + uint8 A = (uint8)PixelBlendA; + +#if PACKEDRGB + XLookup = (X >> 2)*16 + (X % 4); + YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4; + // if (real) { + // real = false; + // printf("XLook: %i, YLook: %i\n", XLookup, YLookup); + // printf("X: %i, Y: %i\n", X, Y); + // } + PixelToSeek = XLookup + YLookup; + uint32 *Pixel = (uint32 *)((uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel); + + uint8 R1 = (*Pixel >> 0); + uint8 G1 = (*Pixel >> 8); + uint8 B1 = (*Pixel >> 16); + uint8 A1 = (*Pixel >> 24); +#else + uint8 *RD = Pixel; + uint8 *GD = Pixel + Buffer->Channel; + uint8 *BD = Pixel + Buffer->Channel*2; + uint8 *AD = Pixel + Buffer->Channel*3; + uint8 R1 = *RD; + uint8 G1 = *GD; + uint8 B1 = *BD; + uint8 A1 = *AD; +#endif + + if (A != 255) { + real32 LayerAlpha = (255 - A) / 255.0f; + R = (R1 * LayerAlpha) - (R * LayerAlpha) + R; + G = (G1 * LayerAlpha) - (G * LayerAlpha) + G; + B = (B1 * LayerAlpha) - (B * LayerAlpha) + B; + A = ClipAdd(A1, A); + } + +#if PACKEDRGB + *Pixel = ((A << 24) | + (B << 16) | + (G << 8) | + (R << 0)); + } + } + } +#else + *RD = R; + *GD = G; + *BD = B; + *AD = A; + } + Pixel++; + } + Row += Buffer->Pitch*2; +#endif + } +} -- cgit v1.2.3