summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
authorFox Caminiti <fox@foxcam.net>2022-07-22 20:45:08 -0400
committerFox Caminiti <fox@foxcam.net>2022-07-22 20:45:08 -0400
commitfc8040d695644aaca4596adebeca4ea1369ef630 (patch)
treeaea6979da97c43df8f03f3a2d7b421ee71bef370 /prenderer.cpp
first
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp788
1 files changed, 788 insertions, 0 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
new file mode 100644
index 0000000..4080af1
--- /dev/null
+++ b/prenderer.cpp
@@ -0,0 +1,788 @@
+
+internal void
+PushRect(rectangle RenderRegion);
+
+internal void
+RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion);
+internal void
+AVX2_RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
+internal void
+RenderLayer(transform_info TransformInfo, pixel_buffer *Buffer, rectangle RenderRegion);
+
+internal bool32
+CheckQueue(render_queue RenderInfo, uint16 Index);
+
+internal void
+CalculateAnchorOffset(project_layer *Layer, real32 Value, uint16 Dir)
+{
+ v2 Result = {};
+ transform_info TransformInfo;
+ image_source *Source = (image_source *)Layer->RenderInfo;
+
+ real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
+ real32 s = Layer->scale.CurrentValue.f;
+
+ if (Dir == 0) {
+ v2 XAxis = V2(cos(Rad), sin(Rad)) * (Value / s);
+ Layer->x.CurrentValue.f += Value;
+ Layer->ax.CurrentValue.f += XAxis.x/Source->Raster.Width;
+ Layer->ay.CurrentValue.f -= XAxis.y/Source->Raster.Height;
+ } else {
+ v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Value / -s);
+ Layer->y.CurrentValue.f += Value;
+ Layer->ax.CurrentValue.f -= YAxis.x/Source->Raster.Width;
+ Layer->ay.CurrentValue.f += YAxis.y/Source->Raster.Height;
+ }
+}
+
+internal transform_info
+CalculateTransforms(project_layer *Layer, pixel_buffer *Buffer)
+{
+ transform_info TransformInfo;
+ image_source *Source = (image_source *)Layer->RenderInfo;
+
+ real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
+ real32 s = Layer->scale.CurrentValue.f;
+ // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};
+
+ v2 XAxis = (Source->Raster.Width * s)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Source->Raster.Height * -s)*V2(sin(Rad), -cos(Rad));
+
+ real32 AnchorX = Layer->ax.CurrentValue.f;
+ real32 AnchorY = Layer->ay.CurrentValue.f;
+
+ v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f};
+ v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
+
+ real32 XLengthSq = 1.0f / LengthSq(XAxis);
+ real32 YLengthSq = 1.0f / LengthSq(YAxis);
+
+ int32 MaxX = 0;
+ int32 MaxY = 0;
+ int32 MinX = Buffer->Width;
+ int32 MinY = Buffer->Height;
+
+ v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
+ for (int i = 0; i < 4; i++) {
+ if (Points[i].x < MinX) { MinX = Points[i].x; }
+ if (Points[i].y < MinY) { MinY = Points[i].y; }
+ if (Points[i].x > MaxX) { MaxX = Points[i].x; }
+ if (Points[i].y > MaxY) { MaxY = Points[i].y; }
+ }
+
+ TransformInfo.XAxisPX = XLengthSq*XAxis.x;
+ TransformInfo.XAxisPY = XLengthSq*XAxis.y;
+ TransformInfo.YAxisPX = YLengthSq*YAxis.x;
+ TransformInfo.YAxisPY = YLengthSq*YAxis.y;
+ TransformInfo.LayerWidth = (real32)Source->Raster.Width;
+ TransformInfo.LayerHeight = (real32)Source->Raster.Height;
+ TransformInfo.LayerOpacity = 1.0f - Layer->opacity.CurrentValue.f;
+ TransformInfo.OriginX = Origin.x;
+ TransformInfo.OriginY = Origin.y;
+ TransformInfo.BufferPitch = Buffer->Pitch;
+ TransformInfo.LayerPitch = Source->Raster.Pitch;
+ TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX, MaxY};
+
+ TransformInfo.SourceBuffer = Source->Raster.EffectBuffer;
+
+ return TransformInfo;
+}
+
+internal void
+EndRenderState(project_state *State)
+{
+ IsRendering = false;
+ DEBUG_CycleCountEnd(3);
+ //TODO(fox): proper pixel accounting
+ // Debug.ExecutionAmount[4] += 1280*720;
+
+ // printf("%lu %lu, avg %lu\n", Debug.EndCycleCount[3], Debug.ExecutionAmount[4],
+ // Debug.EndCycleCount[3] / Debug.ExecutionAmount[4]);
+ // Debug = {};
+
+ for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
+ {
+ State->LayersToRender[i] = 0;
+ }
+
+ State->NumberOfLayersToRender = 0;
+
+ __atomic_store_n(&EntryCount, 0, __ATOMIC_SEQ_CST);
+ __atomic_store_n(&NextEntryToDo, 0, __ATOMIC_SEQ_CST);
+ __atomic_store_n(&CompletedJobs, 0, __ATOMIC_SEQ_CST);
+
+}
+
+internal void
+QueueCurrentFrame(project_data *File, pixel_buffer *CompBuffer, project_state *State)
+{
+ IsRendering = true;
+ render_queue RenderInfo = {File, State, CompBuffer};
+
+ uint16 TileWidth = (CompBuffer->Width - (CompBuffer->Width & 3)) / 4;
+ uint16 TileHeight = (CompBuffer->Height - (CompBuffer->Height & 3)) / 4;
+
+ for (int16 i = 0; i < File->NumberOfLayers; i++)
+ {
+ if (File->Layer[i]->StartFrame <= File->CurrentFrame &&
+ File->Layer[i]->EndFrame >= File->CurrentFrame)
+ {
+ File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer);
+ State->LayersToRender[State->NumberOfLayersToRender] = i;
+ State->NumberOfLayersToRender++;
+ }
+ }
+
+#if THREADED
+ DEBUG_CycleCountStart(3);
+ for (int y = 0; y < 4; y++) {
+ for (int x = 0; x < 4; x++) {
+ // if (x == y) {
+ rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y};
+ PushRect(RenderRegion);
+ // }
+ }
+ }
+
+ // while (CompletedJobs != 16) {
+ // // CheckQueue(RenderInfo, 8);
+ // }
+ // DEBUG_CycleCountEnd(3);
+ // // //TODO(fox): proper pixel accounting
+ // Debug.ExecutionAmount[4] += 1280*720;
+
+ // for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
+ // {
+ // State->LayersToRender[i] = 0;
+ // }
+
+ // State->NumberOfLayersToRender = 0;
+
+#else
+ DEBUG_CycleCountStart(3);
+
+ rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
+ for (int16 i = 0; i < RenderInfo.State->NumberOfLayersToRender; i++) {
+ int16 Idx = RenderInfo.State->LayersToRender[i];
+#if ARM
+ RenderLayerNeon(RenderInfo.File->Layer[Idx], RenderInfo.CompBuffer, RenderRegion);
+#else
+ // RenderLayerSSE(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
+ if (Old)
+ RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
+ else
+ AVX2_RenderLayer(RenderInfo.File->Layer[Idx]->TransformInfo, RenderInfo.CompBuffer, RenderRegion);
+#endif
+ }
+
+ DEBUG_CycleCountEnd(3);
+ Debug.ExecutionAmount[4] += 1280*720;
+
+ for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
+ {
+ State->LayersToRender[i] = 0;
+ }
+
+ State->NumberOfLayersToRender = 0;
+
+#endif
+
+ // printf("Completed jobs: %i\n", CompletedJobs);
+ // printf("Next: %i\n", NextEntryToDo);
+ // Assert(CompletedJobs == 4*4);
+ // __atomic_store_n(&EntryCount, 0, __ATOMIC_SEQ_CST);
+ // __atomic_store_n(&NextEntryToDo, 0, __ATOMIC_SEQ_CST);
+ // __atomic_store_n(&CompletedJobs, 0, __ATOMIC_SEQ_CST);
+}
+
+
+#if ARM
+internal void
+RenderLayerNeon(project_layer *Layer, pixel_buffer *Buffer, rectangle RenderRegion)
+{
+ float32x4_t XAxisPX = vdupq_n_f32(XAxisP.x);
+ float32x4_t XAxisPY = vdupq_n_f32(XAxisP.y);
+ float32x4_t YAxisPX = vdupq_n_f32(YAxisP.x);
+ float32x4_t YAxisPY = vdupq_n_f32(YAxisP.y);
+ float32x4_t LayerWidth = vdupq_n_f32();
+ float32x4_t LayerHeight = vdupq_n_f32();
+ float32x4_t LayerOpacity = vdupq_n_f32();
+ float32x4_t OriginX = vdupq_n_f32(Origin.x);
+ float32x4_t OriginY = vdupq_n_f32(Origin.y);
+
+
+ float32x4_t One = vdupq_n_f32(1);
+ float32x4_t Zero = vdupq_n_f32(0);
+ float32x4_t Four = vdupq_n_f32(4);
+ int32x4_t FourInt = vdupq_n_s32(4);
+ int32x4_t EightInt = vdupq_n_s32(8);
+ int32x4_t SixteenInt = vdupq_n_s32(16);
+ int32x4_t TwentyFourInt = vdupq_n_s32(24);
+ float32x4_t Float255 = vdupq_n_f32(255.0f);
+ int32x4_t Int255 = vdupq_n_s32(255);
+ float32x4_t Norm255 = vdupq_n_f32(1/255.0f);
+
+ for(int16 Y = LayerBounds.Min.y;
+ Y < LayerBounds.Max.y;
+ Y++)
+ {
+ uint32 *Pixel = (uint32 *)Row + LayerBounds.Min.x;
+
+ real32 ScalarPixelX[4] = {(real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3};
+ float32x4_t PixelX = vld1q_f32(ScalarPixelX);
+ float32x4_t PixelY = vdupq_n_f32((real32)Y);
+ float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY);
+
+ for(int16 XI = LayerBounds.Min.x;
+ XI < LayerBounds.Max.x;
+ XI += 1)
+ {
+ float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX);
+ float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY));
+ float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY));
+
+ uint32x4_t R = vandq_u32(vandq_u32(vcleq_f32(U, One), vcgezq_f32(U)),
+ vandq_u32(vcleq_f32(V, One), vcgezq_f32(V)));
+
+ // TODO(fox): Make more efficient with some sort of truncation
+ uint32 comp[4];
+ vst1q_u32(comp, R);
+ if (comp[0] || comp[1] || comp[2] || comp[3]) {
+ U = vmaxq_f32(vminq_f32(One, U), Zero);
+ V = vmaxq_f32(vminq_f32(One, V), Zero);
+
+ float32x4_t TexXFull = vmulq_f32(U, LayerWidth);
+ float32x4_t TexYFull = vmulq_f32(V, LayerHeight);
+
+ int32x4_t TexXInt = vcvtq_s32_f32(TexXFull);
+ int32x4_t TexYInt = vcvtq_s32_f32(TexYFull);
+
+ // fractions
+ float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_s32(TexXInt));
+ float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_s32(TexYInt));
+ float32x4_t TexXInv = vsubq_f32(One, TexX);
+ float32x4_t TexYInv = vsubq_f32(One, TexY);
+ float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY);
+ float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv);
+ float32x4_t TexBoth = vmulq_f32(TexY, TexX);
+ float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv);
+
+ int32 TexXP[4];
+ vst1q_s32(TexXP, TexXInt);
+ int32 TexYP[4];
+ vst1q_s32(TexYP, TexYInt);
+
+ uint8 *TexPTR0 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[0] + TexXP[0]*sizeof(uint32));
+ uint8 *TexPTR1 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[1] + TexXP[1]*sizeof(uint32));
+ uint8 *TexPTR2 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[2] + TexXP[2]*sizeof(uint32));
+ uint8 *TexPTR3 = ((uint8 *)Source->Raster.MainBuffer + LayerPitch*TexYP[3] + TexXP[3]*sizeof(uint32));
+
+ // TexRGBA = vld4_u8(TexPTR0);
+ // TexRGBA = vld4q_lane_u8(TexPTR0, TexRGBA, 0);
+ // TexRGBA = vld4q_lane_u8(TexPTR1, TexRGBA, 4);
+ // TexRGBA = vld4q_lane_u8(TexPTR2, TexRGBA, 8);
+ // TexRGBA = vld4q_lane_u8(TexPTR3, TexRGBA, 12);
+ // TexRGBA = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA, 1);
+ // TexRGBA = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA, 5);
+ // TexRGBA = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA, 9);
+ // TexRGBA = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA, 13);
+ // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA, 2);
+ // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA, 6);
+ // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA, 10);
+ // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA, 14);
+ // TexRGBA = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA, 3);
+ // TexRGBA = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA, 7);
+ // TexRGBA = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA, 11);
+ // TexRGBA = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA, 15);
+ uint8x16x4_t TexRGBA_A = {};
+ uint8x16x4_t TexRGBA_B = {};
+ uint8x16x4_t TexRGBA_C = {};
+ uint8x16x4_t TexRGBA_D = {};
+ TexRGBA_A = vld4q_lane_u8(TexPTR0, TexRGBA_A, 0);
+ TexRGBA_B = vld4q_lane_u8(TexPTR1, TexRGBA_B, 0);
+ TexRGBA_C = vld4q_lane_u8(TexPTR2, TexRGBA_C, 0);
+ TexRGBA_D = vld4q_lane_u8(TexPTR3, TexRGBA_D, 0);
+ TexRGBA_A = vld4q_lane_u8(TexPTR0 + sizeof(uint32), TexRGBA_A, 4);
+ TexRGBA_B = vld4q_lane_u8(TexPTR1 + sizeof(uint32), TexRGBA_B, 4);
+ TexRGBA_C = vld4q_lane_u8(TexPTR2 + sizeof(uint32), TexRGBA_C, 4);
+ TexRGBA_D = vld4q_lane_u8(TexPTR3 + sizeof(uint32), TexRGBA_D, 4);
+ TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch, TexRGBA_A, 8);
+ TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch, TexRGBA_B, 8);
+ TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch, TexRGBA_C, 8);
+ TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch, TexRGBA_D, 8);
+ TexRGBA_A = vld4q_lane_u8(TexPTR0 + LayerPitch + sizeof(uint32), TexRGBA_A, 12);
+ TexRGBA_B = vld4q_lane_u8(TexPTR1 + LayerPitch + sizeof(uint32), TexRGBA_B, 12);
+ TexRGBA_C = vld4q_lane_u8(TexPTR2 + LayerPitch + sizeof(uint32), TexRGBA_C, 12);
+ TexRGBA_D = vld4q_lane_u8(TexPTR3 + LayerPitch + sizeof(uint32), TexRGBA_D, 12);
+
+ uint32x4_t test = (uint32x4_t)TexRGBA_A.val[0];
+
+ float32x4_t asd = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
+ float32x4_t pp = vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
+ vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0])));
+
+ uint32x4_t test2 = (uint32x4_t)TexRGBA_A.val[0];
+
+#if 0
+ float32x4_t PixelBlendR = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0])),
+ vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[0]))),
+ vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[0])),
+ vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[0]))));
+
+ float32x4_t PixelBlendG = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1])),
+ vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[1]))),
+ vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[1])),
+ vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[1]))));
+
+ float32x4_t PixelBlendB = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2])),
+ vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[2]))),
+ vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[2])),
+ vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[2]))));
+
+ float32x4_t PixelBlendA = vaddq_f32(vaddq_f32(vmulq_f32(TexBothInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3])),
+ vmulq_f32(TexBothYInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_B.val[3]))),
+ vaddq_f32(vmulq_f32(TexBothXInv, vcvtq_f32_u32((uint32x4_t)TexRGBA_C.val[3])),
+ vmulq_f32(TexBoth, vcvtq_f32_u32((uint32x4_t)TexRGBA_D.val[3]))));
+#endif
+ float32x4_t PixelBlendR = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[0]);
+ float32x4_t PixelBlendG = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[1]);
+ float32x4_t PixelBlendB = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[2]);
+ float32x4_t PixelBlendA = vcvtq_f32_u32((uint32x4_t)TexRGBA_A.val[3]);
+
+ // __m128 PixelBlendR = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, TexARx4),
+ // _mm_mul_ps(TexBothYInv, TexBRx4)),
+ // _mm_add_ps(_mm_mul_ps(TexBothXInv, TexCRx4),
+ // _mm_mul_ps(TexBoth, TexDRx4)));
+
+ PixelBlendA = vsubq_f32(PixelBlendA, vmulq_f32(PixelBlendA, LayerOpacity));
+ uint32x4_t Output = vorrq_u32(vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendR), 16),
+ vshlq_n_u32(vcvtq_u32_f32(PixelBlendA), 24)),
+ (vorrq_u32(vshlq_n_u32(vcvtq_u32_f32(PixelBlendG), 8),
+ vcvtq_u32_f32(PixelBlendB))));
+
+ uint32 ma[4] = {0xFFFFFFFF, 0, 0, 0};
+ uint32x4_t mask = vld1q_u32(ma);
+ Output = vandq_u32(Output, mask);
+ vst1q_u32(Pixel, Output);
+
+ }
+ Pixel++;
+ PixelX = vaddq_f32(PixelX, One);
+ }
+ Row += BufferPitch;
+ }
+
+}
+#else
+internal void
+AVX2_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX);
+ __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY);
+ __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX);
+ __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
+
+ __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
+ __m256i LayerWidth4i = _mm256_set1_epi32(T.LayerWidth*4);
+ __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
+ __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
+ __m256 OriginX = _mm256_set1_ps(T.OriginX);
+ __m256 OriginY = _mm256_set1_ps(T.OriginY);
+
+ __m256 One = _mm256_set1_ps(1);
+ __m256 Zero = _mm256_set1_ps(0);
+ __m256i Zeroi = _mm256_set1_epi32(0);
+ __m256i Onei = _mm256_set1_epi32(1);
+ __m256 Four = _mm256_set1_ps(4);
+ __m256 Sixteen = _mm256_set1_ps(16);
+ __m256i FF = _mm256_set1_epi32(0xFF);
+ __m256i BottomTwoBits = _mm256_set1_epi32(0x03);
+ __m256i Fouri = _mm256_set1_epi32(4);
+ __m256i Sixteeni = _mm256_set1_epi32(16);
+ __m256 Reg255 = _mm256_set1_ps(255.0f);
+ __m256i Int255 = _mm256_set1_epi32(255);
+ __m256 Norm255 = _mm256_set1_ps(1/255.0f);
+ // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
+ // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF);
+
+ // NOTE(fox): Each loop operates on 8 pixels, 4 horizontal by 2 vertical,
+ // as per the bitmap packing scheme in memory.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2)
+ {
+ __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3,
+ (real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3);
+
+ __m256 PixelY = _mm256_setr_ps((real32)Y,
+ (real32)Y,
+ (real32)Y,
+ (real32)Y,
+ (real32)Y+1,
+ (real32)Y+1,
+ (real32)Y+1,
+ (real32)Y+1);
+
+ __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY);
+
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+ {
+ IACA_START;
+
+ __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(Buffer->Width*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
+ __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
+ __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
+
+ __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
+ _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+
+ if (_mm256_movemask_epi8(LayerMask))
+ {
+ U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
+ V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
+
+ __m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
+ __m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
+ __m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
+ __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, Onei);
+ __m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
+ __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, Onei);
+
+ __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
+ __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
+ __m256 TexXInv = _mm256_sub_ps(One, TexX);
+ __m256 TexYInv = _mm256_sub_ps(One, TexY);
+ __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY);
+ __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv);
+ __m256 TexBoth = _mm256_mul_ps(TexY, TexX);
+ __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv);
+
+ __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
+ _mm256_and_si256(TexXInt, BottomTwoBits));
+ __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), LayerWidth4i),
+ _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
+ __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
+ _mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
+ __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), LayerWidth4i),
+ _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
+ __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup);
+ __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne);
+ __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne);
+
+ // The big feature of AVX2: gathering.
+ __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4);
+ __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4);
+ __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4);
+ __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4);
+
+ __m256i R_TexTL = _mm256_and_si256( PixelsTL, FF);
+ __m256i G_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF);
+ __m256i B_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF);
+ __m256i A_TexTL = _mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF);
+
+ __m256i R_TexTR = _mm256_and_si256( PixelsTR, FF);
+ __m256i G_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF);
+ __m256i B_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF);
+ __m256i A_TexTR = _mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF);
+
+ __m256i R_TexBL = _mm256_and_si256( PixelsBL, FF);
+ __m256i G_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF);
+ __m256i B_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF);
+ __m256i A_TexBL = _mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF);
+
+ __m256i R_TexBR = _mm256_and_si256( PixelsBR, FF);
+ __m256i G_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF);
+ __m256i B_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF);
+ __m256i A_TexBR = _mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF);
+
+ __m256 R_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(R_TexTL)),
+ _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(R_TexTR))),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(R_TexBL)),
+ _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(R_TexBR))));
+ __m256 G_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(G_TexTL)),
+ _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(G_TexTR))),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(G_TexBL)),
+ _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(G_TexBR))));
+ __m256 B_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(B_TexTL)),
+ _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(B_TexTR))),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(B_TexBL)),
+ _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(B_TexBR))));
+ __m256 A_PixelBlend = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, _mm256_cvtepi32_ps(A_TexTL)),
+ _mm256_mul_ps(TexBothYInv, _mm256_cvtepi32_ps(A_TexTR))),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, _mm256_cvtepi32_ps(A_TexBL)),
+ _mm256_mul_ps(TexBoth, _mm256_cvtepi32_ps(A_TexBR))));
+
+ A_PixelBlend = _mm256_sub_ps(A_PixelBlend, _mm256_mul_ps(A_PixelBlend, LayerOpacity));
+
+ __m256i R_Out, G_Out, B_Out, A_Out;
+ // Only do alpha blending if a pixel's value doesn't equal 255
+ if (_mm256_movemask_epi8(_mm256_sub_epi32(_mm256_cvtps_epi32(A_PixelBlend), Int255)))
+ {
+ __m256 LayerAlpha = _mm256_mul_ps(A_PixelBlend, Norm255);
+ __m256 LayerAlphaInv = _mm256_mul_ps(_mm256_sub_ps(Reg255, A_PixelBlend), Norm255);
+
+ __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
+ __m256i R_Dest = _mm256_and_si256( DestPixel, FF);
+ __m256i G_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF);
+ __m256i B_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF);
+ __m256i A_Dest = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF);
+
+ R_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(R_Dest), LayerAlphaInv), _mm256_mul_ps(R_PixelBlend, LayerAlpha)));
+ G_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(G_Dest), LayerAlphaInv), _mm256_mul_ps(G_PixelBlend, LayerAlpha)));
+ B_Out = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(_mm256_cvtepi32_ps(B_Dest), LayerAlphaInv), _mm256_mul_ps(B_PixelBlend, LayerAlpha)));
+ A_Out = _mm256_cvtps_epi32(_mm256_min_ps(_mm256_add_ps(_mm256_cvtepi32_ps(A_Dest), A_PixelBlend), Reg255));
+ }
+ else
+ {
+ R_Out = _mm256_cvtps_epi32(R_PixelBlend);
+ G_Out = _mm256_cvtps_epi32(G_PixelBlend);
+ B_Out = _mm256_cvtps_epi32(B_PixelBlend);
+ A_Out = _mm256_cvtps_epi32(A_PixelBlend);
+ }
+
+ __m256i OutputPixel = _mm256_or_si256(
+ _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
+ _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
+
+ __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
+ _mm256_storeu_si256((__m256i *)Pixel, PixelsMask);
+ }
+ PixelX = _mm256_add_ps(PixelX, Four);
+ }
+ }
+}
+#endif
+
+internal void
+Fallback_RenderLayer(transform_info T, pixel_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);
+
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ uint8 *Row = ((uint8 *)Buffer->OriginalBuffer + Buffer->Pitch*(int16)(LayerBounds.Min.y) );
+
+ uint32 Channel = (T.LayerWidth * T.LayerHeight);
+ // uint32 pp1 = 2;
+ // uint32 pp2 = 3;
+ // bool32 real = true;
+
+ for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y += 2)
+ {
+#if PACKEDRGB
+#else
+ uint8 *Pixel = (uint8 *)Row + (uint16)LayerBounds.Min.x;
+#endif
+ real32 StartVectorY[2];
+ StartVectorY[0] = (real32)Y - T.OriginY;
+ StartVectorY[1] = (real32)(Y+1) - T.OriginY;
+
+ for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
+ {
+ for (int16 i = 0; i < 2; i++)
+ {
+ IACA_START;
+
+ real32 StartVectorX = X - T.OriginX;
+ real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY[i] * T.XAxisPY);
+ real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY[i] * T.YAxisPY);
+
+ if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
+ real32 TexXFull = U * T.LayerWidth;
+ uint32 TexXInt = (uint32)TexXFull;
+ real32 TexX = TexXFull - TexXInt;
+
+ real32 TexYFull = V * T.LayerHeight;
+ uint32 TexYInt = (uint32)TexYFull;
+ real32 TexY = TexYFull - TexYInt;
+
+ real32 TexXInv = 1 - TexX;
+ real32 TexYInv = 1 - TexY;
+ real32 TexBothXInv = TexXInv * TexY;
+ real32 TexBothYInv = TexX * TexYInv;
+ real32 TexBoth = TexY * TexX;
+ real32 TexBothInv = TexXInv * TexYInv;
+
+#if PACKEDRGB
+#if 0
+ uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel);
+ uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel);
+
+ uint32 PixelA = *(uint32 *)TexPTR0;
+ uint32 PixelB = *((uint32 *)TexPTR0 + 1);
+ uint32 PixelC = *(uint32 *)TexPTR1;
+ uint32 PixelD = *((uint32 *)TexPTR1 + 1);
+#else
+ uint16 LX, LY;
+ uint32 XLookup, YLookup, PixelToSeek;
+
+ // TODO(fox): Be careful with the BytesPerPixel here! It's the buffer's, not the layer's!
+ LX = TexXInt;
+ LY = TexYInt;
+ XLookup = (LX >> 2)*16 + (LX % 4);
+ YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ PixelToSeek = XLookup + YLookup;
+ uint32 PixelA = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
+
+ LX = TexXInt+1;
+ LY = TexYInt;
+ XLookup = (LX >> 2)*16 + (LX % 4);
+ YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ PixelToSeek = XLookup + YLookup;
+ uint32 PixelB = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
+
+ LX = TexXInt;
+ LY = TexYInt+1;
+ XLookup = (LX >> 2)*16 + (LX % 4);
+ YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ PixelToSeek = XLookup + YLookup;
+ uint32 PixelC = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
+
+ LX = TexXInt+1;
+ LY = TexYInt+1;
+ XLookup = (LX >> 2)*16 + (LX % 4);
+ YLookup = (LY >> 2)*(T.LayerWidth*4) + (LY % 4)*4;
+ PixelToSeek = XLookup + YLookup;
+ uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
+#endif
+
+ uint8 TexRA = (PixelA & 0xFF);
+ uint8 TexRB = (PixelB & 0xFF);
+ uint8 TexRC = (PixelC & 0xFF);
+ uint8 TexRD = (PixelD & 0xFF);
+
+ uint8 TexGA = ((PixelA >> 8) & 0xFF);
+ uint8 TexGB = ((PixelB >> 8) & 0xFF);
+ uint8 TexGC = ((PixelC >> 8) & 0xFF);
+ uint8 TexGD = ((PixelD >> 8) & 0xFF);
+
+ uint8 TexBA = ((PixelA >> 16) & 0xFF);
+ uint8 TexBB = ((PixelB >> 16) & 0xFF);
+ uint8 TexBC = ((PixelC >> 16) & 0xFF);
+ uint8 TexBD = ((PixelD >> 16) & 0xFF);
+
+ uint8 TexAA = ((PixelA >> 24) & 0xFF);
+ uint8 TexAB = ((PixelB >> 24) & 0xFF);
+ uint8 TexAC = ((PixelC >> 24) & 0xFF);
+ uint8 TexAD = ((PixelD >> 24) & 0xFF);
+#else
+ uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt);
+ uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt);
+
+ uint8 TexRA = *TexPTR0;
+ uint8 TexRB = *(TexPTR0 + 1);
+ uint8 TexRC = *TexPTR1;
+ uint8 TexRD = *(TexPTR1 + 1);
+
+ uint8 TexGA = *(TexPTR0 + Channel);
+ uint8 TexGB = *(TexPTR0 + 1 + Channel);
+ uint8 TexGC = *(TexPTR1 + Channel);
+ uint8 TexGD = *(TexPTR1 + 1 + Channel);
+
+ uint8 TexBA = *(TexPTR0 + Channel*2);
+ uint8 TexBB = *(TexPTR0 + 1 + Channel*2);
+ uint8 TexBC = *(TexPTR1 + Channel*2);
+ uint8 TexBD = *(TexPTR1 + 1 + Channel*2);
+
+ uint8 TexAA = *(TexPTR0 + Channel*3);
+ uint8 TexAB = *(TexPTR0 + 1 + Channel*3);
+ uint8 TexAC = *(TexPTR1 + Channel*3);
+ uint8 TexAD = *(TexPTR1 + 1 + Channel*3);
+#endif
+
+ real32 PixelBlendR = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
+ + (TexBothXInv * TexRC) + (TexBoth * TexRD);
+ real32 PixelBlendG = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
+ + (TexBothXInv * TexGC) + (TexBoth * TexGD);
+ real32 PixelBlendB = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
+ + (TexBothXInv * TexBC) + (TexBoth * TexBD);
+ real32 PixelBlendA = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
+ + (TexBothXInv * TexAC) + (TexBoth * TexAD);
+ PixelBlendA = PixelBlendA - (PixelBlendA * T.LayerOpacity);
+
+ uint8 R = (uint8)PixelBlendR;
+ uint8 G = (uint8)PixelBlendG;
+ uint8 B = (uint8)PixelBlendB;
+ uint8 A = (uint8)PixelBlendA;
+
+#if PACKEDRGB
+ XLookup = (X >> 2)*16 + (X % 4);
+ YLookup = ((Y+i) >> 2)*(Buffer->Width*4) + ((Y+i) % 4)*4;
+ // if (real) {
+ // real = false;
+ // printf("XLook: %i, YLook: %i\n", XLookup, YLookup);
+ // printf("X: %i, Y: %i\n", X, Y);
+ // }
+ PixelToSeek = XLookup + YLookup;
+ uint32 *Pixel = (uint32 *)((uint8 *)Buffer->OriginalBuffer + PixelToSeek*Buffer->BytesPerPixel);
+
+ uint8 R1 = (*Pixel >> 0);
+ uint8 G1 = (*Pixel >> 8);
+ uint8 B1 = (*Pixel >> 16);
+ uint8 A1 = (*Pixel >> 24);
+#else
+ uint8 *RD = Pixel;
+ uint8 *GD = Pixel + Buffer->Channel;
+ uint8 *BD = Pixel + Buffer->Channel*2;
+ uint8 *AD = Pixel + Buffer->Channel*3;
+ uint8 R1 = *RD;
+ uint8 G1 = *GD;
+ uint8 B1 = *BD;
+ uint8 A1 = *AD;
+#endif
+
+ if (A != 255) {
+ real32 LayerAlpha = (255 - A) / 255.0f;
+ R = (R1 * LayerAlpha) - (R * LayerAlpha) + R;
+ G = (G1 * LayerAlpha) - (G * LayerAlpha) + G;
+ B = (B1 * LayerAlpha) - (B * LayerAlpha) + B;
+ A = ClipAdd(A1, A);
+ }
+
+#if PACKEDRGB
+ *Pixel = ((A << 24) |
+ (B << 16) |
+ (G << 8) |
+ (R << 0));
+ }
+ }
+ }
+#else
+ *RD = R;
+ *GD = G;
+ *BD = B;
+ *AD = A;
+ }
+ Pixel++;
+ }
+ Row += Buffer->Pitch*2;
+#endif
+ }
+}