summaryrefslogtreecommitdiff
path: root/src/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/prenderer.cpp')
-rw-r--r--src/prenderer.cpp1914
1 files changed, 1914 insertions, 0 deletions
diff --git a/src/prenderer.cpp b/src/prenderer.cpp
new file mode 100644
index 0000000..54b19cf
--- /dev/null
+++ b/src/prenderer.cpp
@@ -0,0 +1,1914 @@
+static v2
+T_CompUVToLayerUV(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight, v2 CompUV)
+{
+ real32 X = CompUV.x*FileWidth;
+ real32 Y = CompUV.y*FileHeight;
+
+ real32 Rad = (T.rotation* (PI / 180));
+ v2 XAxis = (SourceWidth * T.scale)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (SourceHeight * -T.scale)*V2(sin(Rad), -cos(Rad));
+
+ v2 Pos = {T.x, T.y};
+ v2 Origin = Pos - (XAxis * T.ax) - (YAxis * T.ay);
+
+ v2 XAxisPerp = (1.0f / LengthSq(XAxis))*XAxis;
+ v2 YAxisPerp = (1.0f / LengthSq(YAxis))*YAxis;
+
+ real32 StartVectorX = X - Origin.x;
+ real32 StartVectorY = Y - Origin.y;
+ real32 LayerU = (StartVectorX * XAxisPerp.x) + (StartVectorY * XAxisPerp.y);
+ real32 LayerV = (StartVectorX * YAxisPerp.x) + (StartVectorY * YAxisPerp.y);
+ return V2(LayerU, LayerV);
+}
+
+static v2
+T_CompPosToLayerPos(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight, v2 CompUV)
+{
+ v2 UV = T_CompUVToLayerUV(T, FileWidth, FileHeight, SourceWidth, SourceHeight, CompUV/V2(FileWidth, FileHeight));
+ return UV*V2(SourceWidth, SourceHeight);
+}
+
+static v2
+Transform_ScreenSpaceToLocal(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight,
+ ImVec2 CompPos, ImVec2 CompZoom, ImVec2 ViewportMin, ImVec2 Point)
+{
+ v2 CompUV = ImGui_ScreenPointToCompUV(ViewportMin, CompPos, CompZoom, Point);
+ v2 LayerUV = T_CompUVToLayerUV(T, FileWidth, FileHeight, SourceWidth, SourceHeight, CompUV);
+ return V2(LayerUV.x * SourceWidth, LayerUV.y * SourceHeight);
+}
+
+static void
+Layer_GetDimensions(memory *Memory, block_layer *Layer, int *Width, int *Height)
+{
+ if (!Layer->IsPrecomp) {
+ block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
+ *Width = Source->Width;
+ *Height = Source->Height;
+ } else {
+ block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
+ *Width = Comp->Width;
+ *Height = Comp->Height;
+ }
+}
+
+// Transform given data based on state's Interact data.
+static void
+Transform_ApplyInteractive(interact_transform Interact, real32 *OutputX, real32 *OutputY, real32 *OutputRotation, real32 *OutputScale)
+{
+ v2 BoxLength = Interact.Max - Interact.Min;
+ v2 Center = Interact.Max - (BoxLength/2);
+
+ real32 Point0X = Center.x - *OutputX;
+ real32 Point0Y = Center.y - *OutputY;
+
+ real32 Rad = Interact.Radians;
+ real32 Rotation = Interact.Radians / (PI / 180);
+
+ v2 XAxis = (Point0X * Interact.Scale)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Point0Y * -Interact.Scale)*V2(sin(Rad), -cos(Rad));
+
+ real32 X0 = -XAxis.x - YAxis.x + Center.x;
+ real32 Y0 = -XAxis.y - YAxis.y + Center.y;
+
+ *OutputX = X0 + Interact.Position.x;
+ *OutputY = Y0 + Interact.Position.y;
+ *OutputRotation += Rotation;
+ *OutputScale += Interact.Scale - 1.0f;
+}
+
+static void
+Transform_IterateOuterBounds(block_layer *Layer, uint32 Width, uint32 Height, real32 *MinX, real32 *MinY, real32 *MaxX, real32 *MaxY)
+{
+ real32 Rad = (Layer->rotation.CurrentValue * (PI / 180));
+ real32 s = Layer->scale.CurrentValue;
+
+ v2 XAxis = (Width * s)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Height * -s)*V2(sin(Rad), -cos(Rad));
+
+ real32 AnchorX = Layer->ax.CurrentValue;
+ real32 AnchorY = Layer->ay.CurrentValue;
+
+ v2 Pos = {Layer->x.CurrentValue, Layer->y.CurrentValue};
+ v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
+
+ real32 XLengthSq = 1.0f / LengthSq(XAxis);
+ real32 YLengthSq = 1.0f / LengthSq(YAxis);
+
+ v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
+ for (int i = 0; i < 4; i++) {
+ if (Points[i].x < *MinX) { *MinX = Points[i].x; }
+ if (Points[i].y < *MinY) { *MinY = Points[i].y; }
+ if (Points[i].x > *MaxX) { *MaxX = Points[i].x; }
+ if (Points[i].y > *MaxY) { *MaxY = Points[i].y; }
+ }
+}
+static void
+Transform_Recurse(project_state *State, memory *Memory, block_composition *MainComp, uint32 CompIndex, block_layer *ParentLayer[4], uint32 Recursions,
+ sorted_comp_array *SortedCompArray, sorted_layer_array *SortedLayerArray,
+ real32 *MinX, real32 *MinY, real32 *MaxX, real32 *MaxY)
+{
+ sorted_comp_array *SortedCompStart = &SortedCompArray[CompIndex];
+ sorted_layer_array *SortedLayerStart = Sorted_GetLayerStart(SortedLayerArray, SortedCompArray, CompIndex);
+ for (int i = 0; i < SortedCompStart->LayerCount; i++)
+ {
+ sorted_layer_array SortEntry = SortedLayerStart[i];
+ uint32 Index_Physical = SortEntry.Block_Layer_Index;
+ block_layer *Layer = (block_layer *)Memory_Block_AddressAtIndex(Memory, F_Layers, Index_Physical);
+ if (Layer->IsPrecomp) {
+ ParentLayer[Recursions] = Layer;
+ Transform_Recurse(State, Memory, MainComp, Layer->Block_Source_Index, ParentLayer, Recursions + 1, SortedCompArray, SortedLayerArray,
+ MinX, MinY, MaxX, MaxY);
+ }
+ if (Layer->IsSelected) {
+ uint32 Width = 0, Height = 0;
+ if (!Layer->IsPrecomp) {
+ block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
+ Width = Source->Width;
+ Height = Source->Height;
+ } else {
+ block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
+ Width = Comp->Width;
+ Height = Comp->Height;
+ }
+
+ v2 Point[5] = { V2(Width*Layer->ax.CurrentValue, Height*Layer->ay.CurrentValue), V2(0, 0), V2(Width, 0), V2(0, Height), V2(Width, Height) };
+
+ layer_transforms T = Layer_GetTransforms(Layer);
+
+ v2 NewPos[5];
+ for (int i = 0; i < 5; i++) {
+ NewPos[i] = TransformPoint(T, Width, Height, Point[i]);
+ }
+
+ int i = 0;
+ while (i < Recursions) {
+ T = Layer_GetTransforms(ParentLayer[i]);
+ block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, ParentLayer[i]->Block_Source_Index);
+ Width = Comp->Width;
+ Height = Comp->Height;
+ for (int i = 0; i < 5; i++) {
+ NewPos[i] = TransformPoint(T, Width, Height, NewPos[i]);
+ }
+ i++;
+ }
+
+ for (int i = 0; i < 4; i++) {
+ if (NewPos[i+1].x < *MinX) { *MinX = NewPos[i+1].x; }
+ if (NewPos[i+1].y < *MinY) { *MinY = NewPos[i+1].y; }
+ if (NewPos[i+1].x > *MaxX) { *MaxX = NewPos[i+1].x; }
+ if (NewPos[i+1].y > *MaxY) { *MaxY = NewPos[i+1].y; }
+ }
+ }
+ }
+}
+
+// IMPORTANT(fox): The selection state and ordering of layers cannot change
+// until this action is exited/committed!
+static void
+Interact_Transform_Begin(project_data *File, memory *Memory, project_state *State, ImVec2 OGPos,
+ sorted_comp_array *SortedCompArray, sorted_layer_array *SortedLayerArray)
+{
+ real32 MinX = 100000;
+ real32 MinY = 100000;
+ real32 MaxX = -100000;
+ real32 MaxY = -100000;
+ block_composition *MainComp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, File->PrincipalCompIndex);
+ block_layer *ParentLayer[4];
+ Transform_Recurse(State, Memory, MainComp, File->PrincipalCompIndex, ParentLayer, 0,
+ SortedCompArray, SortedLayerArray,
+ &MinX, &MinY, &MaxX, &MaxY);
+ if (MinX != 100000) {
+ State->Interact_Active = interact_type_viewport_transform;
+ interact_transform *Interact = (interact_transform *)&State->Interact_Offset[0];
+ Interact->Min = V2(MinX, MinY);
+ Interact->Max = V2(MaxX, MaxY);
+ Interact->Position = V2(0);
+ Interact->Radians = 0;
+ Interact->Scale = 1.0f;
+ Interact->OGPos = OGPos;
+ }
+ /*
+ bool32 Activate = false;
+ // Find the max dimensions of all the selected layers.
+ for (int i = 0; i < File->Layer_Count; i++) {
+ block_layer *Layer = (block_layer *)Memory_Block_AddressAtIndex(Memory, F_Layers, i);
+ if (!Layer->IsSelected)
+ continue;
+ uint32 Width = 0, Height = 0;
+ if (!Layer->IsPrecomp) {
+ block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
+ Width = Source->Width;
+ Height = Source->Height;
+ } else {
+ block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
+ Width = Comp->Width;
+ Height = Comp->Height;
+ }
+ Transform_IterateOuterBounds(Layer, Width, Height, &MinX, &MinY, &MaxX, &MaxY);
+ Activate = true;
+ }
+ if (Activate) {
+ State->Interact_Active = interact_type_viewport_transform;
+ interact_transform *Interact = (interact_transform *)&State->Interact_Offset[0];
+ Interact->Min = V2(MinX, MinY);
+ Interact->Max = V2(MaxX, MaxY);
+ Interact->Position = V2(0);
+ Interact->Radians = 0;
+ Interact->Scale = 1.0f;
+ Interact->OGPos = OGPos;
+ }
+ */
+}
+
+static v2
+TransformPoint(layer_transforms T, real32 Width, real32 Height, v2 Point)
+{
+ real32 Rad = (T.rotation * (PI / 180));
+ v2 XAxis = (Point.x - T.ax*Width) * T.scale * V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Point.y - T.ay*Height) * -T.scale * V2(sin(Rad), -cos(Rad));
+ v2 LocalPoint = XAxis + YAxis;
+ return V2(T.x + LocalPoint.x, T.y + LocalPoint.y);
+}
+
+
+static ImVec2
+Layer_LocalToScreenSpace(project_state *State, memory *Memory, block_layer *Layer, ui *UI, uint32 PrincipalCompIndex, v2 Point)
+{
+ block_composition *MainComp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, PrincipalCompIndex);
+
+ uint32 Width = 0, Height = 0;
+ if (!Layer->IsPrecomp) {
+ block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
+ Width = Source->Width;
+ Height = Source->Height;
+ } else {
+ block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
+ Width = Comp->Width;
+ Height = Comp->Height;
+ }
+
+ layer_transforms T = Layer_GetTransforms(Layer);
+
+ if (State->Interact_Active == interact_type_viewport_transform && Layer->IsSelected == 1) {
+ Transform_ApplyInteractive(*(interact_transform *)&State->Interact_Offset[0], &T.x, &T.y, &T.rotation, &T.scale);
+ }
+
+ v2 NewPos = TransformPoint(T, Width, Height, Point);
+
+ if (Layer->Block_Composition_Index != PrincipalCompIndex) {
+ layer_transforms T = Layer_GetTransforms(Layer);
+ NewPos = TransformPoint(T, Width, Height, NewPos);
+ }
+
+ v2 CompUV = NewPos / V2(MainComp->Width, MainComp->Height);
+
+ v2 ScreenPoint = V2(UI->CompPos.x + CompUV.x * UI->CompZoom.x,
+ UI->CompPos.y + CompUV.y * UI->CompZoom.y);
+
+ return ImVec2(ScreenPoint.x, ScreenPoint.y);
+}
+
+static void
+Fallback_RenderLayer(transform_info T, void *OutputBuffer, rectangle RenderRegion);
+static void
+Fallback_RenderDirect(direct_info T, void *OutputBuffer, rectangle RenderRegion);
+
+static void
+RenderLayers(render_entry Entry) {
+ switch (Entry.RenderType)
+ {
+ case render_type_main:
+ {
+ Fallback_RenderLayer(*(transform_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
+ } break;
+ case render_type_notransform:
+ {
+ Fallback_RenderDirect(*(direct_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
+ } break;
+ case render_type_notransform_swap:
+ {
+ Fallback_RenderDirect(*(direct_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
+ } break;
+ case render_type_brush:
+ {
+ PaintTest(*(brush_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
+ } break;
+ default:
+ {
+ Assert(0);
+ }
+ }
+#if 0
+#if ARM
+ Fallback_RenderLayer(RenderData->TransformInfo[i], RenderInfo->CompBuffer, RenderRegion);
+#else
+ if (InstructionMode == instruction_mode_avx)
+ AVX2_RenderLayer(Entry.T, Entry.OutputBuffer, Entry.RenderRegion);
+ else
+ Fallback_RenderLayer(Entry.T, Entry.OutputBuffer, Entry.RenderRegion);
+#endif
+#endif
+}
+
+static void
+Renderer_Start(void *Data, void *OutputBuffer, render_type RenderType, rectangle RenderRegion)
+{
+#if DEBUG
+ if (Debug.NoThreading) {
+ render_entry Entry = { Data, OutputBuffer, RenderType, RenderRegion };
+ RenderLayers(Entry);
+ return;
+ }
+#endif
+ // CPU
+ Threading_BitmapOp(Data, OutputBuffer, RenderType, RenderRegion);
+}
+
+static void
+Renderer_Check(bool32 *Test, render_type RenderType)
+{
+#if DEBUG
+ if (Debug.NoThreading) {
+ *Test = true;
+ return;
+ }
+#endif
+ // CPU
+ *Test = Threading_IsActive(RenderType);
+}
+
+
+static transform_info
+Transform_Calculate(project_state *State, memory *Memory, project_data *File, block_layer *Layer, block_composition *Comp,
+ int Width, int Height, int BytesPerPixel)
+{
+ transform_info TransformInfo;
+
+ real32 Rotation = Layer->rotation.CurrentValue;
+ real32 X = Layer->x.CurrentValue;
+ real32 Y = Layer->y.CurrentValue;
+ real32 s = Layer->scale.CurrentValue;
+ blend_mode BlendMode = Layer->BlendMode;
+
+ if (State->Interact_Active == interact_type_viewport_transform && Layer->IsSelected == 1) {
+ Transform_ApplyInteractive(*(interact_transform *)&State->Interact_Offset[0], &X, &Y, &Rotation, &s);
+ }
+
+ /*
+ state_file_ui *UI = &State->Context[State->CurrentFileIndex].UI;
+ if (UI->IsInteracting == true && UI->InteractMode == interact_transforms && Layer->IsSelected && !Layer->IsAdjustment)
+ Transform_ApplyInteractive(UI, &X, &Y, &Rotation, &s);
+
+ if (UI->IsInteractingBlendmode == true && Layer->IsSelected)
+ BlendMode = UI->InteractBlendmode;
+ */
+
+ real32 Rad = (Rotation * (PI / 180));
+ // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};
+
+ v2 XAxis = (Width * s)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Height * -s)*V2(sin(Rad), -cos(Rad));
+
+ real32 AnchorX = Layer->ax.CurrentValue;
+ real32 AnchorY = Layer->ay.CurrentValue;
+
+ v2 Pos = {X, Y};
+ v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
+
+ real32 XLengthSq = 1.0f / LengthSq(XAxis);
+ real32 YLengthSq = 1.0f / LengthSq(YAxis);
+
+ int32 MaxX = 0;
+ int32 MaxY = 0;
+ int32 MinX = Comp->Width;
+ int32 MinY = Comp->Height;
+
+ v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
+ for (int i = 0; i < 4; i++) {
+ if (Points[i].x < MinX) { MinX = Points[i].x; }
+ if (Points[i].y < MinY) { MinY = Points[i].y; }
+ if (Points[i].x > MaxX) { MaxX = Points[i].x; }
+ if (Points[i].y > MaxY) { MaxY = Points[i].y; }
+ }
+ TransformInfo.XAxisPX = XLengthSq*XAxis.x;
+ TransformInfo.XAxisPY = XLengthSq*XAxis.y;
+ TransformInfo.YAxisPX = YLengthSq*YAxis.x;
+ TransformInfo.YAxisPY = YLengthSq*YAxis.y;
+
+ TransformInfo.BufferWidth = Comp->Width;
+ TransformInfo.BufferHeight = Comp->Height;
+ TransformInfo.BufferBytesPerPixel = Comp->BytesPerPixel;
+ TransformInfo.BufferBits = Bitmap_ByteInfo(Comp->BytesPerPixel);
+
+ TransformInfo.LayerWidth = Width;
+ TransformInfo.LayerHeight = Height;
+ TransformInfo.LayerBytesPerPixel = BytesPerPixel;
+ TransformInfo.LayerBits = Bitmap_ByteInfo(BytesPerPixel);
+
+ TransformInfo.LayerOpacity = Layer->opacity.CurrentValue;
+ TransformInfo.BlendMode = BlendMode;
+ TransformInfo.OriginX = Origin.x;
+ TransformInfo.OriginY = Origin.y;
+ TransformInfo.BufferPitch = Comp->Width*Comp->BytesPerPixel;
+ TransformInfo.LayerPitch = Width*BytesPerPixel;
+ TransformInfo.ClipRect = {MinX, MinY, MaxX, MaxY};
+
+ TransformInfo.IsAdjustment = Layer->IsAdjustment;
+
+ return TransformInfo;
+}
+
+// NOTE(fox): is this too ridiculous? i don't trust inline
+#define Fallback_Blend() \
+ switch (T.BlendMode)\
+ {\
+ case blend_normal:\
+ {\
+ } break;\
+ case blend_multiply:\
+ {\
+ R_Blend = R_Dest * R_Col;\
+ G_Blend = G_Dest * G_Col;\
+ B_Blend = B_Dest * B_Col;\
+ } break;\
+ case blend_colorburn:\
+ {\
+ /* NOTE(fox): Padding to prevent actual crashing from zero division */ \
+ R_Blend = 1.0f - ((1.0f - R_Dest) / (R_Col + 0.001f));\
+ G_Blend = 1.0f - ((1.0f - G_Dest) / (G_Col + 0.001f));\
+ B_Blend = 1.0f - ((1.0f - B_Dest) / (B_Col + 0.001f));\
+ } break;\
+ case blend_linearburn:\
+ {\
+ R_Blend = (R_Dest + R_Col) - 1.0f;\
+ G_Blend = (G_Dest + G_Col) - 1.0f;\
+ B_Blend = (B_Dest + B_Col) - 1.0f;\
+ } break;\
+ case blend_add:\
+ {\
+ R_Blend = R_Dest + R_Col;\
+ G_Blend = G_Dest + G_Col;\
+ B_Blend = B_Dest + B_Col;\
+ } break;\
+ case blend_screen:\
+ {\
+ R_Blend = 1.0f - ((1.0f - R_Dest) * (1.0f - R_Col));\
+ G_Blend = 1.0f - ((1.0f - G_Dest) * (1.0f - G_Col));\
+ B_Blend = 1.0f - ((1.0f - B_Dest) * (1.0f - B_Col));\
+ } break;\
+ case blend_overlay:\
+ {\
+ if (R_Dest < 0.5) {\
+ R_Blend = 2.0f * R_Dest * R_Col;\
+ } else {\
+ R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));\
+ }\
+ if (G_Dest < 0.5) {\
+ G_Blend = 2.0f * G_Dest * G_Col;\
+ } else {\
+ G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));\
+ }\
+ if (B_Dest < 0.5) {\
+ B_Blend = 2.0f * B_Dest * B_Col;\
+ } else {\
+ B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));\
+ }\
+ } break;\
+ case blend_softlight:\
+ {\
+ /* using Pegtop's equation */ \
+ R_Blend = ((1.0f - R_Col * 2) * R_Dest * R_Dest) + (R_Col * 2 * R_Dest);\
+ G_Blend = ((1.0f - G_Col * 2) * G_Dest * G_Dest) + (G_Col * 2 * G_Dest);\
+ B_Blend = ((1.0f - B_Col * 2) * B_Dest * B_Dest) + (B_Col * 2 * B_Dest);\
+ } break;\
+ case blend_hardlight:\
+ {\
+ if (R_Dest > 0.5) {\
+ R_Blend = 2.0f * R_Dest * R_Col;\
+ } else {\
+ R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));\
+ }\
+ if (G_Dest > 0.5) {\
+ G_Blend = 2.0f * G_Dest * G_Col;\
+ } else {\
+ G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));\
+ }\
+ if (B_Dest > 0.5) {\
+ B_Blend = 2.0f * B_Dest * B_Col;\
+ } else {\
+ B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));\
+ }\
+ } break;\
+ case blend_subtract:\
+ {\
+ R_Blend = R_Dest - R_Col;\
+ G_Blend = G_Dest - G_Col;\
+ B_Blend = B_Dest - B_Col;\
+ } break;\
+ case blend_divide:\
+ {\
+ R_Blend = R_Dest / (R_Col + 0.001f);\
+ G_Blend = G_Dest / (G_Col + 0.001f);\
+ B_Blend = B_Dest / (B_Col + 0.001f);\
+ } break;\
+ case blend_difference:\
+ {\
+ if (R_Col - R_Dest > 0) {\
+ R_Blend = R_Col - R_Dest;\
+ } else {\
+ R_Blend = R_Dest - R_Col;\
+ }\
+ if (G_Col - G_Dest > 0) {\
+ G_Blend = G_Col - G_Dest;\
+ } else {\
+ G_Blend = G_Dest - G_Col;\
+ }\
+ if (B_Col - B_Dest > 0) {\
+ B_Blend = B_Col - B_Dest;\
+ } else {\
+ B_Blend = B_Dest - B_Col;\
+ }\
+ } break;\
+ }\
+
+static void
+Fallback_RenderDirect(direct_info T, void *OutputBuffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);
+
+ Assert(LayerBounds.Max.x <= T.BufferWidth);
+ Assert(LayerBounds.Max.y <= T.BufferHeight);
+
+ for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
+ {
+ uint16 LX = X;
+ uint16 LY = Y;
+ uint16 LXPlus = Ceil(X+1, (uint32)T.BufferWidth - 1);
+ uint16 LYPlus = Ceil(Y+1, (uint32)T.BufferHeight - 1);
+
+ uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + ((uint16)T.BufferPitch * LY) + (LX * (uint16)T.BufferBytesPerPixel));
+
+ uint32 *R_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 0);
+ uint32 *G_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 1);
+ uint32 *B_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 2);
+ uint32 *A_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 3);
+
+ real32 R_Src = (real32)(*R_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+
+ real32 R_Col = (real32)(*R_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 G_Col = (real32)(*G_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 B_Col = (real32)(*B_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 A_Col = (real32)(*A_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+
+ if (A_Col == 0)
+ continue;
+
+ real32 LayerAlpha = A_Col * T.Opacity;
+
+ uint8 *DestPixel =((uint8 *)OutputBuffer + ((uint16)Y * (uint16)T.BufferPitch) + ((uint16)X * (uint16)T.BufferBytesPerPixel));
+
+ uint32 *R_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 0);
+ uint32 *G_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 1);
+ uint32 *B_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 2);
+ uint32 *A_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 3);
+
+ uint32 R_DestInt = (*R_DestAddress & T.BufferBits.MaskPixel);
+ uint32 G_DestInt = (*G_DestAddress & T.BufferBits.MaskPixel);
+ uint32 B_DestInt = (*B_DestAddress & T.BufferBits.MaskPixel);
+ uint32 A_DestInt = (*A_DestAddress & T.BufferBits.MaskPixel);
+
+ real32 R_Dest = (real32)(R_DestInt) * T.BufferBits.Normalized;
+ real32 G_Dest = (real32)(G_DestInt) * T.BufferBits.Normalized;
+ real32 B_Dest = (real32)(B_DestInt) * T.BufferBits.Normalized;
+ real32 A_Dest = (real32)(A_DestInt) * T.BufferBits.Normalized;
+ real32 Test = (A_Dest > 0.01) ? 1 : 0;
+
+ real32 R_Blend = R_Col;
+ real32 G_Blend = G_Col;
+ real32 B_Blend = B_Col;
+ real32 A_Blend = A_Col;
+ // A_Blend = (A_Blend >= 0.04045) ? pow((A_Blend + 0.055) / (1 + 0.055), 2.4) : A_Blend / 12.92;
+
+ if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {
+
+ Fallback_Blend();
+
+ if (A_Dest == 0) {
+ A_Blend = LayerAlpha;
+ } else {
+ A_Blend = A_Dest + ((1.0f - A_Dest) * LayerAlpha);
+ real32 Alpha = pow(LayerAlpha, A_Dest);
+ R_Blend = (R_Dest * (1.0f - Alpha)) + (R_Blend * Alpha);
+ G_Blend = (G_Dest * (1.0f - Alpha)) + (G_Blend * Alpha);
+ B_Blend = (B_Dest * (1.0f - Alpha)) + (B_Blend * Alpha);
+ }
+ }
+
+ uint32 R_Out = (uint32)(Normalize(R_Blend) * T.BufferBits.Bits);
+ uint32 G_Out = (uint32)(Normalize(G_Blend) * T.BufferBits.Bits);
+ uint32 B_Out = (uint32)(Normalize(B_Blend) * T.BufferBits.Bits);
+ uint32 A_Out = (uint32)(Normalize(A_Blend) * T.BufferBits.Bits);
+
+ if (T.SwapActive)
+ {
+ *R_SrcAddress = (*R_SrcAddress & ~T.BufferBits.MaskPixel) | R_DestInt;
+ *G_SrcAddress = (*G_SrcAddress & ~T.BufferBits.MaskPixel) | G_DestInt;
+ *B_SrcAddress = (*B_SrcAddress & ~T.BufferBits.MaskPixel) | B_DestInt;
+ *A_SrcAddress = (*A_SrcAddress & ~T.BufferBits.MaskPixel) | A_DestInt;
+ }
+ *R_DestAddress = (*R_DestAddress & ~T.BufferBits.MaskPixel) | R_Out;
+ *G_DestAddress = (*G_DestAddress & ~T.BufferBits.MaskPixel) | G_Out;
+ *B_DestAddress = (*B_DestAddress & ~T.BufferBits.MaskPixel) | B_Out;
+ *A_DestAddress = (*A_DestAddress & ~T.BufferBits.MaskPixel) | A_Out;
+ }
+ }
+}
+
+static void
+Fallback_RenderLayer(transform_info T, void *OutputBuffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);
+
+ Assert(LayerBounds.Max.x <= T.BufferWidth);
+ Assert(LayerBounds.Max.y <= T.BufferHeight);
+
+ for (int Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ real32 StartVectorY = (real32)Y - T.OriginY;
+
+ for (int X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
+ {
+ real32 StartVectorX = X - T.OriginX;
+ real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
+ real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
+
+ if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {
+
+ real32 TexXFull = U * T.LayerWidth;
+ uint32 TexXInt = (uint32)TexXFull;
+ real32 TexX = TexXFull - TexXInt;
+
+ real32 TexYFull = V * T.LayerHeight;
+ uint32 TexYInt = (uint32)TexYFull;
+ real32 TexY = TexYFull - TexYInt;
+
+ real32 TexXInv = 1 - TexX;
+ real32 TexYInv = 1 - TexY;
+ real32 TexBothXInv = TexXInv * TexY;
+ real32 TexBothYInv = TexX * TexYInv;
+ real32 TexBoth = TexY * TexX;
+ real32 TexBothInv = TexXInv * TexYInv;
+
+ uint32 XLookup, YLookup, PixelToSeek;
+
+ uint32 LX = TexXInt;
+ uint32 LY = TexYInt;
+ uint32 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
+ uint32 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);
+
+ uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LY) + (LX * (uint32)T.LayerBytesPerPixel));
+ uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LY) + (LXPlus * (uint32)T.LayerBytesPerPixel));
+ uint8 *TexPTR2 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LYPlus) + (LX * (uint32)T.LayerBytesPerPixel));
+ uint8 *TexPTR3 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LYPlus) + (LXPlus * (uint32)T.LayerBytesPerPixel));
+
+ uint32 PixelA = *(uint32 *)TexPTR0;
+ uint32 PixelB = *(uint32 *)TexPTR1;
+ uint32 PixelC = *(uint32 *)TexPTR2;
+ uint32 PixelD = *(uint32 *)TexPTR3;
+
+
+#if 0
+ real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255;
+ real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255;
+ real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255;
+ real32 TexRD = (real32)(PixelD & 0xFF) * Normalized255;
+
+ real32 TexGA = (real32)((PixelA >> 8) & 0xFF) * Normalized255;
+ real32 TexGB = (real32)((PixelB >> 8) & 0xFF) * Normalized255;
+ real32 TexGC = (real32)((PixelC >> 8) & 0xFF) * Normalized255;
+ real32 TexGD = (real32)((PixelD >> 8) & 0xFF) * Normalized255;
+
+ real32 TexBA = (real32)((PixelA >> 16) & 0xFF) * Normalized255;
+ real32 TexBB = (real32)((PixelB >> 16) & 0xFF) * Normalized255;
+ real32 TexBC = (real32)((PixelC >> 16) & 0xFF) * Normalized255;
+ real32 TexBD = (real32)((PixelD >> 16) & 0xFF) * Normalized255;
+
+ real32 TexAA = (real32)((PixelA >> 24) & 0xFF) * Normalized255;
+ real32 TexAB = (real32)((PixelB >> 24) & 0xFF) * Normalized255;
+ real32 TexAC = (real32)((PixelC >> 24) & 0xFF) * Normalized255;
+ real32 TexAD = (real32)((PixelD >> 24) & 0xFF) * Normalized255;
+#else
+ real32 TexRA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexGA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexBA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexAA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+
+ real32 TexRB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexGB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexBB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexAB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+
+ real32 TexRC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexGC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexBC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexAC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+
+ real32 TexRD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexGD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexBD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+ real32 TexAD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
+
+#endif
+
+ real32 R_Col = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
+ + (TexBothXInv * TexRC) + (TexBoth * TexRD);
+ real32 G_Col = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
+ + (TexBothXInv * TexGC) + (TexBoth * TexGD);
+ real32 B_Col = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
+ + (TexBothXInv * TexBC) + (TexBoth * TexBD);
+ real32 A_Col = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
+ + (TexBothXInv * TexAC) + (TexBoth * TexAD);
+
+ real32 LayerAlpha = A_Col * T.LayerOpacity;
+
+#if DEBUG
+ if (Debug.DisableAlpha == 1) {
+ A_Col = 1;
+ LayerAlpha = 1;
+ }
+#endif
+
+ real32 R_Blend = R_Col;
+ real32 G_Blend = G_Col;
+ real32 B_Blend = B_Col;
+ real32 A_Blend = A_Col;
+
+ uint8 *DestPixel =((uint8 *)OutputBuffer + ((uint32)Y * (uint32)T.BufferPitch) + ((uint32)X * (uint32)T.BufferBytesPerPixel));
+ Assert(X != (T.BufferWidth));
+
+ uint8 *R_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 0);
+ uint8 *G_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 1);
+ uint8 *B_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 2);
+ uint8 *A_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 3);
+
+ if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {
+
+ real32 R_Dest = (real32)(*R_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 G_Dest = (real32)(*G_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 B_Dest = (real32)(*B_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+ real32 A_Dest = (real32)(*A_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
+
+ Fallback_Blend();
+
+ R_Blend = (R_Dest * (1.0f - LayerAlpha)) + (R_Blend * LayerAlpha);
+ G_Blend = (G_Dest * (1.0f - LayerAlpha)) + (G_Blend * LayerAlpha);
+ B_Blend = (B_Dest * (1.0f - LayerAlpha)) + (B_Blend * LayerAlpha);
+
+ if (T.BlendMode == blend_normal)
+ A_Blend = A_Dest + LayerAlpha;
+ // A_Blend = A_Dest + ((1.0f - A_Dest) * LayerAlpha);
+ else
+ A_Blend = A_Dest;
+#if DEBUG
+ if (Debug.DisableAlpha == 1) {
+ G_Blend = R_Blend;
+ B_Blend = R_Blend;
+ } else
+ if (Debug.DisableAlpha == 2) {
+ R_Blend = LayerAlpha;
+ G_Blend = LayerAlpha;
+ B_Blend = LayerAlpha;
+ }
+#endif
+ }
+
+ uint8 R_Out = (uint8)(Normalize(R_Blend) * T.BufferBits.Bits);
+ uint8 G_Out = (uint8)(Normalize(G_Blend) * T.BufferBits.Bits);
+ uint8 B_Out = (uint8)(Normalize(B_Blend) * T.BufferBits.Bits);
+ uint8 A_Out = (uint8)(Normalize(A_Blend) * T.BufferBits.Bits);
+
+ *R_DestAddress = R_Out;
+ *G_DestAddress = G_Out;
+ *B_DestAddress = B_Out;
+ *A_DestAddress = A_Out;
+ // *R_DestAddress = 255;
+ // *G_DestAddress = 255;
+ // *B_DestAddress = 255;
+ // *A_DestAddress = 255;
+ }
+ }
+ }
+}
+
+#if 0
+static void
+Layer_CalcRotatedOffset(project_layer *Layer, v2 Increment, v2 Divisor, real32 *ValueX, real32 *ValueY)
+{
+
+ real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
+ real32 s = Layer->scale.CurrentValue.f;
+
+ v2 XAxis = V2(cos(Rad), sin(Rad)) * (Increment.x / s);
+ v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Increment.y / -s);
+
+ *ValueX += XAxis.x/Divisor.x;
+ *ValueY -= XAxis.y/Divisor.y;
+ *ValueX -= YAxis.x/Divisor.x;
+ *ValueY += YAxis.y/Divisor.y;
+}
+
+static transform_info
+CalculateTransforms(project_layer *Layer, comp_buffer *CompBuffer)
+{
+ transform_info TransformInfo;
+ source *Source = Layer->Source;
+
+ real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
+ real32 s = Layer->scale.CurrentValue.f;
+ // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};
+
+ v2 XAxis = (Source->Info.Width * s)*V2(cos(Rad), sin(Rad));
+ v2 YAxis = (Source->Info.Height * -s)*V2(sin(Rad), -cos(Rad));
+
+ real32 AnchorX = Layer->ax.CurrentValue.f;
+ real32 AnchorY = Layer->ay.CurrentValue.f;
+
+ v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f};
+ v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
+
+ real32 XLengthSq = 1.0f / LengthSq(XAxis);
+ real32 YLengthSq = 1.0f / LengthSq(YAxis);
+
+ int32 MaxX = 0;
+ int32 MaxY = 0;
+ int32 MinX = CompBuffer->Width;
+ int32 MinY = CompBuffer->Height;
+
+ v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
+ for (int i = 0; i < 4; i++) {
+ if (Points[i].x < MinX) { MinX = Points[i].x; }
+ if (Points[i].y < MinY) { MinY = Points[i].y; }
+ if (Points[i].x > MaxX) { MaxX = Points[i].x; }
+ if (Points[i].y > MaxY) { MaxY = Points[i].y; }
+ }
+
+ TransformInfo.XAxisPX = XLengthSq*XAxis.x;
+ TransformInfo.XAxisPY = XLengthSq*XAxis.y;
+ TransformInfo.YAxisPX = YLengthSq*YAxis.x;
+ TransformInfo.YAxisPY = YLengthSq*YAxis.y;
+
+ uint16 Width = Source->Info.Width;
+ uint16 Height = Source->Info.Height;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP);
+
+ TransformInfo.LayerWidth = Width;
+ TransformInfo.LayerHeight = Height;
+ TransformInfo.FullLayerWidth = WidthP;
+ TransformInfo.FullLayerHeight = HeightP;
+ TransformInfo.LayerOpacity = Layer->opacity.CurrentValue.f;
+ TransformInfo.BlendMode =Layer->BlendMode;
+ TransformInfo.OriginX = Origin.x;
+ TransformInfo.OriginY = Origin.y;
+ TransformInfo.BufferPitch = CompBuffer->Width*CompBuffer->BytesPerPixel;
+ TransformInfo.LayerPitch = Source->Info.Width*Source->Info.BytesPerPixel;
+ TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1};
+
+ TransformInfo.SourceBuffer = Layer->BitmapInfo.BitmapBuffer;
+
+ return TransformInfo;
+}
+
+static void
+EndRenderState(project_state *State)
+{
+ IsRendering = false;
+
+ for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
+ {
+ State->LayersToRender[i] = 0;
+ }
+
+ State->NumberOfLayersToRender = 0;
+ SDL_AtomicSet(&CurrentEntry, 0);
+ SDL_AtomicSet(&QueuedEntries, 0);
+ SDL_AtomicSet(&CompletedEntries, 0);
+
+}
+
+static void
+RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
+ for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
+ int16 Idx = RenderInfo->State->LayersToRender[i];
+
+#if ARM
+ if (InstructionMode == instruction_mode_neon)
+ Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+#else
+ if (InstructionMode == instruction_mode_avx)
+ AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ else if (InstructionMode == instruction_mode_sse)
+ SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+#endif
+ else
+ Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
+ }
+}
+
+static void
+FinishRenderAndUpload(project_state *State, comp_buffer *CompBuffer, GLuint textureID)
+{
+#if PERF
+ Test = __rdtsc() - Test;
+
+ Debug.PixelCountRendered = 1280*720*5;
+ printf("Cycles per pixel rendered: %li ", Test / Debug.PixelCountRendered);
+ printf("Pixels rendered: %li ", Debug.PixelCountRendered);
+ printf("Cycles: %li\n", Test);
+
+ Test = 0;
+ Debug.PixelCountTransparent = 0;
+ Debug.PixelCountRendered = 0;
+ Debug.PixelCountChecked = 0;
+#endif
+
+
+#if PACKEDRGB
+ Bitmap_ConvertPacking(CompBuffer->PackedBuffer, CompBuffer->UnpackedBuffer,
+ CompBuffer->Width, CompBuffer->Height, CompBuffer->BytesPerPixel, 1);
+#endif
+ EndRenderState(State);
+ glBindTexture(GL_TEXTURE_2D, textureID);
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE,
+ CompBuffer->UnpackedBuffer);
+
+ // shmp->shared_framenumber = File.CurrentFrame;
+ // if (sem_post(&shmp->sem2) == -1)
+ // Assert(0);
+}
+
+static void
+QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State)
+{
+ IsRendering = true;
+ render_queue RenderInfo = {File, State, CompBuffer};
+
+#if PERF
+ Test = __rdtsc();
+#endif
+
+ for (int16 i = 0; i < File->NumberOfLayers; i++)
+ {
+ if (File->Layer[i]->StartFrame <= File->CurrentFrame &&
+ File->Layer[i]->EndFrame >= File->CurrentFrame)
+ {
+ File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer);
+ State->LayersToRender[State->NumberOfLayersToRender] = i;
+ State->NumberOfLayersToRender++;
+ }
+ }
+
+
+ uint16 TileWidth = CompBuffer->Width / 4;
+ uint16 TileHeight = CompBuffer->Height / 4;
+
+ for (int y = 0; y < 4; y++) {
+ for (int x = 0; x < 4; x++) {
+ // if (x == y) {
+ rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y};
+ // The render regions always have to be aligned to the top left of
+ // a 4x4 chunk (at least for AVX2) and cannot exceed the bounds of
+ // the comp.
+ // It seems we don't need any special math to guarantee this aside
+ // from dividing by 4 and modulating.
+ RenderRegion.Min.x -= RenderRegion.Min.x % 4;
+ RenderRegion.Min.y -= RenderRegion.Min.y % 4;
+ RenderRegion.Max.x -= RenderRegion.Max.x % 4;
+ RenderRegion.Max.y -= RenderRegion.Max.y % 4;
+ if (RenderRegion.Max.x > CompBuffer->Width)
+ RenderRegion.Max.x = CompBuffer->Width;
+ if (RenderRegion.Max.y > CompBuffer->Height)
+ RenderRegion.Max.y = CompBuffer->Height;
+ PushRect(RenderRegion);
+ // }
+ }
+ }
+
+
+ rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
+ RenderLayers(&RenderInfo, RenderRegion);
+
+}
+
+#if ARM
+
+static void
+NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ float32x4_t XAxisPX = vdupq_n_f32(T.XAxisPX);
+ float32x4_t XAxisPY = vdupq_n_f32(T.XAxisPY);
+ float32x4_t YAxisPX = vdupq_n_f32(T.YAxisPX);
+ float32x4_t YAxisPY = vdupq_n_f32(T.YAxisPY);
+
+ float32x4_t LayerWidth = vdupq_n_f32(T.LayerWidth);
+ int32x4_t FullLayerWidth4i = vdupq_n_s32(T.FullLayerWidth*4);
+ int32x4_t LayerWidthMinusOne = vdupq_n_s32(T.LayerWidth - 1);
+ int32x4_t LayerHeightMinusOne = vdupq_n_s32(T.LayerHeight - 1);
+ float32x4_t LayerHeight = vdupq_n_f32(T.LayerHeight);
+ float32x4_t LayerOpacity = vdupq_n_f32(T.LayerOpacity);
+ float32x4_t OriginX = vdupq_n_f32(T.OriginX);
+ float32x4_t OriginY = vdupq_n_f32(T.OriginY);
+
+ float32x4_t ClipPrevent = vdupq_n_f32(0.001f);
+ float32x4_t One = vdupq_n_f32(1);
+ float32x4_t Two = vdupq_n_f32(2);
+ float32x4_t Zero = vdupq_n_f32(0);
+
+ float32x4_t ZeroPoint25 = vdupq_n_f32(0.25);
+ float32x4_t ZeroPointFive = vdupq_n_f32(0.5);
+ int32x4_t Onei = vdupq_n_s32(1);
+ float32x4_t Four = vdupq_n_f32(4);
+ int32x4_t FF = vdupq_n_s32(0xFF);
+ int32x4_t BottomTwoBits = vdupq_n_s32(0x03);
+ int32x4_t Fouri = vdupq_n_s32(4);
+ int32x4_t Sixteeni = vdupq_n_s32(16);
+ float32x4_t Real255 = vdupq_n_f32(255.0f);
+ float32x4_t Norm255 = vdupq_n_f32(1/255.0f);
+
+ // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
+
+ // TODO(fox): A possible optimization could be made by using the 32x4x4
+ // load intrinsic and a loop that repeats four times.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ real32 xvals[4] = { (real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3 };
+ float32x4_t PixelX = vld1q_f32(xvals);
+
+ float32x4_t PixelY = vdupq_n_f32((real32)Y);
+ float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY);
+
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+ {
+
+ float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
+ float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY));
+ float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY));
+
+ uint32x4_t LayerMask = vandq_u32(vandq_u32(vcgeq_f32(U, Zero), vcltq_f32(U, One)),
+ vandq_u32(vcgeq_f32(V, Zero), vcltq_f32(V, One)));
+
+ // TODO(fox): Make more efficient with some sort of truncation
+ uint32 comp[4];
+ vst1q_u32(comp, LayerMask);
+ if (comp[0] || comp[1] || comp[2] || comp[3]) {
+ U = vmaxq_f32(vminq_f32(One, U), Zero);
+ V = vmaxq_f32(vminq_f32(One, V), Zero);
+
+ float32x4_t TexXFull = vmulq_f32(U, LayerWidth);
+ float32x4_t TexYFull = vmulq_f32(V, LayerHeight);
+ int32x4_t TexXInt = vcvtq_s32_f32(TexXFull);
+ int32x4_t TexXIntPlusOne = vaddq_f32(TexXInt, vandq_u32(vcltq_u32(TexXInt, LayerWidthMinusOne), Onei));
+ int32x4_t TexYInt = vcvtq_s32_f32(TexYFull);
+ int32x4_t TexYIntPlusOne = vaddq_f32(TexYInt, vandq_u32(vcltq_u32(TexYInt, LayerWidthMinusOne), Onei));
+
+ float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_u32(TexXInt));
+ float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_u32(TexYInt));
+ float32x4_t TexXInv = vsubq_f32(One, TexX);
+ float32x4_t TexYInv = vsubq_f32(One, TexY);
+ float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY);
+ float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv);
+ float32x4_t TexBoth = vmulq_f32(TexY, TexX);
+ float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv);
+
+ int32x4_t XLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXInt, 2), Sixteeni),
+ vandq_u32(TexXInt, BottomTwoBits));
+ int32x4_t YLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYInt, 2), FullLayerWidth4i),
+ vmulq_u32(vandq_u32(TexYInt, BottomTwoBits), Fouri));
+ int32x4_t XLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXIntPlusOne, 2), Sixteeni),
+ vandq_u32(TexXIntPlusOne, BottomTwoBits));
+ int32x4_t YLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ vmulq_u32(vandq_u32(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ int32x4_t PixelLookupTL = vaddq_u32(XLookup, YLookup);
+ int32x4_t PixelLookupTR = vaddq_u32(XLookupPlusOne, YLookup);
+ int32x4_t PixelLookupBL = vaddq_u32(XLookup, YLookupPlusOne);
+ int32x4_t PixelLookupBR = vaddq_u32(XLookupPlusOne, YLookupPlusOne);
+
+ // I thought NEON had gather/scatter, but it appears it doesn't...
+ }
+
+ PixelX = vaddq_f32(PixelX, Four);
+ }
+ }
+}
+
+#else
+
+#if 0
+#include "iacaMarks.h"
+#else
+#define IACA_START
+#define IACA_END
+#endif
+
+static void
+AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX);
+ __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY);
+ __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX);
+ __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
+
+ __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
+ __m256 LayerBoundsMaxX = _mm256_set1_ps(LayerBounds.Max.x);
+ __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
+ __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
+ __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
+ __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
+ __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
+ __m256 OriginX = _mm256_set1_ps(T.OriginX);
+ __m256 OriginY = _mm256_set1_ps(T.OriginY);
+
+ __m256 ClipPrevent = _mm256_set1_ps(0.001f);
+ __m256 One = _mm256_set1_ps(1);
+ __m256 Two = _mm256_set1_ps(2);
+ __m256 Zero = _mm256_set1_ps(0);
+
+ __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
+ __m256 ZeroPointFive = _mm256_set1_ps(0.5);
+ __m256i Onei = _mm256_set1_epi32(1);
+ __m256 Four = _mm256_set1_ps(4);
+ __m256 Eight = _mm256_set1_ps(8);
+ __m256i FF = _mm256_set1_epi32(0xFF);
+ __m256i BottomTwoBits = _mm256_set1_epi32(0x03);
+ __m256i Fouri = _mm256_set1_epi32(4);
+ __m256i Sixteeni = _mm256_set1_epi32(16);
+ __m256 Real255 = _mm256_set1_ps(255.0f);
+ __m256 Norm255 = _mm256_set1_ps(1/255.0f);
+ // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
+ // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF);
+
+ // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
+ __m256 X0 = _mm256_set1_ps(0.30);
+ __m256 Y0 = _mm256_set1_ps(0.10);
+ __m256 X1 = _mm256_set1_ps(0.80);
+ __m256 Y1 = _mm256_set1_ps(0.35);
+ __m256 X2 = _mm256_set1_ps(0.05);
+ __m256 Y2 = _mm256_set1_ps(0.60);
+ __m256 X3 = _mm256_set1_ps(0.55);
+ __m256 Y3 = _mm256_set1_ps(0.85);
+
+
+#if PACKEDRGB
+#else
+ __m256i LayerPitch = _mm256_set1_epi32(T.LayerPitch);
+ __m256i BytesPerPixel = _mm256_set1_epi32(Buffer->BytesPerPixel);
+#endif
+
+#if PACKEDRGB
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2)
+ {
+ __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3,
+ (real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3);
+
+ __m256 PixelY = _mm256_setr_ps((real32)Y,
+ (real32)Y,
+ (real32)Y,
+ (real32)Y,
+ (real32)Y+1,
+ (real32)Y+1,
+ (real32)Y+1,
+ (real32)Y+1);
+#else
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3,
+ (real32)LayerBounds.Min.x+4,
+ (real32)LayerBounds.Min.x+5,
+ (real32)LayerBounds.Min.x+6,
+ (real32)LayerBounds.Min.x+7);
+
+ __m256 PixelY = _mm256_set1_ps((real32)Y);
+#endif
+
+ __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY);
+
+#if PACKEDRGB
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+#else
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 8)
+#endif
+ {
+
+ IACA_START;
+
+ __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+ __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
+ __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
+ __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
+ __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
+ __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
+ __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
+ __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
+ __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
+
+#if PACKEDRGB
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
+#else
+ uint8 *Pixel = (uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel;
+#endif
+
+ __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
+ __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
+
+ __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
+ __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
+ __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
+ __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
+ __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
+ __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
+ __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
+ __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));
+
+ __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
+ __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
+ __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
+ __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));
+
+ // Each point that passes adds .25
+ __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
+ _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
+
+ // Preventing overlap between threads for non-packed. One nice thing
+ // about packed is that the 4-padded bitmap means we can set up the
+ // boundaries so we don't have to check this ever.
+ __m256i TileBarrier = _mm256_cvtps_epi32(_mm256_cmp_ps(PixelX, LayerBoundsMaxX, 13));
+
+ // Zero - no points pass
+ // One - all points pass; not an edge
+ __m256i Mask = _mm256_cvtps_epi32(_mm256_cmp_ps(Avg, Zero, 14));
+ __m256i NonEdge = _mm256_cvtps_epi32(_mm256_cmp_ps(Avg, One, 13));
+ __m256i TotalMask = _mm256_andnot_si256(TileBarrier, _mm256_and_si256(Mask, NonEdge));
+
+ // __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)),
+ // _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1)));
+
+ // If all of the pixels are zeroed in the mask (aka fall outside
+ // the UV lookup), we can skip the iteration.
+ if (_mm256_movemask_epi8(TotalMask))
+ {
+ __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
+
+ U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
+ V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
+
+ __m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
+ __m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
+ __m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
+ __m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
+ __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
+ __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+ // NOTE(fox): The comparison is for when we're on the last pixel of the texel.
+
+ __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
+ __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
+ __m256 TexXInv = _mm256_sub_ps(One, TexX);
+ __m256 TexYInv = _mm256_sub_ps(One, TexY);
+ __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY);
+ __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv);
+ __m256 TexBoth = _mm256_mul_ps(TexY, TexX);
+ __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv);
+
+#if PACKEDRGB
+ __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
+ _mm256_and_si256(TexXInt, BottomTwoBits));
+ __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
+ _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
+ __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
+ _mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
+ __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
+#else
+ __m256i XLookup = TexXInt;
+ __m256i YLookup = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYInt), LayerWidth));
+ __m256i XLookupPlusOne = TexXIntPlusOne;
+ __m256i YLookupPlusOne = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYIntPlusOne), LayerWidth));
+#endif
+
+ __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
+ __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup);
+ __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne);
+ __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne);
+
+ // The big feature of AVX2: gathering.
+ __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4);
+ __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4);
+ __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4);
+ __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4);
+
+ __m256 R_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTL, FF)), Norm255);
+ __m256 G_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF)), Norm255);
+ __m256 B_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF)), Norm255);
+ __m256 A_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF)), Norm255);
+
+ __m256 R_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTR, FF)), Norm255);
+ __m256 G_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF)), Norm255);
+ __m256 B_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF)), Norm255);
+ __m256 A_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF)), Norm255);
+
+ __m256 R_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBL, FF)), Norm255);
+ __m256 G_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF)), Norm255);
+ __m256 B_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF)), Norm255);
+ __m256 A_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF)), Norm255);
+
+ __m256 R_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBR, FF)), Norm255);
+ __m256 G_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF)), Norm255);
+ __m256 B_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF)), Norm255);
+ __m256 A_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF)), Norm255);
+
+ __m256 R_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, R_TexTL),
+ _mm256_mul_ps(TexBothYInv, R_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, R_TexBL),
+ _mm256_mul_ps(TexBoth, R_TexBR)));
+ __m256 G_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, G_TexTL),
+ _mm256_mul_ps(TexBothYInv, G_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, G_TexBL),
+ _mm256_mul_ps(TexBoth, G_TexBR)));
+ __m256 B_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, B_TexTL),
+ _mm256_mul_ps(TexBothYInv, B_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, B_TexBL),
+ _mm256_mul_ps(TexBoth, B_TexBR)));
+ __m256 A_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, A_TexTL),
+ _mm256_mul_ps(TexBothYInv, A_TexTR)),
+ _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
+ _mm256_mul_ps(TexBoth, A_TexBR)));
+
+ // Apply anti-aliasing to edges if there are any
+ if (_mm256_movemask_epi8(EdgeMask))
+ {
+ A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), _mm256_cvtepi32_ps(EdgeMask));
+ }
+
+ IACA_END;
+ __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
+ __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
+
+ // Hoisted out of some blend modes; maybe it'd be better to just keep them in there.
+ __m256 R_Colx2 = _mm256_mul_ps(R_Col, Two);
+ __m256 R_ColInv = _mm256_sub_ps(One, R_Col);
+
+ __m256 G_Colx2 = _mm256_mul_ps(G_Col, Two);
+ __m256 G_ColInv = _mm256_sub_ps(One, G_Col);
+
+ __m256 B_Colx2 = _mm256_mul_ps(B_Col, Two);
+ __m256 B_ColInv = _mm256_sub_ps(One, B_Col);
+
+ __m256 R_Blend = R_Col;
+ __m256 G_Blend = G_Col;
+ __m256 B_Blend = B_Col;
+ __m256 A_Blend = LayerAlpha;
+
+ // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
+ if (T.BlendMode != blend_normal || _mm256_movemask_epi8(_mm256_cvtps_epi32(_mm256_cmp_ps(LayerAlpha, One, 2))))
+ {
+ __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
+ __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
+ __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF)), Norm255);
+ __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Norm255);
+ __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF)), Norm255);
+
+ switch (T.BlendMode)
+ {
+ case blend_normal:
+ {
+ } break;
+ case blend_multiply:
+ {
+ R_Blend = _mm256_mul_ps(R_Dest, R_Col);
+ G_Blend = _mm256_mul_ps(G_Dest, G_Col);
+ B_Blend = _mm256_mul_ps(B_Dest, B_Col);
+ } break;
+ case blend_colorburn:
+ {
+ // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
+ // color channels, causing black clipping.
+ R_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), _mm256_add_ps(R_Col, ClipPrevent)));
+ G_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), _mm256_add_ps(G_Col, ClipPrevent)));
+ B_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), _mm256_add_ps(B_Col, ClipPrevent)));
+ } break;
+ case blend_linearburn:
+ {
+ R_Blend = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
+ G_Blend = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
+ B_Blend = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
+ } break;
+ case blend_add:
+ {
+ R_Blend = _mm256_add_ps(R_Dest, R_Col);
+ G_Blend = _mm256_add_ps(G_Dest, G_Col);
+ B_Blend = _mm256_add_ps(B_Dest, B_Col);
+ } break;
+ case blend_screen:
+ {
+ R_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
+ G_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
+ B_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
+ } break;
+ case blend_overlay:
+ {
+ __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 1);
+ __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 1);
+ __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 1);
+ __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+ __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+ __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+ __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
+ __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
+ __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_softlight:
+ {
+ // using Pegtop's equation
+ R_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
+ G_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
+ B_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
+ } break;
+ case blend_hardlight:
+ {
+ __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 13);
+ __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 13);
+ __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 13);
+ __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
+ __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
+ __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
+ __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
+ __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
+ __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_subtract:
+ {
+ R_Blend = _mm256_sub_ps(R_Dest, R_Col);
+ G_Blend = _mm256_sub_ps(G_Dest, G_Col);
+ B_Blend = _mm256_sub_ps(B_Dest, B_Col);
+ } break;
+ case blend_divide:
+ {
+ R_Blend = _mm256_div_ps(R_Dest, _mm256_add_ps(R_Col, ClipPrevent));
+ G_Blend = _mm256_div_ps(G_Dest, _mm256_add_ps(G_Col, ClipPrevent));
+ B_Blend = _mm256_div_ps(B_Dest, _mm256_add_ps(B_Col, ClipPrevent));
+ } break;
+ case blend_difference:
+ {
+ __m256 R_Lower = _mm256_sub_ps(R_Col, R_Dest);
+ __m256 G_Lower = _mm256_sub_ps(G_Col, G_Dest);
+ __m256 B_Lower = _mm256_sub_ps(B_Col, B_Dest);
+ __m256 R_Upper = _mm256_sub_ps(R_Dest, R_Col);
+ __m256 G_Upper = _mm256_sub_ps(G_Dest, G_Col);
+ __m256 B_Upper = _mm256_sub_ps(B_Dest, B_Col);
+ __m256 R_Mask = _mm256_cmp_ps(R_Lower, Zero, 14);
+ __m256 G_Mask = _mm256_cmp_ps(G_Lower, Zero, 14);
+ __m256 B_Mask = _mm256_cmp_ps(B_Lower, Zero, 14);
+ R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ }
+
+ R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, LayerAlphaInv), _mm256_mul_ps(R_Blend, LayerAlpha));
+ G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, LayerAlphaInv), _mm256_mul_ps(G_Blend, LayerAlpha));
+ B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, LayerAlphaInv), _mm256_mul_ps(B_Blend, LayerAlpha));
+
+ // Standard behavior in photo apps is for blend modes to
+ // inherit underlying opacity instead of adding to it.
+ if (T.BlendMode == blend_normal)
+ A_Blend = _mm256_add_ps(A_Dest, LayerAlpha);
+ else
+ A_Blend = A_Dest;
+ }
+
+ __m256i R_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, R_Blend), Zero), Real255));
+ __m256i G_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, G_Blend), Zero), Real255));
+ __m256i B_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, B_Blend), Zero), Real255));
+ __m256i A_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, A_Blend), Zero), Real255));
+
+ __m256i OutputPixel = _mm256_or_si256(
+ _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
+ _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
+
+ _mm256_maskstore_epi32((int *)Pixel, TotalMask, OutputPixel);
+ }
+#if PACKEDRGB
+ PixelX = _mm256_add_ps(PixelX, Four);
+#else
+ PixelX = _mm256_add_ps(PixelX, Eight);
+#endif
+ }
+ }
+}
+
+static void
+SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
+{
+ rectangle LayerBounds = ClipRectangle( T.ClipRect,
+ RenderRegion );
+ // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
+ LayerBounds.Min.x -= LayerBounds.Min.x % 4;
+ LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
+ uint8 *TexPTR = (uint8 *)T.SourceBuffer;
+ Assert(LayerBounds.Max.x <= Buffer->Width);
+ Assert(LayerBounds.Max.y <= Buffer->Height);
+
+ __m128 XAxisPX = _mm_set1_ps(T.XAxisPX);
+ __m128 XAxisPY = _mm_set1_ps(T.XAxisPY);
+ __m128 YAxisPX = _mm_set1_ps(T.YAxisPX);
+ __m128 YAxisPY = _mm_set1_ps(T.YAxisPY);
+
+ __m128 LayerWidth = _mm_set1_ps(T.LayerWidth);
+ __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1);
+ __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4);
+ __m128 LayerHeight = _mm_set1_ps(T.LayerHeight);
+ __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1);
+ __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity);
+ __m128 OriginX = _mm_set1_ps(T.OriginX);
+ __m128 OriginY = _mm_set1_ps(T.OriginY);
+
+ __m128 ClipPrevent = _mm_set1_ps(0.001f);
+ __m128 One = _mm_set1_ps(1);
+ __m128 Two = _mm_set1_ps(2);
+ __m128 Zero = _mm_set1_ps(0);
+ __m128 ZeroPointFive = _mm_set1_ps(0.5);
+ __m128i Onei = _mm_set1_epi32(1);
+ __m128 Four = _mm_set1_ps(4);
+ __m128i FF = _mm_set1_epi32(0xFF);
+ __m128i BottomTwoBits = _mm_set1_epi32(0x03);
+ __m128i Fouri = _mm_set1_epi32(4);
+ __m128i Sixteeni = _mm_set1_epi32(16);
+ __m128 Reg255 = _mm_set1_ps(255.0f);
+ __m128 Norm255 = _mm_set1_ps(1/255.0f);
+
+ // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
+
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3);
+
+ __m128 PixelY = _mm_set1_ps((real32)Y);
+ __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY);
+
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+ {
+
+ __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
+
+ uint32 XLookup = (X >> 2)*16 + (X % 4);
+ uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
+ uint32 PixelToSeek = XLookup + YLookup;
+ uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
+
+ __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
+ __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
+
+ __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
+ _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));
+
+ if (_mm_movemask_epi8(LayerMask))
+ {
+ U = _mm_max_ps(_mm_min_ps(One, U), Zero);
+ V = _mm_max_ps(_mm_min_ps(One, V), Zero);
+
+ __m128 TexXFull = _mm_mul_ps(U, LayerWidth);
+ __m128 TexYFull = _mm_mul_ps(V, LayerHeight);
+ __m128i TexXInt = _mm_cvttps_epi32(TexXFull);
+ __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei));
+ __m128i TexYInt = _mm_cvttps_epi32(TexYFull);
+ __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei));
+
+ __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt));
+ __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt));
+ __m128 TexXInv = _mm_sub_ps(One, TexX);
+ __m128 TexYInv = _mm_sub_ps(One, TexY);
+ __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY);
+ __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv);
+ __m128 TexBoth = _mm_mul_ps(TexY, TexX);
+ __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv);
+
+ __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni),
+ _mm_and_si128(TexXInt, BottomTwoBits));
+ __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri));
+ __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
+ _mm_and_si128(TexXIntPlusOne, BottomTwoBits));
+ __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
+ _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri));
+
+ __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup);
+ __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup);
+ __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne);
+ __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne);
+
+ // SSE lacks gathering, so we have no choice but to manually
+ // look up each pixel's four bilinear samples in scalar.
+
+ uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL);
+ uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR);
+ uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL);
+ uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR);
+ uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4);
+ uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4);
+ uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4);
+ uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4);
+
+ uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4));
+ uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4));
+ uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4));
+ uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4));
+ uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4);
+ uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4);
+ uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4);
+ uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4);
+
+ uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8));
+ uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8));
+ uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8));
+ uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8));
+ uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4);
+ uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4);
+ uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4);
+ uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4);
+
+ uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12));
+ uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12));
+ uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12));
+ uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12));
+ uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4);
+ uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4);
+ uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4);
+ uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4);
+
+ __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3);
+ __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3);
+ __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
+ __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);
+
+ __m128 R_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTL, FF)), Norm255);
+ __m128 G_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF)), Norm255);
+ __m128 B_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF)), Norm255);
+ __m128 A_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF)), Norm255);
+
+ __m128 R_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTR, FF)), Norm255);
+ __m128 G_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF)), Norm255);
+ __m128 B_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF)), Norm255);
+ __m128 A_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF)), Norm255);
+
+ __m128 R_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBL, FF)), Norm255);
+ __m128 G_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF)), Norm255);
+ __m128 B_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF)), Norm255);
+ __m128 A_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF)), Norm255);
+
+ __m128 R_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBR, FF)), Norm255);
+ __m128 G_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF)), Norm255);
+ __m128 B_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF)), Norm255);
+ __m128 A_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF)), Norm255);
+
+ __m128 R_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, R_TexTL),
+ _mm_mul_ps(TexBothYInv, R_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, R_TexBL),
+ _mm_mul_ps(TexBoth, R_TexBR)));
+ __m128 G_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, G_TexTL),
+ _mm_mul_ps(TexBothYInv, G_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, G_TexBL),
+ _mm_mul_ps(TexBoth, G_TexBR)));
+ __m128 B_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, B_TexTL),
+ _mm_mul_ps(TexBothYInv, B_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, B_TexBL),
+ _mm_mul_ps(TexBoth, B_TexBR)));
+ __m128 A_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, A_TexTL),
+ _mm_mul_ps(TexBothYInv, A_TexTR)),
+ _mm_add_ps(_mm_mul_ps(TexBothXInv, A_TexBL),
+ _mm_mul_ps(TexBoth, A_TexBR)));
+
+
+ __m128i R_Out, G_Out, B_Out, A_Out;
+
+ __m128 LayerAlpha = _mm_mul_ps(A_Col, LayerOpacity);
+ __m128 LayerAlphaInv = _mm_sub_ps(One, LayerAlpha);
+
+ __m128 R_Colx2 = _mm_mul_ps(R_Col, Two);
+ __m128 R_ColInv = _mm_sub_ps(One, R_Col);
+
+ __m128 G_Colx2 = _mm_mul_ps(G_Col, Two);
+ __m128 G_ColInv = _mm_sub_ps(One, G_Col);
+
+ __m128 B_Colx2 = _mm_mul_ps(B_Col, Two);
+ __m128 B_ColInv = _mm_sub_ps(One, B_Col);
+
+ __m128 R_Blend = R_Col;
+ __m128 G_Blend = G_Col;
+ __m128 B_Blend = B_Col;
+ __m128 A_Blend = LayerAlpha;
+
+ if (!_mm_movemask_epi8(_mm_cvtps_epi32(_mm_cmpeq_ps(LayerAlpha, One))) || T.BlendMode != blend_normal)
+ {
+ __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
+ __m128 R_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( DestPixel, FF)), Norm255);
+ __m128 G_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF)), Norm255);
+ __m128 B_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF)), Norm255);
+ __m128 A_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF)), Norm255);
+
+ switch (T.BlendMode)
+ {
+ case blend_normal:
+ {
+ } break;
+ case blend_multiply:
+ {
+ R_Blend = _mm_mul_ps(R_Dest, R_Col);
+ G_Blend = _mm_mul_ps(G_Dest, G_Col);
+ B_Blend = _mm_mul_ps(B_Dest, B_Col);
+ } break;
+ case blend_colorburn:
+ {
+ // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
+ // color channels, causing black clipping.
+ R_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, R_Dest), _mm_add_ps(R_Col, ClipPrevent)));
+ G_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, G_Dest), _mm_add_ps(G_Col, ClipPrevent)));
+ B_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, B_Dest), _mm_add_ps(B_Col, ClipPrevent)));
+ } break;
+ case blend_linearburn:
+ {
+ R_Blend = _mm_sub_ps(_mm_add_ps(R_Dest, R_Col), One);
+ G_Blend = _mm_sub_ps(_mm_add_ps(G_Dest, G_Col), One);
+ B_Blend = _mm_sub_ps(_mm_add_ps(B_Dest, B_Col), One);
+ } break;
+ case blend_add:
+ {
+ R_Blend = _mm_add_ps(R_Dest, R_Col);
+ G_Blend = _mm_add_ps(G_Dest, G_Col);
+ B_Blend = _mm_add_ps(B_Dest, B_Col);
+ } break;
+ case blend_screen:
+ {
+ R_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv));
+ G_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv));
+ B_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv));
+ } break;
+ case blend_overlay:
+ {
+ __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 1);
+ __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 1);
+ __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 1);
+ __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
+ __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
+ __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
+ __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
+ __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
+ __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_softlight:
+ {
+ // using Pegtop's equation
+ R_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, R_Colx2), _mm_mul_ps(R_Dest, R_Dest)), _mm_mul_ps(R_Colx2, R_Dest));
+ G_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, G_Colx2), _mm_mul_ps(G_Dest, G_Dest)), _mm_mul_ps(G_Colx2, G_Dest));
+ B_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, B_Colx2), _mm_mul_ps(B_Dest, B_Dest)), _mm_mul_ps(B_Colx2, B_Dest));
+ } break;
+ case blend_hardlight:
+ {
+ __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 13);
+ __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 13);
+ __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 13);
+ __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
+ __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
+ __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
+ __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
+ __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
+ __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ case blend_subtract:
+ {
+ R_Blend = _mm_sub_ps(R_Dest, R_Col);
+ G_Blend = _mm_sub_ps(G_Dest, G_Col);
+ B_Blend = _mm_sub_ps(B_Dest, B_Col);
+ } break;
+ case blend_divide:
+ {
+ R_Blend = _mm_div_ps(R_Dest, _mm_add_ps(R_Col, ClipPrevent));
+ G_Blend = _mm_div_ps(G_Dest, _mm_add_ps(G_Col, ClipPrevent));
+ B_Blend = _mm_div_ps(B_Dest, _mm_add_ps(B_Col, ClipPrevent));
+ } break;
+ case blend_difference:
+ {
+ __m128 R_Lower = _mm_sub_ps(R_Col, R_Dest);
+ __m128 G_Lower = _mm_sub_ps(G_Col, G_Dest);
+ __m128 B_Lower = _mm_sub_ps(B_Col, B_Dest);
+ __m128 R_Upper = _mm_sub_ps(R_Dest, R_Col);
+ __m128 G_Upper = _mm_sub_ps(G_Dest, G_Col);
+ __m128 B_Upper = _mm_sub_ps(B_Dest, B_Col);
+ __m128 R_Mask = _mm_cmp_ps(R_Lower, Zero, 14);
+ __m128 G_Mask = _mm_cmp_ps(G_Lower, Zero, 14);
+ __m128 B_Mask = _mm_cmp_ps(B_Lower, Zero, 14);
+ R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
+ G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
+ B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
+ } break;
+ }
+
+ R_Blend = _mm_add_ps(_mm_mul_ps(R_Dest, LayerAlphaInv), _mm_mul_ps(R_Blend, LayerAlpha));
+ G_Blend = _mm_add_ps(_mm_mul_ps(G_Dest, LayerAlphaInv), _mm_mul_ps(G_Blend, LayerAlpha));
+ B_Blend = _mm_add_ps(_mm_mul_ps(B_Dest, LayerAlphaInv), _mm_mul_ps(B_Blend, LayerAlpha));
+
+ // Standard behavior in photo apps is for blend modes to
+ // inherit underlying opacity instead of adding to it.
+ if (T.BlendMode == blend_normal)
+ A_Blend = _mm_add_ps(A_Dest, LayerAlpha);
+ else
+ A_Blend = A_Dest;
+ }
+
+ R_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, R_Blend), Zero), Reg255));
+ G_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, G_Blend), Zero), Reg255));
+ B_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, B_Blend), Zero), Reg255));
+ A_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, A_Blend), Zero), Reg255));
+
+ __m128i OutputPixel = _mm_or_si128(
+ _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
+ _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
+ _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel);
+ }
+ PixelX = _mm_add_ps(PixelX, Four);
+ }
+ }
+}
+
+#endif
+#endif