summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
authorFox Caminiti <fox@foxcam.net>2022-12-16 20:16:43 -0500
committerFox Caminiti <fox@foxcam.net>2022-12-16 20:16:43 -0500
commitbedd6906eabdd513042d6a178d4dc56a3a41d1d3 (patch)
tree2bcbd3e46ae61e583707a2ccc5b3f5cfeacb61a8 /prenderer.cpp
parentcdb9e1f7240cb0716b7d99df5e1fd7c3fc3407a8 (diff)
v3, file/build organization
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp1914
1 files changed, 0 insertions, 1914 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
deleted file mode 100644
index 0940d16..0000000
--- a/prenderer.cpp
+++ /dev/null
@@ -1,1914 +0,0 @@
-static v2
-T_CompUVToLayerUV(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight, v2 CompUV)
-{
- real32 X = CompUV.x*FileWidth;
- real32 Y = CompUV.y*FileHeight;
-
- real32 Rad = (T.rotation* (PI / 180));
- v2 XAxis = (SourceWidth * T.scale)*V2(cos(Rad), sin(Rad));
- v2 YAxis = (SourceHeight * -T.scale)*V2(sin(Rad), -cos(Rad));
-
- v2 Pos = {T.x, T.y};
- v2 Origin = Pos - (XAxis * T.ax) - (YAxis * T.ay);
-
- v2 XAxisPerp = (1.0f / LengthSq(XAxis))*XAxis;
- v2 YAxisPerp = (1.0f / LengthSq(YAxis))*YAxis;
-
- real32 StartVectorX = X - Origin.x;
- real32 StartVectorY = Y - Origin.y;
- real32 LayerU = (StartVectorX * XAxisPerp.x) + (StartVectorY * XAxisPerp.y);
- real32 LayerV = (StartVectorX * YAxisPerp.x) + (StartVectorY * YAxisPerp.y);
- return V2(LayerU, LayerV);
-}
-
-static v2
-T_CompPosToLayerPos(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight, v2 CompUV)
-{
- v2 UV = T_CompUVToLayerUV(T, FileWidth, FileHeight, SourceWidth, SourceHeight, CompUV/V2(FileWidth, FileHeight));
- return UV*V2(SourceWidth, SourceHeight);
-}
-
-static v2
-Transform_ScreenSpaceToLocal(layer_transforms T, uint32 FileWidth, uint32 FileHeight, uint32 SourceWidth, uint32 SourceHeight,
- ImVec2 CompPos, ImVec2 CompZoom, ImVec2 ViewportMin, ImVec2 Point)
-{
- v2 CompUV = ImGui_ScreenPointToCompUV(ViewportMin, CompPos, CompZoom, Point);
- v2 LayerUV = T_CompUVToLayerUV(T, FileWidth, FileHeight, SourceWidth, SourceHeight, CompUV);
- return V2(LayerUV.x * SourceWidth, LayerUV.y * SourceHeight);
-}
-
-static void
-Layer_GetDimensions(memory *Memory, block_layer *Layer, int *Width, int *Height)
-{
- if (!Layer->IsPrecomp) {
- block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
- *Width = Source->Width;
- *Height = Source->Height;
- } else {
- block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
- *Width = Comp->Width;
- *Height = Comp->Height;
- }
-}
-
-// Transform given data based on state's Interact data.
-static void
-Transform_ApplyInteractive(interact_transform Interact, real32 *OutputX, real32 *OutputY, real32 *OutputRotation, real32 *OutputScale)
-{
- v2 BoxLength = Interact.Max - Interact.Min;
- v2 Center = Interact.Max - (BoxLength/2);
-
- real32 Point0X = Center.x - *OutputX;
- real32 Point0Y = Center.y - *OutputY;
-
- real32 Rad = Interact.Radians;
- real32 Rotation = Interact.Radians / (PI / 180);
-
- v2 XAxis = (Point0X * Interact.Scale)*V2(cos(Rad), sin(Rad));
- v2 YAxis = (Point0Y * -Interact.Scale)*V2(sin(Rad), -cos(Rad));
-
- real32 X0 = -XAxis.x - YAxis.x + Center.x;
- real32 Y0 = -XAxis.y - YAxis.y + Center.y;
-
- *OutputX = X0 + Interact.Position.x;
- *OutputY = Y0 + Interact.Position.y;
- *OutputRotation += Rotation;
- *OutputScale += Interact.Scale - 1.0f;
-}
-
-static void
-Transform_IterateOuterBounds(block_layer *Layer, uint32 Width, uint32 Height, real32 *MinX, real32 *MinY, real32 *MaxX, real32 *MaxY)
-{
- real32 Rad = (Layer->rotation.CurrentValue * (PI / 180));
- real32 s = Layer->scale.CurrentValue;
-
- v2 XAxis = (Width * s)*V2(cos(Rad), sin(Rad));
- v2 YAxis = (Height * -s)*V2(sin(Rad), -cos(Rad));
-
- real32 AnchorX = Layer->ax.CurrentValue;
- real32 AnchorY = Layer->ay.CurrentValue;
-
- v2 Pos = {Layer->x.CurrentValue, Layer->y.CurrentValue};
- v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
-
- real32 XLengthSq = 1.0f / LengthSq(XAxis);
- real32 YLengthSq = 1.0f / LengthSq(YAxis);
-
- v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
- for (int i = 0; i < 4; i++) {
- if (Points[i].x < *MinX) { *MinX = Points[i].x; }
- if (Points[i].y < *MinY) { *MinY = Points[i].y; }
- if (Points[i].x > *MaxX) { *MaxX = Points[i].x; }
- if (Points[i].y > *MaxY) { *MaxY = Points[i].y; }
- }
-}
-static void
-Transform_Recurse(project_state *State, memory *Memory, block_composition *MainComp, uint32 CompIndex, block_layer *ParentLayer[4], uint32 Recursions,
- sorted_comp_info *SortedCompArray, sorted_layer *SortedLayerArray,
- real32 *MinX, real32 *MinY, real32 *MaxX, real32 *MaxY)
-{
- sorted_comp_info *SortedCompInfo = &SortedCompArray[CompIndex];
- sorted_layer *SortedLayerInfo = Layer_GetSortedArray(SortedLayerArray, SortedCompArray, CompIndex);
- for (int i = 0; i < SortedCompInfo->LayerCount; i++)
- {
- sorted_layer SortEntry = SortedLayerInfo[i];
- uint32 Index_Physical = SortEntry.Block_Layer_Index;
- block_layer *Layer = (block_layer *)Memory_Block_AddressAtIndex(Memory, F_Layers, Index_Physical);
- if (Layer->IsPrecomp) {
- ParentLayer[Recursions] = Layer;
- Transform_Recurse(State, Memory, MainComp, Layer->Block_Source_Index, ParentLayer, Recursions + 1, SortedCompArray, SortedLayerArray,
- MinX, MinY, MaxX, MaxY);
- }
- if (Layer->IsSelected) {
- uint32 Width = 0, Height = 0;
- if (!Layer->IsPrecomp) {
- block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
- Width = Source->Width;
- Height = Source->Height;
- } else {
- block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
- Width = Comp->Width;
- Height = Comp->Height;
- }
-
- v2 Point[5] = { V2(Width*Layer->ax.CurrentValue, Height*Layer->ay.CurrentValue), V2(0, 0), V2(Width, 0), V2(0, Height), V2(Width, Height) };
-
- layer_transforms T = Layer_GetTransforms(Layer);
-
- v2 NewPos[5];
- for (int i = 0; i < 5; i++) {
- NewPos[i] = TransformPoint(T, Width, Height, Point[i]);
- }
-
- int i = 0;
- while (i < Recursions) {
- T = Layer_GetTransforms(ParentLayer[i]);
- block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, ParentLayer[i]->Block_Source_Index);
- Width = Comp->Width;
- Height = Comp->Height;
- for (int i = 0; i < 5; i++) {
- NewPos[i] = TransformPoint(T, Width, Height, NewPos[i]);
- }
- i++;
- }
-
- for (int i = 0; i < 4; i++) {
- if (NewPos[i+1].x < *MinX) { *MinX = NewPos[i+1].x; }
- if (NewPos[i+1].y < *MinY) { *MinY = NewPos[i+1].y; }
- if (NewPos[i+1].x > *MaxX) { *MaxX = NewPos[i+1].x; }
- if (NewPos[i+1].y > *MaxY) { *MaxY = NewPos[i+1].y; }
- }
- }
- }
-}
-
-// IMPORTANT(fox): The selection state and ordering of layers cannot change
-// until this action is exited/committed!
-static void
-Interact_Transform_Begin(project_data *File, memory *Memory, project_state *State, ImVec2 OGPos,
- sorted_comp_info *SortedCompArray, sorted_layer *SortedLayerArray)
-{
- real32 MinX = 100000;
- real32 MinY = 100000;
- real32 MaxX = -100000;
- real32 MaxY = -100000;
- block_composition *MainComp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, File->PrincipalCompIndex);
- block_layer *ParentLayer[4];
- Transform_Recurse(State, Memory, MainComp, File->PrincipalCompIndex, ParentLayer, 0,
- SortedCompArray, SortedLayerArray,
- &MinX, &MinY, &MaxX, &MaxY);
- if (MinX != 100000) {
- State->Interact_Active = interact_type_viewport_transform;
- interact_transform *Interact = (interact_transform *)&State->Interact_Offset[0];
- Interact->Min = V2(MinX, MinY);
- Interact->Max = V2(MaxX, MaxY);
- Interact->Position = V2(0);
- Interact->Radians = 0;
- Interact->Scale = 1.0f;
- Interact->OGPos = OGPos;
- }
- /*
- bool32 Activate = false;
- // Find the max dimensions of all the selected layers.
- for (int i = 0; i < File->Layer_Count; i++) {
- block_layer *Layer = (block_layer *)Memory_Block_AddressAtIndex(Memory, F_Layers, i);
- if (!Layer->IsSelected)
- continue;
- uint32 Width = 0, Height = 0;
- if (!Layer->IsPrecomp) {
- block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
- Width = Source->Width;
- Height = Source->Height;
- } else {
- block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
- Width = Comp->Width;
- Height = Comp->Height;
- }
- Transform_IterateOuterBounds(Layer, Width, Height, &MinX, &MinY, &MaxX, &MaxY);
- Activate = true;
- }
- if (Activate) {
- State->Interact_Active = interact_type_viewport_transform;
- interact_transform *Interact = (interact_transform *)&State->Interact_Offset[0];
- Interact->Min = V2(MinX, MinY);
- Interact->Max = V2(MaxX, MaxY);
- Interact->Position = V2(0);
- Interact->Radians = 0;
- Interact->Scale = 1.0f;
- Interact->OGPos = OGPos;
- }
- */
-}
-
-static v2
-TransformPoint(layer_transforms T, real32 Width, real32 Height, v2 Point)
-{
- real32 Rad = (T.rotation * (PI / 180));
- v2 XAxis = (Point.x - T.ax*Width) * T.scale * V2(cos(Rad), sin(Rad));
- v2 YAxis = (Point.y - T.ay*Height) * -T.scale * V2(sin(Rad), -cos(Rad));
- v2 LocalPoint = XAxis + YAxis;
- return V2(T.x + LocalPoint.x, T.y + LocalPoint.y);
-}
-
-
-static ImVec2
-Layer_LocalToScreenSpace(project_state *State, memory *Memory, block_layer *Layer, ui *UI, uint32 PrincipalCompIndex, v2 Point)
-{
- block_composition *MainComp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, PrincipalCompIndex);
-
- uint32 Width = 0, Height = 0;
- if (!Layer->IsPrecomp) {
- block_source *Source = (block_source *)Memory_Block_AddressAtIndex(Memory, F_Sources, Layer->Block_Source_Index);
- Width = Source->Width;
- Height = Source->Height;
- } else {
- block_composition *Comp = (block_composition *)Memory_Block_AddressAtIndex(Memory, F_Precomps, Layer->Block_Source_Index);
- Width = Comp->Width;
- Height = Comp->Height;
- }
-
- layer_transforms T = Layer_GetTransforms(Layer);
-
- if (State->Interact_Active == interact_type_viewport_transform && Layer->IsSelected == 1) {
- Transform_ApplyInteractive(*(interact_transform *)&State->Interact_Offset[0], &T.x, &T.y, &T.rotation, &T.scale);
- }
-
- v2 NewPos = TransformPoint(T, Width, Height, Point);
-
- if (Layer->Block_Composition_Index != PrincipalCompIndex) {
- layer_transforms T = Layer_GetTransforms(Layer);
- NewPos = TransformPoint(T, Width, Height, NewPos);
- }
-
- v2 CompUV = NewPos / V2(MainComp->Width, MainComp->Height);
-
- v2 ScreenPoint = V2(UI->CompPos.x + CompUV.x * UI->CompZoom.x,
- UI->CompPos.y + CompUV.y * UI->CompZoom.y);
-
- return ImVec2(ScreenPoint.x, ScreenPoint.y);
-}
-
-static void
-Fallback_RenderLayer(transform_info T, void *OutputBuffer, rectangle RenderRegion);
-static void
-Fallback_RenderDirect(direct_info T, void *OutputBuffer, rectangle RenderRegion);
-
-static void
-RenderLayers(render_entry Entry) {
- switch (Entry.RenderType)
- {
- case render_type_main:
- {
- Fallback_RenderLayer(*(transform_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
- } break;
- case render_type_notransform:
- {
- Fallback_RenderDirect(*(direct_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
- } break;
- case render_type_notransform_swap:
- {
- Fallback_RenderDirect(*(direct_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
- } break;
- case render_type_brush:
- {
- PaintTest(*(brush_info *)Entry.RenderData, Entry.OutputBuffer, Entry.RenderRegion);
- } break;
- default:
- {
- Assert(0);
- }
- }
-#if 0
-#if ARM
- Fallback_RenderLayer(RenderData->TransformInfo[i], RenderInfo->CompBuffer, RenderRegion);
-#else
- if (InstructionMode == instruction_mode_avx)
- AVX2_RenderLayer(Entry.T, Entry.OutputBuffer, Entry.RenderRegion);
- else
- Fallback_RenderLayer(Entry.T, Entry.OutputBuffer, Entry.RenderRegion);
-#endif
-#endif
-}
-
-static void
-Renderer_Start(void *Data, void *OutputBuffer, render_type RenderType, rectangle RenderRegion)
-{
-#if DEBUG
- if (Debug.NoThreading) {
- render_entry Entry = { Data, OutputBuffer, RenderType, RenderRegion };
- RenderLayers(Entry);
- return;
- }
-#endif
- // CPU
- Threading_BitmapOp(Data, OutputBuffer, RenderType, RenderRegion);
-}
-
-static void
-Renderer_Check(bool32 *Test, render_type RenderType)
-{
-#if DEBUG
- if (Debug.NoThreading) {
- *Test = true;
- return;
- }
-#endif
- // CPU
- *Test = Threading_IsActive(RenderType);
-}
-
-
-static transform_info
-Transform_Calculate(project_state *State, memory *Memory, project_data *File, block_layer *Layer, block_composition *Comp,
- int Width, int Height, int BytesPerPixel)
-{
- transform_info TransformInfo;
-
- real32 Rotation = Layer->rotation.CurrentValue;
- real32 X = Layer->x.CurrentValue;
- real32 Y = Layer->y.CurrentValue;
- real32 s = Layer->scale.CurrentValue;
- blend_mode BlendMode = Layer->BlendMode;
-
- if (State->Interact_Active == interact_type_viewport_transform && Layer->IsSelected == 1) {
- Transform_ApplyInteractive(*(interact_transform *)&State->Interact_Offset[0], &X, &Y, &Rotation, &s);
- }
-
- /*
- state_file_ui *UI = &State->Context[State->CurrentFileIndex].UI;
- if (UI->IsInteracting == true && UI->InteractMode == interact_transforms && Layer->IsSelected && !Layer->IsAdjustment)
- Transform_ApplyInteractive(UI, &X, &Y, &Rotation, &s);
-
- if (UI->IsInteractingBlendmode == true && Layer->IsSelected)
- BlendMode = UI->InteractBlendmode;
- */
-
- real32 Rad = (Rotation * (PI / 180));
- // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};
-
- v2 XAxis = (Width * s)*V2(cos(Rad), sin(Rad));
- v2 YAxis = (Height * -s)*V2(sin(Rad), -cos(Rad));
-
- real32 AnchorX = Layer->ax.CurrentValue;
- real32 AnchorY = Layer->ay.CurrentValue;
-
- v2 Pos = {X, Y};
- v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
-
- real32 XLengthSq = 1.0f / LengthSq(XAxis);
- real32 YLengthSq = 1.0f / LengthSq(YAxis);
-
- int32 MaxX = 0;
- int32 MaxY = 0;
- int32 MinX = Comp->Width;
- int32 MinY = Comp->Height;
-
- v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
- for (int i = 0; i < 4; i++) {
- if (Points[i].x < MinX) { MinX = Points[i].x; }
- if (Points[i].y < MinY) { MinY = Points[i].y; }
- if (Points[i].x > MaxX) { MaxX = Points[i].x; }
- if (Points[i].y > MaxY) { MaxY = Points[i].y; }
- }
- TransformInfo.XAxisPX = XLengthSq*XAxis.x;
- TransformInfo.XAxisPY = XLengthSq*XAxis.y;
- TransformInfo.YAxisPX = YLengthSq*YAxis.x;
- TransformInfo.YAxisPY = YLengthSq*YAxis.y;
-
- TransformInfo.BufferWidth = Comp->Width;
- TransformInfo.BufferHeight = Comp->Height;
- TransformInfo.BufferBytesPerPixel = Comp->BytesPerPixel;
- TransformInfo.BufferBits = Bitmap_ByteInfo(Comp->BytesPerPixel);
-
- TransformInfo.LayerWidth = Width;
- TransformInfo.LayerHeight = Height;
- TransformInfo.LayerBytesPerPixel = BytesPerPixel;
- TransformInfo.LayerBits = Bitmap_ByteInfo(BytesPerPixel);
-
- TransformInfo.LayerOpacity = Layer->opacity.CurrentValue;
- TransformInfo.BlendMode = BlendMode;
- TransformInfo.OriginX = Origin.x;
- TransformInfo.OriginY = Origin.y;
- TransformInfo.BufferPitch = Comp->Width*Comp->BytesPerPixel;
- TransformInfo.LayerPitch = Width*BytesPerPixel;
- TransformInfo.ClipRect = {MinX, MinY, MaxX, MaxY};
-
- TransformInfo.IsAdjustment = Layer->IsAdjustment;
-
- return TransformInfo;
-}
-
-// NOTE(fox): is this too ridiculous? i don't trust inline
-#define Fallback_Blend() \
- switch (T.BlendMode)\
- {\
- case blend_normal:\
- {\
- } break;\
- case blend_multiply:\
- {\
- R_Blend = R_Dest * R_Col;\
- G_Blend = G_Dest * G_Col;\
- B_Blend = B_Dest * B_Col;\
- } break;\
- case blend_colorburn:\
- {\
- /* NOTE(fox): Padding to prevent actual crashing from zero division */ \
- R_Blend = 1.0f - ((1.0f - R_Dest) / (R_Col + 0.001f));\
- G_Blend = 1.0f - ((1.0f - G_Dest) / (G_Col + 0.001f));\
- B_Blend = 1.0f - ((1.0f - B_Dest) / (B_Col + 0.001f));\
- } break;\
- case blend_linearburn:\
- {\
- R_Blend = (R_Dest + R_Col) - 1.0f;\
- G_Blend = (G_Dest + G_Col) - 1.0f;\
- B_Blend = (B_Dest + B_Col) - 1.0f;\
- } break;\
- case blend_add:\
- {\
- R_Blend = R_Dest + R_Col;\
- G_Blend = G_Dest + G_Col;\
- B_Blend = B_Dest + B_Col;\
- } break;\
- case blend_screen:\
- {\
- R_Blend = 1.0f - ((1.0f - R_Dest) * (1.0f - R_Col));\
- G_Blend = 1.0f - ((1.0f - G_Dest) * (1.0f - G_Col));\
- B_Blend = 1.0f - ((1.0f - B_Dest) * (1.0f - B_Col));\
- } break;\
- case blend_overlay:\
- {\
- if (R_Dest < 0.5) {\
- R_Blend = 2.0f * R_Dest * R_Col;\
- } else {\
- R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));\
- }\
- if (G_Dest < 0.5) {\
- G_Blend = 2.0f * G_Dest * G_Col;\
- } else {\
- G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));\
- }\
- if (B_Dest < 0.5) {\
- B_Blend = 2.0f * B_Dest * B_Col;\
- } else {\
- B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));\
- }\
- } break;\
- case blend_softlight:\
- {\
- /* using Pegtop's equation */ \
- R_Blend = ((1.0f - R_Col * 2) * R_Dest * R_Dest) + (R_Col * 2 * R_Dest);\
- G_Blend = ((1.0f - G_Col * 2) * G_Dest * G_Dest) + (G_Col * 2 * G_Dest);\
- B_Blend = ((1.0f - B_Col * 2) * B_Dest * B_Dest) + (B_Col * 2 * B_Dest);\
- } break;\
- case blend_hardlight:\
- {\
- if (R_Dest > 0.5) {\
- R_Blend = 2.0f * R_Dest * R_Col;\
- } else {\
- R_Blend = 1.0f - (2.0f * (1.0f - R_Dest) * (1.0f - R_Col));\
- }\
- if (G_Dest > 0.5) {\
- G_Blend = 2.0f * G_Dest * G_Col;\
- } else {\
- G_Blend = 1.0f - (2.0f * (1.0f - G_Dest) * (1.0f - G_Col));\
- }\
- if (B_Dest > 0.5) {\
- B_Blend = 2.0f * B_Dest * B_Col;\
- } else {\
- B_Blend = 1.0f - (2.0f * (1.0f - B_Dest) * (1.0f - B_Col));\
- }\
- } break;\
- case blend_subtract:\
- {\
- R_Blend = R_Dest - R_Col;\
- G_Blend = G_Dest - G_Col;\
- B_Blend = B_Dest - B_Col;\
- } break;\
- case blend_divide:\
- {\
- R_Blend = R_Dest / (R_Col + 0.001f);\
- G_Blend = G_Dest / (G_Col + 0.001f);\
- B_Blend = B_Dest / (B_Col + 0.001f);\
- } break;\
- case blend_difference:\
- {\
- if (R_Col - R_Dest > 0) {\
- R_Blend = R_Col - R_Dest;\
- } else {\
- R_Blend = R_Dest - R_Col;\
- }\
- if (G_Col - G_Dest > 0) {\
- G_Blend = G_Col - G_Dest;\
- } else {\
- G_Blend = G_Dest - G_Col;\
- }\
- if (B_Col - B_Dest > 0) {\
- B_Blend = B_Col - B_Dest;\
- } else {\
- B_Blend = B_Dest - B_Col;\
- }\
- } break;\
- }\
-
-static void
-Fallback_RenderDirect(direct_info T, void *OutputBuffer, rectangle RenderRegion)
-{
- rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);
-
- Assert(LayerBounds.Max.x <= T.BufferWidth);
- Assert(LayerBounds.Max.y <= T.BufferHeight);
-
- for (int16 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
- {
- for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
- {
- uint16 LX = X;
- uint16 LY = Y;
- uint16 LXPlus = Ceil(X+1, (uint32)T.BufferWidth - 1);
- uint16 LYPlus = Ceil(Y+1, (uint32)T.BufferHeight - 1);
-
- uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + ((uint16)T.BufferPitch * LY) + (LX * (uint16)T.BufferBytesPerPixel));
-
- uint32 *R_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 0);
- uint32 *G_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 1);
- uint32 *B_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 2);
- uint32 *A_SrcAddress = (uint32 *)(TexPTR0 + T.BufferBits.ByteOffset * 3);
-
- real32 R_Src = (real32)(*R_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
-
- real32 R_Col = (real32)(*R_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 G_Col = (real32)(*G_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 B_Col = (real32)(*B_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 A_Col = (real32)(*A_SrcAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
-
- if (A_Col == 0)
- continue;
-
- real32 LayerAlpha = A_Col * T.Opacity;
-
- uint8 *DestPixel =((uint8 *)OutputBuffer + ((uint16)Y * (uint16)T.BufferPitch) + ((uint16)X * (uint16)T.BufferBytesPerPixel));
-
- uint32 *R_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 0);
- uint32 *G_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 1);
- uint32 *B_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 2);
- uint32 *A_DestAddress = (uint32 *)(DestPixel + T.BufferBits.ByteOffset * 3);
-
- uint32 R_DestInt = (*R_DestAddress & T.BufferBits.MaskPixel);
- uint32 G_DestInt = (*G_DestAddress & T.BufferBits.MaskPixel);
- uint32 B_DestInt = (*B_DestAddress & T.BufferBits.MaskPixel);
- uint32 A_DestInt = (*A_DestAddress & T.BufferBits.MaskPixel);
-
- real32 R_Dest = (real32)(R_DestInt) * T.BufferBits.Normalized;
- real32 G_Dest = (real32)(G_DestInt) * T.BufferBits.Normalized;
- real32 B_Dest = (real32)(B_DestInt) * T.BufferBits.Normalized;
- real32 A_Dest = (real32)(A_DestInt) * T.BufferBits.Normalized;
- real32 Test = (A_Dest > 0.01) ? 1 : 0;
-
- real32 R_Blend = R_Col;
- real32 G_Blend = G_Col;
- real32 B_Blend = B_Col;
- real32 A_Blend = A_Col;
- // A_Blend = (A_Blend >= 0.04045) ? pow((A_Blend + 0.055) / (1 + 0.055), 2.4) : A_Blend / 12.92;
-
- if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {
-
- Fallback_Blend();
-
- if (A_Dest == 0) {
- A_Blend = LayerAlpha;
- } else {
- A_Blend = A_Dest + ((1.0f - A_Dest) * LayerAlpha);
- real32 Alpha = pow(LayerAlpha, A_Dest);
- R_Blend = (R_Dest * (1.0f - Alpha)) + (R_Blend * Alpha);
- G_Blend = (G_Dest * (1.0f - Alpha)) + (G_Blend * Alpha);
- B_Blend = (B_Dest * (1.0f - Alpha)) + (B_Blend * Alpha);
- }
- }
-
- uint32 R_Out = (uint32)(Normalize(R_Blend) * T.BufferBits.Bits);
- uint32 G_Out = (uint32)(Normalize(G_Blend) * T.BufferBits.Bits);
- uint32 B_Out = (uint32)(Normalize(B_Blend) * T.BufferBits.Bits);
- uint32 A_Out = (uint32)(Normalize(A_Blend) * T.BufferBits.Bits);
-
- if (T.SwapActive)
- {
- *R_SrcAddress = (*R_SrcAddress & ~T.BufferBits.MaskPixel) | R_DestInt;
- *G_SrcAddress = (*G_SrcAddress & ~T.BufferBits.MaskPixel) | G_DestInt;
- *B_SrcAddress = (*B_SrcAddress & ~T.BufferBits.MaskPixel) | B_DestInt;
- *A_SrcAddress = (*A_SrcAddress & ~T.BufferBits.MaskPixel) | A_DestInt;
- }
- *R_DestAddress = (*R_DestAddress & ~T.BufferBits.MaskPixel) | R_Out;
- *G_DestAddress = (*G_DestAddress & ~T.BufferBits.MaskPixel) | G_Out;
- *B_DestAddress = (*B_DestAddress & ~T.BufferBits.MaskPixel) | B_Out;
- *A_DestAddress = (*A_DestAddress & ~T.BufferBits.MaskPixel) | A_Out;
- }
- }
-}
-
-static void
-Fallback_RenderLayer(transform_info T, void *OutputBuffer, rectangle RenderRegion)
-{
- rectangle LayerBounds = ClipRectangle( T.ClipRect, RenderRegion);
-
- Assert(LayerBounds.Max.x <= T.BufferWidth);
- Assert(LayerBounds.Max.y <= T.BufferHeight);
-
- for (int Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
- {
- real32 StartVectorY = (real32)Y - T.OriginY;
-
- for (int X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
- {
- real32 StartVectorX = X - T.OriginX;
- real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
- real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
-
- if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {
-
- real32 TexXFull = U * T.LayerWidth;
- uint32 TexXInt = (uint32)TexXFull;
- real32 TexX = TexXFull - TexXInt;
-
- real32 TexYFull = V * T.LayerHeight;
- uint32 TexYInt = (uint32)TexYFull;
- real32 TexY = TexYFull - TexYInt;
-
- real32 TexXInv = 1 - TexX;
- real32 TexYInv = 1 - TexY;
- real32 TexBothXInv = TexXInv * TexY;
- real32 TexBothYInv = TexX * TexYInv;
- real32 TexBoth = TexY * TexX;
- real32 TexBothInv = TexXInv * TexYInv;
-
- uint32 XLookup, YLookup, PixelToSeek;
-
- uint32 LX = TexXInt;
- uint32 LY = TexYInt;
- uint32 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
- uint32 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);
-
- uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LY) + (LX * (uint32)T.LayerBytesPerPixel));
- uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LY) + (LXPlus * (uint32)T.LayerBytesPerPixel));
- uint8 *TexPTR2 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LYPlus) + (LX * (uint32)T.LayerBytesPerPixel));
- uint8 *TexPTR3 = ((uint8 *)T.SourceBuffer + ((uint32)T.LayerPitch * LYPlus) + (LXPlus * (uint32)T.LayerBytesPerPixel));
-
- uint32 PixelA = *(uint32 *)TexPTR0;
- uint32 PixelB = *(uint32 *)TexPTR1;
- uint32 PixelC = *(uint32 *)TexPTR2;
- uint32 PixelD = *(uint32 *)TexPTR3;
-
-
-#if 0
- real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255;
- real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255;
- real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255;
- real32 TexRD = (real32)(PixelD & 0xFF) * Normalized255;
-
- real32 TexGA = (real32)((PixelA >> 8) & 0xFF) * Normalized255;
- real32 TexGB = (real32)((PixelB >> 8) & 0xFF) * Normalized255;
- real32 TexGC = (real32)((PixelC >> 8) & 0xFF) * Normalized255;
- real32 TexGD = (real32)((PixelD >> 8) & 0xFF) * Normalized255;
-
- real32 TexBA = (real32)((PixelA >> 16) & 0xFF) * Normalized255;
- real32 TexBB = (real32)((PixelB >> 16) & 0xFF) * Normalized255;
- real32 TexBC = (real32)((PixelC >> 16) & 0xFF) * Normalized255;
- real32 TexBD = (real32)((PixelD >> 16) & 0xFF) * Normalized255;
-
- real32 TexAA = (real32)((PixelA >> 24) & 0xFF) * Normalized255;
- real32 TexAB = (real32)((PixelB >> 24) & 0xFF) * Normalized255;
- real32 TexAC = (real32)((PixelC >> 24) & 0xFF) * Normalized255;
- real32 TexAD = (real32)((PixelD >> 24) & 0xFF) * Normalized255;
-#else
- real32 TexRA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexGA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexBA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexAA = (real32)(*(uint32 *)(TexPTR0 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
-
- real32 TexRB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexGB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexBB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexAB = (real32)(*(uint32 *)(TexPTR1 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
-
- real32 TexRC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexGC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexBC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexAC = (real32)(*(uint32 *)(TexPTR2 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
-
- real32 TexRD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 0) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexGD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 1) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexBD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 2) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
- real32 TexAD = (real32)(*(uint32 *)(TexPTR3 + T.LayerBits.ByteOffset * 3) & T.LayerBits.MaskPixel) * T.LayerBits.Normalized;
-
-#endif
-
- real32 R_Col = (TexBothInv * TexRA) + (TexBothYInv * TexRB)
- + (TexBothXInv * TexRC) + (TexBoth * TexRD);
- real32 G_Col = (TexBothInv * TexGA) + (TexBothYInv * TexGB)
- + (TexBothXInv * TexGC) + (TexBoth * TexGD);
- real32 B_Col = (TexBothInv * TexBA) + (TexBothYInv * TexBB)
- + (TexBothXInv * TexBC) + (TexBoth * TexBD);
- real32 A_Col = (TexBothInv * TexAA) + (TexBothYInv * TexAB)
- + (TexBothXInv * TexAC) + (TexBoth * TexAD);
-
- real32 LayerAlpha = A_Col * T.LayerOpacity;
-
-#if DEBUG
- if (Debug.DisableAlpha == 1) {
- A_Col = 1;
- LayerAlpha = 1;
- }
-#endif
-
- real32 R_Blend = R_Col;
- real32 G_Blend = G_Col;
- real32 B_Blend = B_Col;
- real32 A_Blend = A_Col;
-
- uint8 *DestPixel =((uint8 *)OutputBuffer + ((uint32)Y * (uint32)T.BufferPitch) + ((uint32)X * (uint32)T.BufferBytesPerPixel));
- Assert(X != (T.BufferWidth));
-
- uint8 *R_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 0);
- uint8 *G_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 1);
- uint8 *B_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 2);
- uint8 *A_DestAddress = (DestPixel + T.BufferBits.ByteOffset * 3);
-
- if (LayerAlpha != 1.0f || T.BlendMode != blend_normal) {
-
- real32 R_Dest = (real32)(*R_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 G_Dest = (real32)(*G_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 B_Dest = (real32)(*B_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
- real32 A_Dest = (real32)(*A_DestAddress & T.BufferBits.MaskPixel) * T.BufferBits.Normalized;
-
- Fallback_Blend();
-
- R_Blend = (R_Dest * (1.0f - LayerAlpha)) + (R_Blend * LayerAlpha);
- G_Blend = (G_Dest * (1.0f - LayerAlpha)) + (G_Blend * LayerAlpha);
- B_Blend = (B_Dest * (1.0f - LayerAlpha)) + (B_Blend * LayerAlpha);
-
- if (T.BlendMode == blend_normal)
- A_Blend = A_Dest + LayerAlpha;
- // A_Blend = A_Dest + ((1.0f - A_Dest) * LayerAlpha);
- else
- A_Blend = A_Dest;
-#if DEBUG
- if (Debug.DisableAlpha == 1) {
- G_Blend = R_Blend;
- B_Blend = R_Blend;
- } else
- if (Debug.DisableAlpha == 2) {
- R_Blend = LayerAlpha;
- G_Blend = LayerAlpha;
- B_Blend = LayerAlpha;
- }
-#endif
- }
-
- uint8 R_Out = (uint8)(Normalize(R_Blend) * T.BufferBits.Bits);
- uint8 G_Out = (uint8)(Normalize(G_Blend) * T.BufferBits.Bits);
- uint8 B_Out = (uint8)(Normalize(B_Blend) * T.BufferBits.Bits);
- uint8 A_Out = (uint8)(Normalize(A_Blend) * T.BufferBits.Bits);
-
- *R_DestAddress = R_Out;
- *G_DestAddress = G_Out;
- *B_DestAddress = B_Out;
- *A_DestAddress = A_Out;
- // *R_DestAddress = 255;
- // *G_DestAddress = 255;
- // *B_DestAddress = 255;
- // *A_DestAddress = 255;
- }
- }
- }
-}
-
-#if 0
-static void
-Layer_CalcRotatedOffset(project_layer *Layer, v2 Increment, v2 Divisor, real32 *ValueX, real32 *ValueY)
-{
-
- real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
- real32 s = Layer->scale.CurrentValue.f;
-
- v2 XAxis = V2(cos(Rad), sin(Rad)) * (Increment.x / s);
- v2 YAxis = V2(sin(Rad), -cos(Rad)) * (Increment.y / -s);
-
- *ValueX += XAxis.x/Divisor.x;
- *ValueY -= XAxis.y/Divisor.y;
- *ValueX -= YAxis.x/Divisor.x;
- *ValueY += YAxis.y/Divisor.y;
-}
-
-static transform_info
-CalculateTransforms(project_layer *Layer, comp_buffer *CompBuffer)
-{
- transform_info TransformInfo;
- source *Source = Layer->Source;
-
- real32 Rad = (Layer->rotation.CurrentValue.f * (PI / 180));
- real32 s = Layer->scale.CurrentValue.f;
- // v2 Scale = {Source->Raster.Width * s, Source->Raster.Height * s};
-
- v2 XAxis = (Source->Info.Width * s)*V2(cos(Rad), sin(Rad));
- v2 YAxis = (Source->Info.Height * -s)*V2(sin(Rad), -cos(Rad));
-
- real32 AnchorX = Layer->ax.CurrentValue.f;
- real32 AnchorY = Layer->ay.CurrentValue.f;
-
- v2 Pos = {Layer->x.CurrentValue.f, Layer->y.CurrentValue.f};
- v2 Origin = Pos - (XAxis * AnchorX) - (YAxis * AnchorY);
-
- real32 XLengthSq = 1.0f / LengthSq(XAxis);
- real32 YLengthSq = 1.0f / LengthSq(YAxis);
-
- int32 MaxX = 0;
- int32 MaxY = 0;
- int32 MinX = CompBuffer->Width;
- int32 MinY = CompBuffer->Height;
-
- v2 Points[4] = {Origin, Origin + XAxis, Origin + YAxis, Origin + XAxis + YAxis};
- for (int i = 0; i < 4; i++) {
- if (Points[i].x < MinX) { MinX = Points[i].x; }
- if (Points[i].y < MinY) { MinY = Points[i].y; }
- if (Points[i].x > MaxX) { MaxX = Points[i].x; }
- if (Points[i].y > MaxY) { MaxY = Points[i].y; }
- }
-
- TransformInfo.XAxisPX = XLengthSq*XAxis.x;
- TransformInfo.XAxisPY = XLengthSq*XAxis.y;
- TransformInfo.YAxisPX = YLengthSq*YAxis.x;
- TransformInfo.YAxisPY = YLengthSq*YAxis.y;
-
- uint16 Width = Source->Info.Width;
- uint16 Height = Source->Info.Height;
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Width, Height, &WidthP, &HeightP);
-
- TransformInfo.LayerWidth = Width;
- TransformInfo.LayerHeight = Height;
- TransformInfo.FullLayerWidth = WidthP;
- TransformInfo.FullLayerHeight = HeightP;
- TransformInfo.LayerOpacity = Layer->opacity.CurrentValue.f;
- TransformInfo.BlendMode =Layer->BlendMode;
- TransformInfo.OriginX = Origin.x;
- TransformInfo.OriginY = Origin.y;
- TransformInfo.BufferPitch = CompBuffer->Width*CompBuffer->BytesPerPixel;
- TransformInfo.LayerPitch = Source->Info.Width*Source->Info.BytesPerPixel;
- TransformInfo.ClipRect = {MinX - (MinX & 3), MinY, MaxX + 1, MaxY + 1};
-
- TransformInfo.SourceBuffer = Layer->BitmapInfo.BitmapBuffer;
-
- return TransformInfo;
-}
-
-static void
-EndRenderState(project_state *State)
-{
- IsRendering = false;
-
- for (int16 i = 0; i < State->NumberOfLayersToRender; i++)
- {
- State->LayersToRender[i] = 0;
- }
-
- State->NumberOfLayersToRender = 0;
- SDL_AtomicSet(&CurrentEntry, 0);
- SDL_AtomicSet(&QueuedEntries, 0);
- SDL_AtomicSet(&CompletedEntries, 0);
-
-}
-
-static void
-RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
- for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
- int16 Idx = RenderInfo->State->LayersToRender[i];
-
-#if ARM
- if (InstructionMode == instruction_mode_neon)
- Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
-#else
- if (InstructionMode == instruction_mode_avx)
- AVX2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
- else if (InstructionMode == instruction_mode_sse)
- SSE2_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
-#endif
- else
- Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
- }
-}
-
-static void
-FinishRenderAndUpload(project_state *State, comp_buffer *CompBuffer, GLuint textureID)
-{
-#if PERF
- Test = __rdtsc() - Test;
-
- Debug.PixelCountRendered = 1280*720*5;
- printf("Cycles per pixel rendered: %li ", Test / Debug.PixelCountRendered);
- printf("Pixels rendered: %li ", Debug.PixelCountRendered);
- printf("Cycles: %li\n", Test);
-
- Test = 0;
- Debug.PixelCountTransparent = 0;
- Debug.PixelCountRendered = 0;
- Debug.PixelCountChecked = 0;
-#endif
-
-
-#if PACKEDRGB
- Bitmap_ConvertPacking(CompBuffer->PackedBuffer, CompBuffer->UnpackedBuffer,
- CompBuffer->Width, CompBuffer->Height, CompBuffer->BytesPerPixel, 1);
-#endif
- EndRenderState(State);
- glBindTexture(GL_TEXTURE_2D, textureID);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE,
- CompBuffer->UnpackedBuffer);
-
- // shmp->shared_framenumber = File.CurrentFrame;
- // if (sem_post(&shmp->sem2) == -1)
- // Assert(0);
-}
-
-static void
-QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State)
-{
- IsRendering = true;
- render_queue RenderInfo = {File, State, CompBuffer};
-
-#if PERF
- Test = __rdtsc();
-#endif
-
- for (int16 i = 0; i < File->NumberOfLayers; i++)
- {
- if (File->Layer[i]->StartFrame <= File->CurrentFrame &&
- File->Layer[i]->EndFrame >= File->CurrentFrame)
- {
- File->Layer[i]->TransformInfo = CalculateTransforms(File->Layer[i], CompBuffer);
- State->LayersToRender[State->NumberOfLayersToRender] = i;
- State->NumberOfLayersToRender++;
- }
- }
-
-
- uint16 TileWidth = CompBuffer->Width / 4;
- uint16 TileHeight = CompBuffer->Height / 4;
-
- for (int y = 0; y < 4; y++) {
- for (int x = 0; x < 4; x++) {
- // if (x == y) {
- rectangle RenderRegion = {TileWidth*x, TileHeight*y, TileWidth + TileWidth*x, TileHeight + TileHeight*y};
- // The render regions always have to be aligned to the top left of
- // a 4x4 chunk (at least for AVX2) and cannot exceed the bounds of
- // the comp.
- // It seems we don't need any special math to guarantee this aside
- // from dividing by 4 and modulating.
- RenderRegion.Min.x -= RenderRegion.Min.x % 4;
- RenderRegion.Min.y -= RenderRegion.Min.y % 4;
- RenderRegion.Max.x -= RenderRegion.Max.x % 4;
- RenderRegion.Max.y -= RenderRegion.Max.y % 4;
- if (RenderRegion.Max.x > CompBuffer->Width)
- RenderRegion.Max.x = CompBuffer->Width;
- if (RenderRegion.Max.y > CompBuffer->Height)
- RenderRegion.Max.y = CompBuffer->Height;
- PushRect(RenderRegion);
- // }
- }
- }
-
-
- rectangle RenderRegion = {0, 0, (int32)CompBuffer->Width, (int32)CompBuffer->Height};
- RenderLayers(&RenderInfo, RenderRegion);
-
-}
-
-#if ARM
-
-static void
-NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
-{
- rectangle LayerBounds = ClipRectangle( T.ClipRect,
- RenderRegion );
- // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
- LayerBounds.Min.x -= LayerBounds.Min.x % 4;
- LayerBounds.Min.y -= LayerBounds.Min.y % 4;
-
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
- uint8 *TexPTR = (uint8 *)T.SourceBuffer;
- Assert(LayerBounds.Max.x <= Buffer->Width);
- Assert(LayerBounds.Max.y <= Buffer->Height);
-
- float32x4_t XAxisPX = vdupq_n_f32(T.XAxisPX);
- float32x4_t XAxisPY = vdupq_n_f32(T.XAxisPY);
- float32x4_t YAxisPX = vdupq_n_f32(T.YAxisPX);
- float32x4_t YAxisPY = vdupq_n_f32(T.YAxisPY);
-
- float32x4_t LayerWidth = vdupq_n_f32(T.LayerWidth);
- int32x4_t FullLayerWidth4i = vdupq_n_s32(T.FullLayerWidth*4);
- int32x4_t LayerWidthMinusOne = vdupq_n_s32(T.LayerWidth - 1);
- int32x4_t LayerHeightMinusOne = vdupq_n_s32(T.LayerHeight - 1);
- float32x4_t LayerHeight = vdupq_n_f32(T.LayerHeight);
- float32x4_t LayerOpacity = vdupq_n_f32(T.LayerOpacity);
- float32x4_t OriginX = vdupq_n_f32(T.OriginX);
- float32x4_t OriginY = vdupq_n_f32(T.OriginY);
-
- float32x4_t ClipPrevent = vdupq_n_f32(0.001f);
- float32x4_t One = vdupq_n_f32(1);
- float32x4_t Two = vdupq_n_f32(2);
- float32x4_t Zero = vdupq_n_f32(0);
-
- float32x4_t ZeroPoint25 = vdupq_n_f32(0.25);
- float32x4_t ZeroPointFive = vdupq_n_f32(0.5);
- int32x4_t Onei = vdupq_n_s32(1);
- float32x4_t Four = vdupq_n_f32(4);
- int32x4_t FF = vdupq_n_s32(0xFF);
- int32x4_t BottomTwoBits = vdupq_n_s32(0x03);
- int32x4_t Fouri = vdupq_n_s32(4);
- int32x4_t Sixteeni = vdupq_n_s32(16);
- float32x4_t Real255 = vdupq_n_f32(255.0f);
- float32x4_t Norm255 = vdupq_n_f32(1/255.0f);
-
- // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
-
- // TODO(fox): A possible optimization could be made by using the 32x4x4
- // load intrinsic and a loop that repeats four times.
-
- for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
- {
- real32 xvals[4] = { (real32)LayerBounds.Min.x, (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2, (real32)LayerBounds.Min.x+3 };
- float32x4_t PixelX = vld1q_f32(xvals);
-
- float32x4_t PixelY = vdupq_n_f32((real32)Y);
- float32x4_t StartVectorY = vsubq_f32(PixelY, OriginY);
-
- for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
- {
-
- float32x4_t StartVectorX = vsubq_f32(PixelX, OriginX);
-
- uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
- uint32 PixelToSeek = XLookup + YLookup;
- uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
-
- float32x4_t U = vaddq_f32(vmulq_f32(StartVectorX, XAxisPX), vmulq_f32(StartVectorY, XAxisPY));
- float32x4_t V = vaddq_f32(vmulq_f32(StartVectorX, YAxisPX), vmulq_f32(StartVectorY, YAxisPY));
-
- uint32x4_t LayerMask = vandq_u32(vandq_u32(vcgeq_f32(U, Zero), vcltq_f32(U, One)),
- vandq_u32(vcgeq_f32(V, Zero), vcltq_f32(V, One)));
-
- // TODO(fox): Make more efficient with some sort of truncation
- uint32 comp[4];
- vst1q_u32(comp, LayerMask);
- if (comp[0] || comp[1] || comp[2] || comp[3]) {
- U = vmaxq_f32(vminq_f32(One, U), Zero);
- V = vmaxq_f32(vminq_f32(One, V), Zero);
-
- float32x4_t TexXFull = vmulq_f32(U, LayerWidth);
- float32x4_t TexYFull = vmulq_f32(V, LayerHeight);
- int32x4_t TexXInt = vcvtq_s32_f32(TexXFull);
- int32x4_t TexXIntPlusOne = vaddq_f32(TexXInt, vandq_u32(vcltq_u32(TexXInt, LayerWidthMinusOne), Onei));
- int32x4_t TexYInt = vcvtq_s32_f32(TexYFull);
- int32x4_t TexYIntPlusOne = vaddq_f32(TexYInt, vandq_u32(vcltq_u32(TexYInt, LayerWidthMinusOne), Onei));
-
- float32x4_t TexX = vsubq_f32(TexXFull, vcvtq_f32_u32(TexXInt));
- float32x4_t TexY = vsubq_f32(TexYFull, vcvtq_f32_u32(TexYInt));
- float32x4_t TexXInv = vsubq_f32(One, TexX);
- float32x4_t TexYInv = vsubq_f32(One, TexY);
- float32x4_t TexBothXInv = vmulq_f32(TexXInv, TexY);
- float32x4_t TexBothYInv = vmulq_f32(TexX, TexYInv);
- float32x4_t TexBoth = vmulq_f32(TexY, TexX);
- float32x4_t TexBothInv = vmulq_f32(TexXInv, TexYInv);
-
- int32x4_t XLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXInt, 2), Sixteeni),
- vandq_u32(TexXInt, BottomTwoBits));
- int32x4_t YLookup = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYInt, 2), FullLayerWidth4i),
- vmulq_u32(vandq_u32(TexYInt, BottomTwoBits), Fouri));
- int32x4_t XLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexXIntPlusOne, 2), Sixteeni),
- vandq_u32(TexXIntPlusOne, BottomTwoBits));
- int32x4_t YLookupPlusOne = vaddq_u32(vmulq_u32(vshrq_n_u32(TexYIntPlusOne, 2), FullLayerWidth4i),
- vmulq_u32(vandq_u32(TexYIntPlusOne, BottomTwoBits), Fouri));
-
- int32x4_t PixelLookupTL = vaddq_u32(XLookup, YLookup);
- int32x4_t PixelLookupTR = vaddq_u32(XLookupPlusOne, YLookup);
- int32x4_t PixelLookupBL = vaddq_u32(XLookup, YLookupPlusOne);
- int32x4_t PixelLookupBR = vaddq_u32(XLookupPlusOne, YLookupPlusOne);
-
- // I thought NEON had gather/scatter, but it appears it doesn't...
- }
-
- PixelX = vaddq_f32(PixelX, Four);
- }
- }
-}
-
-#else
-
-#if 0
-#include "iacaMarks.h"
-#else
-#define IACA_START
-#define IACA_END
-#endif
-
-static void
-AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
-{
- rectangle LayerBounds = ClipRectangle( T.ClipRect,
- RenderRegion );
- // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
- LayerBounds.Min.x -= LayerBounds.Min.x % 4;
- LayerBounds.Min.y -= LayerBounds.Min.y % 4;
-
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
- uint8 *TexPTR = (uint8 *)T.SourceBuffer;
- Assert(LayerBounds.Max.x <= Buffer->Width);
- Assert(LayerBounds.Max.y <= Buffer->Height);
-
- __m256 XAxisPX = _mm256_set1_ps(T.XAxisPX);
- __m256 XAxisPY = _mm256_set1_ps(T.XAxisPY);
- __m256 YAxisPX = _mm256_set1_ps(T.YAxisPX);
- __m256 YAxisPY = _mm256_set1_ps(T.YAxisPY);
-
- __m256 LayerWidth = _mm256_set1_ps(T.LayerWidth);
- __m256 LayerBoundsMaxX = _mm256_set1_ps(LayerBounds.Max.x);
- __m256i FullLayerWidth4i = _mm256_set1_epi32(T.FullLayerWidth*4);
- __m256i LayerWidthMinusOne = _mm256_set1_epi32(T.LayerWidth - 1);
- __m256i LayerHeightMinusOne = _mm256_set1_epi32(T.LayerHeight - 1);
- __m256 LayerHeight = _mm256_set1_ps(T.LayerHeight);
- __m256 LayerOpacity = _mm256_set1_ps(T.LayerOpacity);
- __m256 OriginX = _mm256_set1_ps(T.OriginX);
- __m256 OriginY = _mm256_set1_ps(T.OriginY);
-
- __m256 ClipPrevent = _mm256_set1_ps(0.001f);
- __m256 One = _mm256_set1_ps(1);
- __m256 Two = _mm256_set1_ps(2);
- __m256 Zero = _mm256_set1_ps(0);
-
- __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
- __m256 ZeroPointFive = _mm256_set1_ps(0.5);
- __m256i Onei = _mm256_set1_epi32(1);
- __m256 Four = _mm256_set1_ps(4);
- __m256 Eight = _mm256_set1_ps(8);
- __m256i FF = _mm256_set1_epi32(0xFF);
- __m256i BottomTwoBits = _mm256_set1_epi32(0x03);
- __m256i Fouri = _mm256_set1_epi32(4);
- __m256i Sixteeni = _mm256_set1_epi32(16);
- __m256 Real255 = _mm256_set1_ps(255.0f);
- __m256 Norm255 = _mm256_set1_ps(1/255.0f);
- // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
- // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF);
-
- // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
- __m256 X0 = _mm256_set1_ps(0.30);
- __m256 Y0 = _mm256_set1_ps(0.10);
- __m256 X1 = _mm256_set1_ps(0.80);
- __m256 Y1 = _mm256_set1_ps(0.35);
- __m256 X2 = _mm256_set1_ps(0.05);
- __m256 Y2 = _mm256_set1_ps(0.60);
- __m256 X3 = _mm256_set1_ps(0.55);
- __m256 Y3 = _mm256_set1_ps(0.85);
-
-
-#if PACKEDRGB
-#else
- __m256i LayerPitch = _mm256_set1_epi32(T.LayerPitch);
- __m256i BytesPerPixel = _mm256_set1_epi32(Buffer->BytesPerPixel);
-#endif
-
-#if PACKEDRGB
- for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2)
- {
- __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
- (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2,
- (real32)LayerBounds.Min.x+3,
- (real32)LayerBounds.Min.x,
- (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2,
- (real32)LayerBounds.Min.x+3);
-
- __m256 PixelY = _mm256_setr_ps((real32)Y,
- (real32)Y,
- (real32)Y,
- (real32)Y,
- (real32)Y+1,
- (real32)Y+1,
- (real32)Y+1,
- (real32)Y+1);
-#else
- for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
- {
- __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
- (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2,
- (real32)LayerBounds.Min.x+3,
- (real32)LayerBounds.Min.x+4,
- (real32)LayerBounds.Min.x+5,
- (real32)LayerBounds.Min.x+6,
- (real32)LayerBounds.Min.x+7);
-
- __m256 PixelY = _mm256_set1_ps((real32)Y);
-#endif
-
- __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY);
-
-#if PACKEDRGB
- for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
-#else
- for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 8)
-#endif
- {
-
- IACA_START;
-
- __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
- __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
- __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
- __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
- __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
- __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
- __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
- __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
- __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
-
-#if PACKEDRGB
- uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
- uint32 PixelToSeek = XLookup + YLookup;
- uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
-#else
- uint8 *Pixel = (uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel;
-#endif
-
- __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
- __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
-
- __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
- __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
- __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
- __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
- __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
- __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
- __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
- __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));
-
- __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
- _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
- __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
- _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
- __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
- _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
- __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
- _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));
-
- // Each point that passes adds .25
- __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
- _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
-
- // Preventing overlap between threads for non-packed. One nice thing
- // about packed is that the 4-padded bitmap means we can set up the
- // boundaries so we don't have to check this ever.
- __m256i TileBarrier = _mm256_cvtps_epi32(_mm256_cmp_ps(PixelX, LayerBoundsMaxX, 13));
-
- // Zero - no points pass
- // One - all points pass; not an edge
- __m256i Mask = _mm256_cvtps_epi32(_mm256_cmp_ps(Avg, Zero, 14));
- __m256i NonEdge = _mm256_cvtps_epi32(_mm256_cmp_ps(Avg, One, 13));
- __m256i TotalMask = _mm256_andnot_si256(TileBarrier, _mm256_and_si256(Mask, NonEdge));
-
- // __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)),
- // _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1)));
-
- // If all of the pixels are zeroed in the mask (aka fall outside
- // the UV lookup), we can skip the iteration.
- if (_mm256_movemask_epi8(TotalMask))
- {
- __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
-
- U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
- V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
-
- __m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
- __m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
- __m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
- __m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
- __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
- __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
- // NOTE(fox): The comparison is for when we're on the last pixel of the texel.
-
- __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
- __m256 TexY = _mm256_sub_ps(TexYFull, _mm256_cvtepi32_ps(TexYInt));
- __m256 TexXInv = _mm256_sub_ps(One, TexX);
- __m256 TexYInv = _mm256_sub_ps(One, TexY);
- __m256 TexBothXInv = _mm256_mul_ps(TexXInv, TexY);
- __m256 TexBothYInv = _mm256_mul_ps(TexX, TexYInv);
- __m256 TexBoth = _mm256_mul_ps(TexY, TexX);
- __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv);
-
-#if PACKEDRGB
- __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
- _mm256_and_si256(TexXInt, BottomTwoBits));
- __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
- _mm256_mullo_epi32(_mm256_and_si256(TexYInt, BottomTwoBits), Fouri));
- __m256i XLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
- _mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
- __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
- _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
-#else
- __m256i XLookup = TexXInt;
- __m256i YLookup = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYInt), LayerWidth));
- __m256i XLookupPlusOne = TexXIntPlusOne;
- __m256i YLookupPlusOne = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYIntPlusOne), LayerWidth));
-#endif
-
- __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
- __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup);
- __m256i PixelLookupBL = _mm256_add_epi32(XLookup, YLookupPlusOne);
- __m256i PixelLookupBR = _mm256_add_epi32(XLookupPlusOne, YLookupPlusOne);
-
- // The big feature of AVX2: gathering.
- __m256i PixelsTL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTL, 4);
- __m256i PixelsTR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupTR, 4);
- __m256i PixelsBL = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBL, 4);
- __m256i PixelsBR = _mm256_i32gather_epi32((const int32 *)TexPTR, PixelLookupBR, 4);
-
- __m256 R_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTL, FF)), Norm255);
- __m256 G_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 8), FF)), Norm255);
- __m256 B_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 16), FF)), Norm255);
- __m256 A_TexTL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTL, 24), FF)), Norm255);
-
- __m256 R_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsTR, FF)), Norm255);
- __m256 G_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 8), FF)), Norm255);
- __m256 B_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 16), FF)), Norm255);
- __m256 A_TexTR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsTR, 24), FF)), Norm255);
-
- __m256 R_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBL, FF)), Norm255);
- __m256 G_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 8), FF)), Norm255);
- __m256 B_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 16), FF)), Norm255);
- __m256 A_TexBL = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBL, 24), FF)), Norm255);
-
- __m256 R_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( PixelsBR, FF)), Norm255);
- __m256 G_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 8), FF)), Norm255);
- __m256 B_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 16), FF)), Norm255);
- __m256 A_TexBR = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(PixelsBR, 24), FF)), Norm255);
-
- __m256 R_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, R_TexTL),
- _mm256_mul_ps(TexBothYInv, R_TexTR)),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, R_TexBL),
- _mm256_mul_ps(TexBoth, R_TexBR)));
- __m256 G_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, G_TexTL),
- _mm256_mul_ps(TexBothYInv, G_TexTR)),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, G_TexBL),
- _mm256_mul_ps(TexBoth, G_TexBR)));
- __m256 B_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, B_TexTL),
- _mm256_mul_ps(TexBothYInv, B_TexTR)),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, B_TexBL),
- _mm256_mul_ps(TexBoth, B_TexBR)));
- __m256 A_Col = _mm256_add_ps(_mm256_add_ps(_mm256_mul_ps(TexBothInv, A_TexTL),
- _mm256_mul_ps(TexBothYInv, A_TexTR)),
- _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
- _mm256_mul_ps(TexBoth, A_TexBR)));
-
- // Apply anti-aliasing to edges if there are any
- if (_mm256_movemask_epi8(EdgeMask))
- {
- A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), _mm256_cvtepi32_ps(EdgeMask));
- }
-
- IACA_END;
- __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
- __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
-
- // Hoisted out of some blend modes; maybe it'd be better to just keep them in there.
- __m256 R_Colx2 = _mm256_mul_ps(R_Col, Two);
- __m256 R_ColInv = _mm256_sub_ps(One, R_Col);
-
- __m256 G_Colx2 = _mm256_mul_ps(G_Col, Two);
- __m256 G_ColInv = _mm256_sub_ps(One, G_Col);
-
- __m256 B_Colx2 = _mm256_mul_ps(B_Col, Two);
- __m256 B_ColInv = _mm256_sub_ps(One, B_Col);
-
- __m256 R_Blend = R_Col;
- __m256 G_Blend = G_Col;
- __m256 B_Blend = B_Col;
- __m256 A_Blend = LayerAlpha;
-
- // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
- if (T.BlendMode != blend_normal || _mm256_movemask_epi8(_mm256_cvtps_epi32(_mm256_cmp_ps(LayerAlpha, One, 2))))
- {
- __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
- __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
- __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8), FF)), Norm255);
- __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Norm255);
- __m256 A_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF)), Norm255);
-
- switch (T.BlendMode)
- {
- case blend_normal:
- {
- } break;
- case blend_multiply:
- {
- R_Blend = _mm256_mul_ps(R_Dest, R_Col);
- G_Blend = _mm256_mul_ps(G_Dest, G_Col);
- B_Blend = _mm256_mul_ps(B_Dest, B_Col);
- } break;
- case blend_colorburn:
- {
- // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
- // color channels, causing black clipping.
- R_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), _mm256_add_ps(R_Col, ClipPrevent)));
- G_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), _mm256_add_ps(G_Col, ClipPrevent)));
- B_Blend = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), _mm256_add_ps(B_Col, ClipPrevent)));
- } break;
- case blend_linearburn:
- {
- R_Blend = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
- G_Blend = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
- B_Blend = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
- } break;
- case blend_add:
- {
- R_Blend = _mm256_add_ps(R_Dest, R_Col);
- G_Blend = _mm256_add_ps(G_Dest, G_Col);
- B_Blend = _mm256_add_ps(B_Dest, B_Col);
- } break;
- case blend_screen:
- {
- R_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
- G_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
- B_Blend = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
- } break;
- case blend_overlay:
- {
- __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 1);
- __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 1);
- __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 1);
- __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
- __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
- __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
- __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
- __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
- __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
- R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- case blend_softlight:
- {
- // using Pegtop's equation
- R_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
- G_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
- B_Blend = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
- } break;
- case blend_hardlight:
- {
- __m256 R_Mask = _mm256_cmp_ps(R_Dest, ZeroPointFive, 13);
- __m256 G_Mask = _mm256_cmp_ps(G_Dest, ZeroPointFive, 13);
- __m256 B_Mask = _mm256_cmp_ps(B_Dest, ZeroPointFive, 13);
- __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
- __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
- __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
- __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv)));
- __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv)));
- __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv)));
- R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- case blend_subtract:
- {
- R_Blend = _mm256_sub_ps(R_Dest, R_Col);
- G_Blend = _mm256_sub_ps(G_Dest, G_Col);
- B_Blend = _mm256_sub_ps(B_Dest, B_Col);
- } break;
- case blend_divide:
- {
- R_Blend = _mm256_div_ps(R_Dest, _mm256_add_ps(R_Col, ClipPrevent));
- G_Blend = _mm256_div_ps(G_Dest, _mm256_add_ps(G_Col, ClipPrevent));
- B_Blend = _mm256_div_ps(B_Dest, _mm256_add_ps(B_Col, ClipPrevent));
- } break;
- case blend_difference:
- {
- __m256 R_Lower = _mm256_sub_ps(R_Col, R_Dest);
- __m256 G_Lower = _mm256_sub_ps(G_Col, G_Dest);
- __m256 B_Lower = _mm256_sub_ps(B_Col, B_Dest);
- __m256 R_Upper = _mm256_sub_ps(R_Dest, R_Col);
- __m256 G_Upper = _mm256_sub_ps(G_Dest, G_Col);
- __m256 B_Upper = _mm256_sub_ps(B_Dest, B_Col);
- __m256 R_Mask = _mm256_cmp_ps(R_Lower, Zero, 14);
- __m256 G_Mask = _mm256_cmp_ps(G_Lower, Zero, 14);
- __m256 B_Mask = _mm256_cmp_ps(B_Lower, Zero, 14);
- R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- }
-
- R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, LayerAlphaInv), _mm256_mul_ps(R_Blend, LayerAlpha));
- G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, LayerAlphaInv), _mm256_mul_ps(G_Blend, LayerAlpha));
- B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, LayerAlphaInv), _mm256_mul_ps(B_Blend, LayerAlpha));
-
- // Standard behavior in photo apps is for blend modes to
- // inherit underlying opacity instead of adding to it.
- if (T.BlendMode == blend_normal)
- A_Blend = _mm256_add_ps(A_Dest, LayerAlpha);
- else
- A_Blend = A_Dest;
- }
-
- __m256i R_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, R_Blend), Zero), Real255));
- __m256i G_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, G_Blend), Zero), Real255));
- __m256i B_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, B_Blend), Zero), Real255));
- __m256i A_Out = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_max_ps(_mm256_min_ps(One, A_Blend), Zero), Real255));
-
- __m256i OutputPixel = _mm256_or_si256(
- _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
- _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
-
- _mm256_maskstore_epi32((int *)Pixel, TotalMask, OutputPixel);
- }
-#if PACKEDRGB
- PixelX = _mm256_add_ps(PixelX, Four);
-#else
- PixelX = _mm256_add_ps(PixelX, Eight);
-#endif
- }
- }
-}
-
-static void
-SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
-{
- rectangle LayerBounds = ClipRectangle( T.ClipRect,
- RenderRegion );
- // Remember: since bitmaps are packed in 4x4 cubes, we always need to be aligned.
- LayerBounds.Min.x -= LayerBounds.Min.x % 4;
- LayerBounds.Min.y -= LayerBounds.Min.y % 4;
-
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
- uint8 *TexPTR = (uint8 *)T.SourceBuffer;
- Assert(LayerBounds.Max.x <= Buffer->Width);
- Assert(LayerBounds.Max.y <= Buffer->Height);
-
- __m128 XAxisPX = _mm_set1_ps(T.XAxisPX);
- __m128 XAxisPY = _mm_set1_ps(T.XAxisPY);
- __m128 YAxisPX = _mm_set1_ps(T.YAxisPX);
- __m128 YAxisPY = _mm_set1_ps(T.YAxisPY);
-
- __m128 LayerWidth = _mm_set1_ps(T.LayerWidth);
- __m128i LayerWidthMinusOne = _mm_set1_epi32(T.LayerWidth - 1);
- __m128i FullLayerWidth4i = _mm_set1_epi32(T.FullLayerWidth*4);
- __m128 LayerHeight = _mm_set1_ps(T.LayerHeight);
- __m128i LayerHeightMinusOne = _mm_set1_epi32(T.LayerHeight - 1);
- __m128 LayerOpacity = _mm_set1_ps(T.LayerOpacity);
- __m128 OriginX = _mm_set1_ps(T.OriginX);
- __m128 OriginY = _mm_set1_ps(T.OriginY);
-
- __m128 ClipPrevent = _mm_set1_ps(0.001f);
- __m128 One = _mm_set1_ps(1);
- __m128 Two = _mm_set1_ps(2);
- __m128 Zero = _mm_set1_ps(0);
- __m128 ZeroPointFive = _mm_set1_ps(0.5);
- __m128i Onei = _mm_set1_epi32(1);
- __m128 Four = _mm_set1_ps(4);
- __m128i FF = _mm_set1_epi32(0xFF);
- __m128i BottomTwoBits = _mm_set1_epi32(0x03);
- __m128i Fouri = _mm_set1_epi32(4);
- __m128i Sixteeni = _mm_set1_epi32(16);
- __m128 Reg255 = _mm_set1_ps(255.0f);
- __m128 Norm255 = _mm_set1_ps(1/255.0f);
-
- // NOTE(fox): Each loop operates on 4 pixels, 4 horizontal by 1 vertical.
-
- for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
- {
- __m128 PixelX = _mm_setr_ps((real32)LayerBounds.Min.x,
- (real32)LayerBounds.Min.x+1,
- (real32)LayerBounds.Min.x+2,
- (real32)LayerBounds.Min.x+3);
-
- __m128 PixelY = _mm_set1_ps((real32)Y);
- __m128 StartVectorY = _mm_sub_ps(PixelY, OriginY);
-
- for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
- {
-
- __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
-
- uint32 XLookup = (X >> 2)*16 + (X % 4);
- uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
- uint32 PixelToSeek = XLookup + YLookup;
- uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
-
- __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
- __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
-
- __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
- _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));
-
- if (_mm_movemask_epi8(LayerMask))
- {
- U = _mm_max_ps(_mm_min_ps(One, U), Zero);
- V = _mm_max_ps(_mm_min_ps(One, V), Zero);
-
- __m128 TexXFull = _mm_mul_ps(U, LayerWidth);
- __m128 TexYFull = _mm_mul_ps(V, LayerHeight);
- __m128i TexXInt = _mm_cvttps_epi32(TexXFull);
- __m128i TexXIntPlusOne = _mm_add_epi32(TexXInt, _mm_and_si128(_mm_cmplt_epi32(TexXInt, LayerWidthMinusOne), Onei));
- __m128i TexYInt = _mm_cvttps_epi32(TexYFull);
- __m128i TexYIntPlusOne = _mm_add_epi32(TexYInt, _mm_and_si128(_mm_cmplt_epi32(TexYInt, LayerHeightMinusOne), Onei));
-
- __m128 TexX = _mm_sub_ps(TexXFull, _mm_cvtepi32_ps(TexXInt));
- __m128 TexY = _mm_sub_ps(TexYFull, _mm_cvtepi32_ps(TexYInt));
- __m128 TexXInv = _mm_sub_ps(One, TexX);
- __m128 TexYInv = _mm_sub_ps(One, TexY);
- __m128 TexBothXInv = _mm_mul_ps(TexXInv, TexY);
- __m128 TexBothYInv = _mm_mul_ps(TexX, TexYInv);
- __m128 TexBoth = _mm_mul_ps(TexY, TexX);
- __m128 TexBothInv = _mm_mul_ps(TexXInv, TexYInv);
-
- __m128i XLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXInt, 2), Sixteeni),
- _mm_and_si128(TexXInt, BottomTwoBits));
- __m128i YLookup = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYInt, 2), FullLayerWidth4i),
- _mm_mullo_epi32(_mm_and_si128(TexYInt, BottomTwoBits), Fouri));
- __m128i XLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexXIntPlusOne, 2), Sixteeni),
- _mm_and_si128(TexXIntPlusOne, BottomTwoBits));
- __m128i YLookupPlusOne = _mm_add_epi32(_mm_mullo_epi32(_mm_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
- _mm_mullo_epi32(_mm_and_si128(TexYIntPlusOne, BottomTwoBits), Fouri));
-
- __m128i PixelLookupTL = _mm_add_epi32(XLookup, YLookup);
- __m128i PixelLookupTR = _mm_add_epi32(XLookupPlusOne, YLookup);
- __m128i PixelLookupBL = _mm_add_epi32(XLookup, YLookupPlusOne);
- __m128i PixelLookupBR = _mm_add_epi32(XLookupPlusOne, YLookupPlusOne);
-
- // SSE lacks gathering, so we have no choice but to manually
- // look up each pixel's four bilinear samples in scalar.
-
- uint32 S_PixelLookupTL0 = _mm_cvtsi128_si32(PixelLookupTL);
- uint32 S_PixelLookupTR0 = _mm_cvtsi128_si32(PixelLookupTR);
- uint32 S_PixelLookupBL0 = _mm_cvtsi128_si32(PixelLookupBL);
- uint32 S_PixelLookupBR0 = _mm_cvtsi128_si32(PixelLookupBR);
- uint32 S_PixelsTL0 = *(uint32 *)(TexPTR + S_PixelLookupTL0*4);
- uint32 S_PixelsTR0 = *(uint32 *)(TexPTR + S_PixelLookupTR0*4);
- uint32 S_PixelsBL0 = *(uint32 *)(TexPTR + S_PixelLookupBL0*4);
- uint32 S_PixelsBR0 = *(uint32 *)(TexPTR + S_PixelLookupBR0*4);
-
- uint32 S_PixelLookupTL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 4));
- uint32 S_PixelLookupTR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 4));
- uint32 S_PixelLookupBL1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 4));
- uint32 S_PixelLookupBR1 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 4));
- uint32 S_PixelsTL1 = *(uint32 *)(TexPTR + S_PixelLookupTL1*4);
- uint32 S_PixelsTR1 = *(uint32 *)(TexPTR + S_PixelLookupTR1*4);
- uint32 S_PixelsBL1 = *(uint32 *)(TexPTR + S_PixelLookupBL1*4);
- uint32 S_PixelsBR1 = *(uint32 *)(TexPTR + S_PixelLookupBR1*4);
-
- uint32 S_PixelLookupTL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 8));
- uint32 S_PixelLookupTR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 8));
- uint32 S_PixelLookupBL2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 8));
- uint32 S_PixelLookupBR2 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 8));
- uint32 S_PixelsTL2 = *(uint32 *)(TexPTR + S_PixelLookupTL2*4);
- uint32 S_PixelsTR2 = *(uint32 *)(TexPTR + S_PixelLookupTR2*4);
- uint32 S_PixelsBL2 = *(uint32 *)(TexPTR + S_PixelLookupBL2*4);
- uint32 S_PixelsBR2 = *(uint32 *)(TexPTR + S_PixelLookupBR2*4);
-
- uint32 S_PixelLookupTL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTL, 12));
- uint32 S_PixelLookupTR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupTR, 12));
- uint32 S_PixelLookupBL3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBL, 12));
- uint32 S_PixelLookupBR3 = _mm_cvtsi128_si32(_mm_srli_si128(PixelLookupBR, 12));
- uint32 S_PixelsTL3 = *(uint32 *)(TexPTR + S_PixelLookupTL3*4);
- uint32 S_PixelsTR3 = *(uint32 *)(TexPTR + S_PixelLookupTR3*4);
- uint32 S_PixelsBL3 = *(uint32 *)(TexPTR + S_PixelLookupBL3*4);
- uint32 S_PixelsBR3 = *(uint32 *)(TexPTR + S_PixelLookupBR3*4);
-
- __m128i PixelsTL = _mm_setr_epi32(S_PixelsTL0, S_PixelsTL1, S_PixelsTL2, S_PixelsTL3);
- __m128i PixelsTR = _mm_setr_epi32(S_PixelsTR0, S_PixelsTR1, S_PixelsTR2, S_PixelsTR3);
- __m128i PixelsBL = _mm_setr_epi32(S_PixelsBL0, S_PixelsBL1, S_PixelsBL2, S_PixelsBL3);
- __m128i PixelsBR = _mm_setr_epi32(S_PixelsBR0, S_PixelsBR1, S_PixelsBR2, S_PixelsBR3);
-
- __m128 R_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTL, FF)), Norm255);
- __m128 G_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 8), FF)), Norm255);
- __m128 B_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 16), FF)), Norm255);
- __m128 A_TexTL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTL, 24), FF)), Norm255);
-
- __m128 R_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsTR, FF)), Norm255);
- __m128 G_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 8), FF)), Norm255);
- __m128 B_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 16), FF)), Norm255);
- __m128 A_TexTR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsTR, 24), FF)), Norm255);
-
- __m128 R_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBL, FF)), Norm255);
- __m128 G_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 8), FF)), Norm255);
- __m128 B_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 16), FF)), Norm255);
- __m128 A_TexBL = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBL, 24), FF)), Norm255);
-
- __m128 R_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( PixelsBR, FF)), Norm255);
- __m128 G_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 8), FF)), Norm255);
- __m128 B_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 16), FF)), Norm255);
- __m128 A_TexBR = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(PixelsBR, 24), FF)), Norm255);
-
- __m128 R_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, R_TexTL),
- _mm_mul_ps(TexBothYInv, R_TexTR)),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, R_TexBL),
- _mm_mul_ps(TexBoth, R_TexBR)));
- __m128 G_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, G_TexTL),
- _mm_mul_ps(TexBothYInv, G_TexTR)),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, G_TexBL),
- _mm_mul_ps(TexBoth, G_TexBR)));
- __m128 B_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, B_TexTL),
- _mm_mul_ps(TexBothYInv, B_TexTR)),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, B_TexBL),
- _mm_mul_ps(TexBoth, B_TexBR)));
- __m128 A_Col = _mm_add_ps(_mm_add_ps(_mm_mul_ps(TexBothInv, A_TexTL),
- _mm_mul_ps(TexBothYInv, A_TexTR)),
- _mm_add_ps(_mm_mul_ps(TexBothXInv, A_TexBL),
- _mm_mul_ps(TexBoth, A_TexBR)));
-
-
- __m128i R_Out, G_Out, B_Out, A_Out;
-
- __m128 LayerAlpha = _mm_mul_ps(A_Col, LayerOpacity);
- __m128 LayerAlphaInv = _mm_sub_ps(One, LayerAlpha);
-
- __m128 R_Colx2 = _mm_mul_ps(R_Col, Two);
- __m128 R_ColInv = _mm_sub_ps(One, R_Col);
-
- __m128 G_Colx2 = _mm_mul_ps(G_Col, Two);
- __m128 G_ColInv = _mm_sub_ps(One, G_Col);
-
- __m128 B_Colx2 = _mm_mul_ps(B_Col, Two);
- __m128 B_ColInv = _mm_sub_ps(One, B_Col);
-
- __m128 R_Blend = R_Col;
- __m128 G_Blend = G_Col;
- __m128 B_Blend = B_Col;
- __m128 A_Blend = LayerAlpha;
-
- if (!_mm_movemask_epi8(_mm_cvtps_epi32(_mm_cmpeq_ps(LayerAlpha, One))) || T.BlendMode != blend_normal)
- {
- __m128i DestPixel = _mm_loadu_si128((const __m128i *)Pixel);
- __m128 R_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128( DestPixel, FF)), Norm255);
- __m128 G_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 8), FF)), Norm255);
- __m128 B_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 16), FF)), Norm255);
- __m128 A_Dest = _mm_mul_ps(_mm_cvtepi32_ps(_mm_and_si128(_mm_srli_epi32(DestPixel, 24), FF)), Norm255);
-
- switch (T.BlendMode)
- {
- case blend_normal:
- {
- } break;
- case blend_multiply:
- {
- R_Blend = _mm_mul_ps(R_Dest, R_Col);
- G_Blend = _mm_mul_ps(G_Dest, G_Col);
- B_Blend = _mm_mul_ps(B_Dest, B_Col);
- } break;
- case blend_colorburn:
- {
- // NOTE(fox): A small amount is added to Col since images with zero for alpha may also zero out the
- // color channels, causing black clipping.
- R_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, R_Dest), _mm_add_ps(R_Col, ClipPrevent)));
- G_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, G_Dest), _mm_add_ps(G_Col, ClipPrevent)));
- B_Blend = _mm_sub_ps(One, _mm_div_ps(_mm_sub_ps(One, B_Dest), _mm_add_ps(B_Col, ClipPrevent)));
- } break;
- case blend_linearburn:
- {
- R_Blend = _mm_sub_ps(_mm_add_ps(R_Dest, R_Col), One);
- G_Blend = _mm_sub_ps(_mm_add_ps(G_Dest, G_Col), One);
- B_Blend = _mm_sub_ps(_mm_add_ps(B_Dest, B_Col), One);
- } break;
- case blend_add:
- {
- R_Blend = _mm_add_ps(R_Dest, R_Col);
- G_Blend = _mm_add_ps(G_Dest, G_Col);
- B_Blend = _mm_add_ps(B_Dest, B_Col);
- } break;
- case blend_screen:
- {
- R_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv));
- G_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv));
- B_Blend = _mm_sub_ps(One, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv));
- } break;
- case blend_overlay:
- {
- __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 1);
- __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 1);
- __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 1);
- __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
- __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
- __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
- __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
- __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
- __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
- R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- case blend_softlight:
- {
- // using Pegtop's equation
- R_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, R_Colx2), _mm_mul_ps(R_Dest, R_Dest)), _mm_mul_ps(R_Colx2, R_Dest));
- G_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, G_Colx2), _mm_mul_ps(G_Dest, G_Dest)), _mm_mul_ps(G_Colx2, G_Dest));
- B_Blend = _mm_add_ps(_mm_mul_ps(_mm_sub_ps(One, B_Colx2), _mm_mul_ps(B_Dest, B_Dest)), _mm_mul_ps(B_Colx2, B_Dest));
- } break;
- case blend_hardlight:
- {
- __m128 R_Mask = _mm_cmp_ps(R_Dest, ZeroPointFive, 13);
- __m128 G_Mask = _mm_cmp_ps(G_Dest, ZeroPointFive, 13);
- __m128 B_Mask = _mm_cmp_ps(B_Dest, ZeroPointFive, 13);
- __m128 R_Lower = _mm_mul_ps(Two, _mm_mul_ps(R_Dest, R_Col));
- __m128 G_Lower = _mm_mul_ps(Two, _mm_mul_ps(G_Dest, G_Col));
- __m128 B_Lower = _mm_mul_ps(Two, _mm_mul_ps(B_Dest, B_Col));
- __m128 R_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, R_Dest), R_ColInv)));
- __m128 G_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, G_Dest), G_ColInv)));
- __m128 B_Upper = _mm_sub_ps(One, _mm_mul_ps(Two, _mm_mul_ps(_mm_sub_ps(One, B_Dest), B_ColInv)));
- R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- case blend_subtract:
- {
- R_Blend = _mm_sub_ps(R_Dest, R_Col);
- G_Blend = _mm_sub_ps(G_Dest, G_Col);
- B_Blend = _mm_sub_ps(B_Dest, B_Col);
- } break;
- case blend_divide:
- {
- R_Blend = _mm_div_ps(R_Dest, _mm_add_ps(R_Col, ClipPrevent));
- G_Blend = _mm_div_ps(G_Dest, _mm_add_ps(G_Col, ClipPrevent));
- B_Blend = _mm_div_ps(B_Dest, _mm_add_ps(B_Col, ClipPrevent));
- } break;
- case blend_difference:
- {
- __m128 R_Lower = _mm_sub_ps(R_Col, R_Dest);
- __m128 G_Lower = _mm_sub_ps(G_Col, G_Dest);
- __m128 B_Lower = _mm_sub_ps(B_Col, B_Dest);
- __m128 R_Upper = _mm_sub_ps(R_Dest, R_Col);
- __m128 G_Upper = _mm_sub_ps(G_Dest, G_Col);
- __m128 B_Upper = _mm_sub_ps(B_Dest, B_Col);
- __m128 R_Mask = _mm_cmp_ps(R_Lower, Zero, 14);
- __m128 G_Mask = _mm_cmp_ps(G_Lower, Zero, 14);
- __m128 B_Mask = _mm_cmp_ps(B_Lower, Zero, 14);
- R_Blend = _mm_blendv_ps(R_Upper, R_Lower, R_Mask);
- G_Blend = _mm_blendv_ps(G_Upper, G_Lower, G_Mask);
- B_Blend = _mm_blendv_ps(B_Upper, B_Lower, B_Mask);
- } break;
- }
-
- R_Blend = _mm_add_ps(_mm_mul_ps(R_Dest, LayerAlphaInv), _mm_mul_ps(R_Blend, LayerAlpha));
- G_Blend = _mm_add_ps(_mm_mul_ps(G_Dest, LayerAlphaInv), _mm_mul_ps(G_Blend, LayerAlpha));
- B_Blend = _mm_add_ps(_mm_mul_ps(B_Dest, LayerAlphaInv), _mm_mul_ps(B_Blend, LayerAlpha));
-
- // Standard behavior in photo apps is for blend modes to
- // inherit underlying opacity instead of adding to it.
- if (T.BlendMode == blend_normal)
- A_Blend = _mm_add_ps(A_Dest, LayerAlpha);
- else
- A_Blend = A_Dest;
- }
-
- R_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, R_Blend), Zero), Reg255));
- G_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, G_Blend), Zero), Reg255));
- B_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, B_Blend), Zero), Reg255));
- A_Out = _mm_cvtps_epi32(_mm_mul_ps(_mm_max_ps(_mm_min_ps(One, A_Blend), Zero), Reg255));
-
- __m128i OutputPixel = _mm_or_si128(
- _mm_or_si128(R_Out, _mm_slli_epi32(G_Out, 8)),
- _mm_or_si128(_mm_slli_epi32(B_Out, 16), _mm_slli_epi32(A_Out, 24)));
- _mm_maskmoveu_si128(OutputPixel, LayerMask, (char *)Pixel);
- }
- PixelX = _mm_add_ps(PixelX, Four);
- }
- }
-}
-
-#endif
-#endif