summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFox Caminiti <fox@foxcam.net>2022-08-14 12:38:08 -0400
committerFox Caminiti <fox@foxcam.net>2022-08-14 12:38:08 -0400
commit7cfb7ce652d1c13ab72392d95dc93d967bf505fb (patch)
treeb68e0ddbcce7ee5c125c170920b815112c0c3d36
parentbc5375149c0ecb416848a2d3657ea41ae97177b3 (diff)
concave masking; software anti aliasing
-rw-r--r--bitmap_calls.cpp14
-rw-r--r--createcalls.cpp148
-rw-r--r--gl_calls.cpp4
-rw-r--r--main.cpp9
-rw-r--r--my_imgui_widgets.cpp17
-rw-r--r--prenderer.cpp99
6 files changed, 138 insertions, 153 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp
index 2031459..dd5c793 100644
--- a/bitmap_calls.cpp
+++ b/bitmap_calls.cpp
@@ -19,6 +19,9 @@ void Bitmap_ConvertPacking(void *Buffer, void *DestBuffer, uint16 Width, uint16
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
+ if (Y == 48 && X == 0)
+ uint8 war = 0;
+ // if (YLookup == 2500 && XLookup == 1)
uint8 *DPixel, *Pixel;
if (Which == 0) {
DPixel = Temp + PixelToSeek*BytesPerPixel;
@@ -167,6 +170,8 @@ Bitmap_CopyToPointer(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tot
uint64 RemainderBytes = TotalBytes % ByteOffset;
while (bytes <= TotalBytes - RemainderBytes) {
+ if (bytes > 2496*4)
+ int pp = 0;
uint8 *Pixel = (uint8 *)Row + bytes;
uint8 *Pixel2 = (uint8 *)Row2 + bytes;
if (InstructionMode == instruction_mode_avx) {
@@ -233,15 +238,6 @@ BitmapPackRGB(pixel_buffer *Buffer) {
}
static void
-OutputToViewport(pixel_buffer *CompBuffer, project_state *State, GLuint textureID) {
- Convert4x4Chunk(CompBuffer, 1);
- EndRenderState(State);
- glBindTexture(GL_TEXTURE_2D, textureID);
- glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE,
- CompBuffer->EffectBuffer);
-}
-
-static void
DebugFillSolid(pixel_buffer *Raster, v4 Color)
{
uint32 ColS = ColToUint32(Color);
diff --git a/createcalls.cpp b/createcalls.cpp
index 4ddaa7e..89d881b 100644
--- a/createcalls.cpp
+++ b/createcalls.cpp
@@ -217,13 +217,16 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask)
glBindFramebuffer(GL_FRAMEBUFFER, Test.FramebufferObject);
glEnable(GL_STENCIL_TEST);
- glStencilOp(GL_KEEP, GL_REPLACE, GL_REPLACE);
+ // glStencilOp(GL_KEEP, GL_REPLACE, GL_REPLACE);
+ glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
glClearColor(0.0f, 0.0f, 0.0f, 0.0f);
glClear(GL_COLOR_BUFFER_BIT | GL_STENCIL_BUFFER_BIT);
- glStencilFunc(GL_ALWAYS, 1, 0xFF); // always write
+ glStencilFunc(GL_ALWAYS, 0, 0xFF); // always write
glStencilMask(0xff); // allow writing; ANDs any writes to the stencil buffer with this
+ glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
glUseProgram(MaskShaderProgram);
// secondary VBO
@@ -239,11 +242,21 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask)
int Scale = glGetUniformLocation(MaskShaderProgram, "CompDimensions");
glUniform3f(Scale, (real32)Layer->Source->Info.Width, (real32)Layer->Source->Info.Height, 0);
+
+ glStencilOpSeparate(GL_FRONT, GL_KEEP, GL_KEEP, GL_INCR_WRAP);
+ glStencilOpSeparate(GL_BACK, GL_KEEP, GL_KEEP, GL_DECR_WRAP);
+
+ glDisable(GL_CULL_FACE);
+
glDrawArrays(GL_TRIANGLE_FAN, 0, Mask->NumberOfVerts);
+ // glEnable(GL_CULL_FACE);
+
+ glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+
glBindVertexArray(0);
- glStencilFunc(GL_EQUAL, 1, 0xFF);
- glStencilMask(0x00); // disables stencil writing
+ // glStencilFunc(GL_EQUAL, 1, 0xFF);
+ // glStencilMask(0x00); // disables stencil writing
glBindRenderbuffer(GL_RENDERBUFFER, Test.Color_Renderbuffer);
glUseProgram(DefaultShaderProgram);
@@ -258,7 +271,13 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask)
glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(float), (void*)(3 * sizeof(float)));
glEnableVertexAttribArray(1);
- glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0);
+ //glStencilFunc(GL_EQUAL, 0, 0xFF);
+ //glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP);
+ //glDrawElements(GL_TRIANGLE_STRIP, 6, GL_UNSIGNED_INT, 0);
+
+ glStencilFunc(GL_NOTEQUAL, 0, 0xFF);
+ glStencilOp(GL_ZERO, GL_ZERO, GL_ZERO);
+ glDrawElements(GL_TRIANGLE_STRIP, 6, GL_UNSIGNED_INT, 0);
glDisable(GL_STENCIL_TEST);
glStencilMask(0xFF);
@@ -287,8 +306,9 @@ Layer_UpdateBitmap(project_layer *Layer, memory *Memory, int32 CurrentFrame) {
uint16 Height = Source->Info.Height;
uint16 BytesPerPixel = Source->Info.BytesPerPixel;
void *DestBuffer = BitmapInfo->BitmapBuffer;
- uint64 Size = Bitmap_CalcUnpackedBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel);
- Bitmap_CopyToPointer(Bitmap->Data, DestBuffer, BytesPerPixel, Size);
+ uint64 UnpackedSize = Bitmap_CalcUnpackedBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel);
+ uint64 PackedSize = Bitmap_CalcTotalBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel);
+ Bitmap_CopyToPointer(Bitmap->Data, DestBuffer, BytesPerPixel, UnpackedSize);
TestGL_InitTexture(&BitmapInfo->Test, DestBuffer, Width, Height);
@@ -305,7 +325,7 @@ Layer_UpdateBitmap(project_layer *Layer, memory *Memory, int32 CurrentFrame) {
Layer->Effect[i]->func(Source, BitmapInfo, Memory, Layer->Effect[i]->Property);
}
Bitmap_ConvertPacking(DestBuffer, Memory->Scratch, Width, Height, BytesPerPixel, 0);
- Bitmap_CopyToPointer(Memory->Scratch, DestBuffer, BytesPerPixel, Size);
+ Bitmap_CopyToPointer(Memory->Scratch, DestBuffer, BytesPerPixel, PackedSize);
}
static void
@@ -418,6 +438,7 @@ LoadTestFootage(project_data *File, project_state *State, memory *Memory)
PostMsg(State, "File open fail...");
source *Source = &File->Source[0];
Layer_CreateFromSource(File, State, Memory, Source);
+
SelectLayer(File->Layer[0], State, 0);
// AddEffect(File->Layer[0], Memory, 3);
@@ -447,9 +468,13 @@ LoadTestFootage(project_data *File, project_state *State, memory *Memory)
Mask->Point[3].HandleBezier = true;
Mask->Point[4].HandleBezier = true;
-
Mask->NumberOfPoints = 5;
+ // if (!Source_Generate(File, Memory, "../asset/test.png"))
+ // PostMsg(State, "File open fail...");
+ if (!Source_Generate(File, Memory, "../asset/debug.png"))
+ PostMsg(State, "File open fail...");
+
// property_channel *Property = &File->Layer[0]->x;
// ManualKeyframeInsertF(Property, Memory, 1, 500);
// ManualKeyframeInsertF(Property, Memory, 30, 800);
@@ -538,108 +563,6 @@ CreateGrid(project_data *File, memory *Memory) {
}
}
#endif
-#if 0
-static void
-DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memory *Memory, sdl_input Input, project_state *State,
- rectangle Box)
-{
- uint16 Padding = 20; //UI->LayerPadding / 5;
- uint16 Margin = 100;
-
- uint16 *Levels = (uint16 *)Scratch;
-
- uint16 *Mean = (Levels + 256*7);
-
- uint32 Color = 0;
- uint32 AltColor = ColToUint32(V4(0.1,0.1,0.1,1.0));
-
- // this is a bad idea
- real32 *Zoom = (real32 *)(Levels + 256*6);
- if (*Zoom < 0.0f)
- *Zoom = 0.0f;
- uint16 *SelectedChannel = (uint16 *)(Levels + 256*6 + 3);
-
- if (*SelectedChannel == 0) {
- Color = ColToUint32(V4(0.6,0.6,0.6,1.0));
- } else if (*SelectedChannel == 1) {
- Levels += 256;
- Color = ColToUint32(V4(0.6,0.0,0.0,1.0));
- } else if (*SelectedChannel == 2) {
- Levels += 256*2;
- Color = ColToUint32(V4(0.0,0.6,0.0,1.0));
- } else if (*SelectedChannel == 3) {
- Levels += 256*3;
- Color = ColToUint32(V4(0.0,0.0,0.6,1.0));
- } else if (*SelectedChannel == 4) {
- Levels += 256*4;
- Color = ColToUint32(V4(0.9,0.9,0.9,1.0));
- }
-
-
- /*
- if (TestRectangle(Box, Input.Mouse) &&
- Input.MouseButton[0].IsDown)
- {
- State->ArbitrarySlide = 1;
- State->Sliding.RandomPointer = Zoom;
- }
- */
-
- uint8 *Row = ((uint8 *)UIBuffer->OriginalBuffer +
- UIBuffer->BytesPerPixel +
- UIBuffer->Pitch);
- for (int Y = 0;
- Y > Box.Min.y;
- Y--)
- {
- uint32 *Pixel = (uint32 *)Row + Box.Min.x;
- for(int X = Box.Min.x;
- X < Box.Max.x;
- ++X)
- {
- real32 Span = (Box.Max.x - Box.Min.x) / 256.0f;
- int16 XLocal = (X - Box.Min.x) / Span;
- int16 YLocal = -(Y - Box.Max.y);
- if (*(Levels + XLocal) > (YLocal * RoundReal32ToInt32(*Zoom)) && XLocal < 256)
- *Pixel++ = Color;
- else
- *Pixel++ = AltColor;
- }
- Row -= UIBuffer->Pitch;
- }
-}
-
-static pixel_buffer
-CreateSolidBitmap(memory *Memory, uint16 Height, uint16 Width, v4 Color) {
- pixel_buffer Buffer = {};
- Buffer.BytesPerPixel = 4;
- Buffer.Height = Height;
- Buffer.Width = Width;
- CalculateFull(&Buffer);
- Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
- Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
- Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
- DebugFillSolid(&Buffer, Color);
- BitmapPackRGB(&Buffer);
- Buffer.ToUpdate = true;
- return Buffer;
-}
-
-static pixel_buffer
-CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) {
- pixel_buffer Buffer = {};
- Buffer.BytesPerPixel = 4;
- Buffer.Height = Height;
- Buffer.Width = Width;
- CalculateFull(&Buffer);
- Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel;
- Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
- Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch);
- DebugBitmap(&Buffer);
- BitmapPackRGB(&Buffer);
- Buffer.ToUpdate = true;
- return Buffer;
-}
/*
{
@@ -659,6 +582,3 @@ CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) {
Layer->EndFrame = File.EndFrame;
}
*/
-
-
-#endif
diff --git a/gl_calls.cpp b/gl_calls.cpp
index 4474b44..68c2882 100644
--- a/gl_calls.cpp
+++ b/gl_calls.cpp
@@ -152,6 +152,8 @@ static void TestGL_InitDefaultVerts() {
1, 2, 3
};
+ glEnable(GL_MULTISAMPLE);
+
// Indices!
glGenVertexArrays(1, &DefaultVerts.VertexArrayObject);
@@ -194,7 +196,7 @@ TestGL_InitTexture(gl_effect_layer *Test, void *Data, uint16 Width, uint16 Heigh
GLuint Stencil_Renderbuffer = 0;
glGenRenderbuffers(1, &Stencil_Renderbuffer);
glBindRenderbuffer( GL_RENDERBUFFER, (GLuint)Stencil_Renderbuffer );
- glRenderbufferStorage( GL_RENDERBUFFER, GL_STENCIL_INDEX8, Width, Height );
+ glRenderbufferStorage(GL_RENDERBUFFER, GL_STENCIL_INDEX8, Width, Height );
glBindFramebuffer(GL_FRAMEBUFFER, Test->FramebufferObject);
diff --git a/main.cpp b/main.cpp
index 84d3559..2153ae6 100644
--- a/main.cpp
+++ b/main.cpp
@@ -237,7 +237,7 @@ int main(int argc, char *argv[]) {
InitMemoryTable(&GlobalMemory, &Memory, 10 * 1024 * 1024, F_Strings, "Strings");
InitMemoryTable(&GlobalMemory, &Memory, (uint64)200 * 1024 * 1024, B_LayerBitmaps, "Layer buffer");
- InitMemoryTable(&GlobalMemory, &Memory, (uint64)200 * 1024 * 1024, B_LoadedBitmaps, "Loaded bitmap buffer");
+ InitMemoryTable(&GlobalMemory, &Memory, (uint64)600 * 1024 * 1024, B_LoadedBitmaps, "Loaded bitmap buffer");
Memory.Scratch = AllocateMemory(&Memory, (uint64)64*1024*1024, B_LayerBitmaps);
@@ -509,7 +509,12 @@ int main(int argc, char *argv[]) {
}
}
#else
- OutputToViewport(&CompBuffer, &State, textureID);
+ Bitmap_ConvertPacking(CompBuffer.PackedBuffer, CompBuffer.UnpackedBuffer,
+ CompBuffer.Width, CompBuffer.Height, CompBuffer.BytesPerPixel, 1);
+ EndRenderState(&State);
+ glBindTexture(GL_TEXTURE_2D, textureID);
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE,
+ CompBuffer.UnpackedBuffer);
#endif
ImGui::Render();
diff --git a/my_imgui_widgets.cpp b/my_imgui_widgets.cpp
index e334f63..656804d 100644
--- a/my_imgui_widgets.cpp
+++ b/my_imgui_widgets.cpp
@@ -597,6 +597,14 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, comp_buffer Comp
}
*/
+ ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonMiddle);
+ if (ImGui::BeginPopup("context")) {
+ if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != instruction_mode_scalar)) { InstructionMode = instruction_mode_scalar; State->UpdateFrame = true; }
+ if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != instruction_mode_sse)) { InstructionMode = instruction_mode_sse; State->UpdateFrame = true; }
+ if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != instruction_mode_avx)) { InstructionMode = instruction_mode_avx; State->UpdateFrame = true; }
+ ImGui::EndPopup();
+ }
+
if (IsHovered && IsActivated && ImGui::IsMouseDown(ImGuiMouseButton_Left))
{
// Point to zoom in on if Z is held
@@ -643,15 +651,6 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, comp_buffer Comp
}
- ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonMiddle);
- if (ImGui::BeginPopup("context")) {
- if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != instruction_mode_scalar)) { InstructionMode = instruction_mode_scalar; State->UpdateFrame = true; }
- if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != instruction_mode_sse)) { InstructionMode = instruction_mode_sse; State->UpdateFrame = true; }
- if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != instruction_mode_avx)) { InstructionMode = instruction_mode_avx; State->UpdateFrame = true; }
- ImGui::EndPopup();
- }
-
-
ImGui::End();
}
diff --git a/prenderer.cpp b/prenderer.cpp
index 7550d0f..940cb0a 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -376,6 +376,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
LayerBounds.Min.x -= LayerBounds.Min.x % 4;
LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
uint8 *TexPTR = (uint8 *)T.SourceBuffer;
Assert(LayerBounds.Max.x <= Buffer->Width);
Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -398,15 +401,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 One = _mm256_set1_ps(1);
__m256 Two = _mm256_set1_ps(2);
__m256 Zero = _mm256_set1_ps(0);
+ // __m256 UMin = _mm256_set1_ps(0.0f - (1 / T.LayerWidth));
+ // __m256 VMin = _mm256_set1_ps(0.0f - (1 / T.LayerHeight));
+ // __m256 UMax = _mm256_set1_ps(1.0f + (1 / T.LayerWidth));
+ __m256 VMax = _mm256_set1_ps(1.0f - (1 / T.LayerHeight));
+
+ __m256 ZeroPoint25 = _mm256_set1_ps(0.25);
__m256 ZeroPointFive = _mm256_set1_ps(0.5);
__m256i Zeroi = _mm256_set1_epi32(0);
__m256i Onei = _mm256_set1_epi32(1);
__m256 Four = _mm256_set1_ps(4);
__m256 Sixteen = _mm256_set1_ps(16);
__m256i FF = _mm256_set1_epi32(0xFF);
+ __m256i Full = _mm256_set1_epi32(0xFFFFFFFF);
__m256i BottomTwoBits = _mm256_set1_epi32(0x03);
__m256i Fouri = _mm256_set1_epi32(4);
__m256i Sixteeni = _mm256_set1_epi32(16);
+ __m256i SixtyFouri = _mm256_set1_epi32(64);
__m256 Real255 = _mm256_set1_ps(255.0f);
__m256i Int255 = _mm256_set1_epi32(255);
__m256 Norm255 = _mm256_set1_ps(1/255.0f);
@@ -442,11 +453,25 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
IACA_START;
- __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+ // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
+ __m256 X0 = _mm256_set1_ps(0.30);
+ __m256 Y0 = _mm256_set1_ps(0.10);
+ __m256 X1 = _mm256_set1_ps(0.80);
+ __m256 Y1 = _mm256_set1_ps(0.35);
+ __m256 X2 = _mm256_set1_ps(0.05);
+ __m256 Y2 = _mm256_set1_ps(0.60);
+ __m256 X3 = _mm256_set1_ps(0.55);
+ __m256 Y3 = _mm256_set1_ps(0.85);
- // TODO(fox): Not unwraping this function may lose a few cycles!
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+ __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
+ __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
+ __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0);
+ __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1);
+ __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1);
+ __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2);
+ __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2);
+ __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
+ __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
@@ -456,13 +481,39 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
__m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
- __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)),
- _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2))));
+ __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY));
+ __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY));
+ __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY));
+ __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY));
+ __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY));
+ __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY));
+ __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY));
+ __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY));
+
+ __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1)));
+ __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1)));
+ __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1)));
+ __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1)));
+
+ // Each point that passes adds .25
+ __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)),
+ _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25)));
+
+ // Zero - no points pass
+ // One - all points pass; not an edge
+ __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
+ __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);
// If all of the pixels are zeroed in the mask (aka fall outside
// the UV lookup), we can skip the iteration.
- if (_mm256_movemask_epi8(LayerMask))
+ if (_mm256_movemask_epi8(Mask))
{
+ __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
+
U = _mm256_max_ps(_mm256_min_ps(One, U), Zero);
V = _mm256_max_ps(_mm256_min_ps(One, V), Zero);
@@ -472,6 +523,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
__m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
+ if (T.LayerWidth == 50 && _mm256_cvtsi256_si32(TexYIntPlusOne) == 49)
+ int pp = 0;
// NOTE(fox): The comparison is for when we're on the last pixel of the texel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
@@ -540,6 +593,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL),
_mm256_mul_ps(TexBoth, A_TexBR)));
+ // Apply anti-aliasing to edges if there are any
+ if (_mm256_movemask_epi8(EdgeMask))
+ {
+ A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask);
+ }
+
__m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
__m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
@@ -559,7 +618,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 A_Blend = LayerAlpha;
// Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
- if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal)
+ if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal)
{
__m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
__m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
@@ -691,8 +750,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
- // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask);
- _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
+
+ _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel);
}
PixelX = _mm256_add_ps(PixelX, Four);
}
@@ -708,6 +767,9 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
LayerBounds.Min.x -= LayerBounds.Min.x % 4;
LayerBounds.Min.y -= LayerBounds.Min.y % 4;
+ uint16 WidthP, HeightP;
+ Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
+
uint8 *TexPTR = (uint8 *)T.SourceBuffer;
Assert(LayerBounds.Max.x <= Buffer->Width);
Assert(LayerBounds.Max.y <= Buffer->Height);
@@ -761,11 +823,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
-
- // TODO(fox): Not unwraping this function may lose a few cycles!
- uint16 WidthP, HeightP;
- Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP);
-
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
@@ -774,8 +831,8 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY));
__m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY));
- __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)),
- _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One))));
+ __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)),
+ _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One))));
if (_mm_movemask_epi8(LayerMask))
{
@@ -1082,7 +1139,7 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY);
- if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) {
+ if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) {
real32 TexXFull = U * T.LayerWidth;
uint32 TexXInt = (uint32)TexXFull;
@@ -1092,6 +1149,12 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
uint32 TexYInt = (uint32)TexYFull;
real32 TexY = TexYFull - TexYInt;
+ if(T.LayerWidth == 50)
+ real32 pp = 0;
+
+ if(TexYInt > 47 && T.LayerWidth == 50)
+ real32 pp = 0;
+
real32 TexXInv = 1 - TexX;
real32 TexYInv = 1 - TexY;
real32 TexBothXInv = TexXInv * TexY;