diff options
-rw-r--r-- | bitmap_calls.cpp | 14 | ||||
-rw-r--r-- | createcalls.cpp | 148 | ||||
-rw-r--r-- | gl_calls.cpp | 4 | ||||
-rw-r--r-- | main.cpp | 9 | ||||
-rw-r--r-- | my_imgui_widgets.cpp | 17 | ||||
-rw-r--r-- | prenderer.cpp | 99 |
6 files changed, 138 insertions, 153 deletions
diff --git a/bitmap_calls.cpp b/bitmap_calls.cpp index 2031459..dd5c793 100644 --- a/bitmap_calls.cpp +++ b/bitmap_calls.cpp @@ -19,6 +19,9 @@ void Bitmap_ConvertPacking(void *Buffer, void *DestBuffer, uint16 Width, uint16 uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; + if (Y == 48 && X == 0) + uint8 war = 0; + // if (YLookup == 2500 && XLookup == 1) uint8 *DPixel, *Pixel; if (Which == 0) { DPixel = Temp + PixelToSeek*BytesPerPixel; @@ -167,6 +170,8 @@ Bitmap_CopyToPointer(void *Input, void *Output, uint16 BytesPerPixel, uint64 Tot uint64 RemainderBytes = TotalBytes % ByteOffset; while (bytes <= TotalBytes - RemainderBytes) { + if (bytes > 2496*4) + int pp = 0; uint8 *Pixel = (uint8 *)Row + bytes; uint8 *Pixel2 = (uint8 *)Row2 + bytes; if (InstructionMode == instruction_mode_avx) { @@ -233,15 +238,6 @@ BitmapPackRGB(pixel_buffer *Buffer) { } static void -OutputToViewport(pixel_buffer *CompBuffer, project_state *State, GLuint textureID) { - Convert4x4Chunk(CompBuffer, 1); - EndRenderState(State); - glBindTexture(GL_TEXTURE_2D, textureID); - glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE, - CompBuffer->EffectBuffer); -} - -static void DebugFillSolid(pixel_buffer *Raster, v4 Color) { uint32 ColS = ColToUint32(Color); diff --git a/createcalls.cpp b/createcalls.cpp index 4ddaa7e..89d881b 100644 --- a/createcalls.cpp +++ b/createcalls.cpp @@ -217,13 +217,16 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask) glBindFramebuffer(GL_FRAMEBUFFER, Test.FramebufferObject); glEnable(GL_STENCIL_TEST); - glStencilOp(GL_KEEP, GL_REPLACE, GL_REPLACE); + // glStencilOp(GL_KEEP, GL_REPLACE, GL_REPLACE); + glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP); glClearColor(0.0f, 0.0f, 0.0f, 0.0f); glClear(GL_COLOR_BUFFER_BIT | GL_STENCIL_BUFFER_BIT); - glStencilFunc(GL_ALWAYS, 1, 0xFF); // always write + glStencilFunc(GL_ALWAYS, 0, 0xFF); // always write glStencilMask(0xff); // allow writing; ANDs any writes to the stencil buffer with this + glColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE); + glUseProgram(MaskShaderProgram); // secondary VBO @@ -239,11 +242,21 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask) int Scale = glGetUniformLocation(MaskShaderProgram, "CompDimensions"); glUniform3f(Scale, (real32)Layer->Source->Info.Width, (real32)Layer->Source->Info.Height, 0); + + glStencilOpSeparate(GL_FRONT, GL_KEEP, GL_KEEP, GL_INCR_WRAP); + glStencilOpSeparate(GL_BACK, GL_KEEP, GL_KEEP, GL_DECR_WRAP); + + glDisable(GL_CULL_FACE); + glDrawArrays(GL_TRIANGLE_FAN, 0, Mask->NumberOfVerts); + // glEnable(GL_CULL_FACE); + + glColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); + glBindVertexArray(0); - glStencilFunc(GL_EQUAL, 1, 0xFF); - glStencilMask(0x00); // disables stencil writing + // glStencilFunc(GL_EQUAL, 1, 0xFF); + // glStencilMask(0x00); // disables stencil writing glBindRenderbuffer(GL_RENDERBUFFER, Test.Color_Renderbuffer); glUseProgram(DefaultShaderProgram); @@ -258,7 +271,13 @@ Mask_TriangulateAndRasterize(memory *Memory, project_layer *Layer, mask *Mask) glVertexAttribPointer(1, 2, GL_FLOAT, GL_FALSE, 5 * sizeof(float), (void*)(3 * sizeof(float))); glEnableVertexAttribArray(1); - glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_INT, 0); + //glStencilFunc(GL_EQUAL, 0, 0xFF); + //glStencilOp(GL_KEEP, GL_KEEP, GL_KEEP); + //glDrawElements(GL_TRIANGLE_STRIP, 6, GL_UNSIGNED_INT, 0); + + glStencilFunc(GL_NOTEQUAL, 0, 0xFF); + glStencilOp(GL_ZERO, GL_ZERO, GL_ZERO); + glDrawElements(GL_TRIANGLE_STRIP, 6, GL_UNSIGNED_INT, 0); glDisable(GL_STENCIL_TEST); glStencilMask(0xFF); @@ -287,8 +306,9 @@ Layer_UpdateBitmap(project_layer *Layer, memory *Memory, int32 CurrentFrame) { uint16 Height = Source->Info.Height; uint16 BytesPerPixel = Source->Info.BytesPerPixel; void *DestBuffer = BitmapInfo->BitmapBuffer; - uint64 Size = Bitmap_CalcUnpackedBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel); - Bitmap_CopyToPointer(Bitmap->Data, DestBuffer, BytesPerPixel, Size); + uint64 UnpackedSize = Bitmap_CalcUnpackedBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel); + uint64 PackedSize = Bitmap_CalcTotalBytes(Source->Info.Width, Source->Info.Height, Source->Info.BytesPerPixel); + Bitmap_CopyToPointer(Bitmap->Data, DestBuffer, BytesPerPixel, UnpackedSize); TestGL_InitTexture(&BitmapInfo->Test, DestBuffer, Width, Height); @@ -305,7 +325,7 @@ Layer_UpdateBitmap(project_layer *Layer, memory *Memory, int32 CurrentFrame) { Layer->Effect[i]->func(Source, BitmapInfo, Memory, Layer->Effect[i]->Property); } Bitmap_ConvertPacking(DestBuffer, Memory->Scratch, Width, Height, BytesPerPixel, 0); - Bitmap_CopyToPointer(Memory->Scratch, DestBuffer, BytesPerPixel, Size); + Bitmap_CopyToPointer(Memory->Scratch, DestBuffer, BytesPerPixel, PackedSize); } static void @@ -418,6 +438,7 @@ LoadTestFootage(project_data *File, project_state *State, memory *Memory) PostMsg(State, "File open fail..."); source *Source = &File->Source[0]; Layer_CreateFromSource(File, State, Memory, Source); + SelectLayer(File->Layer[0], State, 0); // AddEffect(File->Layer[0], Memory, 3); @@ -447,9 +468,13 @@ LoadTestFootage(project_data *File, project_state *State, memory *Memory) Mask->Point[3].HandleBezier = true; Mask->Point[4].HandleBezier = true; - Mask->NumberOfPoints = 5; + // if (!Source_Generate(File, Memory, "../asset/test.png")) + // PostMsg(State, "File open fail..."); + if (!Source_Generate(File, Memory, "../asset/debug.png")) + PostMsg(State, "File open fail..."); + // property_channel *Property = &File->Layer[0]->x; // ManualKeyframeInsertF(Property, Memory, 1, 500); // ManualKeyframeInsertF(Property, Memory, 30, 800); @@ -538,108 +563,6 @@ CreateGrid(project_data *File, memory *Memory) { } } #endif -#if 0 -static void -DrawHistogram(project_layer *Layer, pixel_buffer *UIBuffer, void *Scratch, memory *Memory, sdl_input Input, project_state *State, - rectangle Box) -{ - uint16 Padding = 20; //UI->LayerPadding / 5; - uint16 Margin = 100; - - uint16 *Levels = (uint16 *)Scratch; - - uint16 *Mean = (Levels + 256*7); - - uint32 Color = 0; - uint32 AltColor = ColToUint32(V4(0.1,0.1,0.1,1.0)); - - // this is a bad idea - real32 *Zoom = (real32 *)(Levels + 256*6); - if (*Zoom < 0.0f) - *Zoom = 0.0f; - uint16 *SelectedChannel = (uint16 *)(Levels + 256*6 + 3); - - if (*SelectedChannel == 0) { - Color = ColToUint32(V4(0.6,0.6,0.6,1.0)); - } else if (*SelectedChannel == 1) { - Levels += 256; - Color = ColToUint32(V4(0.6,0.0,0.0,1.0)); - } else if (*SelectedChannel == 2) { - Levels += 256*2; - Color = ColToUint32(V4(0.0,0.6,0.0,1.0)); - } else if (*SelectedChannel == 3) { - Levels += 256*3; - Color = ColToUint32(V4(0.0,0.0,0.6,1.0)); - } else if (*SelectedChannel == 4) { - Levels += 256*4; - Color = ColToUint32(V4(0.9,0.9,0.9,1.0)); - } - - - /* - if (TestRectangle(Box, Input.Mouse) && - Input.MouseButton[0].IsDown) - { - State->ArbitrarySlide = 1; - State->Sliding.RandomPointer = Zoom; - } - */ - - uint8 *Row = ((uint8 *)UIBuffer->OriginalBuffer + - UIBuffer->BytesPerPixel + - UIBuffer->Pitch); - for (int Y = 0; - Y > Box.Min.y; - Y--) - { - uint32 *Pixel = (uint32 *)Row + Box.Min.x; - for(int X = Box.Min.x; - X < Box.Max.x; - ++X) - { - real32 Span = (Box.Max.x - Box.Min.x) / 256.0f; - int16 XLocal = (X - Box.Min.x) / Span; - int16 YLocal = -(Y - Box.Max.y); - if (*(Levels + XLocal) > (YLocal * RoundReal32ToInt32(*Zoom)) && XLocal < 256) - *Pixel++ = Color; - else - *Pixel++ = AltColor; - } - Row -= UIBuffer->Pitch; - } -} - -static pixel_buffer -CreateSolidBitmap(memory *Memory, uint16 Height, uint16 Width, v4 Color) { - pixel_buffer Buffer = {}; - Buffer.BytesPerPixel = 4; - Buffer.Height = Height; - Buffer.Width = Width; - CalculateFull(&Buffer); - Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; - Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); - Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); - DebugFillSolid(&Buffer, Color); - BitmapPackRGB(&Buffer); - Buffer.ToUpdate = true; - return Buffer; -} - -static pixel_buffer -CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) { - pixel_buffer Buffer = {}; - Buffer.BytesPerPixel = 4; - Buffer.Height = Height; - Buffer.Width = Width; - CalculateFull(&Buffer); - Buffer.Pitch = Buffer.FullWidth*Buffer.BytesPerPixel; - Buffer.OriginalBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); - Buffer.EffectBuffer = AllocateMemory(Memory, Buffer.FullHeight * Buffer.FullWidth * Buffer.BytesPerPixel, B_Scratch); - DebugBitmap(&Buffer); - BitmapPackRGB(&Buffer); - Buffer.ToUpdate = true; - return Buffer; -} /* { @@ -659,6 +582,3 @@ CreateDebugBitmap(memory *Memory, uint16 Height, uint16 Width) { Layer->EndFrame = File.EndFrame; } */ - - -#endif diff --git a/gl_calls.cpp b/gl_calls.cpp index 4474b44..68c2882 100644 --- a/gl_calls.cpp +++ b/gl_calls.cpp @@ -152,6 +152,8 @@ static void TestGL_InitDefaultVerts() { 1, 2, 3 }; + glEnable(GL_MULTISAMPLE); + // Indices! glGenVertexArrays(1, &DefaultVerts.VertexArrayObject); @@ -194,7 +196,7 @@ TestGL_InitTexture(gl_effect_layer *Test, void *Data, uint16 Width, uint16 Heigh GLuint Stencil_Renderbuffer = 0; glGenRenderbuffers(1, &Stencil_Renderbuffer); glBindRenderbuffer( GL_RENDERBUFFER, (GLuint)Stencil_Renderbuffer ); - glRenderbufferStorage( GL_RENDERBUFFER, GL_STENCIL_INDEX8, Width, Height ); + glRenderbufferStorage(GL_RENDERBUFFER, GL_STENCIL_INDEX8, Width, Height ); glBindFramebuffer(GL_FRAMEBUFFER, Test->FramebufferObject); @@ -237,7 +237,7 @@ int main(int argc, char *argv[]) { InitMemoryTable(&GlobalMemory, &Memory, 10 * 1024 * 1024, F_Strings, "Strings"); InitMemoryTable(&GlobalMemory, &Memory, (uint64)200 * 1024 * 1024, B_LayerBitmaps, "Layer buffer"); - InitMemoryTable(&GlobalMemory, &Memory, (uint64)200 * 1024 * 1024, B_LoadedBitmaps, "Loaded bitmap buffer"); + InitMemoryTable(&GlobalMemory, &Memory, (uint64)600 * 1024 * 1024, B_LoadedBitmaps, "Loaded bitmap buffer"); Memory.Scratch = AllocateMemory(&Memory, (uint64)64*1024*1024, B_LayerBitmaps); @@ -509,7 +509,12 @@ int main(int argc, char *argv[]) { } } #else - OutputToViewport(&CompBuffer, &State, textureID); + Bitmap_ConvertPacking(CompBuffer.PackedBuffer, CompBuffer.UnpackedBuffer, + CompBuffer.Width, CompBuffer.Height, CompBuffer.BytesPerPixel, 1); + EndRenderState(&State); + glBindTexture(GL_TEXTURE_2D, textureID); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer.Width, CompBuffer.Height, GL_RGBA, GL_UNSIGNED_BYTE, + CompBuffer.UnpackedBuffer); #endif ImGui::Render(); diff --git a/my_imgui_widgets.cpp b/my_imgui_widgets.cpp index e334f63..656804d 100644 --- a/my_imgui_widgets.cpp +++ b/my_imgui_widgets.cpp @@ -597,6 +597,14 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, comp_buffer Comp } */ + ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonMiddle); + if (ImGui::BeginPopup("context")) { + if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != instruction_mode_scalar)) { InstructionMode = instruction_mode_scalar; State->UpdateFrame = true; } + if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != instruction_mode_sse)) { InstructionMode = instruction_mode_sse; State->UpdateFrame = true; } + if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != instruction_mode_avx)) { InstructionMode = instruction_mode_avx; State->UpdateFrame = true; } + ImGui::EndPopup(); + } + if (IsHovered && IsActivated && ImGui::IsMouseDown(ImGuiMouseButton_Left)) { // Point to zoom in on if Z is held @@ -643,15 +651,6 @@ ImGui_Viewport(project_data File, project_state *State, ui *UI, comp_buffer Comp } - ImGui::OpenPopupOnItemClick("context", ImGuiPopupFlags_MouseButtonMiddle); - if (ImGui::BeginPopup("context")) { - if (ImGui::MenuItem("Scalar", NULL, false, InstructionMode != instruction_mode_scalar)) { InstructionMode = instruction_mode_scalar; State->UpdateFrame = true; } - if (ImGui::MenuItem("SSE", NULL, false, InstructionMode != instruction_mode_sse)) { InstructionMode = instruction_mode_sse; State->UpdateFrame = true; } - if (ImGui::MenuItem("AVX2", NULL, false, InstructionMode != instruction_mode_avx)) { InstructionMode = instruction_mode_avx; State->UpdateFrame = true; } - ImGui::EndPopup(); - } - - ImGui::End(); } diff --git a/prenderer.cpp b/prenderer.cpp index 7550d0f..940cb0a 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -376,6 +376,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; + uint16 WidthP, HeightP; + Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); @@ -398,15 +401,23 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 One = _mm256_set1_ps(1); __m256 Two = _mm256_set1_ps(2); __m256 Zero = _mm256_set1_ps(0); + // __m256 UMin = _mm256_set1_ps(0.0f - (1 / T.LayerWidth)); + // __m256 VMin = _mm256_set1_ps(0.0f - (1 / T.LayerHeight)); + // __m256 UMax = _mm256_set1_ps(1.0f + (1 / T.LayerWidth)); + __m256 VMax = _mm256_set1_ps(1.0f - (1 / T.LayerHeight)); + + __m256 ZeroPoint25 = _mm256_set1_ps(0.25); __m256 ZeroPointFive = _mm256_set1_ps(0.5); __m256i Zeroi = _mm256_set1_epi32(0); __m256i Onei = _mm256_set1_epi32(1); __m256 Four = _mm256_set1_ps(4); __m256 Sixteen = _mm256_set1_ps(16); __m256i FF = _mm256_set1_epi32(0xFF); + __m256i Full = _mm256_set1_epi32(0xFFFFFFFF); __m256i BottomTwoBits = _mm256_set1_epi32(0x03); __m256i Fouri = _mm256_set1_epi32(4); __m256i Sixteeni = _mm256_set1_epi32(16); + __m256i SixtyFouri = _mm256_set1_epi32(64); __m256 Real255 = _mm256_set1_ps(255.0f); __m256i Int255 = _mm256_set1_epi32(255); __m256 Norm255 = _mm256_set1_ps(1/255.0f); @@ -442,11 +453,25 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) { IACA_START; - __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); + // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky. + __m256 X0 = _mm256_set1_ps(0.30); + __m256 Y0 = _mm256_set1_ps(0.10); + __m256 X1 = _mm256_set1_ps(0.80); + __m256 Y1 = _mm256_set1_ps(0.35); + __m256 X2 = _mm256_set1_ps(0.05); + __m256 Y2 = _mm256_set1_ps(0.60); + __m256 X3 = _mm256_set1_ps(0.55); + __m256 Y3 = _mm256_set1_ps(0.85); - // TODO(fox): Not unwraping this function may lose a few cycles! - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); + __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0); + __m256 StartVectorY0 = _mm256_add_ps(StartVectorY, Y0); + __m256 StartVectorX1 = _mm256_add_ps(StartVectorX, X1); + __m256 StartVectorY1 = _mm256_add_ps(StartVectorY, Y1); + __m256 StartVectorX2 = _mm256_add_ps(StartVectorX, X2); + __m256 StartVectorY2 = _mm256_add_ps(StartVectorY, Y2); + __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3); + __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3); uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; @@ -456,13 +481,39 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY)); __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY)); - __m256i LayerMask = _mm256_castps_si256(_mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 2)), - _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 2)))); + __m256 U0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, XAxisPX), _mm256_mul_ps(StartVectorY0, XAxisPY)); + __m256 V0 = _mm256_add_ps(_mm256_mul_ps(StartVectorX0, YAxisPX), _mm256_mul_ps(StartVectorY0, YAxisPY)); + __m256 U1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, XAxisPX), _mm256_mul_ps(StartVectorY1, XAxisPY)); + __m256 V1 = _mm256_add_ps(_mm256_mul_ps(StartVectorX1, YAxisPX), _mm256_mul_ps(StartVectorY1, YAxisPY)); + __m256 U2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, XAxisPX), _mm256_mul_ps(StartVectorY2, XAxisPY)); + __m256 V2 = _mm256_add_ps(_mm256_mul_ps(StartVectorX2, YAxisPX), _mm256_mul_ps(StartVectorY2, YAxisPY)); + __m256 U3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, XAxisPX), _mm256_mul_ps(StartVectorY3, XAxisPY)); + __m256 V3 = _mm256_add_ps(_mm256_mul_ps(StartVectorX3, YAxisPX), _mm256_mul_ps(StartVectorY3, YAxisPY)); + + __m256 LayerMask0 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U0, Zero, 13), _mm256_cmp_ps(U0, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V0, Zero, 13), _mm256_cmp_ps(V0, One, 1))); + __m256 LayerMask1 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U1, Zero, 13), _mm256_cmp_ps(U1, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V1, Zero, 13), _mm256_cmp_ps(V1, One, 1))); + __m256 LayerMask2 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U2, Zero, 13), _mm256_cmp_ps(U2, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V2, Zero, 13), _mm256_cmp_ps(V2, One, 1))); + __m256 LayerMask3 = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U3, Zero, 13), _mm256_cmp_ps(U3, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V3, Zero, 13), _mm256_cmp_ps(V3, One, 1))); + + // Each point that passes adds .25 + __m256 Avg = _mm256_add_ps(_mm256_add_ps(_mm256_and_ps(LayerMask0, ZeroPoint25), _mm256_and_ps(LayerMask1, ZeroPoint25)), + _mm256_add_ps(_mm256_and_ps(LayerMask2, ZeroPoint25), _mm256_and_ps(LayerMask3, ZeroPoint25))); + + // Zero - no points pass + // One - all points pass; not an edge + __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14); + __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13); // If all of the pixels are zeroed in the mask (aka fall outside // the UV lookup), we can skip the iteration. - if (_mm256_movemask_epi8(LayerMask)) + if (_mm256_movemask_epi8(Mask)) { + __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask); + U = _mm256_max_ps(_mm256_min_ps(One, U), Zero); V = _mm256_max_ps(_mm256_min_ps(One, V), Zero); @@ -472,6 +523,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); + if (T.LayerWidth == 50 && _mm256_cvtsi256_si32(TexYIntPlusOne) == 49) + int pp = 0; // NOTE(fox): The comparison is for when we're on the last pixel of the texel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); @@ -540,6 +593,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_add_ps(_mm256_mul_ps(TexBothXInv, A_TexBL), _mm256_mul_ps(TexBoth, A_TexBR))); + // Apply anti-aliasing to edges if there are any + if (_mm256_movemask_epi8(EdgeMask)) + { + A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask); + } + __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity); __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha); @@ -559,7 +618,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 A_Blend = LayerAlpha; // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it). - if (!_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 0)) || T.BlendMode != blend_normal) + if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal) { __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255); @@ -691,8 +750,8 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - // __m256i PixelsMask = _mm256_blendv_epi8(Zeroi, OutputPixel, LayerMask); - _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); + + _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel); } PixelX = _mm256_add_ps(PixelX, Four); } @@ -708,6 +767,9 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) LayerBounds.Min.x -= LayerBounds.Min.x % 4; LayerBounds.Min.y -= LayerBounds.Min.y % 4; + uint16 WidthP, HeightP; + Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); + uint8 *TexPTR = (uint8 *)T.SourceBuffer; Assert(LayerBounds.Max.x <= Buffer->Width); Assert(LayerBounds.Max.y <= Buffer->Height); @@ -761,11 +823,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); - - // TODO(fox): Not unwraping this function may lose a few cycles! - uint16 WidthP, HeightP; - Bitmap_CalcPackedDimensions(Buffer->Width, Buffer->Height, &WidthP, &HeightP); - uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; @@ -774,8 +831,8 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m128 U = _mm_add_ps(_mm_mul_ps(StartVectorX, XAxisPX), _mm_mul_ps(StartVectorY, XAxisPY)); __m128 V = _mm_add_ps(_mm_mul_ps(StartVectorX, YAxisPX), _mm_mul_ps(StartVectorY, YAxisPY)); - __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmple_ps(U, One)), - _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmple_ps(V, One)))); + __m128i LayerMask = _mm_castps_si128(_mm_and_ps(_mm_and_ps(_mm_cmpge_ps(U, Zero), _mm_cmplt_ps(U, One)), + _mm_and_ps(_mm_cmpge_ps(V, Zero), _mm_cmplt_ps(V, One)))); if (_mm_movemask_epi8(LayerMask)) { @@ -1082,7 +1139,7 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); real32 V = (StartVectorX * T.YAxisPX) + (StartVectorY * T.YAxisPY); - if (U <= 1.0f && U >= 0.0f && V <= 1.0f && V >= 0.0f) { + if (U < 1.0f && U >= 0.0f && V < 1.0f && V >= 0.0f) { real32 TexXFull = U * T.LayerWidth; uint32 TexXInt = (uint32)TexXFull; @@ -1092,6 +1149,12 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi uint32 TexYInt = (uint32)TexYFull; real32 TexY = TexYFull - TexYInt; + if(T.LayerWidth == 50) + real32 pp = 0; + + if(TexYInt > 47 && T.LayerWidth == 50) + real32 pp = 0; + real32 TexXInv = 1 - TexX; real32 TexYInv = 1 - TexY; real32 TexBothXInv = TexXInv * TexY; |