From 8c5f06c37f3c267ecd8f867cd49765c366b5f47c Mon Sep 17 00:00:00 2001 From: Fox Caminiti Date: Thu, 18 Aug 2022 23:11:29 -0400 Subject: many additions --- prenderer.cpp | 149 ++++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 118 insertions(+), 31 deletions(-) (limited to 'prenderer.cpp') diff --git a/prenderer.cpp b/prenderer.cpp index e755fe7..909fc4c 100644 --- a/prenderer.cpp +++ b/prenderer.cpp @@ -99,6 +99,7 @@ static void RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) { int16 Idx = RenderInfo->State->LayersToRender[i]; + #if ARM if (InstructionMode == instruction_mode_neon) Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion); @@ -113,11 +114,46 @@ RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) { } } +static void +FinishRenderAndUpload(project_state *State, comp_buffer *CompBuffer, GLuint textureID) +{ +#if PERF + Test = __rdtsc() - Test; + + Debug.PixelCountRendered = 1280*720*5; + printf("Cycles per pixel rendered: %li ", Test / Debug.PixelCountRendered); + printf("Pixels rendered: %li ", Debug.PixelCountRendered); + printf("Cycles: %li\n", Test); + + Test = 0; + Debug.PixelCountTransparent = 0; + Debug.PixelCountRendered = 0; + Debug.PixelCountChecked = 0; +#endif + + +#if PACKEDRGB + Bitmap_ConvertPacking(CompBuffer->PackedBuffer, CompBuffer->UnpackedBuffer, + CompBuffer->Width, CompBuffer->Height, CompBuffer->BytesPerPixel, 1); +#endif + EndRenderState(State); + glBindTexture(GL_TEXTURE_2D, textureID); + glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE, + CompBuffer->UnpackedBuffer); + + // shmp->shared_framenumber = File.CurrentFrame; + // if (sem_post(&shmp->sem2) == -1) + // Assert(0); +} + static void QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State) { IsRendering = true; render_queue RenderInfo = {File, State, CompBuffer}; +#if PERF + Test = __rdtsc(); +#endif for (int16 i = 0; i < File->NumberOfLayers; i++) { @@ -290,6 +326,13 @@ NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) #else +#if 0 +#include "iacaMarks.h" +#else +#define IACA_START +#define IACA_END +#endif + static void AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) { @@ -329,6 +372,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 ZeroPointFive = _mm256_set1_ps(0.5); __m256i Onei = _mm256_set1_epi32(1); __m256 Four = _mm256_set1_ps(4); + __m256 Eight = _mm256_set1_ps(8); __m256i FF = _mm256_set1_epi32(0xFF); __m256i BottomTwoBits = _mm256_set1_epi32(0x03); __m256i Fouri = _mm256_set1_epi32(4); @@ -338,9 +382,24 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) // __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0); // __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF); - // NOTE(fox): Each loop operates on 8 pixels, 4 horizontal by 2 vertical, - // as per the bitmap packing scheme in memory. + // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky. + __m256 X0 = _mm256_set1_ps(0.30); + __m256 Y0 = _mm256_set1_ps(0.10); + __m256 X1 = _mm256_set1_ps(0.80); + __m256 Y1 = _mm256_set1_ps(0.35); + __m256 X2 = _mm256_set1_ps(0.05); + __m256 Y2 = _mm256_set1_ps(0.60); + __m256 X3 = _mm256_set1_ps(0.55); + __m256 Y3 = _mm256_set1_ps(0.85); + +#if PACKEDRGB +#else + __m256i LayerPitch = _mm256_set1_epi32(T.LayerPitch); + __m256i BytesPerPixel = _mm256_set1_epi32(Buffer->BytesPerPixel); +#endif + +#if PACKEDRGB for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2) { __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x, @@ -360,22 +419,31 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) (real32)Y+1, (real32)Y+1, (real32)Y+1); +#else + for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++) + { + __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x, + (real32)LayerBounds.Min.x+1, + (real32)LayerBounds.Min.x+2, + (real32)LayerBounds.Min.x+3, + (real32)LayerBounds.Min.x+4, + (real32)LayerBounds.Min.x+5, + (real32)LayerBounds.Min.x+6, + (real32)LayerBounds.Min.x+7); + + __m256 PixelY = _mm256_set1_ps((real32)Y); +#endif __m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY); +#if PACKEDRGB for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) +#else + for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 8) +#endif { - IACA_START; - // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky. - __m256 X0 = _mm256_set1_ps(0.30); - __m256 Y0 = _mm256_set1_ps(0.10); - __m256 X1 = _mm256_set1_ps(0.80); - __m256 Y1 = _mm256_set1_ps(0.35); - __m256 X2 = _mm256_set1_ps(0.05); - __m256 Y2 = _mm256_set1_ps(0.60); - __m256 X3 = _mm256_set1_ps(0.55); - __m256 Y3 = _mm256_set1_ps(0.85); + IACA_START; __m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX); __m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0); @@ -387,10 +455,14 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3); __m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3); +#if PACKEDRGB uint32 XLookup = (X >> 2)*16 + (X % 4); uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; uint32 PixelToSeek = XLookup + YLookup; uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel; +#else + uint8 *Pixel = (uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel; +#endif __m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY)); __m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY)); @@ -422,9 +494,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256i Mask = _mm256_cmp_ps(Avg, Zero, 14); __m256i NonEdge = _mm256_cmp_ps(Avg, One, 13); + __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)), + _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1))); + // If all of the pixels are zeroed in the mask (aka fall outside // the UV lookup), we can skip the iteration. - if (_mm256_movemask_epi8(Mask)) + if (_mm256_movemask_epi8(LayerMask)) { __m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask); @@ -434,10 +509,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 TexXFull = _mm256_mul_ps(U, LayerWidth); __m256 TexYFull = _mm256_mul_ps(V, LayerHeight); __m256i TexXInt = _mm256_cvttps_epi32(TexXFull); - __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYInt = _mm256_cvttps_epi32(TexYFull); + __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei)); __m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei)); - // NOTE(fox): The comparison is for when we're on the last pixel of the texel. __m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt)); @@ -449,6 +523,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 TexBoth = _mm256_mul_ps(TexY, TexX); __m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv); +#if PACKEDRGB __m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni), _mm256_and_si256(TexXInt, BottomTwoBits)); __m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i), @@ -457,6 +532,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_and_si256(TexXIntPlusOne, BottomTwoBits)); __m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i), _mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri)); +#else + __m256i XLookup = TexXInt; + __m256i YLookup = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYInt), LayerWidth)); + __m256i XLookupPlusOne = TexXIntPlusOne; + __m256i YLookupPlusOne = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYIntPlusOne), LayerWidth)); +#endif __m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup); __m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup); @@ -512,6 +593,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask); } + IACA_END; __m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity); __m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha); @@ -531,7 +613,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) __m256 A_Blend = LayerAlpha; // Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it). - if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal) + if (T.BlendMode != blend_normal || _mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2))) { __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel); __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255); @@ -663,10 +745,13 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)), _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24))); - - _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel); + _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel); } +#if PACKEDRGB PixelX = _mm256_add_ps(PixelX, Four); +#else + PixelX = _mm256_add_ps(PixelX, Eight); +#endif } } } @@ -729,7 +814,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion) for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4) { - IACA_START; __m128 StartVectorX = _mm_sub_ps(PixelX, OriginX); @@ -1040,7 +1124,6 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++) { - IACA_START; real32 StartVectorX = X - T.OriginX; real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY); @@ -1063,23 +1146,14 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi real32 TexBoth = TexY * TexX; real32 TexBothInv = TexXInv * TexYInv; -#if 0 - uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel); - uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel); - - uint32 PixelA = *(uint32 *)TexPTR0; - uint32 PixelB = *((uint32 *)TexPTR0 + 1); - uint32 PixelC = *(uint32 *)TexPTR1; - uint32 PixelD = *((uint32 *)TexPTR1 + 1); -#else uint32 XLookup, YLookup, PixelToSeek; - // TODO(fox): Anti-aliasing on edges uint16 LX = TexXInt; uint16 LY = TexYInt; uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1); uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1); +#if PACKEDRGB // TODO(fox): Be careful with the BytesPerPixel here! It's the // buffer's, not the layer's (currently everything is 4 bytes // per pixel). @@ -1102,12 +1176,25 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4; PixelToSeek = XLookup + YLookup; uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel); -#endif + XLookup = (X >> 2)*16 + (X % 4); YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4; PixelToSeek = XLookup + YLookup; uint32 *Pixel = (uint32 *)((uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel); +#else + uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LY + LX*Buffer->BytesPerPixel); + uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LY + LXPlus*Buffer->BytesPerPixel); + uint8 *TexPTR2 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LYPlus + LX*Buffer->BytesPerPixel); + uint8 *TexPTR3 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LYPlus + LXPlus*Buffer->BytesPerPixel); + + uint32 PixelA = *(uint32 *)TexPTR0; + uint32 PixelB = *(uint32 *)TexPTR1; + uint32 PixelC = *(uint32 *)TexPTR2; + uint32 PixelD = *(uint32 *)TexPTR3; + + uint32 *Pixel = (uint32 *)((uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel); +#endif real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255; real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255; real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255; -- cgit v1.2.3