summaryrefslogtreecommitdiff
path: root/prenderer.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'prenderer.cpp')
-rw-r--r--prenderer.cpp149
1 files changed, 118 insertions, 31 deletions
diff --git a/prenderer.cpp b/prenderer.cpp
index e755fe7..909fc4c 100644
--- a/prenderer.cpp
+++ b/prenderer.cpp
@@ -99,6 +99,7 @@ static void
RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
for (int16 i = 0; i < RenderInfo->State->NumberOfLayersToRender; i++) {
int16 Idx = RenderInfo->State->LayersToRender[i];
+
#if ARM
if (InstructionMode == instruction_mode_neon)
Fallback_RenderLayer(RenderInfo->File->Layer[Idx]->TransformInfo, RenderInfo->CompBuffer, RenderRegion);
@@ -114,10 +115,45 @@ RenderLayers(render_queue *RenderInfo, rectangle RenderRegion) {
}
static void
+FinishRenderAndUpload(project_state *State, comp_buffer *CompBuffer, GLuint textureID)
+{
+#if PERF
+ Test = __rdtsc() - Test;
+
+ Debug.PixelCountRendered = 1280*720*5;
+ printf("Cycles per pixel rendered: %li ", Test / Debug.PixelCountRendered);
+ printf("Pixels rendered: %li ", Debug.PixelCountRendered);
+ printf("Cycles: %li\n", Test);
+
+ Test = 0;
+ Debug.PixelCountTransparent = 0;
+ Debug.PixelCountRendered = 0;
+ Debug.PixelCountChecked = 0;
+#endif
+
+
+#if PACKEDRGB
+ Bitmap_ConvertPacking(CompBuffer->PackedBuffer, CompBuffer->UnpackedBuffer,
+ CompBuffer->Width, CompBuffer->Height, CompBuffer->BytesPerPixel, 1);
+#endif
+ EndRenderState(State);
+ glBindTexture(GL_TEXTURE_2D, textureID);
+ glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, CompBuffer->Width, CompBuffer->Height, GL_RGBA, GL_UNSIGNED_BYTE,
+ CompBuffer->UnpackedBuffer);
+
+ // shmp->shared_framenumber = File.CurrentFrame;
+ // if (sem_post(&shmp->sem2) == -1)
+ // Assert(0);
+}
+
+static void
QueueCurrentFrame(project_data *File, comp_buffer *CompBuffer, project_state *State)
{
IsRendering = true;
render_queue RenderInfo = {File, State, CompBuffer};
+#if PERF
+ Test = __rdtsc();
+#endif
for (int16 i = 0; i < File->NumberOfLayers; i++)
{
@@ -290,6 +326,13 @@ NEON_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
#else
+#if 0
+#include "iacaMarks.h"
+#else
+#define IACA_START
+#define IACA_END
+#endif
+
static void
AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
{
@@ -329,6 +372,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 ZeroPointFive = _mm256_set1_ps(0.5);
__m256i Onei = _mm256_set1_epi32(1);
__m256 Four = _mm256_set1_ps(4);
+ __m256 Eight = _mm256_set1_ps(8);
__m256i FF = _mm256_set1_epi32(0xFF);
__m256i BottomTwoBits = _mm256_set1_epi32(0x03);
__m256i Fouri = _mm256_set1_epi32(4);
@@ -338,9 +382,24 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
// __m256i White = _mm256_setr_epi32(0xFFFFFFFF, 0, 0, 0, 0xFFFFFFFF, 0, 0, 0);
// __m256i White2 = _mm256_set1_epi32(0xFFFFFFFF);
- // NOTE(fox): Each loop operates on 8 pixels, 4 horizontal by 2 vertical,
- // as per the bitmap packing scheme in memory.
+ // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
+ __m256 X0 = _mm256_set1_ps(0.30);
+ __m256 Y0 = _mm256_set1_ps(0.10);
+ __m256 X1 = _mm256_set1_ps(0.80);
+ __m256 Y1 = _mm256_set1_ps(0.35);
+ __m256 X2 = _mm256_set1_ps(0.05);
+ __m256 Y2 = _mm256_set1_ps(0.60);
+ __m256 X3 = _mm256_set1_ps(0.55);
+ __m256 Y3 = _mm256_set1_ps(0.85);
+
+#if PACKEDRGB
+#else
+ __m256i LayerPitch = _mm256_set1_epi32(T.LayerPitch);
+ __m256i BytesPerPixel = _mm256_set1_epi32(Buffer->BytesPerPixel);
+#endif
+
+#if PACKEDRGB
for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y+=2)
{
__m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
@@ -360,22 +419,31 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
(real32)Y+1,
(real32)Y+1,
(real32)Y+1);
+#else
+ for (int32 Y = LayerBounds.Min.y; Y < LayerBounds.Max.y; Y++)
+ {
+ __m256 PixelX = _mm256_setr_ps((real32)LayerBounds.Min.x,
+ (real32)LayerBounds.Min.x+1,
+ (real32)LayerBounds.Min.x+2,
+ (real32)LayerBounds.Min.x+3,
+ (real32)LayerBounds.Min.x+4,
+ (real32)LayerBounds.Min.x+5,
+ (real32)LayerBounds.Min.x+6,
+ (real32)LayerBounds.Min.x+7);
+
+ __m256 PixelY = _mm256_set1_ps((real32)Y);
+#endif
__m256 StartVectorY = _mm256_sub_ps(PixelY, OriginY);
+#if PACKEDRGB
for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
+#else
+ for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 8)
+#endif
{
- IACA_START;
- // TODO(fox): Tried an MSAA technique for anti aliasing, but it still looks pretty sucky.
- __m256 X0 = _mm256_set1_ps(0.30);
- __m256 Y0 = _mm256_set1_ps(0.10);
- __m256 X1 = _mm256_set1_ps(0.80);
- __m256 Y1 = _mm256_set1_ps(0.35);
- __m256 X2 = _mm256_set1_ps(0.05);
- __m256 Y2 = _mm256_set1_ps(0.60);
- __m256 X3 = _mm256_set1_ps(0.55);
- __m256 Y3 = _mm256_set1_ps(0.85);
+ IACA_START;
__m256 StartVectorX = _mm256_sub_ps(PixelX, OriginX);
__m256 StartVectorX0 = _mm256_add_ps(StartVectorX, X0);
@@ -387,10 +455,14 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 StartVectorX3 = _mm256_add_ps(StartVectorX, X3);
__m256 StartVectorY3 = _mm256_add_ps(StartVectorY, Y3);
+#if PACKEDRGB
uint32 XLookup = (X >> 2)*16 + (X % 4);
uint32 YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
uint32 PixelToSeek = XLookup + YLookup;
uint8 *Pixel = (uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel;
+#else
+ uint8 *Pixel = (uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel;
+#endif
__m256 U = _mm256_add_ps(_mm256_mul_ps(StartVectorX, XAxisPX), _mm256_mul_ps(StartVectorY, XAxisPY));
__m256 V = _mm256_add_ps(_mm256_mul_ps(StartVectorX, YAxisPX), _mm256_mul_ps(StartVectorY, YAxisPY));
@@ -422,9 +494,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256i Mask = _mm256_cmp_ps(Avg, Zero, 14);
__m256i NonEdge = _mm256_cmp_ps(Avg, One, 13);
+ __m256 LayerMask = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(U, Zero, 13), _mm256_cmp_ps(U, One, 1)),
+ _mm256_and_ps(_mm256_cmp_ps(V, Zero, 13), _mm256_cmp_ps(V, One, 1)));
+
// If all of the pixels are zeroed in the mask (aka fall outside
// the UV lookup), we can skip the iteration.
- if (_mm256_movemask_epi8(Mask))
+ if (_mm256_movemask_epi8(LayerMask))
{
__m256i EdgeMask = _mm256_andnot_si256(NonEdge, Mask);
@@ -434,10 +509,9 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 TexXFull = _mm256_mul_ps(U, LayerWidth);
__m256 TexYFull = _mm256_mul_ps(V, LayerHeight);
__m256i TexXInt = _mm256_cvttps_epi32(TexXFull);
- __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYInt = _mm256_cvttps_epi32(TexYFull);
+ __m256i TexXIntPlusOne = _mm256_add_epi32(TexXInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerWidthMinusOne, TexXInt), Onei));
__m256i TexYIntPlusOne = _mm256_add_epi32(TexYInt, _mm256_and_si256(_mm256_cmpgt_epi32(LayerHeightMinusOne, TexYInt), Onei));
-
// NOTE(fox): The comparison is for when we're on the last pixel of the texel.
__m256 TexX = _mm256_sub_ps(TexXFull, _mm256_cvtepi32_ps(TexXInt));
@@ -449,6 +523,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 TexBoth = _mm256_mul_ps(TexY, TexX);
__m256 TexBothInv = _mm256_mul_ps(TexXInv, TexYInv);
+#if PACKEDRGB
__m256i XLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexXInt, 2), Sixteeni),
_mm256_and_si256(TexXInt, BottomTwoBits));
__m256i YLookup = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYInt, 2), FullLayerWidth4i),
@@ -457,6 +532,12 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_and_si256(TexXIntPlusOne, BottomTwoBits));
__m256i YLookupPlusOne = _mm256_add_epi32(_mm256_mullo_epi32(_mm256_srli_epi32(TexYIntPlusOne, 2), FullLayerWidth4i),
_mm256_mullo_epi32(_mm256_and_si256(TexYIntPlusOne, BottomTwoBits), Fouri));
+#else
+ __m256i XLookup = TexXInt;
+ __m256i YLookup = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYInt), LayerWidth));
+ __m256i XLookupPlusOne = TexXIntPlusOne;
+ __m256i YLookupPlusOne = _mm256_cvtps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(TexYIntPlusOne), LayerWidth));
+#endif
__m256i PixelLookupTL = _mm256_add_epi32(XLookup, YLookup);
__m256i PixelLookupTR = _mm256_add_epi32(XLookupPlusOne, YLookup);
@@ -512,6 +593,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
A_Col = _mm256_blendv_ps(A_Col, _mm256_mul_ps(A_Col, Avg), EdgeMask);
}
+ IACA_END;
__m256 LayerAlpha = _mm256_mul_ps(A_Col, LayerOpacity);
__m256 LayerAlphaInv = _mm256_sub_ps(One, LayerAlpha);
@@ -531,7 +613,7 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
__m256 A_Blend = LayerAlpha;
// Only load the dest pixel if we actually need to (a pixel's opacity isn't 255 or the blend mode requires it).
- if (_mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)) || T.BlendMode != blend_normal)
+ if (T.BlendMode != blend_normal || _mm256_movemask_epi8(_mm256_cmp_ps(LayerAlpha, One, 2)))
{
__m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);
__m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256( DestPixel, FF)), Norm255);
@@ -663,10 +745,13 @@ AVX2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
_mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
_mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));
-
- _mm256_maskstore_epi32((int *)Pixel, Mask, OutputPixel);
+ _mm256_maskstore_epi32((int *)Pixel, LayerMask, OutputPixel);
}
+#if PACKEDRGB
PixelX = _mm256_add_ps(PixelX, Four);
+#else
+ PixelX = _mm256_add_ps(PixelX, Eight);
+#endif
}
}
}
@@ -729,7 +814,6 @@ SSE2_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegion)
for (int32 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X += 4)
{
- IACA_START;
__m128 StartVectorX = _mm_sub_ps(PixelX, OriginX);
@@ -1040,7 +1124,6 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
for (int16 X = LayerBounds.Min.x; X < LayerBounds.Max.x; X++)
{
- IACA_START;
real32 StartVectorX = X - T.OriginX;
real32 U = (StartVectorX * T.XAxisPX) + (StartVectorY * T.XAxisPY);
@@ -1063,23 +1146,14 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
real32 TexBoth = TexY * TexX;
real32 TexBothInv = TexXInv * TexYInv;
-#if 0
- uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*TexYInt + TexXInt*Buffer->BytesPerPixel);
- uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*(TexYInt+1) + TexXInt*Buffer->BytesPerPixel);
-
- uint32 PixelA = *(uint32 *)TexPTR0;
- uint32 PixelB = *((uint32 *)TexPTR0 + 1);
- uint32 PixelC = *(uint32 *)TexPTR1;
- uint32 PixelD = *((uint32 *)TexPTR1 + 1);
-#else
uint32 XLookup, YLookup, PixelToSeek;
- // TODO(fox): Anti-aliasing on edges
uint16 LX = TexXInt;
uint16 LY = TexYInt;
uint16 LXPlus = Ceil(TexXInt+1, (uint32)T.LayerWidth - 1);
uint16 LYPlus = Ceil(TexYInt+1, (uint32)T.LayerHeight - 1);
+#if PACKEDRGB
// TODO(fox): Be careful with the BytesPerPixel here! It's the
// buffer's, not the layer's (currently everything is 4 bytes
// per pixel).
@@ -1102,12 +1176,25 @@ Fallback_RenderLayer(transform_info T, comp_buffer *Buffer, rectangle RenderRegi
YLookup = (LYPlus >> 2)*(T.FullLayerWidth*4) + (LYPlus % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 PixelD = *(uint32 *)((uint8 *)T.SourceBuffer + PixelToSeek*Buffer->BytesPerPixel);
-#endif
+
XLookup = (X >> 2)*16 + (X % 4);
YLookup = (Y >> 2)*(WidthP*4) + (Y % 4)*4;
PixelToSeek = XLookup + YLookup;
uint32 *Pixel = (uint32 *)((uint8 *)Buffer->PackedBuffer + PixelToSeek*Buffer->BytesPerPixel);
+#else
+ uint8 *TexPTR0 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LY + LX*Buffer->BytesPerPixel);
+ uint8 *TexPTR1 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LY + LXPlus*Buffer->BytesPerPixel);
+ uint8 *TexPTR2 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LYPlus + LX*Buffer->BytesPerPixel);
+ uint8 *TexPTR3 = ((uint8 *)T.SourceBuffer + (uint16)T.LayerPitch*LYPlus + LXPlus*Buffer->BytesPerPixel);
+
+ uint32 PixelA = *(uint32 *)TexPTR0;
+ uint32 PixelB = *(uint32 *)TexPTR1;
+ uint32 PixelC = *(uint32 *)TexPTR2;
+ uint32 PixelD = *(uint32 *)TexPTR3;
+
+ uint32 *Pixel = (uint32 *)((uint8 *)Buffer->UnpackedBuffer + Y*T.BufferPitch + X*Buffer->BytesPerPixel);
+#endif
real32 TexRA = (real32)(PixelA & 0xFF) * Normalized255;
real32 TexRB = (real32)(PixelB & 0xFF) * Normalized255;
real32 TexRC = (real32)(PixelC & 0xFF) * Normalized255;