summaryrefslogtreecommitdiff
path: root/effects_software.cpp
blob: 06e85430ccb0cc1be6d9699d20ece729d8f6aa8c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
static void
Effect_Software_DrawColor(source *Source, layer_bitmap_info *BitmapInfo, memory *Memory, property_channel Property[])
{
#if ARM
#else
    v4 FloatColor = Property[0].CurrentValue.col;
    blend_mode BlendMode = Property[1].CurrentValue.blendmode;

    __m256 ZeroReal = _mm256_set1_ps(0);
    __m256 ZeroPointFive = _mm256_set1_ps(0.5);
    __m256 One = _mm256_set1_ps(1);
    __m256 Two = _mm256_set1_ps(2);

    __m256 Fraction255 = _mm256_set1_ps(1/255.0f);
    __m256 Real255  = _mm256_set1_ps(255);

    __m256i FF = _mm256_set1_epi32(0xFF);

    __m256 Alpha  = _mm256_set1_ps(FloatColor.a);
    __m256 AlphaInv  = _mm256_set1_ps(1.0f - FloatColor.a);

    __m256 R_Col    = _mm256_set1_ps(FloatColor.E[0]);
    __m256 R_Colx2  = _mm256_mul_ps(R_Col, Two);
    __m256 R_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[0]);

    __m256 G_Col    = _mm256_set1_ps(FloatColor.E[1]);
    __m256 G_Colx2  = _mm256_mul_ps(G_Col, Two);
    __m256 G_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[1]);

    __m256 B_Col    = _mm256_set1_ps(FloatColor.E[2]);
    __m256 B_Colx2  = _mm256_mul_ps(B_Col, Two);
    __m256 B_ColInv = _mm256_set1_ps(1.0f - FloatColor.E[2]);

    for (int16 Y = 0; Y < Source->Info.Height; Y += 2)
    {
        for (int16 X = 0; X < Source->Info.Width; X += 4)
        {
            uint32 XLookup = (X >> 2)*16 + (X % 4);
            uint32 YLookup = (Y >> 2)*(Source->Info.Width*4) + (Y % 4)*4;
            uint32 PixelToSeek = XLookup + YLookup;
            uint8 *Pixel = (uint8 *)BitmapInfo->BitmapBuffer + PixelToSeek*Source->Info.BytesPerPixel;
            __m256i DestPixel = _mm256_loadu_si256((const __m256i *)Pixel);

            // normalized values
            __m256 R_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(                  DestPixel,      FF)), Fraction255);
            __m256 G_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 8),  FF)), Fraction255);
            __m256 B_Dest = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(_mm256_srli_epi32(DestPixel, 16), FF)), Fraction255);
            __m256i A_Out = _mm256_and_si256(_mm256_srli_epi32(DestPixel, 24), FF);

            __m256 R_Blend = R_Col;
            __m256 G_Blend = G_Col;
            __m256 B_Blend = B_Col;
            switch (BlendMode)
            {
                case blend_normal:
                {
                } break;
                case blend_multiply:
                {
                    R_Blend  = _mm256_mul_ps(R_Dest, R_Col);
                    G_Blend  = _mm256_mul_ps(G_Dest, G_Col);
                    B_Blend  = _mm256_mul_ps(B_Dest, B_Col);
                } break;
                case blend_colorburn:
                {
                    R_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, R_Dest), R_Col));
                    G_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, G_Dest), G_Col));
                    B_Blend  = _mm256_sub_ps(One, _mm256_div_ps(_mm256_sub_ps(One, B_Dest), B_Col));
                } break;
                case blend_linearburn:
                {
                    R_Blend  = _mm256_sub_ps(_mm256_add_ps(R_Dest, R_Col), One);
                    G_Blend  = _mm256_sub_ps(_mm256_add_ps(G_Dest, G_Col), One);
                    B_Blend  = _mm256_sub_ps(_mm256_add_ps(B_Dest, B_Col), One);
                } break;
                case blend_add:
                {
                    R_Blend  = _mm256_add_ps(R_Dest, R_Col);
                    G_Blend  = _mm256_add_ps(G_Dest, G_Col);
                    B_Blend  = _mm256_add_ps(B_Dest, B_Col);
                } break;
                case blend_screen:
                {
                    R_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest), R_ColInv));
                    G_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest), G_ColInv));
                    B_Blend  = _mm256_sub_ps(One, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest), B_ColInv));
                } break;
                case blend_overlay:
                {
                    __m256 R_Mask  = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 1);
                    __m256 G_Mask  = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 1);
                    __m256 B_Mask  = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 1);
                    __m256 R_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
                    __m256 G_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
                    __m256 B_Lower = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
                    __m256 R_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
                    __m256 G_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
                    __m256 B_Upper = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                } break;
                case blend_softlight:
                {
                    // using Pegtop's equation
                    R_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, R_Colx2), _mm256_mul_ps(R_Dest, R_Dest)), _mm256_mul_ps(R_Colx2, R_Dest));
                    G_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, G_Colx2), _mm256_mul_ps(G_Dest, G_Dest)), _mm256_mul_ps(G_Colx2, G_Dest));
                    B_Blend  = _mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(One, B_Colx2), _mm256_mul_ps(B_Dest, B_Dest)), _mm256_mul_ps(B_Colx2, B_Dest));
                } break;
                case blend_hardlight:
                {
                    __m256 R_Mask   = _mm256_cmp_ps(R_Dest,  ZeroPointFive, 13);
                    __m256 G_Mask   = _mm256_cmp_ps(G_Dest,  ZeroPointFive, 13);
                    __m256 B_Mask   = _mm256_cmp_ps(B_Dest,  ZeroPointFive, 13);
                    __m256 R_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(R_Dest, R_Col));
                    __m256 G_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(G_Dest, G_Col));
                    __m256 B_Lower  = _mm256_mul_ps(Two, _mm256_mul_ps(B_Dest, B_Col));
                    __m256 R_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, R_Dest),  R_ColInv)));
                    __m256 G_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, G_Dest),  G_ColInv)));
                    __m256 B_Upper  = _mm256_sub_ps(One, _mm256_mul_ps(Two, _mm256_mul_ps(_mm256_sub_ps(One, B_Dest),  B_ColInv)));
                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                } break;
                case blend_subtract:
                {
                    R_Blend  = _mm256_sub_ps(R_Dest, R_Col);
                    G_Blend  = _mm256_sub_ps(G_Dest, G_Col);
                    B_Blend  = _mm256_sub_ps(B_Dest, B_Col);
                } break;
                case blend_divide:
                {
                    R_Blend  = _mm256_div_ps(R_Dest, R_Col);
                    G_Blend  = _mm256_div_ps(G_Dest, G_Col);
                    B_Blend  = _mm256_div_ps(B_Dest, B_Col);
                } break;
                case blend_difference:
                {
                    __m256 R_Lower  = _mm256_sub_ps(R_Col, R_Dest);
                    __m256 G_Lower  = _mm256_sub_ps(G_Col, G_Dest);
                    __m256 B_Lower  = _mm256_sub_ps(B_Col, B_Dest);
                    __m256 R_Upper  = _mm256_sub_ps(R_Dest, R_Col);
                    __m256 G_Upper  = _mm256_sub_ps(G_Dest, G_Col);
                    __m256 B_Upper  = _mm256_sub_ps(B_Dest, B_Col);
                    __m256 R_Mask  = _mm256_cmp_ps(R_Lower,  ZeroReal, 14);
                    __m256 G_Mask  = _mm256_cmp_ps(G_Lower,  ZeroReal, 14);
                    __m256 B_Mask  = _mm256_cmp_ps(B_Lower,  ZeroReal, 14);
                    R_Blend = _mm256_blendv_ps(R_Upper, R_Lower, R_Mask);
                    G_Blend = _mm256_blendv_ps(G_Upper, G_Lower, G_Mask);
                    B_Blend = _mm256_blendv_ps(B_Upper, B_Lower, B_Mask);
                } break;
            }

            R_Blend = _mm256_add_ps(_mm256_mul_ps(R_Dest, AlphaInv),
                                 _mm256_mul_ps(R_Blend, Alpha));
            G_Blend = _mm256_add_ps(_mm256_mul_ps(G_Dest, AlphaInv),
                                 _mm256_mul_ps(G_Blend, Alpha));
            B_Blend = _mm256_add_ps(_mm256_mul_ps(B_Dest, AlphaInv),
                                 _mm256_mul_ps(B_Blend, Alpha));

            R_Blend = _mm256_max_ps(_mm256_min_ps(One, R_Blend), ZeroReal);
            G_Blend = _mm256_max_ps(_mm256_min_ps(One, G_Blend), ZeroReal);
            B_Blend = _mm256_max_ps(_mm256_min_ps(One, B_Blend), ZeroReal);

            __m256i R_Out = _mm256_cvttps_epi32(_mm256_mul_ps(R_Blend, Real255));
            __m256i G_Out = _mm256_cvttps_epi32(_mm256_mul_ps(G_Blend, Real255));
            __m256i B_Out = _mm256_cvttps_epi32(_mm256_mul_ps(B_Blend, Real255));

            __m256i OutputPixel = _mm256_or_si256(
                                  _mm256_or_si256(R_Out, _mm256_slli_epi32(G_Out, 8)),
                                  _mm256_or_si256(_mm256_slli_epi32(B_Out, 16), _mm256_slli_epi32(A_Out, 24)));

            _mm256_storeu_si256((__m256i *)Pixel, OutputPixel);
        }
    }
#endif
}