56 __m256 factor = _mm256_set1_ps(f);
57 for (; width > 0; width -= 8, p += 8)
59 __m256 s = _mm256_load_ps(p);
60 _mm256_store_ps(p, _mm256_mul_ps(factor, s));
68 for (; width > 0; width -= 16, sp += 16, dpl += 8, dph += 8)
70 __m256 a = _mm256_load_ps(sp);
71 __m256 b = _mm256_load_ps(sp + 8);
72 __m256 c = _mm256_permute2f128_ps(a, b, (2 << 4) | (0));
73 __m256 d = _mm256_permute2f128_ps(a, b, (3 << 4) | (1));
74 __m256 e = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(2, 0, 2, 0));
75 __m256 f = _mm256_shuffle_ps(c, d, _MM_SHUFFLE(3, 1, 3, 1));
76 _mm256_store_ps(dpl, e);
77 _mm256_store_ps(dph, f);
85 for (; width > 0; width -= 16, dp += 16, spl += 8, sph += 8)
87 __m256 a = _mm256_load_ps(spl);
88 __m256 b = _mm256_load_ps(sph);
89 __m256 c = _mm256_unpacklo_ps(a, b);
90 __m256 d = _mm256_unpackhi_ps(a, b);
91 __m256 e = _mm256_permute2f128_ps(c, d, (2 << 4) | (0));
92 __m256 f = _mm256_permute2f128_ps(c, d, (3 << 4) | (1));
93 _mm256_store_ps(dp, e);
94 _mm256_store_ps(dp + 8, f);
101 ui32 repeat,
bool synthesis)
107 __m256 factor = _mm256_set1_ps(a);
109 float* dst = aug->
f32;
110 const float* src1 = sig->
f32, * src2 = other->
f32;
112 for ( ; i > 0; i -= 8, dst += 8, src1 += 8, src2 += 8)
114 __m256 s1 = _mm256_load_ps(src1);
115 __m256 s2 = _mm256_load_ps(src2);
116 __m256 d = _mm256_load_ps(dst);
117 d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
118 _mm256_store_ps(dst, d);
131 ui32 width,
bool even)
137 float* dpl = even ? ldst->
f32 : hdst->
f32;
138 float* dph = even ? hdst->
f32 : ldst->
f32;
139 float* sp = src->
f32;
145 float* hp = hdst->
f32, * lp = ldst->
f32;
146 ui32 l_width = (width + (even ? 1 : 0)) >> 1;
147 ui32 h_width = (width + (even ? 0 : 1)) >> 1;
149 for (
ui32 j = num_steps; j > 0; --j)
156 lp[l_width] = lp[l_width - 1];
158 const float* sp = lp;
160 int i = (int)h_width;
161 __m256 f = _mm256_set1_ps(a);
164 for (; i > 0; i -= 8, sp += 8, dp += 8)
166 __m256 m = _mm256_load_ps(sp);
167 __m256 n = _mm256_loadu_ps(sp + 1);
168 __m256 p = _mm256_load_ps(dp);
169 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
170 _mm256_store_ps(dp, p);
175 for (; i > 0; i -= 8, sp += 8, dp += 8)
177 __m256 m = _mm256_load_ps(sp);
178 __m256 n = _mm256_loadu_ps(sp - 1);
179 __m256 p = _mm256_load_ps(dp);
180 p = _mm256_add_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
181 _mm256_store_ps(dp, p);
186 float* t = lp; lp = hp; hp = t;
188 ui32 w = l_width; l_width = h_width; h_width = w;
192 float K = atk->
get_K();
193 float K_inv = 1.0f / K;
200 ldst->
f32[0] = src->
f32[0];
202 hdst->
f32[0] = src->
f32[0] * 2.0f;
209 ui32 width,
bool even)
214 float* oth = hsrc->
f32, * aug = lsrc->
f32;
215 ui32 aug_width = (width + (even ? 1 : 0)) >> 1;
216 ui32 oth_width = (width + (even ? 0 : 1)) >> 1;
219 float K = atk->
get_K();
220 float K_inv = 1.0f / K;
227 for (
ui32 j = 0; j < num_steps; ++j)
234 oth[oth_width] = oth[oth_width - 1];
236 const float* sp = oth;
238 int i = (int)aug_width;
239 __m256 f = _mm256_set1_ps(a);
242 for (; i > 0; i -= 8, sp += 8, dp += 8)
244 __m256 m = _mm256_load_ps(sp);
245 __m256 n = _mm256_loadu_ps(sp - 1);
246 __m256 p = _mm256_load_ps(dp);
247 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
248 _mm256_store_ps(dp, p);
253 for (; i > 0; i -= 8, sp += 8, dp += 8)
255 __m256 m = _mm256_load_ps(sp);
256 __m256 n = _mm256_loadu_ps(sp + 1);
257 __m256 p = _mm256_load_ps(dp);
258 p = _mm256_sub_ps(p, _mm256_mul_ps(f, _mm256_add_ps(m, n)));
259 _mm256_store_ps(dp, p);
264 float* t = aug; aug = oth; oth = t;
266 ui32 w = aug_width; aug_width = oth_width; oth_width = w;
271 float* dp = dst->
f32;
272 float* spl = even ? lsrc->
f32 : hsrc->
f32;
273 float* sph = even ? hsrc->
f32 : lsrc->
f32;
280 dst->
f32[0] = lsrc->
f32[0];
282 dst->
f32[0] = hsrc->
f32[0] * 0.5f;