29 #ifndef __always_inline
30 #define __always_inline inline __attribute__((always_inline))
52 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
54 M3 = _mm_adds_epi16(M0, M2); \
55 M4 = _mm_subs_epi16(M1, M2); \
56 M0 = _mm_subs_epi16(M0, M2); \
57 M1 = _mm_adds_epi16(M1, M2); \
58 M2 = _mm_max_epi16(M3, M4); \
59 M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
60 M4 = _mm_max_epi16(M0, M1); \
61 M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
78 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
80 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
82 M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
83 M0 = _mm_shuffle_epi8(M0, M2); \
84 M1 = _mm_shuffle_epi8(M1, M2); \
85 M2 = _mm_unpacklo_epi64(M0, M1); \
86 M3 = _mm_unpackhi_epi64(M0, M1); \
103 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
104 M8, M9, M10, M11, M12, M13, M14, M15) \
106 M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
107 M0 = _mm_shuffle_epi8(M0, M8); \
108 M1 = _mm_shuffle_epi8(M1, M8); \
109 M2 = _mm_shuffle_epi8(M2, M8); \
110 M3 = _mm_shuffle_epi8(M3, M8); \
111 M4 = _mm_shuffle_epi8(M4, M8); \
112 M5 = _mm_shuffle_epi8(M5, M8); \
113 M6 = _mm_shuffle_epi8(M6, M8); \
114 M7 = _mm_shuffle_epi8(M7, M8); \
115 M8 = _mm_unpacklo_epi64(M0, M1); \
116 M9 = _mm_unpackhi_epi64(M0, M1); \
117 M10 = _mm_unpacklo_epi64(M2, M3); \
118 M11 = _mm_unpackhi_epi64(M2, M3); \
119 M12 = _mm_unpacklo_epi64(M4, M5); \
120 M13 = _mm_unpackhi_epi64(M4, M5); \
121 M14 = _mm_unpacklo_epi64(M6, M7); \
122 M15 = _mm_unpackhi_epi64(M6, M7); \
135 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
137 M0 = _mm_sign_epi16(M4, M0); \
138 M1 = _mm_sign_epi16(M4, M1); \
139 M2 = _mm_sign_epi16(M4, M2); \
140 M3 = _mm_sign_epi16(M4, M3); \
141 M6 = _mm_hadds_epi16(M0, M1); \
142 M7 = _mm_hadds_epi16(M2, M3); \
157 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
159 M0 = _mm_sign_epi16(M4, M0); \
160 M1 = _mm_sign_epi16(M4, M1); \
161 M2 = _mm_sign_epi16(M4, M2); \
162 M3 = _mm_sign_epi16(M4, M3); \
163 M0 = _mm_hadds_epi16(M0, M1); \
164 M1 = _mm_hadds_epi16(M2, M3); \
165 M5 = _mm_hadds_epi16(M0, M1); \
181 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
182 #define SSE_MINPOS(M0, M1) \
184 if (sse41_supported) { \
185 M0 = _mm_minpos_epu16(M0); \
187 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
188 M0 = _mm_min_epi16(M0, M1); \
189 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
190 M0 = _mm_min_epi16(M0, M1); \
191 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
192 M0 = _mm_min_epi16(M0, M1); \
196 #define SSE_MINPOS(M0, M1) \
198 M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
199 M0 = _mm_min_epi16(M0, M1); \
200 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
201 M0 = _mm_min_epi16(M0, M1); \
202 M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
203 M0 = _mm_min_epi16(M0, M1); \
219 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
221 M2 = _mm_min_epi16(M0, M1); \
224 M0 = _mm_subs_epi16(M0, M2); \
225 M1 = _mm_subs_epi16(M1, M2); \
239 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
241 M8 = _mm_min_epi16(M0, M1); \
242 M9 = _mm_min_epi16(M2, M3); \
243 M10 = _mm_min_epi16(M4, M5); \
244 M11 = _mm_min_epi16(M6, M7); \
245 M8 = _mm_min_epi16(M8, M9); \
246 M10 = _mm_min_epi16(M10, M11); \
247 M8 = _mm_min_epi16(M8, M10); \
250 M0 = _mm_subs_epi16(M0, M8); \
251 M1 = _mm_subs_epi16(M1, M8); \
252 M2 = _mm_subs_epi16(M2, M8); \
253 M3 = _mm_subs_epi16(M3, M8); \
254 M4 = _mm_subs_epi16(M4, M8); \
255 M5 = _mm_subs_epi16(M5, M8); \
256 M6 = _mm_subs_epi16(M6, M8); \
257 M7 = _mm_subs_epi16(M7, M8); \
267 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
269 __m128i m0, m1, m2, m3, m4, m5, m6;
272 m2 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
275 m0 = _mm_load_si128((__m128i *) &out[0]);
276 m1 = _mm_load_si128((__m128i *) &out[8]);
279 m0 = _mm_sign_epi16(m2, m0);
280 m1 = _mm_sign_epi16(m2, m1);
281 m2 = _mm_hadds_epi16(m0, m1);
284 m0 = _mm_load_si128((__m128i *) &sums[0]);
285 m1 = _mm_load_si128((__m128i *) &sums[8]);
295 _mm_store_si128((__m128i *) &sums[0], m2);
296 _mm_store_si128((__m128i *) &sums[8], m6);
297 _mm_store_si128((__m128i *) &
paths[0], m5);
298 _mm_store_si128((__m128i *) &
paths[8], m4);
309 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
311 __m128i m0, m1, m2, m3, m4, m5, m6;
314 m4 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
317 m0 = _mm_load_si128((__m128i *) &out[0]);
318 m1 = _mm_load_si128((__m128i *) &out[8]);
319 m2 = _mm_load_si128((__m128i *) &out[16]);
320 m3 = _mm_load_si128((__m128i *) &out[24]);
325 m0 = _mm_load_si128((__m128i *) &sums[0]);
326 m1 = _mm_load_si128((__m128i *) &sums[8]);
336 _mm_store_si128((__m128i *) &sums[0], m2);
337 _mm_store_si128((__m128i *) &sums[8], m6);
338 _mm_store_si128((__m128i *) &
paths[0], m5);
339 _mm_store_si128((__m128i *) &
paths[8], m4);
349 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
351 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
352 m9, m10, m11, m12, m13, m14, m15;
355 m0 = _mm_load_si128((__m128i *) &sums[0]);
356 m1 = _mm_load_si128((__m128i *) &sums[8]);
357 m2 = _mm_load_si128((__m128i *) &sums[16]);
358 m3 = _mm_load_si128((__m128i *) &sums[24]);
359 m4 = _mm_load_si128((__m128i *) &sums[32]);
360 m5 = _mm_load_si128((__m128i *) &sums[40]);
361 m6 = _mm_load_si128((__m128i *) &sums[48]);
362 m7 = _mm_load_si128((__m128i *) &sums[56]);
366 m8, m9, m10, m11, m12, m13, m14, m15)
369 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
372 m0 = _mm_load_si128((__m128i *) &out[0]);
373 m1 = _mm_load_si128((__m128i *) &out[8]);
374 m2 = _mm_load_si128((__m128i *) &out[16]);
375 m3 = _mm_load_si128((__m128i *) &out[24]);
379 m0 = _mm_load_si128((__m128i *) &out[32]);
380 m1 = _mm_load_si128((__m128i *) &out[40]);
381 m2 = _mm_load_si128((__m128i *) &out[48]);
382 m3 = _mm_load_si128((__m128i *) &out[56]);
390 _mm_store_si128((__m128i *) &
paths[0], m0);
391 _mm_store_si128((__m128i *) &
paths[8], m2);
392 _mm_store_si128((__m128i *) &
paths[32], m9);
393 _mm_store_si128((__m128i *) &
paths[40], m11);
399 _mm_store_si128((__m128i *) &
paths[16], m0);
400 _mm_store_si128((__m128i *) &
paths[24], m9);
401 _mm_store_si128((__m128i *) &
paths[48], m13);
402 _mm_store_si128((__m128i *) &
paths[56], m15);
406 m7, m11, m0, m8, m9, m10)
408 _mm_store_si128((__m128i *) &sums[0], m4);
409 _mm_store_si128((__m128i *) &sums[8], m5);
410 _mm_store_si128((__m128i *) &sums[16], m6);
411 _mm_store_si128((__m128i *) &sums[24], m7);
412 _mm_store_si128((__m128i *) &sums[32], m1);
413 _mm_store_si128((__m128i *) &sums[40], m3);
414 _mm_store_si128((__m128i *) &sums[48], m2);
415 _mm_store_si128((__m128i *) &sums[56], m11);
424 const int16_t *out, int16_t *sums, int16_t *
paths,
int norm)
426 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
427 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
430 m0 = _mm_load_si128((__m128i *) &sums[0]);
431 m1 = _mm_load_si128((__m128i *) &sums[8]);
432 m2 = _mm_load_si128((__m128i *) &sums[16]);
433 m3 = _mm_load_si128((__m128i *) &sums[24]);
434 m4 = _mm_load_si128((__m128i *) &sums[32]);
435 m5 = _mm_load_si128((__m128i *) &sums[40]);
436 m6 = _mm_load_si128((__m128i *) &sums[48]);
437 m7 = _mm_load_si128((__m128i *) &sums[56]);
441 m8, m9, m10, m11, m12, m13, m14, m15)
444 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
447 m0 = _mm_load_si128((__m128i *) &out[0]);
448 m1 = _mm_load_si128((__m128i *) &out[8]);
449 m2 = _mm_load_si128((__m128i *) &out[16]);
450 m3 = _mm_load_si128((__m128i *) &out[24]);
454 m0 = _mm_load_si128((__m128i *) &out[32]);
455 m1 = _mm_load_si128((__m128i *) &out[40]);
456 m2 = _mm_load_si128((__m128i *) &out[48]);
457 m3 = _mm_load_si128((__m128i *) &out[56]);
461 m0 = _mm_load_si128((__m128i *) &out[64]);
462 m1 = _mm_load_si128((__m128i *) &out[72]);
463 m2 = _mm_load_si128((__m128i *) &out[80]);
464 m3 = _mm_load_si128((__m128i *) &out[88]);
468 m0 = _mm_load_si128((__m128i *) &out[96]);
469 m1 = _mm_load_si128((__m128i *) &out[104]);
470 m2 = _mm_load_si128((__m128i *) &out[112]);
471 m3 = _mm_load_si128((__m128i *) &out[120]);
479 _mm_store_si128((__m128i *) &
paths[0], m0);
480 _mm_store_si128((__m128i *) &
paths[8], m2);
481 _mm_store_si128((__m128i *) &
paths[32], m9);
482 _mm_store_si128((__m128i *) &
paths[40], m11);
488 _mm_store_si128((__m128i *) &
paths[16], m0);
489 _mm_store_si128((__m128i *) &
paths[24], m9);
490 _mm_store_si128((__m128i *) &
paths[48], m13);
491 _mm_store_si128((__m128i *) &
paths[56], m15);
495 m7, m11, m0, m8, m9, m10)
497 _mm_store_si128((__m128i *) &sums[0], m4);
498 _mm_store_si128((__m128i *) &sums[8], m5);
499 _mm_store_si128((__m128i *) &sums[16], m6);
500 _mm_store_si128((__m128i *) &sums[24], m7);
501 _mm_store_si128((__m128i *) &sums[32], m1);
502 _mm_store_si128((__m128i *) &sums[40], m3);
503 _mm_store_si128((__m128i *) &sums[48], m2);
504 _mm_store_si128((__m128i *) &sums[56], m11);