263 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
265 __m128i m0, m1, m2, m3, m4, m5, m6;
268 m2 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
271 m0 = _mm_load_si128((__m128i *) &out[0]);
272 m1 = _mm_load_si128((__m128i *) &out[8]);
275 m0 = _mm_sign_epi16(m2, m0);
276 m1 = _mm_sign_epi16(m2, m1);
277 m2 = _mm_hadds_epi16(m0, m1);
280 m0 = _mm_load_si128((__m128i *) &sums[0]);
281 m1 = _mm_load_si128((__m128i *) &sums[8]);
291 _mm_store_si128((__m128i *) &sums[0], m2);
292 _mm_store_si128((__m128i *) &sums[8], m6);
293 _mm_store_si128((__m128i *) &paths[0], m5);
294 _mm_store_si128((__m128i *) &paths[8], m4);
305 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
307 __m128i m0, m1, m2, m3, m4, m5, m6;
310 m4 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
313 m0 = _mm_load_si128((__m128i *) &out[0]);
314 m1 = _mm_load_si128((__m128i *) &out[8]);
315 m2 = _mm_load_si128((__m128i *) &out[16]);
316 m3 = _mm_load_si128((__m128i *) &out[24]);
321 m0 = _mm_load_si128((__m128i *) &sums[0]);
322 m1 = _mm_load_si128((__m128i *) &sums[8]);
332 _mm_store_si128((__m128i *) &sums[0], m2);
333 _mm_store_si128((__m128i *) &sums[8], m6);
334 _mm_store_si128((__m128i *) &paths[0], m5);
335 _mm_store_si128((__m128i *) &paths[8], m4);
345 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
347 __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
348 m9, m10, m11, m12, m13, m14, m15;
351 m0 = _mm_load_si128((__m128i *) &sums[0]);
352 m1 = _mm_load_si128((__m128i *) &sums[8]);
353 m2 = _mm_load_si128((__m128i *) &sums[16]);
354 m3 = _mm_load_si128((__m128i *) &sums[24]);
355 m4 = _mm_load_si128((__m128i *) &sums[32]);
356 m5 = _mm_load_si128((__m128i *) &sums[40]);
357 m6 = _mm_load_si128((__m128i *) &sums[48]);
358 m7 = _mm_load_si128((__m128i *) &sums[56]);
362 m8, m9, m10, m11, m12, m13, m14, m15)
365 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
368 m0 = _mm_load_si128((__m128i *) &out[0]);
369 m1 = _mm_load_si128((__m128i *) &out[8]);
370 m2 = _mm_load_si128((__m128i *) &out[16]);
371 m3 = _mm_load_si128((__m128i *) &out[24]);
375 m0 = _mm_load_si128((__m128i *) &out[32]);
376 m1 = _mm_load_si128((__m128i *) &out[40]);
377 m2 = _mm_load_si128((__m128i *) &out[48]);
378 m3 = _mm_load_si128((__m128i *) &out[56]);
386 _mm_store_si128((__m128i *) &paths[0], m0);
387 _mm_store_si128((__m128i *) &paths[8], m2);
388 _mm_store_si128((__m128i *) &paths[32], m9);
389 _mm_store_si128((__m128i *) &paths[40], m11);
395 _mm_store_si128((__m128i *) &paths[16], m0);
396 _mm_store_si128((__m128i *) &paths[24], m9);
397 _mm_store_si128((__m128i *) &paths[48], m13);
398 _mm_store_si128((__m128i *) &paths[56], m15);
402 m7, m11, m0, m8, m9, m10)
404 _mm_store_si128((__m128i *) &sums[0], m4);
405 _mm_store_si128((__m128i *) &sums[8], m5);
406 _mm_store_si128((__m128i *) &sums[16], m6);
407 _mm_store_si128((__m128i *) &sums[24], m7);
408 _mm_store_si128((__m128i *) &sums[32], m1);
409 _mm_store_si128((__m128i *) &sums[40], m3);
410 _mm_store_si128((__m128i *) &sums[48], m2);
411 _mm_store_si128((__m128i *) &sums[56], m11);
420 const int16_t *out, int16_t *sums, int16_t *paths,
int norm)
422 __m128i m0, m1, m2, m3, m4, m5, m6, m7;
423 __m128i m8, m9, m10, m11, m12, m13, m14, m15;
426 m0 = _mm_load_si128((__m128i *) &sums[0]);
427 m1 = _mm_load_si128((__m128i *) &sums[8]);
428 m2 = _mm_load_si128((__m128i *) &sums[16]);
429 m3 = _mm_load_si128((__m128i *) &sums[24]);
430 m4 = _mm_load_si128((__m128i *) &sums[32]);
431 m5 = _mm_load_si128((__m128i *) &sums[40]);
432 m6 = _mm_load_si128((__m128i *) &sums[48]);
433 m7 = _mm_load_si128((__m128i *) &sums[56]);
437 m8, m9, m10, m11, m12, m13, m14, m15)
440 m7 = _mm_castpd_si128(_mm_loaddup_pd((
double const *) val));
443 m0 = _mm_load_si128((__m128i *) &out[0]);
444 m1 = _mm_load_si128((__m128i *) &out[8]);
445 m2 = _mm_load_si128((__m128i *) &out[16]);
446 m3 = _mm_load_si128((__m128i *) &out[24]);
450 m0 = _mm_load_si128((__m128i *) &out[32]);
451 m1 = _mm_load_si128((__m128i *) &out[40]);
452 m2 = _mm_load_si128((__m128i *) &out[48]);
453 m3 = _mm_load_si128((__m128i *) &out[56]);
457 m0 = _mm_load_si128((__m128i *) &out[64]);
458 m1 = _mm_load_si128((__m128i *) &out[72]);
459 m2 = _mm_load_si128((__m128i *) &out[80]);
460 m3 = _mm_load_si128((__m128i *) &out[88]);
464 m0 = _mm_load_si128((__m128i *) &out[96]);
465 m1 = _mm_load_si128((__m128i *) &out[104]);
466 m2 = _mm_load_si128((__m128i *) &out[112]);
467 m3 = _mm_load_si128((__m128i *) &out[120]);
475 _mm_store_si128((__m128i *) &paths[0], m0);
476 _mm_store_si128((__m128i *) &paths[8], m2);
477 _mm_store_si128((__m128i *) &paths[32], m9);
478 _mm_store_si128((__m128i *) &paths[40], m11);
484 _mm_store_si128((__m128i *) &paths[16], m0);
485 _mm_store_si128((__m128i *) &paths[24], m9);
486 _mm_store_si128((__m128i *) &paths[48], m13);
487 _mm_store_si128((__m128i *) &paths[56], m15);
491 m7, m11, m0, m8, m9, m10)
493 _mm_store_si128((__m128i *) &sums[0], m4);
494 _mm_store_si128((__m128i *) &sums[8], m5);
495 _mm_store_si128((__m128i *) &sums[16], m6);
496 _mm_store_si128((__m128i *) &sums[24], m7);
497 _mm_store_si128((__m128i *) &sums[32], m1);
498 _mm_store_si128((__m128i *) &sums[40], m3);
499 _mm_store_si128((__m128i *) &sums[48], m2);
500 _mm_store_si128((__m128i *) &sums[56], m11);
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition conv_acc_sse_impl.h:235
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition conv_acc_sse_impl.h:99