libosmocore  1.10.0.24-6791b.202410232026
Osmocom core library
conv_acc_sse_impl.h
Go to the documentation of this file.
1 
5 /*
6  * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
7  *
8  * All Rights Reserved
9  *
10  * SPDX-License-Identifier: GPL-2.0+
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License for more details.
21  */
22 
23 /* Some distributions (notably Alpine Linux) for some strange reason
24  * don't have this #define */
25 #ifndef __always_inline
26 #define __always_inline inline __attribute__((always_inline))
27 #endif
28 
29 extern int sse41_supported;
30 
31 /* Octo-Viterbi butterfly
32  * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
33  * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
34  * Two intermediate registers are used and results are set in the upper 4
35  * registers.
36  *
37  * Input:
38  * M0 - Path metrics 0 (packed 16-bit integers)
39  * M1 - Path metrics 1 (packed 16-bit integers)
40  * M2 - Branch metrics (packed 16-bit integers)
41  *
42  * Output:
43  * M2 - Selected and accumulated path metrics 0
44  * M4 - Selected and accumulated path metrics 1
45  * M3 - Path selections 0
46  * M1 - Path selections 1
47  */
48 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
49 { \
50  M3 = _mm_adds_epi16(M0, M2); \
51  M4 = _mm_subs_epi16(M1, M2); \
52  M0 = _mm_subs_epi16(M0, M2); \
53  M1 = _mm_adds_epi16(M1, M2); \
54  M2 = _mm_max_epi16(M3, M4); \
55  M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
56  M4 = _mm_max_epi16(M0, M1); \
57  M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
58 }
59 
60 /* Two lane deinterleaving K = 5:
61  * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
62  * registers. The operation summarized below. Four registers are used with
63  * the lower 2 as input and upper 2 as output.
64  *
65  * In - 10101010 10101010 10101010 10101010
66  * Out - 00000000 11111111 00000000 11111111
67  *
68  * Input:
69  * M0:1 - Packed 16-bit integers
70  *
71  * Output:
72  * M2:3 - Deinterleaved packed 16-bit integers
73  */
74 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
75 
76 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
77 { \
78  M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
79  M0 = _mm_shuffle_epi8(M0, M2); \
80  M1 = _mm_shuffle_epi8(M1, M2); \
81  M2 = _mm_unpacklo_epi64(M0, M1); \
82  M3 = _mm_unpackhi_epi64(M0, M1); \
83 }
84 
85 /* Two lane deinterleaving K = 7:
86  * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
87  * registers. The operation summarized below. 16 registers are used with the
88  * lower 8 as input and upper 8 as output.
89  *
90  * In - 10101010 10101010 10101010 10101010 ...
91  * Out - 00000000 11111111 00000000 11111111 ...
92  *
93  * Input:
94  * M0:7 - Packed 16-bit integers
95  *
96  * Output:
97  * M8:15 - Deinterleaved packed 16-bit integers
98  */
99 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
100  M8, M9, M10, M11, M12, M13, M14, M15) \
101 { \
102  M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
103  M0 = _mm_shuffle_epi8(M0, M8); \
104  M1 = _mm_shuffle_epi8(M1, M8); \
105  M2 = _mm_shuffle_epi8(M2, M8); \
106  M3 = _mm_shuffle_epi8(M3, M8); \
107  M4 = _mm_shuffle_epi8(M4, M8); \
108  M5 = _mm_shuffle_epi8(M5, M8); \
109  M6 = _mm_shuffle_epi8(M6, M8); \
110  M7 = _mm_shuffle_epi8(M7, M8); \
111  M8 = _mm_unpacklo_epi64(M0, M1); \
112  M9 = _mm_unpackhi_epi64(M0, M1); \
113  M10 = _mm_unpacklo_epi64(M2, M3); \
114  M11 = _mm_unpackhi_epi64(M2, M3); \
115  M12 = _mm_unpacklo_epi64(M4, M5); \
116  M13 = _mm_unpackhi_epi64(M4, M5); \
117  M14 = _mm_unpacklo_epi64(M6, M7); \
118  M15 = _mm_unpackhi_epi64(M6, M7); \
119 }
120 
121 /* Generate branch metrics N = 2:
122  * Compute 16 branch metrics from trellis outputs and input values.
123  *
124  * Input:
125  * M0:3 - 16 x 2 packed 16-bit trellis outputs
126  * M4 - Expanded and packed 16-bit input value
127  *
128  * Output:
129  * M6:7 - 16 computed 16-bit branch metrics
130  */
131 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
132 { \
133  M0 = _mm_sign_epi16(M4, M0); \
134  M1 = _mm_sign_epi16(M4, M1); \
135  M2 = _mm_sign_epi16(M4, M2); \
136  M3 = _mm_sign_epi16(M4, M3); \
137  M6 = _mm_hadds_epi16(M0, M1); \
138  M7 = _mm_hadds_epi16(M2, M3); \
139 }
140 
141 /* Generate branch metrics N = 4:
142  * Compute 8 branch metrics from trellis outputs and input values. This
143  * macro is reused for N less than 4 where the extra soft input bits are
144  * padded.
145  *
146  * Input:
147  * M0:3 - 8 x 4 packed 16-bit trellis outputs
148  * M4 - Expanded and packed 16-bit input value
149  *
150  * Output:
151  * M5 - 8 computed 16-bit branch metrics
152  */
153 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
154 { \
155  M0 = _mm_sign_epi16(M4, M0); \
156  M1 = _mm_sign_epi16(M4, M1); \
157  M2 = _mm_sign_epi16(M4, M2); \
158  M3 = _mm_sign_epi16(M4, M3); \
159  M0 = _mm_hadds_epi16(M0, M1); \
160  M1 = _mm_hadds_epi16(M2, M3); \
161  M5 = _mm_hadds_epi16(M0, M1); \
162 }
163 
164 /* Horizontal minimum
165  * Compute horizontal minimum of packed unsigned 16-bit integers and place
166  * result in the low 16-bit element of the source register. Only SSE 4.1
167  * has a dedicated minpos instruction. One intermediate register is used
168  * if SSE 4.1 is not available. This is a destructive operation and the
169  * source register is overwritten.
170  *
171  * Input:
172  * M0 - Packed unsigned 16-bit integers
173  *
174  * Output:
175  * M0 - Minimum value placed in low 16-bit element
176  */
177 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
178 #define SSE_MINPOS(M0, M1) \
179 { \
180  if (sse41_supported) { \
181  M0 = _mm_minpos_epu16(M0); \
182  } else { \
183  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
184  M0 = _mm_min_epi16(M0, M1); \
185  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
186  M0 = _mm_min_epi16(M0, M1); \
187  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
188  M0 = _mm_min_epi16(M0, M1); \
189  } \
190 }
191 #else
192 #define SSE_MINPOS(M0, M1) \
193 { \
194  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
195  M0 = _mm_min_epi16(M0, M1); \
196  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
197  M0 = _mm_min_epi16(M0, M1); \
198  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
199  M0 = _mm_min_epi16(M0, M1); \
200 }
201 #endif
202 
203 /* Normalize state metrics K = 5:
204  * Compute 16-wide normalization by subtracting the smallest value from
205  * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
206  * Two intermediate registers are used and normalized results are placed
207  * in the originating locations.
208  *
209  * Input:
210  * M0:1 - Path metrics 0:1 (packed 16-bit integers)
211  *
212  * Output:
213  * M0:1 - Normalized path metrics 0:1
214  */
215 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
216 { \
217  M2 = _mm_min_epi16(M0, M1); \
218  SSE_MINPOS(M2, M3) \
219  SSE_BROADCAST(M2) \
220  M0 = _mm_subs_epi16(M0, M2); \
221  M1 = _mm_subs_epi16(M1, M2); \
222 }
223 
224 /* Normalize state metrics K = 7:
225  * Compute 64-wide normalization by subtracting the smallest value from
226  * all values. Inputs are 8 registers of accumulated sums and 4 temporary
227  * registers. Normalized results are returned in the originating locations.
228  *
229  * Input:
230  * M0:7 - Path metrics 0:7 (packed 16-bit integers)
231  *
232  * Output:
233  * M0:7 - Normalized path metrics 0:7
234  */
235 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
236 { \
237  M8 = _mm_min_epi16(M0, M1); \
238  M9 = _mm_min_epi16(M2, M3); \
239  M10 = _mm_min_epi16(M4, M5); \
240  M11 = _mm_min_epi16(M6, M7); \
241  M8 = _mm_min_epi16(M8, M9); \
242  M10 = _mm_min_epi16(M10, M11); \
243  M8 = _mm_min_epi16(M8, M10); \
244  SSE_MINPOS(M8, M9) \
245  SSE_BROADCAST(M8) \
246  M0 = _mm_subs_epi16(M0, M8); \
247  M1 = _mm_subs_epi16(M1, M8); \
248  M2 = _mm_subs_epi16(M2, M8); \
249  M3 = _mm_subs_epi16(M3, M8); \
250  M4 = _mm_subs_epi16(M4, M8); \
251  M5 = _mm_subs_epi16(M5, M8); \
252  M6 = _mm_subs_epi16(M6, M8); \
253  M7 = _mm_subs_epi16(M7, M8); \
254 }
255 
256 /* Combined BMU/PMU (K=5, N=2)
257  * Compute branch metrics followed by path metrics for half rate 16-state
258  * trellis. 8 butterflies are computed. Accumulated path sums are not
259  * preserved and read and written into the same memory location. Normalize
260  * sums if requires.
261  */
262 __always_inline static void _sse_metrics_k5_n2(const int16_t *val,
263  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
264 {
265  __m128i m0, m1, m2, m3, m4, m5, m6;
266 
267  /* (BMU) Load input sequence */
268  m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
269 
270  /* (BMU) Load trellis outputs */
271  m0 = _mm_load_si128((__m128i *) &out[0]);
272  m1 = _mm_load_si128((__m128i *) &out[8]);
273 
274  /* (BMU) Compute branch metrics */
275  m0 = _mm_sign_epi16(m2, m0);
276  m1 = _mm_sign_epi16(m2, m1);
277  m2 = _mm_hadds_epi16(m0, m1);
278 
279  /* (PMU) Load accumulated path metrics */
280  m0 = _mm_load_si128((__m128i *) &sums[0]);
281  m1 = _mm_load_si128((__m128i *) &sums[8]);
282 
283  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
284 
285  /* (PMU) Butterflies: 0-7 */
286  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
287 
288  if (norm)
289  SSE_NORMALIZE_K5(m2, m6, m0, m1)
290 
291  _mm_store_si128((__m128i *) &sums[0], m2);
292  _mm_store_si128((__m128i *) &sums[8], m6);
293  _mm_store_si128((__m128i *) &paths[0], m5);
294  _mm_store_si128((__m128i *) &paths[8], m4);
295 }
296 
297 /* Combined BMU/PMU (K=5, N=3 and N=4)
298  * Compute branch metrics followed by path metrics for 16-state and rates
299  * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
300  * values at a time, and extra values should be set to zero for rates other
301  * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
302  * dedicated implementation of rate 1/2.
303  */
304 __always_inline static void _sse_metrics_k5_n4(const int16_t *val,
305  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
306 {
307  __m128i m0, m1, m2, m3, m4, m5, m6;
308 
309  /* (BMU) Load input sequence */
310  m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
311 
312  /* (BMU) Load trellis outputs */
313  m0 = _mm_load_si128((__m128i *) &out[0]);
314  m1 = _mm_load_si128((__m128i *) &out[8]);
315  m2 = _mm_load_si128((__m128i *) &out[16]);
316  m3 = _mm_load_si128((__m128i *) &out[24]);
317 
318  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
319 
320  /* (PMU) Load accumulated path metrics */
321  m0 = _mm_load_si128((__m128i *) &sums[0]);
322  m1 = _mm_load_si128((__m128i *) &sums[8]);
323 
324  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
325 
326  /* (PMU) Butterflies: 0-7 */
327  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
328 
329  if (norm)
330  SSE_NORMALIZE_K5(m2, m6, m0, m1)
331 
332  _mm_store_si128((__m128i *) &sums[0], m2);
333  _mm_store_si128((__m128i *) &sums[8], m6);
334  _mm_store_si128((__m128i *) &paths[0], m5);
335  _mm_store_si128((__m128i *) &paths[8], m4);
336 }
337 
338 /* Combined BMU/PMU (K=7, N=2)
339  * Compute branch metrics followed by path metrics for half rate 64-state
340  * trellis. 32 butterfly operations are computed. Deinterleaving path
341  * metrics requires usage of the full SSE register file, so separate sums
342  * before computing branch metrics to avoid register spilling.
343  */
344 __always_inline static void _sse_metrics_k7_n2(const int16_t *val,
345  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
346 {
347  __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
348  m9, m10, m11, m12, m13, m14, m15;
349 
350  /* (PMU) Load accumulated path metrics */
351  m0 = _mm_load_si128((__m128i *) &sums[0]);
352  m1 = _mm_load_si128((__m128i *) &sums[8]);
353  m2 = _mm_load_si128((__m128i *) &sums[16]);
354  m3 = _mm_load_si128((__m128i *) &sums[24]);
355  m4 = _mm_load_si128((__m128i *) &sums[32]);
356  m5 = _mm_load_si128((__m128i *) &sums[40]);
357  m6 = _mm_load_si128((__m128i *) &sums[48]);
358  m7 = _mm_load_si128((__m128i *) &sums[56]);
359 
360  /* (PMU) Deinterleave to even-odd registers */
361  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
362  m8, m9, m10, m11, m12, m13, m14, m15)
363 
364  /* (BMU) Load input symbols */
365  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
366 
367  /* (BMU) Load trellis outputs */
368  m0 = _mm_load_si128((__m128i *) &out[0]);
369  m1 = _mm_load_si128((__m128i *) &out[8]);
370  m2 = _mm_load_si128((__m128i *) &out[16]);
371  m3 = _mm_load_si128((__m128i *) &out[24]);
372 
373  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
374 
375  m0 = _mm_load_si128((__m128i *) &out[32]);
376  m1 = _mm_load_si128((__m128i *) &out[40]);
377  m2 = _mm_load_si128((__m128i *) &out[48]);
378  m3 = _mm_load_si128((__m128i *) &out[56]);
379 
380  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
381 
382  /* (PMU) Butterflies: 0-15 */
383  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
384  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
385 
386  _mm_store_si128((__m128i *) &paths[0], m0);
387  _mm_store_si128((__m128i *) &paths[8], m2);
388  _mm_store_si128((__m128i *) &paths[32], m9);
389  _mm_store_si128((__m128i *) &paths[40], m11);
390 
391  /* (PMU) Butterflies: 17-31 */
392  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
393  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
394 
395  _mm_store_si128((__m128i *) &paths[16], m0);
396  _mm_store_si128((__m128i *) &paths[24], m9);
397  _mm_store_si128((__m128i *) &paths[48], m13);
398  _mm_store_si128((__m128i *) &paths[56], m15);
399 
400  if (norm)
401  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
402  m7, m11, m0, m8, m9, m10)
403 
404  _mm_store_si128((__m128i *) &sums[0], m4);
405  _mm_store_si128((__m128i *) &sums[8], m5);
406  _mm_store_si128((__m128i *) &sums[16], m6);
407  _mm_store_si128((__m128i *) &sums[24], m7);
408  _mm_store_si128((__m128i *) &sums[32], m1);
409  _mm_store_si128((__m128i *) &sums[40], m3);
410  _mm_store_si128((__m128i *) &sums[48], m2);
411  _mm_store_si128((__m128i *) &sums[56], m11);
412 }
413 
414 /* Combined BMU/PMU (K=7, N=3 and N=4)
415  * Compute branch metrics followed by path metrics for half rate 64-state
416  * trellis. 32 butterfly operations are computed. Deinterleave path
417  * metrics before computing branch metrics as in the half rate case.
418  */
419 __always_inline static void _sse_metrics_k7_n4(const int16_t *val,
420  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
421 {
422  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
423  __m128i m8, m9, m10, m11, m12, m13, m14, m15;
424 
425  /* (PMU) Load accumulated path metrics */
426  m0 = _mm_load_si128((__m128i *) &sums[0]);
427  m1 = _mm_load_si128((__m128i *) &sums[8]);
428  m2 = _mm_load_si128((__m128i *) &sums[16]);
429  m3 = _mm_load_si128((__m128i *) &sums[24]);
430  m4 = _mm_load_si128((__m128i *) &sums[32]);
431  m5 = _mm_load_si128((__m128i *) &sums[40]);
432  m6 = _mm_load_si128((__m128i *) &sums[48]);
433  m7 = _mm_load_si128((__m128i *) &sums[56]);
434 
435  /* (PMU) Deinterleave into even and odd packed registers */
436  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
437  m8, m9, m10, m11, m12, m13, m14, m15)
438 
439  /* (BMU) Load and expand 8-bit input out to 16-bits */
440  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
441 
442  /* (BMU) Load and compute branch metrics */
443  m0 = _mm_load_si128((__m128i *) &out[0]);
444  m1 = _mm_load_si128((__m128i *) &out[8]);
445  m2 = _mm_load_si128((__m128i *) &out[16]);
446  m3 = _mm_load_si128((__m128i *) &out[24]);
447 
448  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
449 
450  m0 = _mm_load_si128((__m128i *) &out[32]);
451  m1 = _mm_load_si128((__m128i *) &out[40]);
452  m2 = _mm_load_si128((__m128i *) &out[48]);
453  m3 = _mm_load_si128((__m128i *) &out[56]);
454 
455  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
456 
457  m0 = _mm_load_si128((__m128i *) &out[64]);
458  m1 = _mm_load_si128((__m128i *) &out[72]);
459  m2 = _mm_load_si128((__m128i *) &out[80]);
460  m3 = _mm_load_si128((__m128i *) &out[88]);
461 
462  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
463 
464  m0 = _mm_load_si128((__m128i *) &out[96]);
465  m1 = _mm_load_si128((__m128i *) &out[104]);
466  m2 = _mm_load_si128((__m128i *) &out[112]);
467  m3 = _mm_load_si128((__m128i *) &out[120]);
468 
469  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
470 
471  /* (PMU) Butterflies: 0-15 */
472  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
473  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
474 
475  _mm_store_si128((__m128i *) &paths[0], m0);
476  _mm_store_si128((__m128i *) &paths[8], m2);
477  _mm_store_si128((__m128i *) &paths[32], m9);
478  _mm_store_si128((__m128i *) &paths[40], m11);
479 
480  /* (PMU) Butterflies: 17-31 */
481  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
482  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
483 
484  _mm_store_si128((__m128i *) &paths[16], m0);
485  _mm_store_si128((__m128i *) &paths[24], m9);
486  _mm_store_si128((__m128i *) &paths[48], m13);
487  _mm_store_si128((__m128i *) &paths[56], m15);
488 
489  if (norm)
490  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
491  m7, m11, m0, m8, m9, m10)
492 
493  _mm_store_si128((__m128i *) &sums[0], m4);
494  _mm_store_si128((__m128i *) &sums[8], m5);
495  _mm_store_si128((__m128i *) &sums[16], m6);
496  _mm_store_si128((__m128i *) &sums[24], m7);
497  _mm_store_si128((__m128i *) &sums[32], m1);
498  _mm_store_si128((__m128i *) &sums[40], m3);
499  _mm_store_si128((__m128i *) &sums[48], m2);
500  _mm_store_si128((__m128i *) &sums[56], m11);
501 }
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:153
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:76
#define __always_inline
Definition: conv_acc_sse_impl.h:26
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:215
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:235
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:419
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:48
int sse41_supported
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:131
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:262
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:344
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:304
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:99