libosmocore  1.5.1
Osmocom core library
conv_acc_sse_impl.h
Go to the documentation of this file.
1 
5 /*
6  * Copyright (C) 2013, 2014 Thomas Tsou <tom@tsou.cc>
7  *
8  * All Rights Reserved
9  *
10  * SPDX-License-Identifier: GPL-2.0+
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License for more details.
21  *
22  * You should have received a copy of the GNU General Public License along
23  * with this program; if not, write to the Free Software Foundation, Inc.,
24  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25  */
26 
27 /* Some distributions (notably Alpine Linux) for some strange reason
28  * don't have this #define */
29 #ifndef __always_inline
30 #define __always_inline inline __attribute__((always_inline))
31 #endif
32 
33 extern int sse41_supported;
34 
35 /* Octo-Viterbi butterfly
36  * Compute 8-wide butterfly generating 16 path decisions and 16 accumulated
37  * sums. Inputs all packed 16-bit integers in three 128-bit XMM registers.
38  * Two intermediate registers are used and results are set in the upper 4
39  * registers.
40  *
41  * Input:
42  * M0 - Path metrics 0 (packed 16-bit integers)
43  * M1 - Path metrics 1 (packed 16-bit integers)
44  * M2 - Branch metrics (packed 16-bit integers)
45  *
46  * Output:
47  * M2 - Selected and accumulated path metrics 0
48  * M4 - Selected and accumulated path metrics 1
49  * M3 - Path selections 0
50  * M1 - Path selections 1
51  */
52 #define SSE_BUTTERFLY(M0, M1, M2, M3, M4) \
53 { \
54  M3 = _mm_adds_epi16(M0, M2); \
55  M4 = _mm_subs_epi16(M1, M2); \
56  M0 = _mm_subs_epi16(M0, M2); \
57  M1 = _mm_adds_epi16(M1, M2); \
58  M2 = _mm_max_epi16(M3, M4); \
59  M3 = _mm_or_si128(_mm_cmpgt_epi16(M3, M4), _mm_cmpeq_epi16(M3, M4)); \
60  M4 = _mm_max_epi16(M0, M1); \
61  M1 = _mm_or_si128(_mm_cmpgt_epi16(M0, M1), _mm_cmpeq_epi16(M0, M1)); \
62 }
63 
64 /* Two lane deinterleaving K = 5:
65  * Take 16 interleaved 16-bit integers and deinterleave to 2 packed 128-bit
66  * registers. The operation summarized below. Four registers are used with
67  * the lower 2 as input and upper 2 as output.
68  *
69  * In - 10101010 10101010 10101010 10101010
70  * Out - 00000000 11111111 00000000 11111111
71  *
72  * Input:
73  * M0:1 - Packed 16-bit integers
74  *
75  * Output:
76  * M2:3 - Deinterleaved packed 16-bit integers
77  */
78 #define _I8_SHUFFLE_MASK 15, 14, 11, 10, 7, 6, 3, 2, 13, 12, 9, 8, 5, 4, 1, 0
79 
80 #define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3) \
81 { \
82  M2 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
83  M0 = _mm_shuffle_epi8(M0, M2); \
84  M1 = _mm_shuffle_epi8(M1, M2); \
85  M2 = _mm_unpacklo_epi64(M0, M1); \
86  M3 = _mm_unpackhi_epi64(M0, M1); \
87 }
88 
89 /* Two lane deinterleaving K = 7:
90  * Take 64 interleaved 16-bit integers and deinterleave to 8 packed 128-bit
91  * registers. The operation summarized below. 16 registers are used with the
92  * lower 8 as input and upper 8 as output.
93  *
94  * In - 10101010 10101010 10101010 10101010 ...
95  * Out - 00000000 11111111 00000000 11111111 ...
96  *
97  * Input:
98  * M0:7 - Packed 16-bit integers
99  *
100  * Output:
101  * M8:15 - Deinterleaved packed 16-bit integers
102  */
103 #define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, \
104  M8, M9, M10, M11, M12, M13, M14, M15) \
105 { \
106  M8 = _mm_set_epi8(_I8_SHUFFLE_MASK); \
107  M0 = _mm_shuffle_epi8(M0, M8); \
108  M1 = _mm_shuffle_epi8(M1, M8); \
109  M2 = _mm_shuffle_epi8(M2, M8); \
110  M3 = _mm_shuffle_epi8(M3, M8); \
111  M4 = _mm_shuffle_epi8(M4, M8); \
112  M5 = _mm_shuffle_epi8(M5, M8); \
113  M6 = _mm_shuffle_epi8(M6, M8); \
114  M7 = _mm_shuffle_epi8(M7, M8); \
115  M8 = _mm_unpacklo_epi64(M0, M1); \
116  M9 = _mm_unpackhi_epi64(M0, M1); \
117  M10 = _mm_unpacklo_epi64(M2, M3); \
118  M11 = _mm_unpackhi_epi64(M2, M3); \
119  M12 = _mm_unpacklo_epi64(M4, M5); \
120  M13 = _mm_unpackhi_epi64(M4, M5); \
121  M14 = _mm_unpacklo_epi64(M6, M7); \
122  M15 = _mm_unpackhi_epi64(M6, M7); \
123 }
124 
125 /* Generate branch metrics N = 2:
126  * Compute 16 branch metrics from trellis outputs and input values.
127  *
128  * Input:
129  * M0:3 - 16 x 2 packed 16-bit trellis outputs
130  * M4 - Expanded and packed 16-bit input value
131  *
132  * Output:
133  * M6:7 - 16 computed 16-bit branch metrics
134  */
135 #define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7) \
136 { \
137  M0 = _mm_sign_epi16(M4, M0); \
138  M1 = _mm_sign_epi16(M4, M1); \
139  M2 = _mm_sign_epi16(M4, M2); \
140  M3 = _mm_sign_epi16(M4, M3); \
141  M6 = _mm_hadds_epi16(M0, M1); \
142  M7 = _mm_hadds_epi16(M2, M3); \
143 }
144 
145 /* Generate branch metrics N = 4:
146  * Compute 8 branch metrics from trellis outputs and input values. This
147  * macro is reused for N less than 4 where the extra soft input bits are
148  * padded.
149  *
150  * Input:
151  * M0:3 - 8 x 4 packed 16-bit trellis outputs
152  * M4 - Expanded and packed 16-bit input value
153  *
154  * Output:
155  * M5 - 8 computed 16-bit branch metrics
156  */
157 #define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5) \
158 { \
159  M0 = _mm_sign_epi16(M4, M0); \
160  M1 = _mm_sign_epi16(M4, M1); \
161  M2 = _mm_sign_epi16(M4, M2); \
162  M3 = _mm_sign_epi16(M4, M3); \
163  M0 = _mm_hadds_epi16(M0, M1); \
164  M1 = _mm_hadds_epi16(M2, M3); \
165  M5 = _mm_hadds_epi16(M0, M1); \
166 }
167 
168 /* Horizontal minimum
169  * Compute horizontal minimum of packed unsigned 16-bit integers and place
170  * result in the low 16-bit element of the source register. Only SSE 4.1
171  * has a dedicated minpos instruction. One intermediate register is used
172  * if SSE 4.1 is not available. This is a destructive operation and the
173  * source register is overwritten.
174  *
175  * Input:
176  * M0 - Packed unsigned 16-bit integers
177  *
178  * Output:
179  * M0 - Minimum value placed in low 16-bit element
180  */
181 #if defined(HAVE_SSE4_1) || defined(HAVE_SSE41)
182 #define SSE_MINPOS(M0, M1) \
183 { \
184  if (sse41_supported) { \
185  M0 = _mm_minpos_epu16(M0); \
186  } else { \
187  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
188  M0 = _mm_min_epi16(M0, M1); \
189  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
190  M0 = _mm_min_epi16(M0, M1); \
191  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
192  M0 = _mm_min_epi16(M0, M1); \
193  } \
194 }
195 #else
196 #define SSE_MINPOS(M0, M1) \
197 { \
198  M1 = _mm_shuffle_epi32(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
199  M0 = _mm_min_epi16(M0, M1); \
200  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 3, 2)); \
201  M0 = _mm_min_epi16(M0, M1); \
202  M1 = _mm_shufflelo_epi16(M0, _MM_SHUFFLE(0, 0, 0, 1)); \
203  M0 = _mm_min_epi16(M0, M1); \
204 }
205 #endif
206 
207 /* Normalize state metrics K = 5:
208  * Compute 16-wide normalization by subtracting the smallest value from
209  * all values. Inputs are 16 packed 16-bit integers across 2 XMM registers.
210  * Two intermediate registers are used and normalized results are placed
211  * in the originating locations.
212  *
213  * Input:
214  * M0:1 - Path metrics 0:1 (packed 16-bit integers)
215  *
216  * Output:
217  * M0:1 - Normalized path metrics 0:1
218  */
219 #define SSE_NORMALIZE_K5(M0, M1, M2, M3) \
220 { \
221  M2 = _mm_min_epi16(M0, M1); \
222  SSE_MINPOS(M2, M3) \
223  SSE_BROADCAST(M2) \
224  M0 = _mm_subs_epi16(M0, M2); \
225  M1 = _mm_subs_epi16(M1, M2); \
226 }
227 
228 /* Normalize state metrics K = 7:
229  * Compute 64-wide normalization by subtracting the smallest value from
230  * all values. Inputs are 8 registers of accumulated sums and 4 temporary
231  * registers. Normalized results are returned in the originating locations.
232  *
233  * Input:
234  * M0:7 - Path metrics 0:7 (packed 16-bit integers)
235  *
236  * Output:
237  * M0:7 - Normalized path metrics 0:7
238  */
239 #define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11) \
240 { \
241  M8 = _mm_min_epi16(M0, M1); \
242  M9 = _mm_min_epi16(M2, M3); \
243  M10 = _mm_min_epi16(M4, M5); \
244  M11 = _mm_min_epi16(M6, M7); \
245  M8 = _mm_min_epi16(M8, M9); \
246  M10 = _mm_min_epi16(M10, M11); \
247  M8 = _mm_min_epi16(M8, M10); \
248  SSE_MINPOS(M8, M9) \
249  SSE_BROADCAST(M8) \
250  M0 = _mm_subs_epi16(M0, M8); \
251  M1 = _mm_subs_epi16(M1, M8); \
252  M2 = _mm_subs_epi16(M2, M8); \
253  M3 = _mm_subs_epi16(M3, M8); \
254  M4 = _mm_subs_epi16(M4, M8); \
255  M5 = _mm_subs_epi16(M5, M8); \
256  M6 = _mm_subs_epi16(M6, M8); \
257  M7 = _mm_subs_epi16(M7, M8); \
258 }
259 
260 /* Combined BMU/PMU (K=5, N=2)
261  * Compute branch metrics followed by path metrics for half rate 16-state
262  * trellis. 8 butterflies are computed. Accumulated path sums are not
263  * preserved and read and written into the same memory location. Normalize
264  * sums if requires.
265  */
266 __always_inline static void _sse_metrics_k5_n2(const int16_t *val,
267  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
268 {
269  __m128i m0, m1, m2, m3, m4, m5, m6;
270 
271  /* (BMU) Load input sequence */
272  m2 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
273 
274  /* (BMU) Load trellis outputs */
275  m0 = _mm_load_si128((__m128i *) &out[0]);
276  m1 = _mm_load_si128((__m128i *) &out[8]);
277 
278  /* (BMU) Compute branch metrics */
279  m0 = _mm_sign_epi16(m2, m0);
280  m1 = _mm_sign_epi16(m2, m1);
281  m2 = _mm_hadds_epi16(m0, m1);
282 
283  /* (PMU) Load accumulated path metrics */
284  m0 = _mm_load_si128((__m128i *) &sums[0]);
285  m1 = _mm_load_si128((__m128i *) &sums[8]);
286 
287  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
288 
289  /* (PMU) Butterflies: 0-7 */
290  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
291 
292  if (norm)
293  SSE_NORMALIZE_K5(m2, m6, m0, m1)
294 
295  _mm_store_si128((__m128i *) &sums[0], m2);
296  _mm_store_si128((__m128i *) &sums[8], m6);
297  _mm_store_si128((__m128i *) &paths[0], m5);
298  _mm_store_si128((__m128i *) &paths[8], m4);
299 }
300 
301 /* Combined BMU/PMU (K=5, N=3 and N=4)
302  * Compute branch metrics followed by path metrics for 16-state and rates
303  * to 1/4. 8 butterflies are computed. The input sequence is read four 16-bit
304  * values at a time, and extra values should be set to zero for rates other
305  * than 1/4. Normally only rates 1/3 and 1/4 are used as there is a
306  * dedicated implementation of rate 1/2.
307  */
308 __always_inline static void _sse_metrics_k5_n4(const int16_t *val,
309  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
310 {
311  __m128i m0, m1, m2, m3, m4, m5, m6;
312 
313  /* (BMU) Load input sequence */
314  m4 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
315 
316  /* (BMU) Load trellis outputs */
317  m0 = _mm_load_si128((__m128i *) &out[0]);
318  m1 = _mm_load_si128((__m128i *) &out[8]);
319  m2 = _mm_load_si128((__m128i *) &out[16]);
320  m3 = _mm_load_si128((__m128i *) &out[24]);
321 
322  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
323 
324  /* (PMU) Load accumulated path metrics */
325  m0 = _mm_load_si128((__m128i *) &sums[0]);
326  m1 = _mm_load_si128((__m128i *) &sums[8]);
327 
328  SSE_DEINTERLEAVE_K5(m0, m1, m3, m4)
329 
330  /* (PMU) Butterflies: 0-7 */
331  SSE_BUTTERFLY(m3, m4, m2, m5, m6)
332 
333  if (norm)
334  SSE_NORMALIZE_K5(m2, m6, m0, m1)
335 
336  _mm_store_si128((__m128i *) &sums[0], m2);
337  _mm_store_si128((__m128i *) &sums[8], m6);
338  _mm_store_si128((__m128i *) &paths[0], m5);
339  _mm_store_si128((__m128i *) &paths[8], m4);
340 }
341 
342 /* Combined BMU/PMU (K=7, N=2)
343  * Compute branch metrics followed by path metrics for half rate 64-state
344  * trellis. 32 butterfly operations are computed. Deinterleaving path
345  * metrics requires usage of the full SSE register file, so separate sums
346  * before computing branch metrics to avoid register spilling.
347  */
348 __always_inline static void _sse_metrics_k7_n2(const int16_t *val,
349  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
350 {
351  __m128i m0, m1, m2, m3, m4, m5, m6, m7, m8,
352  m9, m10, m11, m12, m13, m14, m15;
353 
354  /* (PMU) Load accumulated path metrics */
355  m0 = _mm_load_si128((__m128i *) &sums[0]);
356  m1 = _mm_load_si128((__m128i *) &sums[8]);
357  m2 = _mm_load_si128((__m128i *) &sums[16]);
358  m3 = _mm_load_si128((__m128i *) &sums[24]);
359  m4 = _mm_load_si128((__m128i *) &sums[32]);
360  m5 = _mm_load_si128((__m128i *) &sums[40]);
361  m6 = _mm_load_si128((__m128i *) &sums[48]);
362  m7 = _mm_load_si128((__m128i *) &sums[56]);
363 
364  /* (PMU) Deinterleave to even-odd registers */
365  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
366  m8, m9, m10, m11, m12, m13, m14, m15)
367 
368  /* (BMU) Load input symbols */
369  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
370 
371  /* (BMU) Load trellis outputs */
372  m0 = _mm_load_si128((__m128i *) &out[0]);
373  m1 = _mm_load_si128((__m128i *) &out[8]);
374  m2 = _mm_load_si128((__m128i *) &out[16]);
375  m3 = _mm_load_si128((__m128i *) &out[24]);
376 
377  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
378 
379  m0 = _mm_load_si128((__m128i *) &out[32]);
380  m1 = _mm_load_si128((__m128i *) &out[40]);
381  m2 = _mm_load_si128((__m128i *) &out[48]);
382  m3 = _mm_load_si128((__m128i *) &out[56]);
383 
384  SSE_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
385 
386  /* (PMU) Butterflies: 0-15 */
387  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
388  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
389 
390  _mm_store_si128((__m128i *) &paths[0], m0);
391  _mm_store_si128((__m128i *) &paths[8], m2);
392  _mm_store_si128((__m128i *) &paths[32], m9);
393  _mm_store_si128((__m128i *) &paths[40], m11);
394 
395  /* (PMU) Butterflies: 17-31 */
396  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
397  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
398 
399  _mm_store_si128((__m128i *) &paths[16], m0);
400  _mm_store_si128((__m128i *) &paths[24], m9);
401  _mm_store_si128((__m128i *) &paths[48], m13);
402  _mm_store_si128((__m128i *) &paths[56], m15);
403 
404  if (norm)
405  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
406  m7, m11, m0, m8, m9, m10)
407 
408  _mm_store_si128((__m128i *) &sums[0], m4);
409  _mm_store_si128((__m128i *) &sums[8], m5);
410  _mm_store_si128((__m128i *) &sums[16], m6);
411  _mm_store_si128((__m128i *) &sums[24], m7);
412  _mm_store_si128((__m128i *) &sums[32], m1);
413  _mm_store_si128((__m128i *) &sums[40], m3);
414  _mm_store_si128((__m128i *) &sums[48], m2);
415  _mm_store_si128((__m128i *) &sums[56], m11);
416 }
417 
418 /* Combined BMU/PMU (K=7, N=3 and N=4)
419  * Compute branch metrics followed by path metrics for half rate 64-state
420  * trellis. 32 butterfly operations are computed. Deinterleave path
421  * metrics before computing branch metrics as in the half rate case.
422  */
423 __always_inline static void _sse_metrics_k7_n4(const int16_t *val,
424  const int16_t *out, int16_t *sums, int16_t *paths, int norm)
425 {
426  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
427  __m128i m8, m9, m10, m11, m12, m13, m14, m15;
428 
429  /* (PMU) Load accumulated path metrics */
430  m0 = _mm_load_si128((__m128i *) &sums[0]);
431  m1 = _mm_load_si128((__m128i *) &sums[8]);
432  m2 = _mm_load_si128((__m128i *) &sums[16]);
433  m3 = _mm_load_si128((__m128i *) &sums[24]);
434  m4 = _mm_load_si128((__m128i *) &sums[32]);
435  m5 = _mm_load_si128((__m128i *) &sums[40]);
436  m6 = _mm_load_si128((__m128i *) &sums[48]);
437  m7 = _mm_load_si128((__m128i *) &sums[56]);
438 
439  /* (PMU) Deinterleave into even and odd packed registers */
440  SSE_DEINTERLEAVE_K7(m0, m1, m2, m3 ,m4 ,m5, m6, m7,
441  m8, m9, m10, m11, m12, m13, m14, m15)
442 
443  /* (BMU) Load and expand 8-bit input out to 16-bits */
444  m7 = _mm_castpd_si128(_mm_loaddup_pd((double const *) val));
445 
446  /* (BMU) Load and compute branch metrics */
447  m0 = _mm_load_si128((__m128i *) &out[0]);
448  m1 = _mm_load_si128((__m128i *) &out[8]);
449  m2 = _mm_load_si128((__m128i *) &out[16]);
450  m3 = _mm_load_si128((__m128i *) &out[24]);
451 
452  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
453 
454  m0 = _mm_load_si128((__m128i *) &out[32]);
455  m1 = _mm_load_si128((__m128i *) &out[40]);
456  m2 = _mm_load_si128((__m128i *) &out[48]);
457  m3 = _mm_load_si128((__m128i *) &out[56]);
458 
459  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
460 
461  m0 = _mm_load_si128((__m128i *) &out[64]);
462  m1 = _mm_load_si128((__m128i *) &out[72]);
463  m2 = _mm_load_si128((__m128i *) &out[80]);
464  m3 = _mm_load_si128((__m128i *) &out[88]);
465 
466  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
467 
468  m0 = _mm_load_si128((__m128i *) &out[96]);
469  m1 = _mm_load_si128((__m128i *) &out[104]);
470  m2 = _mm_load_si128((__m128i *) &out[112]);
471  m3 = _mm_load_si128((__m128i *) &out[120]);
472 
473  SSE_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
474 
475  /* (PMU) Butterflies: 0-15 */
476  SSE_BUTTERFLY(m8, m9, m4, m0, m1)
477  SSE_BUTTERFLY(m10, m11, m5, m2, m3)
478 
479  _mm_store_si128((__m128i *) &paths[0], m0);
480  _mm_store_si128((__m128i *) &paths[8], m2);
481  _mm_store_si128((__m128i *) &paths[32], m9);
482  _mm_store_si128((__m128i *) &paths[40], m11);
483 
484  /* (PMU) Butterflies: 17-31 */
485  SSE_BUTTERFLY(m12, m13, m6, m0, m2)
486  SSE_BUTTERFLY(m14, m15, m7, m9, m11)
487 
488  _mm_store_si128((__m128i *) &paths[16], m0);
489  _mm_store_si128((__m128i *) &paths[24], m9);
490  _mm_store_si128((__m128i *) &paths[48], m13);
491  _mm_store_si128((__m128i *) &paths[56], m15);
492 
493  if (norm)
494  SSE_NORMALIZE_K7(m4, m1, m5, m3, m6, m2,
495  m7, m11, m0, m8, m9, m10)
496 
497  _mm_store_si128((__m128i *) &sums[0], m4);
498  _mm_store_si128((__m128i *) &sums[8], m5);
499  _mm_store_si128((__m128i *) &sums[16], m6);
500  _mm_store_si128((__m128i *) &sums[24], m7);
501  _mm_store_si128((__m128i *) &sums[32], m1);
502  _mm_store_si128((__m128i *) &sums[40], m3);
503  _mm_store_si128((__m128i *) &sums[48], m2);
504  _mm_store_si128((__m128i *) &sums[56], m11);
505 }
SSE_BUTTERFLY
#define SSE_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_sse_impl.h:52
_sse_metrics_k7_n4
static __always_inline void _sse_metrics_k7_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:423
n
write Write running configuration to or terminal n Write configuration to the copy running config startup Copy configuration n Copy running config to n Copy running config to startup write Write running configuration to or terminal n Write to terminal n
SSE_ALIGN
#define SSE_ALIGN
Definition: conv_acc_sse_avx.c:38
sse41_supported
int sse41_supported
SSE_DEINTERLEAVE_K7
#define SSE_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_sse_impl.h:103
__attribute__
__attribute__((visibility("hidden")))
Include common SSE implementation.
Definition: conv_acc_sse.c:68
SSE_NORMALIZE_K7
#define SSE_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_sse_impl.h:239
vdecoder::paths
int16_t ** paths
Definition: conv_acc.c:190
_sse_metrics_k5_n4
static __always_inline void _sse_metrics_k5_n4(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:308
_sse_metrics_k7_n2
static __always_inline void _sse_metrics_k7_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:348
SSE_BRANCH_METRIC_N2
#define SSE_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_sse_impl.h:135
SSE_BRANCH_METRIC_N4
#define SSE_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_sse_impl.h:157
SSE_ALIGN
#define SSE_ALIGN
Definition: conv_acc_sse.c:37
conv_acc_sse_impl.h
SSE_NORMALIZE_K5
#define SSE_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:219
__always_inline
#define __always_inline
Definition: conv_acc_sse_impl.h:30
SSE_DEINTERLEAVE_K5
#define SSE_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_sse_impl.h:80
_sse_metrics_k5_n2
static __always_inline void _sse_metrics_k5_n2(const int16_t *val, const int16_t *out, int16_t *sums, int16_t *paths, int norm)
Definition: conv_acc_sse_impl.h:266
__attribute__
__attribute__((visibility("hidden")))
Include common SSE implementation.
Definition: conv_acc_sse_avx.c:68