libosmocore  1.9.0.169-ada8.202403032026
Osmocom core library
conv_acc_neon_impl.h
Go to the documentation of this file.
1 
4 /*
5  * (C) 2020 by sysmocom - s.f.m.c. GmbH
6  * Author: Eric Wild
7  *
8  * All Rights Reserved
9  *
10  * SPDX-License-Identifier: GPL-2.0+
11  *
12  * This program is free software; you can redistribute it and/or modify
13  * it under the terms of the GNU General Public License as published by
14  * the Free Software Foundation; either version 2 of the License, or
15  * (at your option) any later version.
16  *
17  * This program is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20  * GNU General Public License for more details.
21  */
22 
23 /* Some distributions (notably Alpine Linux) for some strange reason
24  * don't have this #define */
25 #ifndef __always_inline
26 #define __always_inline inline __attribute__((always_inline))
27 #endif
28 
29 #define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \
30 { \
31  M3 = vqaddq_s16(M0, M2); \
32  M4 = vqsubq_s16(M1, M2); \
33  M0 = vqsubq_s16(M0, M2); \
34  M1 = vqaddq_s16(M1, M2); \
35  M2 = vmaxq_s16(M3, M4); \
36  M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \
37  M4 = vmaxq_s16(M0, M1); \
38  M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \
39 }
40 
41 #define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \
42 { \
43  int16x8x2_t tmp; \
44  tmp = vuzpq_s16(M0, M1); \
45  M2 = tmp.val[0]; \
46  M3 = tmp.val[1]; \
47 }
48 
49 #define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \
50 { \
51  int16x8x2_t tmp; \
52  tmp = vuzpq_s16(M0, M1); \
53  M8 = tmp.val[0]; M9 = tmp.val[1]; \
54  tmp = vuzpq_s16(M2, M3); \
55  M10 = tmp.val[0]; M11 = tmp.val[1]; \
56  tmp = vuzpq_s16(M4, M5); \
57  M12 = tmp.val[0]; M13 = tmp.val[1]; \
58  tmp = vuzpq_s16(M6, M7); \
59  M14 = tmp.val[0]; M15 = tmp.val[1]; \
60 }
61 
62 #define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \
63 { \
64  M0 = vmulq_s16(M4, M0); \
65  M1 = vmulq_s16(M4, M1); \
66  M2 = vmulq_s16(M4, M2); \
67  M3 = vmulq_s16(M4, M3); \
68  M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
69  M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
70 }
71 
72 #define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \
73 { \
74  M0 = vmulq_s16(M4, M0); \
75  M1 = vmulq_s16(M4, M1); \
76  M2 = vmulq_s16(M4, M2); \
77  M3 = vmulq_s16(M4, M3); \
78  int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \
79  int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \
80  M5 = vcombine_s16(t1, t2); \
81 }
82 
83 #define NEON_NORMALIZE_K5(M0,M1,M2,M3) \
84 { \
85  M2 = vminq_s16(M0, M1); \
86  int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \
87  t = vpmin_s16(t, t); \
88  t = vpmin_s16(t, t); \
89  M2 = vdupq_lane_s16(t, 0); \
90  M0 = vqsubq_s16(M0, M2); \
91  M1 = vqsubq_s16(M1, M2); \
92 }
93 
94 #define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \
95 { \
96  M8 = vminq_s16(M0, M1); \
97  M9 = vminq_s16(M2, M3); \
98  M10 = vminq_s16(M4, M5); \
99  M11 = vminq_s16(M6, M7); \
100  M8 = vminq_s16(M8, M9); \
101  M10 = vminq_s16(M10, M11); \
102  M8 = vminq_s16(M8, M10); \
103  int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \
104  t = vpmin_s16(t, t); \
105  t = vpmin_s16(t, t); \
106  M8 = vdupq_lane_s16(t, 0); \
107  M0 = vqsubq_s16(M0, M8); \
108  M1 = vqsubq_s16(M1, M8); \
109  M2 = vqsubq_s16(M2, M8); \
110  M3 = vqsubq_s16(M3, M8); \
111  M4 = vqsubq_s16(M4, M8); \
112  M5 = vqsubq_s16(M5, M8); \
113  M6 = vqsubq_s16(M6, M8); \
114  M7 = vqsubq_s16(M7, M8); \
115 }
116 
117 __always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
118  int norm)
119 {
120  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
121  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
122  int16x8_t m0, m1, m2, m3, m4, m5, m6;
123  int16x4_t input;
124 
125  /* (BMU) Load and expand 8-bit input out to 16-bits */
126  input = vld1_s16(val);
127  m2 = vcombine_s16(input, input);
128 
129  /* (BMU) Load and compute branch metrics */
130  m0 = vld1q_s16(&out[0]);
131  m1 = vld1q_s16(&out[8]);
132 
133  m0 = vmulq_s16(m2, m0);
134  m1 = vmulq_s16(m2, m1);
135  m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
136  vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
137 
138  /* (PMU) Load accumulated path matrics */
139  m0 = vld1q_s16(&sums[0]);
140  m1 = vld1q_s16(&sums[8]);
141 
142  NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
143 
144  /* (PMU) Butterflies: 0-7 */
145  NEON_BUTTERFLY(m3, m4, m2, m5, m6)
146 
147  if (norm)
148  NEON_NORMALIZE_K5(m2, m6, m0, m1)
149 
150  vst1q_s16(&sums[0], m2);
151  vst1q_s16(&sums[8], m6);
152  vst1q_s16(&paths[0], m5);
153  vst1q_s16(&paths[8], m4);
154 }
155 
156 __always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
157  int norm)
158 {
159  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
160  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
161  int16x8_t m0, m1, m2, m3, m4, m5, m6;
162  int16x4_t input;
163 
164  /* (BMU) Load and expand 8-bit input out to 16-bits */
165  input = vld1_s16(val);
166  m4 = vcombine_s16(input, input);
167 
168  /* (BMU) Load and compute branch metrics */
169  m0 = vld1q_s16(&out[0]);
170  m1 = vld1q_s16(&out[8]);
171  m2 = vld1q_s16(&out[16]);
172  m3 = vld1q_s16(&out[24]);
173 
174  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m4, m2)
175 
176  /* (PMU) Load accumulated path matrics */
177  m0 = vld1q_s16(&sums[0]);
178  m1 = vld1q_s16(&sums[8]);
179 
180  NEON_DEINTERLEAVE_K5(m0, m1, m3, m4)
181 
182  /* (PMU) Butterflies: 0-7 */
183  NEON_BUTTERFLY(m3, m4, m2, m5, m6)
184 
185  if (norm)
186  NEON_NORMALIZE_K5(m2, m6, m0, m1)
187 
188  vst1q_s16(&sums[0], m2);
189  vst1q_s16(&sums[8], m6);
190  vst1q_s16(&paths[0], m5);
191  vst1q_s16(&paths[8], m4);
192 }
193 
194 __always_inline static void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
195  int norm)
196 {
197  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
198  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
199  int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
200  int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
201  int16x4_t input;
202 
203  /* (PMU) Load accumulated path matrics */
204  m0 = vld1q_s16(&sums[0]);
205  m1 = vld1q_s16(&sums[8]);
206  m2 = vld1q_s16(&sums[16]);
207  m3 = vld1q_s16(&sums[24]);
208  m4 = vld1q_s16(&sums[32]);
209  m5 = vld1q_s16(&sums[40]);
210  m6 = vld1q_s16(&sums[48]);
211  m7 = vld1q_s16(&sums[56]);
212 
213  /* (PMU) Deinterleave into even and odd packed registers */
214  NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
215 
216  /* (BMU) Load and expand 8-bit input out to 16-bits */
217  input = vld1_s16(val);
218  m7 = vcombine_s16(input, input);
219 
220  /* (BMU) Load and compute branch metrics */
221  m0 = vld1q_s16(&out[0]);
222  m1 = vld1q_s16(&out[8]);
223  m2 = vld1q_s16(&out[16]);
224  m3 = vld1q_s16(&out[24]);
225 
226  NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m4, m5)
227 
228  m0 = vld1q_s16(&out[32]);
229  m1 = vld1q_s16(&out[40]);
230  m2 = vld1q_s16(&out[48]);
231  m3 = vld1q_s16(&out[56]);
232 
233  NEON_BRANCH_METRIC_N2(m0, m1, m2, m3, m7, m6, m7)
234 
235  /* (PMU) Butterflies: 0-15 */
236  NEON_BUTTERFLY(m8, m9, m4, m0, m1)
237  NEON_BUTTERFLY(m10, m11, m5, m2, m3)
238 
239  vst1q_s16(&paths[0], m0);
240  vst1q_s16(&paths[8], m2);
241  vst1q_s16(&paths[32], m9);
242  vst1q_s16(&paths[40], m11);
243 
244  /* (PMU) Butterflies: 17-31 */
245  NEON_BUTTERFLY(m12, m13, m6, m0, m2)
246  NEON_BUTTERFLY(m14, m15, m7, m9, m11)
247 
248  vst1q_s16(&paths[16], m0);
249  vst1q_s16(&paths[24], m9);
250  vst1q_s16(&paths[48], m13);
251  vst1q_s16(&paths[56], m15);
252 
253  if (norm)
254  NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
255 
256  vst1q_s16(&sums[0], m4);
257  vst1q_s16(&sums[8], m5);
258  vst1q_s16(&sums[16], m6);
259  vst1q_s16(&sums[24], m7);
260  vst1q_s16(&sums[32], m1);
261  vst1q_s16(&sums[40], m3);
262  vst1q_s16(&sums[48], m2);
263  vst1q_s16(&sums[56], m11);
264 }
265 
266 __always_inline static void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths,
267  int norm)
268 {
269  int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
270  int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
271  int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
272  int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
273  int16x4_t input;
274 
275  /* (PMU) Load accumulated path matrics */
276  m0 = vld1q_s16(&sums[0]);
277  m1 = vld1q_s16(&sums[8]);
278  m2 = vld1q_s16(&sums[16]);
279  m3 = vld1q_s16(&sums[24]);
280  m4 = vld1q_s16(&sums[32]);
281  m5 = vld1q_s16(&sums[40]);
282  m6 = vld1q_s16(&sums[48]);
283  m7 = vld1q_s16(&sums[56]);
284 
285  /* (PMU) Deinterleave into even and odd packed registers */
286  NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
287 
288  /* (BMU) Load and expand 8-bit input out to 16-bits */
289  input = vld1_s16(val);
290  m7 = vcombine_s16(input, input);
291 
292  /* (BMU) Load and compute branch metrics */
293  m0 = vld1q_s16(&out[0]);
294  m1 = vld1q_s16(&out[8]);
295  m2 = vld1q_s16(&out[16]);
296  m3 = vld1q_s16(&out[24]);
297 
298  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m4)
299 
300  m0 = vld1q_s16(&out[32]);
301  m1 = vld1q_s16(&out[40]);
302  m2 = vld1q_s16(&out[48]);
303  m3 = vld1q_s16(&out[56]);
304 
305  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m5)
306 
307  m0 = vld1q_s16(&out[64]);
308  m1 = vld1q_s16(&out[72]);
309  m2 = vld1q_s16(&out[80]);
310  m3 = vld1q_s16(&out[88]);
311 
312  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m6)
313 
314  m0 = vld1q_s16(&out[96]);
315  m1 = vld1q_s16(&out[104]);
316  m2 = vld1q_s16(&out[112]);
317  m3 = vld1q_s16(&out[120]);
318 
319  NEON_BRANCH_METRIC_N4(m0, m1, m2, m3, m7, m7)
320 
321  /* (PMU) Butterflies: 0-15 */
322  NEON_BUTTERFLY(m8, m9, m4, m0, m1)
323  NEON_BUTTERFLY(m10, m11, m5, m2, m3)
324 
325  vst1q_s16(&paths[0], m0);
326  vst1q_s16(&paths[8], m2);
327  vst1q_s16(&paths[32], m9);
328  vst1q_s16(&paths[40], m11);
329 
330  /* (PMU) Butterflies: 17-31 */
331  NEON_BUTTERFLY(m12, m13, m6, m0, m2)
332  NEON_BUTTERFLY(m14, m15, m7, m9, m11)
333 
334  vst1q_s16(&paths[16], m0);
335  vst1q_s16(&paths[24], m9);
336  vst1q_s16(&paths[48], m13);
337  vst1q_s16(&paths[56], m15);
338 
339  if (norm)
340  NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
341 
342  vst1q_s16(&sums[0], m4);
343  vst1q_s16(&sums[8], m5);
344  vst1q_s16(&sums[16], m6);
345  vst1q_s16(&sums[24], m7);
346  vst1q_s16(&sums[32], m1);
347  vst1q_s16(&sums[40], m3);
348  vst1q_s16(&sums[48], m2);
349  vst1q_s16(&sums[56], m11);
350 }
__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:117
#define NEON_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_neon_impl.h:49
#define NEON_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_neon_impl.h:29
#define NEON_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_neon_impl.h:94
#define NEON_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:83
#define __always_inline
Definition: conv_acc_neon_impl.h:26
#define NEON_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_neon_impl.h:62
#define NEON_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_neon_impl.h:72
#define NEON_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:41
static __always_inline void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:194
__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:156
static __always_inline void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:266