29 #ifndef __always_inline 30 #define __always_inline inline __attribute__((always_inline)) 33 #define NEON_BUTTERFLY(M0,M1,M2,M3,M4) \ 35 M3 = vqaddq_s16(M0, M2); \ 36 M4 = vqsubq_s16(M1, M2); \ 37 M0 = vqsubq_s16(M0, M2); \ 38 M1 = vqaddq_s16(M1, M2); \ 39 M2 = vmaxq_s16(M3, M4); \ 40 M3 = vreinterpretq_s16_u16(vcgtq_s16(M3, M4)); \ 41 M4 = vmaxq_s16(M0, M1); \ 42 M1 = vreinterpretq_s16_u16(vcgtq_s16(M0, M1)); \ 45 #define NEON_DEINTERLEAVE_K5(M0,M1,M2,M3) \ 48 tmp = vuzpq_s16(M0, M1); \ 53 #define NEON_DEINTERLEAVE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11,M12,M13,M14,M15) \ 56 tmp = vuzpq_s16(M0, M1); \ 57 M8 = tmp.val[0]; M9 = tmp.val[1]; \ 58 tmp = vuzpq_s16(M2, M3); \ 59 M10 = tmp.val[0]; M11 = tmp.val[1]; \ 60 tmp = vuzpq_s16(M4, M5); \ 61 M12 = tmp.val[0]; M13 = tmp.val[1]; \ 62 tmp = vuzpq_s16(M6, M7); \ 63 M14 = tmp.val[0]; M15 = tmp.val[1]; \ 66 #define NEON_BRANCH_METRIC_N2(M0,M1,M2,M3,M4,M6,M7) \ 68 M0 = vmulq_s16(M4, M0); \ 69 M1 = vmulq_s16(M4, M1); \ 70 M2 = vmulq_s16(M4, M2); \ 71 M3 = vmulq_s16(M4, M3); \ 72 M6 = vcombine_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \ 73 M7 = vcombine_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \ 76 #define NEON_BRANCH_METRIC_N4(M0,M1,M2,M3,M4,M5) \ 78 M0 = vmulq_s16(M4, M0); \ 79 M1 = vmulq_s16(M4, M1); \ 80 M2 = vmulq_s16(M4, M2); \ 81 M3 = vmulq_s16(M4, M3); \ 82 int16x4_t t1 = vpadd_s16(vpadd_s16(vget_low_s16(M0), vget_high_s16(M0)), vpadd_s16(vget_low_s16(M1), vget_high_s16(M1))); \ 83 int16x4_t t2 = vpadd_s16(vpadd_s16(vget_low_s16(M2), vget_high_s16(M2)), vpadd_s16(vget_low_s16(M3), vget_high_s16(M3))); \ 84 M5 = vcombine_s16(t1, t2); \ 87 #define NEON_NORMALIZE_K5(M0,M1,M2,M3) \ 89 M2 = vminq_s16(M0, M1); \ 90 int16x4_t t = vpmin_s16(vget_low_s16(M2), vget_high_s16(M2)); \ 91 t = vpmin_s16(t, t); \ 92 t = vpmin_s16(t, t); \ 93 M2 = vdupq_lane_s16(t, 0); \ 94 M0 = vqsubq_s16(M0, M2); \ 95 M1 = vqsubq_s16(M1, M2); \ 98 #define NEON_NORMALIZE_K7(M0,M1,M2,M3,M4,M5,M6,M7,M8,M9,M10,M11) \ 100 M8 = vminq_s16(M0, M1); \ 101 M9 = vminq_s16(M2, M3); \ 102 M10 = vminq_s16(M4, M5); \ 103 M11 = vminq_s16(M6, M7); \ 104 M8 = vminq_s16(M8, M9); \ 105 M10 = vminq_s16(M10, M11); \ 106 M8 = vminq_s16(M8, M10); \ 107 int16x4_t t = vpmin_s16(vget_low_s16(M8), vget_high_s16(M8)); \ 108 t = vpmin_s16(t, t); \ 109 t = vpmin_s16(t, t); \ 110 M8 = vdupq_lane_s16(t, 0); \ 111 M0 = vqsubq_s16(M0, M8); \ 112 M1 = vqsubq_s16(M1, M8); \ 113 M2 = vqsubq_s16(M2, M8); \ 114 M3 = vqsubq_s16(M3, M8); \ 115 M4 = vqsubq_s16(M4, M8); \ 116 M5 = vqsubq_s16(M5, M8); \ 117 M6 = vqsubq_s16(M6, M8); \ 118 M7 = vqsubq_s16(M7, M8); \ 124 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
125 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
126 int16x8_t m0, m1, m2, m3, m4, m5, m6;
130 input = vld1_s16(val);
131 m2 = vcombine_s16(input, input);
134 m0 = vld1q_s16(&out[0]);
135 m1 = vld1q_s16(&out[8]);
137 m0 = vmulq_s16(m2, m0);
138 m1 = vmulq_s16(m2, m1);
139 m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
140 vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
143 m0 = vld1q_s16(&sums[0]);
144 m1 = vld1q_s16(&sums[8]);
154 vst1q_s16(&sums[0], m2);
155 vst1q_s16(&sums[8], m6);
156 vst1q_s16(&paths[0], m5);
157 vst1q_s16(&paths[8], m4);
163 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
164 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
165 int16x8_t m0, m1, m2, m3, m4, m5, m6;
169 input = vld1_s16(val);
170 m4 = vcombine_s16(input, input);
173 m0 = vld1q_s16(&out[0]);
174 m1 = vld1q_s16(&out[8]);
175 m2 = vld1q_s16(&out[16]);
176 m3 = vld1q_s16(&out[24]);
181 m0 = vld1q_s16(&sums[0]);
182 m1 = vld1q_s16(&sums[8]);
192 vst1q_s16(&sums[0], m2);
193 vst1q_s16(&sums[8], m6);
194 vst1q_s16(&paths[0], m5);
195 vst1q_s16(&paths[8], m4);
201 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
202 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
203 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
204 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
208 m0 = vld1q_s16(&sums[0]);
209 m1 = vld1q_s16(&sums[8]);
210 m2 = vld1q_s16(&sums[16]);
211 m3 = vld1q_s16(&sums[24]);
212 m4 = vld1q_s16(&sums[32]);
213 m5 = vld1q_s16(&sums[40]);
214 m6 = vld1q_s16(&sums[48]);
215 m7 = vld1q_s16(&sums[56]);
218 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
221 input = vld1_s16(val);
222 m7 = vcombine_s16(input, input);
225 m0 = vld1q_s16(&out[0]);
226 m1 = vld1q_s16(&out[8]);
227 m2 = vld1q_s16(&out[16]);
228 m3 = vld1q_s16(&out[24]);
232 m0 = vld1q_s16(&out[32]);
233 m1 = vld1q_s16(&out[40]);
234 m2 = vld1q_s16(&out[48]);
235 m3 = vld1q_s16(&out[56]);
243 vst1q_s16(&paths[0], m0);
244 vst1q_s16(&paths[8], m2);
245 vst1q_s16(&paths[32], m9);
246 vst1q_s16(&paths[40], m11);
252 vst1q_s16(&paths[16], m0);
253 vst1q_s16(&paths[24], m9);
254 vst1q_s16(&paths[48], m13);
255 vst1q_s16(&paths[56], m15);
258 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
260 vst1q_s16(&sums[0], m4);
261 vst1q_s16(&sums[8], m5);
262 vst1q_s16(&sums[16], m6);
263 vst1q_s16(&sums[24], m7);
264 vst1q_s16(&sums[32], m1);
265 vst1q_s16(&sums[40], m3);
266 vst1q_s16(&sums[48], m2);
267 vst1q_s16(&sums[56], m11);
273 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
274 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
275 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
276 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
280 m0 = vld1q_s16(&sums[0]);
281 m1 = vld1q_s16(&sums[8]);
282 m2 = vld1q_s16(&sums[16]);
283 m3 = vld1q_s16(&sums[24]);
284 m4 = vld1q_s16(&sums[32]);
285 m5 = vld1q_s16(&sums[40]);
286 m6 = vld1q_s16(&sums[48]);
287 m7 = vld1q_s16(&sums[56]);
290 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
293 input = vld1_s16(val);
294 m7 = vcombine_s16(input, input);
297 m0 = vld1q_s16(&out[0]);
298 m1 = vld1q_s16(&out[8]);
299 m2 = vld1q_s16(&out[16]);
300 m3 = vld1q_s16(&out[24]);
304 m0 = vld1q_s16(&out[32]);
305 m1 = vld1q_s16(&out[40]);
306 m2 = vld1q_s16(&out[48]);
307 m3 = vld1q_s16(&out[56]);
311 m0 = vld1q_s16(&out[64]);
312 m1 = vld1q_s16(&out[72]);
313 m2 = vld1q_s16(&out[80]);
314 m3 = vld1q_s16(&out[88]);
318 m0 = vld1q_s16(&out[96]);
319 m1 = vld1q_s16(&out[104]);
320 m2 = vld1q_s16(&out[112]);
321 m3 = vld1q_s16(&out[120]);
329 vst1q_s16(&paths[0], m0);
330 vst1q_s16(&paths[8], m2);
331 vst1q_s16(&paths[32], m9);
332 vst1q_s16(&paths[40], m11);
338 vst1q_s16(&paths[16], m0);
339 vst1q_s16(&paths[24], m9);
340 vst1q_s16(&paths[48], m13);
341 vst1q_s16(&paths[56], m15);
344 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
346 vst1q_s16(&sums[0], m4);
347 vst1q_s16(&sums[8], m5);
348 vst1q_s16(&sums[16], m6);
349 vst1q_s16(&sums[24], m7);
350 vst1q_s16(&sums[32], m1);
351 vst1q_s16(&sums[40], m3);
352 vst1q_s16(&sums[48], m2);
353 vst1q_s16(&sums[56], m11);
int16_t ** paths
Definition: conv_acc.c:190
__always_inline void _neon_metrics_k5_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:121
#define NEON_BUTTERFLY(M0, M1, M2, M3, M4)
Definition: conv_acc_neon_impl.h:33
#define NEON_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition: conv_acc_neon_impl.h:98
#define NEON_BRANCH_METRIC_N4(M0, M1, M2, M3, M4, M5)
Definition: conv_acc_neon_impl.h:76
__always_inline void _neon_metrics_k5_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:160
static __always_inline void _neon_metrics_k7_n2(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:198
#define NEON_DEINTERLEAVE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:45
#define NEON_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition: conv_acc_neon_impl.h:53
#define NEON_BRANCH_METRIC_N2(M0, M1, M2, M3, M4, M6, M7)
Definition: conv_acc_neon_impl.h:66
static __always_inline void _neon_metrics_k7_n4(const int16_t *val, const int16_t *outa, int16_t *sumsa, int16_t *paths, int norm)
Definition: conv_acc_neon_impl.h:270
#define NEON_NORMALIZE_K5(M0, M1, M2, M3)
Definition: conv_acc_neon_impl.h:87
#define __always_inline
Definition: conv_acc_neon_impl.h:30