120 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
121 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
122 int16x8_t m0, m1, m2, m3, m4, m5, m6;
126 input = vld1_s16(val);
127 m2 = vcombine_s16(input, input);
130 m0 = vld1q_s16(&out[0]);
131 m1 = vld1q_s16(&out[8]);
133 m0 = vmulq_s16(m2, m0);
134 m1 = vmulq_s16(m2, m1);
135 m2 = vcombine_s16(vpadd_s16(vget_low_s16(m0), vget_high_s16(m0)),
136 vpadd_s16(vget_low_s16(m1), vget_high_s16(m1)));
139 m0 = vld1q_s16(&sums[0]);
140 m1 = vld1q_s16(&sums[8]);
150 vst1q_s16(&sums[0], m2);
151 vst1q_s16(&sums[8], m6);
152 vst1q_s16(&paths[0], m5);
153 vst1q_s16(&paths[8], m4);
159 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
160 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
161 int16x8_t m0, m1, m2, m3, m4, m5, m6;
165 input = vld1_s16(val);
166 m4 = vcombine_s16(input, input);
169 m0 = vld1q_s16(&out[0]);
170 m1 = vld1q_s16(&out[8]);
171 m2 = vld1q_s16(&out[16]);
172 m3 = vld1q_s16(&out[24]);
177 m0 = vld1q_s16(&sums[0]);
178 m1 = vld1q_s16(&sums[8]);
188 vst1q_s16(&sums[0], m2);
189 vst1q_s16(&sums[8], m6);
190 vst1q_s16(&paths[0], m5);
191 vst1q_s16(&paths[8], m4);
197 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
198 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
199 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
200 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
204 m0 = vld1q_s16(&sums[0]);
205 m1 = vld1q_s16(&sums[8]);
206 m2 = vld1q_s16(&sums[16]);
207 m3 = vld1q_s16(&sums[24]);
208 m4 = vld1q_s16(&sums[32]);
209 m5 = vld1q_s16(&sums[40]);
210 m6 = vld1q_s16(&sums[48]);
211 m7 = vld1q_s16(&sums[56]);
214 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
217 input = vld1_s16(val);
218 m7 = vcombine_s16(input, input);
221 m0 = vld1q_s16(&out[0]);
222 m1 = vld1q_s16(&out[8]);
223 m2 = vld1q_s16(&out[16]);
224 m3 = vld1q_s16(&out[24]);
228 m0 = vld1q_s16(&out[32]);
229 m1 = vld1q_s16(&out[40]);
230 m2 = vld1q_s16(&out[48]);
231 m3 = vld1q_s16(&out[56]);
239 vst1q_s16(&paths[0], m0);
240 vst1q_s16(&paths[8], m2);
241 vst1q_s16(&paths[32], m9);
242 vst1q_s16(&paths[40], m11);
248 vst1q_s16(&paths[16], m0);
249 vst1q_s16(&paths[24], m9);
250 vst1q_s16(&paths[48], m13);
251 vst1q_s16(&paths[56], m15);
254 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
256 vst1q_s16(&sums[0], m4);
257 vst1q_s16(&sums[8], m5);
258 vst1q_s16(&sums[16], m6);
259 vst1q_s16(&sums[24], m7);
260 vst1q_s16(&sums[32], m1);
261 vst1q_s16(&sums[40], m3);
262 vst1q_s16(&sums[48], m2);
263 vst1q_s16(&sums[56], m11);
269 int16_t *__restrict out = __builtin_assume_aligned(outa, 8);
270 int16_t *__restrict sums = __builtin_assume_aligned(sumsa, 8);
271 int16x8_t m0, m1, m2, m3, m4, m5, m6, m7;
272 int16x8_t m8, m9, m10, m11, m12, m13, m14, m15;
276 m0 = vld1q_s16(&sums[0]);
277 m1 = vld1q_s16(&sums[8]);
278 m2 = vld1q_s16(&sums[16]);
279 m3 = vld1q_s16(&sums[24]);
280 m4 = vld1q_s16(&sums[32]);
281 m5 = vld1q_s16(&sums[40]);
282 m6 = vld1q_s16(&sums[48]);
283 m7 = vld1q_s16(&sums[56]);
286 NEON_DEINTERLEAVE_K7(m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14, m15)
289 input = vld1_s16(val);
290 m7 = vcombine_s16(input, input);
293 m0 = vld1q_s16(&out[0]);
294 m1 = vld1q_s16(&out[8]);
295 m2 = vld1q_s16(&out[16]);
296 m3 = vld1q_s16(&out[24]);
300 m0 = vld1q_s16(&out[32]);
301 m1 = vld1q_s16(&out[40]);
302 m2 = vld1q_s16(&out[48]);
303 m3 = vld1q_s16(&out[56]);
307 m0 = vld1q_s16(&out[64]);
308 m1 = vld1q_s16(&out[72]);
309 m2 = vld1q_s16(&out[80]);
310 m3 = vld1q_s16(&out[88]);
314 m0 = vld1q_s16(&out[96]);
315 m1 = vld1q_s16(&out[104]);
316 m2 = vld1q_s16(&out[112]);
317 m3 = vld1q_s16(&out[120]);
325 vst1q_s16(&paths[0], m0);
326 vst1q_s16(&paths[8], m2);
327 vst1q_s16(&paths[32], m9);
328 vst1q_s16(&paths[40], m11);
334 vst1q_s16(&paths[16], m0);
335 vst1q_s16(&paths[24], m9);
336 vst1q_s16(&paths[48], m13);
337 vst1q_s16(&paths[56], m15);
340 NEON_NORMALIZE_K7(m4, m1, m5, m3, m6, m2, m7, m11, m0, m8, m9, m10)
342 vst1q_s16(&sums[0], m4);
343 vst1q_s16(&sums[8], m5);
344 vst1q_s16(&sums[16], m6);
345 vst1q_s16(&sums[24], m7);
346 vst1q_s16(&sums[32], m1);
347 vst1q_s16(&sums[40], m3);
348 vst1q_s16(&sums[48], m2);
349 vst1q_s16(&sums[56], m11);
#define NEON_DEINTERLEAVE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11, M12, M13, M14, M15)
Definition conv_acc_neon_impl.h:49
#define NEON_NORMALIZE_K7(M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, M10, M11)
Definition conv_acc_neon_impl.h:94