/*===---- arm_neon.h - NEON intrinsics --------------------------------------=== * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. * *===-----------------------------------------------------------------------=== */ #ifndef __ARM_NEON_H #define __ARM_NEON_H #ifndef __ARM_NEON__ #error "NEON support not enabled" #endif // NEON document appears to be specified in terms of stdint types. #include // Define some NEON-specific scalar types for floats and polynomials. typedef float float32_t; typedef uint8_t poly8_t; typedef uint16_t poly16_t; typedef __attribute__(( __vector_size__(8) )) int8_t __neon_int8x8_t; typedef __attribute__(( __vector_size__(16) )) int8_t __neon_int8x16_t; typedef __attribute__(( __vector_size__(8) )) int16_t __neon_int16x4_t; typedef __attribute__(( __vector_size__(16) )) int16_t __neon_int16x8_t; typedef __attribute__(( __vector_size__(8) )) int32_t __neon_int32x2_t; typedef __attribute__(( __vector_size__(16) )) int32_t __neon_int32x4_t; typedef __attribute__(( __vector_size__(8) )) int64_t __neon_int64x1_t; typedef __attribute__(( __vector_size__(16) )) int64_t __neon_int64x2_t; typedef __attribute__(( __vector_size__(8) )) uint8_t __neon_uint8x8_t; typedef __attribute__(( __vector_size__(16) )) uint8_t __neon_uint8x16_t; typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_uint16x4_t; typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_uint16x8_t; typedef __attribute__(( __vector_size__(8) )) uint32_t __neon_uint32x2_t; typedef __attribute__(( __vector_size__(16) )) uint32_t __neon_uint32x4_t; typedef __attribute__(( __vector_size__(8) )) uint64_t __neon_uint64x1_t; typedef __attribute__(( __vector_size__(16) )) uint64_t __neon_uint64x2_t; typedef __attribute__(( __vector_size__(8) )) uint16_t __neon_float16x4_t; typedef __attribute__(( __vector_size__(16) )) uint16_t __neon_float16x8_t; typedef __attribute__(( __vector_size__(8) )) float32_t __neon_float32x2_t; typedef __attribute__(( __vector_size__(16) )) float32_t __neon_float32x4_t; typedef __attribute__(( __vector_size__(8) )) poly8_t __neon_poly8x8_t; typedef __attribute__(( __vector_size__(16) )) poly8_t __neon_poly8x16_t; typedef __attribute__(( __vector_size__(8) )) poly16_t __neon_poly16x4_t; typedef __attribute__(( __vector_size__(16) )) poly16_t __neon_poly16x8_t; typedef struct __int8x8_t { __neon_int8x8_t val; } int8x8_t; typedef struct __int8x16_t { __neon_int8x16_t val; } int8x16_t; typedef struct __int16x4_t { __neon_int16x4_t val; } int16x4_t; typedef struct __int16x8_t { __neon_int16x8_t val; } int16x8_t; typedef struct __int32x2_t { __neon_int32x2_t val; } int32x2_t; typedef struct __int32x4_t { __neon_int32x4_t val; } int32x4_t; typedef struct __int64x1_t { __neon_int64x1_t val; } int64x1_t; typedef struct __int64x2_t { __neon_int64x2_t val; } int64x2_t; typedef struct __uint8x8_t { __neon_uint8x8_t val; } uint8x8_t; typedef struct __uint8x16_t { __neon_uint8x16_t val; } uint8x16_t; typedef struct __uint16x4_t { __neon_uint16x4_t val; } uint16x4_t; typedef struct __uint16x8_t { __neon_uint16x8_t val; } uint16x8_t; typedef struct __uint32x2_t { __neon_uint32x2_t val; } uint32x2_t; typedef struct __uint32x4_t { __neon_uint32x4_t val; } uint32x4_t; typedef struct __uint64x1_t { __neon_uint64x1_t val; } uint64x1_t; typedef struct __uint64x2_t { __neon_uint64x2_t val; } uint64x2_t; typedef struct __float16x4_t { __neon_float16x4_t val; } float16x4_t; typedef struct __float16x8_t { __neon_float16x8_t val; } float16x8_t; typedef struct __float32x2_t { __neon_float32x2_t val; } float32x2_t; typedef struct __float32x4_t { __neon_float32x4_t val; } float32x4_t; typedef struct __poly8x8_t { __neon_poly8x8_t val; } poly8x8_t; typedef struct __poly8x16_t { __neon_poly8x16_t val; } poly8x16_t; typedef struct __poly16x4_t { __neon_poly16x4_t val; } poly16x4_t; typedef struct __poly16x8_t { __neon_poly16x8_t val; } poly16x8_t; // FIXME: write tool to stamp out the structure-of-array types, possibly gen this whole file. // Intrinsics, per ARM document DUI0348B #define _ATTRS_ai __attribute__((__always_inline__)) static _ATTRS_ai int8x8_t vadd_s8(int8x8_t a, int8x8_t b) { return (int8x8_t){a.val + b.val}; } static _ATTRS_ai int16x4_t vadd_s16(int16x4_t a, int16x4_t b) { return (int16x4_t){a.val + b.val}; } static _ATTRS_ai int32x2_t vadd_s32(int32x2_t a, int32x2_t b) { return (int32x2_t){a.val + b.val}; } static _ATTRS_ai int64x1_t vadd_s64(int64x1_t a, int64x1_t b) { return (int64x1_t){a.val + b.val}; } static _ATTRS_ai float32x2_t vadd_f32(float32x2_t a, float32x2_t b) { return (float32x2_t){a.val + b.val}; } static _ATTRS_ai uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b) { return (uint8x8_t){a.val + b.val}; } static _ATTRS_ai uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b) { return (uint16x4_t){a.val + b.val}; } static _ATTRS_ai uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b) { return (uint32x2_t){a.val + b.val}; } static _ATTRS_ai uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) { return (uint64x1_t){a.val + b.val}; } static _ATTRS_ai int8x16_t vaddq_s8(int8x16_t a, int8x16_t b) { return (int8x16_t){a.val + b.val}; } static _ATTRS_ai int16x8_t vaddq_s16(int16x8_t a, int16x8_t b) { return (int16x8_t){a.val + b.val}; } static _ATTRS_ai int32x4_t vaddq_s32(int32x4_t a, int32x4_t b) { return (int32x4_t){a.val + b.val}; } static _ATTRS_ai int64x2_t vaddq_s64(int64x2_t a, int64x2_t b) { return (int64x2_t){a.val + b.val}; } static _ATTRS_ai float32x4_t vaddq_f32(float32x4_t a, float32x4_t b) { return (float32x4_t){a.val + b.val}; } static _ATTRS_ai uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b) { return (uint8x16_t){a.val + b.val}; } static _ATTRS_ai uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b) { return (uint16x8_t){a.val + b.val}; } static _ATTRS_ai uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b) { return (uint32x4_t){a.val + b.val}; } static _ATTRS_ai uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b) { return (uint64x2_t){a.val + b.val}; } // add // long add // wide add // halving add // rounding halving add // saturating add // add high half // rounding add high half // multiply // multiple accumulate // multiple accumulate long // multiple subtract // multiple subtract long // saturating doubling multiply high // saturating rounding doubling multiply high // saturating doubling multiply accumulate long // saturating doubling multiply subtract long // long multiply // saturating doubling long multiply // subtract // long subtract // wide subtract // saturating subtract // halving subtract // subtract high half // rounding subtract high half // compare eq // compare ge // compare le // compare gt // compare lt // compare abs ge // compare abs le // compare abs gt // compare abs lt // test bits // abs diff // abs diff long // abs diff accumulate // abs diff accumulate long // max // min // pairwise add // long pairwise add // long pairwise add accumulate // pairwise max // pairwise min // recip // recip sqrt // shl by vec // saturating shl by vec // rounding shl by vec // saturating rounding shl by vec // shr by constant // shl by constant // rounding shr by constant // shr by constant and accumulate // rounding shr by constant and accumulate // saturating shl by constant // s->u saturating shl by constant // narrowing saturating shr by constant // s->u narrowing saturating shr by constant // s->u rounding narrowing saturating shr by constant // narrowing saturating shr by constant // rounding narrowing shr by constant // rounding narrowing saturating shr by constant // widening shl by constant // shr and insert // shl and insert // loads and stores, single vector // loads and stores, lane // loads, dupe // loads and stores, arrays // vget,vgetq lane // vset, vsetq lane // vcreate // vdup, vdupq // vmov, vmovq // vdup_lane, vdupq_lane // vcombine // vget_high, vget_low // vcvt {u,s} <-> f, f <-> f16 // narrow // long move (unpack) // saturating narrow // saturating narrow s->u // table lookup // extended table lookup // mla with scalar // widening mla with scalar // widening saturating doubling mla with scalar // mls with scalar // widening mls with scalar // widening saturating doubling mls with scalar // mul by scalar // long mul with scalar // long mul by scalar // saturating doubling long mul with scalar // saturating doubling long mul by scalar // saturating doubling mul high with scalar // saturating doubling mul high by scalar // saturating rounding doubling mul high with scalar // saturating rounding doubling mul high by scalar // mla with scalar // widening mla with sclar // widening saturating doubling mla with scalar // mls with scalar // widening mls with scalar // widening saturating doubling mls with scalar // extract // endian swap (vrev) // abs // saturating abs // negate // saturating negate // count leading signs // count leading zeroes // popcount // recip_est // recip_sqrt_est // not // and // or // xor // andn // orn // bitselect // transpose elts // interleave elts // deinterleave elts // vreinterpret #endif /* __ARM_NEON_H */