1
0
Fork 0
Univerxel/deps/FastNoise2/src/FastSIMD/Internal/NEON.h

425 lines
11 KiB
C++

#pragma once
#include <arm_neon.h>
#include "VecTools.h"
struct NEON_f32x4
{
FASTSIMD_INTERNAL_TYPE_SET( NEON_f32x4, float32x4_t );
constexpr FS_INLINE static uint8_t Size()
{
return 4;
}
FS_INLINE static NEON_f32x4 Zero()
{
return vdupq_n_f32( 0 );
}
FS_INLINE static NEON_f32x4 Incremented()
{
alignas(16) const float f[4]{ 0.0f, 1.0f, 2.0f, 3.0f };
return vld1q_f32( f );
}
FS_INLINE explicit NEON_f32x4( float f )
{
*this = vdupq_n_f32( f );
}
FS_INLINE explicit NEON_f32x4( float f0, float f1, float f2, float f3 )
{
alignas(16) const float f[4]{ f0, f1, f2, f3 };
*this = vld1q_f32( f );
}
FS_INLINE NEON_f32x4& operator+=( const NEON_f32x4& rhs )
{
*this = vaddq_f32( *this, rhs );
return *this;
}
FS_INLINE NEON_f32x4& operator-=( const NEON_f32x4& rhs )
{
*this = vsubq_f32( *this, rhs );
return *this;
}
FS_INLINE NEON_f32x4& operator*=( const NEON_f32x4& rhs )
{
*this = vmulq_f32( *this, rhs );
return *this;
}
FS_INLINE NEON_f32x4& operator/=( const NEON_f32x4& rhs )
{
float32x4_t reciprocal = vrecpeq_f32( rhs );
// use a couple Newton-Raphson steps to refine the estimate. Depending on your
// application's accuracy requirements, you may be able to get away with only
// one refinement (instead of the two used here). Be sure to test!
reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
// and finally, compute a/b = a*(1/b)
*this = vmulq_f32( *this, reciprocal );
return *this;
}
FS_INLINE NEON_f32x4 operator-() const
{
return vnegq_f32( *this );
}
};
FASTSIMD_INTERNAL_OPERATORS_FLOAT( NEON_f32x4 )
struct NEON_i32x4
{
FASTSIMD_INTERNAL_TYPE_SET( NEON_i32x4, int32x4_t );
constexpr FS_INLINE static uint8_t Size()
{
return 4;
}
FS_INLINE static NEON_i32x4 Zero()
{
return vdupq_n_s32( 0 );
}
FS_INLINE static NEON_i32x4 Incremented()
{
alignas(16) const int32_t f[4]{ 0, 1, 2, 3 };
return vld1q_s32( f );
}
FS_INLINE explicit NEON_i32x4( int32_t i )
{
*this = vdupq_n_s32( i );
}
FS_INLINE explicit NEON_i32x4( int32_t i0, int32_t i1, int32_t i2, int32_t i3 )
{
alignas(16) const int32_t f[4]{ i0, i1, i2, i3 };
*this = vld1q_s32( f );
}
FS_INLINE NEON_i32x4& operator+=( const NEON_i32x4& rhs )
{
*this = vaddq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator-=( const NEON_i32x4& rhs )
{
*this = vsubq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator*=( const NEON_i32x4& rhs )
{
*this = vmulq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator&=( const NEON_i32x4& rhs )
{
*this = vandq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator|=( const NEON_i32x4& rhs )
{
*this = vorrq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator^=( const NEON_i32x4& rhs )
{
*this = veorq_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator>>=( const int32_t rhs )
{
*this = vshrq_n_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4& operator<<=( const int32_t rhs )
{
*this = vshlq_n_s32( *this, rhs );
return *this;
}
FS_INLINE NEON_i32x4 operator~() const
{
return vmvnq_s32( *this );
}
FS_INLINE NEON_i32x4 operator-() const
{
return vnegq_s32( *this );
}
};
FASTSIMD_INTERNAL_OPERATORS_INT( NEON_i32x4, int32_t )
template<FastSIMD::eLevel LEVEL_T>
class FastSIMD_NEON_T
{
public:
static const FastSIMD::eLevel SIMD_Level = LEVEL_T;
static const size_t VectorSize = 128 / 8;
typedef NEON_f32x4 float32v;
typedef NEON_i32x4 int32v;
typedef NEON_i32x4 mask32v;
// Load
FS_INLINE static float32v Load_f32( void const* p )
{
return vld1q_f32( reinterpret_cast<float const*>(p) );
}
FS_INLINE static int32v Load_i32( void const* p )
{
return vld1q_s32( reinterpret_cast<int32_t const*>(p) );
}
// Store
FS_INLINE static void Store_f32( void* p, float32v a )
{
vst1q_f32( reinterpret_cast<float*>(p), a );
}
FS_INLINE static void Store_i32( void* p, int32v a )
{
vst1q_s32( reinterpret_cast<int32_t*>(p), a );
}
// Cast
FS_INLINE static float32v Casti32_f32( int32v a )
{
return vreinterpretq_f32_s32( a );
}
FS_INLINE static int32v Castf32_i32( float32v a )
{
return vreinterpretq_s32_f32( a );
}
// Convert
FS_INLINE static float32v Converti32_f32( int32v a )
{
return vcvtq_f32_s32( a );
}
FS_INLINE static int32v Convertf32_i32( float32v a )
{
return vcvtq_s32_f32( a );
}
// Comparisons
FS_INLINE static mask32v Equal_f32( float32v a, float32v b )
{
return vreinterpretq_s32_u32( vceq_f32( a, b ) );
}
FS_INLINE static mask32v GreaterThan_f32( float32v a, float32v b )
{
return vreinterpretq_s32_u32( vcgtq_f32( a, b ) );
}
FS_INLINE static mask32v LessThan_f32( float32v a, float32v b )
{
return vreinterpretq_s32_u32( vcltq_f32( a, b ) );
}
FS_INLINE static mask32v GreaterEqualThan_f32( float32v a, float32v b )
{
return vreinterpretq_s32_u32( vcgeq_f32( a, b ) );
}
FS_INLINE static mask32v LessEqualThan_f32( float32v a, float32v b )
{
return vreinterpretq_s32_u32( vcleq_f32( a, b ) );
}
FS_INLINE static mask32v Equal_i32( int32v a, int32v b )
{
return vceq_s32( a, b );
}
FS_INLINE static mask32v GreaterThan_i32( int32v a, int32v b )
{
return vcgtq_s32( a, b );
}
FS_INLINE static mask32v LessThan_i32( int32v a, int32v b )
{
return vcltq_s32( a, b );
}
// Select
FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
{
return vbslq_f32( vreinterpretq_u32_s32( mask ), b, a );
}
FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
{
return vbslq_s32( vreinterpretq_u32_s32( mask ), b, a );
}
// Min, Max
FS_INLINE static float32v Min_f32( float32v a, float32v b )
{
return vminq_f32( a, b );
}
FS_INLINE static float32v Max_f32( float32v a, float32v b )
{
return vmaxq_f32( a, b );
}
FS_INLINE static int32v Min_i32( int32v a, int32v b )
{
return vminq_s32( a, b );
}
FS_INLINE static int32v Max_i32( int32v a, int32v b )
{
return vmaxq_s32( a, b );
}
// Bitwise
FS_INLINE static float32v BitwiseAnd_f32( float32v a, float32v b )
{
return vreinterpretq_f32_s32( vandq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
}
FS_INLINE static float32v BitwiseOr_f32( float32v a, float32v b )
{
return vreinterpretq_f32_s32( vorrq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
}
FS_INLINE static float32v BitwiseXor_f32( float32v a, float32v b )
{
return vreinterpretq_f32_s32( veorq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
}
FS_INLINE static float32v BitwiseNot_f32( float32v a )
{
return vreinterpretq_f32_s32( vmvn_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
}
FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
{
return vreinterpretq_f32_s32( vandq_s32( vreinterpretq_s32_f32( a ), vmvn_s32( vreinterpretq_s32_f32( b ) ) ) );
}
FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
{
return vandq_s32( a , vmvn_s32( b ) );
}
// Abs
FS_INLINE static float32v Abs_f32( float32v a )
{
return vabsq_f32( a );
}
FS_INLINE static int32v Abs_i32( int32v a )
{
return vabsq_s32( a );
}
// Float math
FS_INLINE static float32v Sqrt_f32( float32v a )
{
return vsqrtq_f32( a );
}
FS_INLINE static float32v InvSqrt_f32( float32v a )
{
return vrsqrteq_f32( a );
}
// Floor, Ceil, Round: http://dss.stephanierct.com/DevBlog/?p=8
FS_INLINE static float32v Floor_f32( float32v a )
{
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
const float32x4_t f1 = vdupq_n_f32( 1.0f ); //_mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
#else
const float32x4_t f1 = vdupq_n_f32( 1.0f );
#endif
float32x4_t fval = vrndmq_f32( a );
return vsubq_f32( fval, BitwiseAnd_f32( vcltq_f32( a, fval ), f1 ) );
}
FS_INLINE static float32v Ceil_f32( float32v a )
{
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
const __m128 f1 = vdupq_n_f32( 1.0f ); //_mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
#else
const __m128 f1 = vdupq_n_f32( 1.0f );
#endif
float32x4_t fval = vrndmq_f32( a );
return vaddq_f32( fval, BitwiseAnd_f32( vcltq_f32( a, fval ), f1 ) );
}
template<FastSIMD::eLevel L = LEVEL_T>
FS_INLINE static FS_ENABLE_IF( L < FastSIMD::ELevel_SSE41, float32v ) Round_f32( float32v a )
{
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
const __m128 nearest2 = _mm_castsi128_ps( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 2 ) );
#else
const __m128 nearest2 = vdupq_n_f32( 1.99999988079071044921875f );
#endif
__m128 aTrunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( a ) ); // truncate a
__m128 rmd = _mm_sub_ps( a, aTrunc ); // get remainder
__m128 rmd2 = _mm_mul_ps( rmd, nearest2 ); // mul remainder by near 2 will yield the needed offset
__m128 rmd2Trunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( rmd2 ) ); // after being truncated of course
return _mm_add_ps( aTrunc, rmd2Trunc );
}
template<FastSIMD::eLevel L = LEVEL_T>
FS_INLINE static FS_ENABLE_IF( L >= FastSIMD::ELevel_SSE41, float32v ) Round_f32( float32v a )
{
return vrndnq_f32( a );
}
// Mask
FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
{
return a & m;
}
FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
{
return BitwiseAnd_f32( a, vreinterpretq_f32_s32( m ) );
}
};
#if FASTSIMD_COMPILE_NEON
typedef FastSIMD_SSE_T<FastSIMD::ELevel_NEON> FastSIMD_NEON;
#endif