449 lines
12 KiB
C++
449 lines
12 KiB
C++
#pragma once
|
|
|
|
#ifdef __GNUG__
|
|
#include <x86intrin.h>
|
|
#else
|
|
#include <intrin.h>
|
|
#endif
|
|
|
|
#include "VecTools.h"
|
|
|
|
namespace FastSIMD
|
|
{
|
|
struct AVX_f32x8
|
|
{
|
|
FASTSIMD_INTERNAL_TYPE_SET( AVX_f32x8, __m256 );
|
|
|
|
FS_INLINE static AVX_f32x8 Incremented()
|
|
{
|
|
return _mm256_set_ps( 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 0.0f );
|
|
}
|
|
|
|
FS_INLINE explicit AVX_f32x8( float f )
|
|
{
|
|
*this = _mm256_set1_ps( f );
|
|
}
|
|
|
|
FS_INLINE explicit AVX_f32x8( float f0, float f1, float f2, float f3, float f4, float f5, float f6, float f7 )
|
|
{
|
|
*this = _mm256_set_ps( f7, f6, f5, f4, f3, f2, f1, f0 );
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator+=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_add_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator-=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_sub_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator*=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_mul_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator/=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_div_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator&=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_and_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator|=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_or_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8& operator^=( const AVX_f32x8& rhs )
|
|
{
|
|
*this = _mm256_xor_ps( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8 operator~() const
|
|
{
|
|
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
|
|
const __m256i neg1 = _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() );
|
|
#else
|
|
const __m256i neg1 = _mm256_set1_epi32( -1 );
|
|
#endif
|
|
return _mm256_xor_ps( *this, _mm256_castsi256_ps( neg1 ) );
|
|
}
|
|
|
|
FS_INLINE AVX_f32x8 operator-() const
|
|
{
|
|
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
|
|
const __m256i minInt = _mm256_slli_epi32( _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() ), 31 );
|
|
#else
|
|
const __m256i minInt = _mm256_set1_epi32( 0x80000000 );
|
|
#endif
|
|
return _mm256_xor_ps( *this, _mm256_castsi256_ps( minInt ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator==( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_EQ_OS ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator!=( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_NEQ_OS ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator>( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_GT_OS ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator<( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_LT_OS ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator>=( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_GE_OS ) );
|
|
}
|
|
|
|
FS_INLINE __m256i operator<=( const AVX_f32x8& rhs )
|
|
{
|
|
return _mm256_castps_si256( _mm256_cmp_ps( *this, rhs, _CMP_LE_OS ) );
|
|
}
|
|
};
|
|
|
|
FASTSIMD_INTERNAL_OPERATORS_FLOAT( AVX_f32x8 )
|
|
|
|
|
|
struct AVX2_i32x8
|
|
{
|
|
FASTSIMD_INTERNAL_TYPE_SET( AVX2_i32x8, __m256i );
|
|
|
|
FS_INLINE static AVX2_i32x8 Incremented()
|
|
{
|
|
return _mm256_set_epi32( 7, 6, 5, 4, 3, 2, 1, 0 );
|
|
}
|
|
|
|
FS_INLINE explicit AVX2_i32x8( int32_t f )
|
|
{
|
|
*this = _mm256_set1_epi32( f );
|
|
}
|
|
|
|
FS_INLINE explicit AVX2_i32x8( int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4, int32_t i5, int32_t i6, int32_t i7 )
|
|
{
|
|
*this = _mm256_set_epi32( i7, i6, i5, i4, i3, i2, i1, i0 );
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator+=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_add_epi32( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator-=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_sub_epi32( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator*=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_mullo_epi32( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator&=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_and_si256( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator|=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_or_si256( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator^=( const AVX2_i32x8& rhs )
|
|
{
|
|
*this = _mm256_xor_si256( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator>>=( int32_t rhs )
|
|
{
|
|
*this = _mm256_srai_epi32( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8& operator<<=( int32_t rhs )
|
|
{
|
|
*this = _mm256_slli_epi32( *this, rhs );
|
|
return *this;
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8 operator~() const
|
|
{
|
|
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
|
|
const __m256i neg1 = _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() );
|
|
#else
|
|
const __m256i neg1 = _mm256_set1_epi32( -1 );
|
|
#endif
|
|
return _mm256_xor_si256( *this, neg1 );
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8 operator-() const
|
|
{
|
|
return _mm256_sub_epi32( _mm256_setzero_si256(), *this );
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8 operator==( const AVX2_i32x8& rhs )
|
|
{
|
|
return _mm256_cmpeq_epi32( *this, rhs );
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8 operator>( const AVX2_i32x8& rhs )
|
|
{
|
|
return _mm256_cmpgt_epi32( *this, rhs );
|
|
}
|
|
|
|
FS_INLINE AVX2_i32x8 operator<( const AVX2_i32x8& rhs )
|
|
{
|
|
return _mm256_cmpgt_epi32( rhs, *this );
|
|
}
|
|
};
|
|
|
|
FASTSIMD_INTERNAL_OPERATORS_INT( AVX2_i32x8, int32_t )
|
|
|
|
template<eLevel LEVEL_T>
|
|
class AVX_T
|
|
{
|
|
public:
|
|
static_assert( LEVEL_T >= Level_AVX && LEVEL_T <= Level_AVX2, "Cannot create template with unsupported SIMD level" );
|
|
|
|
static constexpr eLevel SIMD_Level = LEVEL_T;
|
|
|
|
template<size_t ElementSize = 8>
|
|
static constexpr size_t VectorSize = 256 / ElementSize;
|
|
|
|
typedef AVX_f32x8 float32v;
|
|
typedef AVX2_i32x8 int32v;
|
|
typedef AVX2_i32x8 mask32v;
|
|
|
|
// Load
|
|
|
|
FS_INLINE static float32v Load_f32( void const* p )
|
|
{
|
|
return _mm256_loadu_ps( reinterpret_cast<float const*>(p) );
|
|
}
|
|
|
|
FS_INLINE static int32v Load_i32( void const* p )
|
|
{
|
|
return _mm256_loadu_si256( reinterpret_cast<__m256i const*>(p) );
|
|
}
|
|
|
|
// Store
|
|
|
|
FS_INLINE static void Store_f32( void* p, float32v a )
|
|
{
|
|
_mm256_storeu_ps( reinterpret_cast<float*>(p), a );
|
|
}
|
|
|
|
FS_INLINE static void Store_i32( void* p, int32v a )
|
|
{
|
|
_mm256_storeu_si256( reinterpret_cast<__m256i*>(p), a );
|
|
}
|
|
|
|
// Cast
|
|
|
|
FS_INLINE static float32v Casti32_f32( int32v a )
|
|
{
|
|
return _mm256_castsi256_ps( a );
|
|
}
|
|
|
|
FS_INLINE static int32v Castf32_i32( float32v a )
|
|
{
|
|
return _mm256_castps_si256( a );
|
|
}
|
|
|
|
// Convert
|
|
|
|
FS_INLINE static float32v Converti32_f32( int32v a )
|
|
{
|
|
return _mm256_cvtepi32_ps( a );
|
|
}
|
|
|
|
FS_INLINE static int32v Convertf32_i32( float32v a )
|
|
{
|
|
return _mm256_cvtps_epi32( a );
|
|
}
|
|
|
|
// Select
|
|
|
|
FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
|
|
{
|
|
return _mm256_blendv_ps( b, a, _mm256_castsi256_ps( m ) );
|
|
}
|
|
|
|
FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
|
|
{
|
|
return _mm256_castps_si256( _mm256_blendv_ps( _mm256_castsi256_ps( b ), _mm256_castsi256_ps( a ), _mm256_castsi256_ps( m ) ) );
|
|
}
|
|
|
|
// Min, Max
|
|
|
|
FS_INLINE static float32v Min_f32( float32v a, float32v b )
|
|
{
|
|
return _mm256_min_ps( a, b );
|
|
}
|
|
|
|
FS_INLINE static float32v Max_f32( float32v a, float32v b )
|
|
{
|
|
return _mm256_max_ps( a, b );
|
|
}
|
|
|
|
FS_INLINE static int32v Min_i32( int32v a, int32v b )
|
|
{
|
|
return _mm256_min_epi32( a, b );
|
|
}
|
|
|
|
FS_INLINE static int32v Max_i32( int32v a, int32v b )
|
|
{
|
|
return _mm256_max_epi32( a, b );
|
|
}
|
|
|
|
// Bitwise
|
|
|
|
FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
|
|
{
|
|
return _mm256_andnot_ps( b, a );
|
|
}
|
|
|
|
FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
|
|
{
|
|
return _mm256_andnot_si256( b, a );
|
|
}
|
|
|
|
FS_INLINE static float32v BitwiseShiftRightZX_f32( float32v a, int32_t b )
|
|
{
|
|
return Casti32_f32( _mm256_srli_epi32( Castf32_i32( a ), b ) );
|
|
}
|
|
|
|
FS_INLINE static int32v BitwiseShiftRightZX_i32( int32v a, int32_t b )
|
|
{
|
|
return _mm256_srli_epi32( a, b );
|
|
}
|
|
|
|
// Abs
|
|
|
|
FS_INLINE static float32v Abs_f32( float32v a )
|
|
{
|
|
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
|
|
const __m256i intMax = _mm256_srli_epi32( _mm256_cmpeq_epi32( _mm256_setzero_si256(), _mm256_setzero_si256() ), 1 );
|
|
#else
|
|
const __m256i intMax = _mm256_set1_epi32( 0x7FFFFFFF );
|
|
#endif
|
|
return _mm256_and_ps( a, _mm256_castsi256_ps( intMax ) );
|
|
}
|
|
|
|
FS_INLINE static int32v Abs_i32( int32v a )
|
|
{
|
|
return _mm256_abs_epi32( a );
|
|
}
|
|
|
|
// Float math
|
|
|
|
FS_INLINE static float32v Sqrt_f32( float32v a )
|
|
{
|
|
return _mm256_sqrt_ps( a );
|
|
}
|
|
|
|
FS_INLINE static float32v InvSqrt_f32( float32v a )
|
|
{
|
|
return _mm256_rsqrt_ps( a );
|
|
}
|
|
|
|
FS_INLINE static float32v Reciprocal_f32( float32v a )
|
|
{
|
|
return _mm256_rcp_ps( a );
|
|
}
|
|
|
|
// Floor, Ceil, Round
|
|
|
|
FS_INLINE static float32v Floor_f32( float32v a )
|
|
{
|
|
return _mm256_round_ps( a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC );
|
|
}
|
|
|
|
FS_INLINE static float32v Ceil_f32( float32v a )
|
|
{
|
|
return _mm256_round_ps( a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC );
|
|
}
|
|
|
|
FS_INLINE static float32v Round_f32( float32v a )
|
|
{
|
|
return _mm256_round_ps( a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC );
|
|
}
|
|
|
|
//Mask
|
|
|
|
FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
|
|
{
|
|
return a & m;
|
|
}
|
|
|
|
FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
|
|
{
|
|
return _mm256_and_ps( a, _mm256_castsi256_ps( m ) );
|
|
}
|
|
|
|
FS_INLINE static int32v NMask_i32( int32v a, mask32v m )
|
|
{
|
|
return _mm256_andnot_si256( m, a );
|
|
}
|
|
|
|
FS_INLINE static float32v NMask_f32( float32v a, mask32v m )
|
|
{
|
|
return _mm256_andnot_ps( _mm256_castsi256_ps( m ), a );
|
|
}
|
|
|
|
FS_INLINE static bool AnyMask_bool( mask32v m )
|
|
{
|
|
return _mm256_movemask_ps( _mm256_castsi256_ps( m ) );
|
|
}
|
|
};
|
|
|
|
#if FASTSIMD_COMPILE_AVX
|
|
typedef AVX_T<Level_AVX> AVX;
|
|
#endif
|
|
|
|
#if FASTSIMD_COMPILE_AVX2
|
|
typedef AVX_T<Level_AVX2> AVX2;
|
|
|
|
#if FASTSIMD_USE_FMA
|
|
template<>
|
|
FS_INLINE AVX2::float32v FMulAdd_f32<AVX2>( AVX2::float32v a, AVX2::float32v b, AVX2::float32v c )
|
|
{
|
|
return _mm256_fmadd_ps( a, b, c );
|
|
}
|
|
|
|
template<>
|
|
FS_INLINE AVX2::float32v FNMulAdd_f32<AVX2>( AVX2::float32v a, AVX2::float32v b, AVX2::float32v c )
|
|
{
|
|
return _mm256_fnmadd_ps( a, b, c );
|
|
}
|
|
#endif
|
|
#endif
|
|
|
|
}
|