#pragma once

#include <arm_neon.h>

#include "VecTools.h"

struct NEON_f32x4
{
    FASTSIMD_INTERNAL_TYPE_SET( NEON_f32x4, float32x4_t );

    constexpr FS_INLINE static uint8_t Size()
    {
        return 4;
    }

    FS_INLINE static NEON_f32x4 Zero()
    {
        return vdupq_n_f32( 0 );
    }

    FS_INLINE static NEON_f32x4 Incremented()
    {
        alignas(16) const float f[4]{ 0.0f, 1.0f, 2.0f, 3.0f };
        return vld1q_f32( f );
    }

    FS_INLINE explicit NEON_f32x4( float f )
    {
        *this = vdupq_n_f32( f );
    }

    FS_INLINE explicit NEON_f32x4( float f0, float f1, float f2, float f3 )
    {
        alignas(16) const float f[4]{ f0, f1, f2, f3 };
        *this = vld1q_f32( f );
    }

    FS_INLINE NEON_f32x4& operator+=( const NEON_f32x4& rhs )
    {
        *this = vaddq_f32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_f32x4& operator-=( const NEON_f32x4& rhs )
    {
        *this = vsubq_f32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_f32x4& operator*=( const NEON_f32x4& rhs )
    {
        *this = vmulq_f32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_f32x4& operator/=( const NEON_f32x4& rhs )
    {
        float32x4_t reciprocal = vrecpeq_f32( rhs );
        // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
        // application's accuracy requirements, you may be able to get away with only
        // one refinement (instead of the two used here).  Be sure to test!
        reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );
        reciprocal = vmulq_f32( vrecpsq_f32( rhs, reciprocal ), reciprocal );

        // and finally, compute a/b = a*(1/b)
        *this = vmulq_f32( *this, reciprocal );
        return *this;
    }

    FS_INLINE NEON_f32x4 operator-() const
    {
        return vnegq_f32( *this );
    }
};

FASTSIMD_INTERNAL_OPERATORS_FLOAT( NEON_f32x4 )


struct NEON_i32x4
{
    FASTSIMD_INTERNAL_TYPE_SET( NEON_i32x4, int32x4_t );

    constexpr FS_INLINE static uint8_t Size()
    {
        return 4;
    }

    FS_INLINE static NEON_i32x4 Zero()
    {
        return vdupq_n_s32( 0 );
    }

    FS_INLINE static NEON_i32x4 Incremented()
    {
        alignas(16) const int32_t f[4]{ 0, 1, 2, 3 };
        return vld1q_s32( f );
    }

    FS_INLINE explicit NEON_i32x4( int32_t i )
    {
        *this = vdupq_n_s32( i );
    }

    FS_INLINE explicit NEON_i32x4( int32_t i0, int32_t i1, int32_t i2, int32_t i3 )
    {
        alignas(16) const int32_t f[4]{ i0, i1, i2, i3 };
        *this = vld1q_s32( f );
    }

    FS_INLINE NEON_i32x4& operator+=( const NEON_i32x4& rhs )
    {
        *this = vaddq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator-=( const NEON_i32x4& rhs )
    {
        *this = vsubq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator*=( const NEON_i32x4& rhs )
    {
        *this = vmulq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator&=( const NEON_i32x4& rhs )
    {
        *this = vandq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator|=( const NEON_i32x4& rhs )
    {
        *this = vorrq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator^=( const NEON_i32x4& rhs )
    {
        *this = veorq_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator>>=( const int32_t rhs )
    {
        *this = vshrq_n_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4& operator<<=( const int32_t rhs )
    {
        *this = vshlq_n_s32( *this, rhs );
        return *this;
    }

    FS_INLINE NEON_i32x4 operator~() const
    {
        return vmvnq_s32( *this );
    }

    FS_INLINE NEON_i32x4 operator-() const
    {
        return vnegq_s32( *this );
    }
};

FASTSIMD_INTERNAL_OPERATORS_INT( NEON_i32x4, int32_t )

template<FastSIMD::eLevel LEVEL_T>
class FastSIMD_NEON_T
{
public:
    static const FastSIMD::eLevel SIMD_Level = LEVEL_T;
    static const size_t VectorSize = 128 / 8;

    typedef NEON_f32x4 float32v;
    typedef NEON_i32x4 int32v;
    typedef NEON_i32x4 mask32v;

    // Load

    FS_INLINE static float32v Load_f32( void const* p )
    {
        return vld1q_f32( reinterpret_cast<float const*>(p) );
    }

    FS_INLINE static int32v Load_i32( void const* p )
    {
        return vld1q_s32( reinterpret_cast<int32_t const*>(p) );
    }

    // Store

    FS_INLINE static void Store_f32( void* p, float32v a )
    {
        vst1q_f32( reinterpret_cast<float*>(p), a );
    }

    FS_INLINE static void Store_i32( void* p, int32v a )
    {
        vst1q_s32( reinterpret_cast<int32_t*>(p), a );
    }

    // Cast

    FS_INLINE static float32v Casti32_f32( int32v a )
    {
        return vreinterpretq_f32_s32( a );
    }

    FS_INLINE static int32v Castf32_i32( float32v a )
    {
        return vreinterpretq_s32_f32( a );
    }

    // Convert

    FS_INLINE static float32v Converti32_f32( int32v a )
    {
        return vcvtq_f32_s32( a );
    }

    FS_INLINE static int32v Convertf32_i32( float32v a )
    {
        return vcvtq_s32_f32( a );
    }

    // Comparisons

    FS_INLINE static mask32v Equal_f32( float32v a, float32v b )
    {
        return vreinterpretq_s32_u32( vceq_f32( a, b ) );
    }

    FS_INLINE static mask32v GreaterThan_f32( float32v a, float32v b )
    {
        return vreinterpretq_s32_u32( vcgtq_f32( a, b ) );
    }

    FS_INLINE static mask32v LessThan_f32( float32v a, float32v b )
    {
        return vreinterpretq_s32_u32( vcltq_f32( a, b ) );
    }

    FS_INLINE static mask32v GreaterEqualThan_f32( float32v a, float32v b )
    {
        return vreinterpretq_s32_u32( vcgeq_f32( a, b ) );
    }

    FS_INLINE static mask32v LessEqualThan_f32( float32v a, float32v b )
    {
        return vreinterpretq_s32_u32( vcleq_f32( a, b ) );
    }

    FS_INLINE static mask32v Equal_i32( int32v a, int32v b )
    {
        return vceq_s32( a, b );
    }

    FS_INLINE static mask32v GreaterThan_i32( int32v a, int32v b )
    {
        return vcgtq_s32( a, b );
    }

    FS_INLINE static mask32v LessThan_i32( int32v a, int32v b )
    {
        return vcltq_s32( a, b );
    }

    // Select

    FS_INLINE static float32v Select_f32( mask32v m, float32v a, float32v b )
    {
        return vbslq_f32( vreinterpretq_u32_s32( mask ), b, a );
    }

    FS_INLINE static int32v Select_i32( mask32v m, int32v a, int32v b )
    {
        return vbslq_s32( vreinterpretq_u32_s32( mask ), b, a );
    }

    // Min, Max

    FS_INLINE static float32v Min_f32( float32v a, float32v b )
    {
        return vminq_f32( a, b );
    }

    FS_INLINE static float32v Max_f32( float32v a, float32v b )
    {
        return vmaxq_f32( a, b );
    }

    FS_INLINE static int32v Min_i32( int32v a, int32v b )
    {
        return vminq_s32( a, b );
    }

    FS_INLINE static int32v Max_i32( int32v a, int32v b )
    {
        return vmaxq_s32( a, b );
    }
    
    // Bitwise

    FS_INLINE static float32v BitwiseAnd_f32( float32v a, float32v b )
    {
        return vreinterpretq_f32_s32( vandq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
    }

    FS_INLINE static float32v BitwiseOr_f32( float32v a, float32v b )
    {
        return vreinterpretq_f32_s32( vorrq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
    }

    FS_INLINE static float32v BitwiseXor_f32( float32v a, float32v b )
    {
        return vreinterpretq_f32_s32( veorq_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
    }

    FS_INLINE static float32v BitwiseNot_f32( float32v a )
    {
        return vreinterpretq_f32_s32( vmvn_s32( vreinterpretq_s32_f32( a ), vreinterpretq_s32_f32( b ) ) );
    }

    FS_INLINE static float32v BitwiseAndNot_f32( float32v a, float32v b )
    {
        return vreinterpretq_f32_s32( vandq_s32( vreinterpretq_s32_f32( a ), vmvn_s32( vreinterpretq_s32_f32( b ) ) ) );
    }

    FS_INLINE static int32v BitwiseAndNot_i32( int32v a, int32v b )
    {
        return vandq_s32( a , vmvn_s32( b ) );
    }

    // Abs

    FS_INLINE static float32v Abs_f32( float32v a )
    {
        return vabsq_f32( a );
    }

    FS_INLINE static int32v Abs_i32( int32v a )
    {
        return vabsq_s32( a );
    }

    // Float math

    FS_INLINE static float32v Sqrt_f32( float32v a )
    {
        return vsqrtq_f32( a );
    }

    FS_INLINE static float32v InvSqrt_f32( float32v a )
    {
        return vrsqrteq_f32( a );
    }

    // Floor, Ceil, Round: http://dss.stephanierct.com/DevBlog/?p=8

    FS_INLINE static float32v Floor_f32( float32v a )
    {
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
        const float32x4_t f1 = vdupq_n_f32( 1.0f ); //_mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
#else
        const float32x4_t f1 = vdupq_n_f32( 1.0f );
#endif
        float32x4_t fval = vrndmq_f32( a );

        return vsubq_f32( fval, BitwiseAnd_f32( vcltq_f32( a, fval ), f1 ) );
    }

    FS_INLINE static float32v Ceil_f32( float32v a )
    {
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
        const __m128 f1 = vdupq_n_f32( 1.0f ); //_mm_castsi128_ps( _mm_slli_epi32( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 25 ), 23 ) );
#else
        const __m128 f1 = vdupq_n_f32( 1.0f );
#endif
        float32x4_t fval = vrndmq_f32( a );

        return vaddq_f32( fval, BitwiseAnd_f32( vcltq_f32( a, fval ), f1 ) );
    }

    template<FastSIMD::eLevel L = LEVEL_T>
    FS_INLINE static FS_ENABLE_IF( L < FastSIMD::ELevel_SSE41, float32v ) Round_f32( float32v a )
    {
#if FASTSIMD_CONFIG_GENERATE_CONSTANTS
        const __m128 nearest2 = _mm_castsi128_ps( _mm_srli_epi32( _mm_cmpeq_epi32( _mm_setzero_si128(), _mm_setzero_si128() ), 2 ) );
#else
        const __m128 nearest2 = vdupq_n_f32( 1.99999988079071044921875f );
#endif
        __m128 aTrunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( a ) );       // truncate a
        __m128 rmd = _mm_sub_ps( a, aTrunc );                           // get remainder
        __m128 rmd2 = _mm_mul_ps( rmd, nearest2 );                      // mul remainder by near 2 will yield the needed offset
        __m128 rmd2Trunc = _mm_cvtepi32_ps( _mm_cvttps_epi32( rmd2 ) ); // after being truncated of course
        return _mm_add_ps( aTrunc, rmd2Trunc );
    }

    template<FastSIMD::eLevel L = LEVEL_T>
    FS_INLINE static FS_ENABLE_IF( L >= FastSIMD::ELevel_SSE41, float32v ) Round_f32( float32v a )
    {
        return vrndnq_f32( a );
    }

    // Mask

    FS_INLINE static int32v Mask_i32( int32v a, mask32v m )
    {
        return a & m;
    }

    FS_INLINE static float32v Mask_f32( float32v a, mask32v m )
    {
        return BitwiseAnd_f32( a, vreinterpretq_f32_s32( m ) );
    }
};

#if FASTSIMD_COMPILE_NEON
typedef FastSIMD_SSE_T<FastSIMD::ELevel_NEON> FastSIMD_NEON;
#endif