/** \file ducc0/infra/simd.h * Functionality which approximates future standard C++ SIMD classes. * * For details see section 9 of https://wg21.link/N4808 * * \copyright Copyright (C) 2019-2021 Max-Planck-Society * \author Martin Reinecke */ /* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ /* All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* * This code is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This code is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this code; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef DUCC0_SIMD_H #define DUCC0_SIMD_H #if 0 //__has_include() #include #include #include #include #include namespace ducc0 { namespace detail_simd { namespace stdx=std::experimental; using stdx::native_simd; template struct simd_select { using type = stdx::simd>; }; using stdx::element_aligned_tag; template constexpr inline bool vectorizable = native_simd::size()>1; template constexpr bool simd_exists_h() { if constexpr (N>1) if constexpr (vectorizable) if constexpr (!std::is_same_v>, stdx::fixed_size_simd>) return true; return false; } template constexpr inline bool simd_exists = simd_exists_h(); template inline stdx::simd apply(stdx::simd in, Func func) { stdx::simd res; for (size_t i=0; i inline stdx::simd sin(stdx::simd in) { return apply(in,[](T v){return sin(v);}); } template inline stdx::simd cos(stdx::simd in) { return apply(in,[](T v){return cos(v);}); } } using detail_simd::element_aligned_tag; using detail_simd::native_simd; using detail_simd::simd_select; using detail_simd::simd_exists; using detail_simd::vectorizable; } #else // only enable SIMD support for gcc>=5.0 and clang>=5.0 #ifndef DUCC0_NO_SIMD #define DUCC0_NO_SIMD #if defined(__clang__) // AppleClang has their own version numbering #ifdef __apple_build_version__ # if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) # undef DUCC0_NO_SIMD # endif #elif __clang_major__ >= 5 # undef DUCC0_NO_SIMD #endif #elif defined(__GNUC__) #if __GNUC__>=5 #undef DUCC0_NO_SIMD #endif #endif #endif #include #include #include #ifndef DUCC0_NO_SIMD #if defined(__SSE2__) // we are on an x86 platform and we have vector types #include #endif #if defined(__aarch64__) // let's check for SVE and Neon #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) #if __ARM_FEATURE_SVE_BITS>0 // OK, we can use SVE #define DUCC0_USE_SVE #include #endif #endif #ifndef DUCC0_USE_SVE // see if we can use Neon #if defined(__ARM_NEON) #define DUCC0_USE_NEON #include #endif #endif #endif #endif namespace ducc0 { namespace detail_simd { /// true iff SIMD support is provided for \a T. template constexpr inline bool vectorizable = false; #if (!defined(DUCC0_NO_SIMD)) #if defined(__SSE2__) || defined (DUCC0_USE_SVE) || defined (DUCC0_USE_NEON) template<> constexpr inline bool vectorizable = true; template<> constexpr inline bool vectorizable = true; #endif #endif /// true iff a SIMD type with vector length \a len exists for \a T. template constexpr inline bool simd_exists = false; template constexpr size_t vectorlen = vectorizable ? reglen/sizeof(T) : 1; template class helper_; template struct vmask_ { private: using hlp = helper_; using Tm = typename hlp::Tm; Tm v; public: #if defined(_MSC_VER) vmask_() {} vmask_(const vmask_ &other) : v(other.v) {} vmask_ &operator=(const vmask_ &other) { v = other.v; return *this; } #else vmask_() = default; vmask_(const vmask_ &other) = default; vmask_ &operator=(const vmask_ &other) = default; #endif vmask_(Tm v_): v(v_) {} operator Tm() const { return v; } bool none() const { return hlp::mask_none(v); } bool any() const { return hlp::mask_any(v); } bool all() const { return hlp::mask_all(v); } vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); } vmask_ &operator&= (const vmask_ &other) { v=hlp::mask_and(v,other.v); return *this; } vmask_ operator| (const vmask_ &other) const { return hlp::mask_or(v,other.v); } vmask_ &operator|= (const vmask_ &other) { v=hlp::mask_or(v,other.v); return *this; } }; struct element_aligned_tag {}; template class vtp { private: using hlp = helper_; public: using value_type = T; using Tv = typename hlp::Tv; using Tm = vmask_; static constexpr size_t size() { return len; } private: Tv v; public: #if defined(_MSC_VER) vtp() {} vtp(const vtp &other): v(other.v) {} vtp &operator=(const vtp &other) { v=other.v; return *this; } #else vtp() = default; vtp(const vtp &other) = default; vtp &operator=(const vtp &other) = default; #endif vtp(T other): vtp(hlp::from_scalar(other)) {} vtp(const Tv &other) : v(other) {} vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; } operator Tv() const { return v; } vtp(const T *ptr, element_aligned_tag) : v(hlp::loadu(ptr)) {} void copy_to(T *ptr, element_aligned_tag) const { hlp::storeu(ptr, v); } vtp operator-() const { return vtp(-v); } vtp operator+(vtp other) const { return vtp(v+other.v); } vtp operator-(vtp other) const { return vtp(v-other.v); } vtp operator*(vtp other) const { return vtp(v*other.v); } vtp operator/(vtp other) const { return vtp(v/other.v); } vtp &operator+=(vtp other) { v+=other.v; return *this; } vtp &operator-=(vtp other) { v-=other.v; return *this; } vtp &operator*=(vtp other) { v*=other.v; return *this; } vtp &operator/=(vtp other) { v/=other.v; return *this; } vtp abs() const { return hlp::abs(v); } inline vtp sqrt() const { return hlp::sqrt(v); } vtp max(const vtp &other) const { return hlp::max(v, other.v); } vtp min(const vtp &other) const { return hlp::min(v, other.v); } Tm operator>(const vtp &other) const { return hlp::gt(v, other.v); } Tm operator>=(const vtp &other) const { return hlp::ge(v, other.v); } Tm operator<(const vtp &other) const { return hlp::lt(v, other.v); } Tm operator<=(const vtp &other) const { return hlp::le(v, other.v); } Tm operator==(const vtp &other) const { return hlp::eq(v, other.v); } Tm operator!=(const vtp &other) const { return hlp::ne(v, other.v); } static vtp blend(Tm mask, const vtp &a, const vtp &b) { return hlp::blend(mask, a, b); } class reference { private: vtp &v; size_t i; public: reference (vtp &v_, size_t i_) : v(v_), i(i_) {} reference &operator= (T other) { v.v[i] = other; return *this; } reference &operator*= (T other) { v.v[i] *= other; return *this; } operator T() const { return v.v[i]; } }; void Set(size_t i, T val) { v[i] = val; } reference operator[](size_t i) { return reference(*this, i); } T operator[](size_t i) const { return v[i]; } class where_expr { private: vtp &v; Tm m; public: where_expr (Tm m_, vtp &v_) : v(v_), m(m_) {} where_expr &operator= (const vtp &other) { v=hlp::blend(m, other.v, v.v); return *this; } where_expr &operator*= (const vtp &other) { v=hlp::blend(m, v.v*other.v, v.v); return *this; } where_expr &operator+= (const vtp &other) { v=hlp::blend(m, v.v+other.v, v.v); return *this; } where_expr &operator-= (const vtp &other) { v=hlp::blend(m, v.v-other.v, v.v); return *this; } }; }; template inline vtp abs(vtp v) { return v.abs(); } template typename vtp::where_expr where(typename vtp::Tm m, vtp &v) { return typename vtp::where_expr(m, v); } template vtp operator*(T0 a, vtp b) { return b*a; } template vtp operator+(T a, vtp b) { return b+a; } template vtp operator-(T a, vtp b) { return vtp(a) - b; } template vtp max(vtp a, vtp b) { return a.max(b); } template vtp min(vtp a, vtp b) { return a.min(b); } template vtp sqrt(vtp v) { return v.sqrt(); } template inline bool none_of(const vmask_ &mask) { return mask.none(); } template inline bool any_of(const vmask_ &mask) { return mask.any(); } template inline bool all_of(const vmask_ &mask) { return mask.all(); } template inline vtp blend (const vmask_ &mask, const vtp &a, const vtp &b) { return vtp::blend(mask, a, b); } template T reduce(const vtp &v, Op op) { T res=v[0]; for (size_t i=1; i vtp apply(vtp in, Func func) { vtp res; for (size_t i=0; i class pseudoscalar { private: T v; public: #if defined(_MSC_VER) pseudoscalar() {} pseudoscalar(const pseudoscalar &other) : v(other.v) {} pseudoscalar & operator=(const pseudoscalar &other) { v=other.v; return *this; } #else pseudoscalar() = default; pseudoscalar(const pseudoscalar &other) = default; pseudoscalar & operator=(const pseudoscalar &other) = default; #endif pseudoscalar(T v_):v(v_) {} pseudoscalar operator-() const { return pseudoscalar(-v); } pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); } pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); } pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); } pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); } pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; } pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; } pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; } pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; } pseudoscalar abs() const { return std::abs(v); } inline pseudoscalar sqrt() const { return std::sqrt(v); } pseudoscalar max(const pseudoscalar &other) const { return std::max(v, other.v); } pseudoscalar min(const pseudoscalar &other) const { return std::min(v, other.v); } bool operator>(const pseudoscalar &other) const { return v>other.v; } bool operator>=(const pseudoscalar &other) const { return v>=other.v; } bool operator<(const pseudoscalar &other) const { return v class helper_ { private: static constexpr size_t len = 1; public: using Tv = pseudoscalar; using Tm = bool; static Tv loadu(const T *ptr) { return *ptr; } static void storeu(T *ptr, Tv v) { *ptr = v[0]; } static Tv from_scalar(T v) { return v; } static Tv abs(Tv v) { return v.abs(); } static Tv max(Tv v1, Tv v2) { return v1.max(v2); } static Tv min(Tv v1, Tv v2) { return v1.min(v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; } static Tv sqrt(Tv v) { return v.sqrt(); } static Tm gt (Tv v1, Tv v2) { return v1>v2; } static Tm ge (Tv v1, Tv v2) { return v1>=v2; } static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = double; static constexpr size_t len = 8; public: using Tv = __m512d; using Tm = __mmask8; static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); } static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); } static Tv from_scalar(T v) { return _mm512_set1_pd(v); } static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); } static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm512_min_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); } static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); } static Tm le (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LE_OQ); } static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_EQ_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } static bool mask_none(Tm v) { return v==0; } static bool mask_any(Tm v) { return v!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = float; static constexpr size_t len = 16; public: using Tv = __m512; using Tm = __mmask16; static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); } static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); } static Tv from_scalar(T v) { return _mm512_set1_ps(v); } static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); } static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm512_min_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); } static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); } static Tm le (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LE_OQ); } static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_EQ_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } static bool mask_none(Tm v) { return v==0; } static bool mask_any(Tm v) { return v!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = double; static constexpr size_t len = 4; public: using Tv = __m256d; using Tm = __m256d; static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); } static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); } static Tv from_scalar(T v) { return _mm256_set1_pd(v); } static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm256_min_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); } static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); } static Tm le (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LE_OQ); } static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_EQ_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_pd(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = float; static constexpr size_t len = 8; public: using Tv = __m256; using Tm = __m256; static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); } static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); } static Tv from_scalar(T v) { return _mm256_set1_ps(v); } static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm256_min_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); } static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); } static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); } static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); } static Tm le (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LE_OQ); } static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_EQ_OQ); } static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); } static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_ps(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = double; static constexpr size_t len = 2; public: using Tv = __m128d; using Tm = __m128d; static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); } static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); } static Tv from_scalar(T v) { return _mm_set1_pd(v); } static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm_min_pd(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { #if defined(__SSE4_1__) return _mm_blendv_pd(v2,v1,m); #else return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2)); #endif } static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); } static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); } static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); } static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); } static Tm le (Tv v1, Tv v2) { return _mm_cmple_pd(v1,v2); } static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_pd(v1,v2); } static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); } static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return _mm_or_pd(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = float; static constexpr size_t len = 4; public: using Tv = __m128; using Tm = __m128; static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); } static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); } static Tv from_scalar(T v) { return _mm_set1_ps(v); } static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); } static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); } static Tv min(Tv v1, Tv v2) { return _mm_min_ps(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { #if defined(__SSE4_1__) return _mm_blendv_ps(v2,v1,m); #else return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2)); #endif } static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); } static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); } static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); } static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); } static Tm le (Tv v1, Tv v2) { return _mm_cmple_ps(v1,v2); } static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_ps(v1,v2); } static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); } static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return _mm_or_ps(v1,v2); } static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< class gnuvec_helper { public: using Tv __attribute__ ((vector_size (len*sizeof(T)))) = T; using Tm = decltype(Tv()v2; } static Tm ge (Tv v1, Tv v2) { return v1>=v2; } static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; template<> class helper_: public gnuvec_helper {}; template<> constexpr inline bool simd_exists = true; template<> class helper_: public gnuvec_helper {}; #endif #if defined(DUCC0_USE_NEON) template<> constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = double; static constexpr size_t len = 2; public: using Tv = float64x2_t; using Tm = uint64x2_t; static Tv loadu(const T *ptr) { return vld1q_f64(ptr); } static void storeu(T *ptr, Tv v) { vst1q_f64(ptr, v); } static Tv from_scalar(T v) { return vdupq_n_f64(v); } static Tv abs(Tv v) { return vabsq_f64(v); } static Tv max(Tv v1, Tv v2) { return vmaxq_f64(v1, v2); } static Tv min(Tv v1, Tv v2) { return vminq_f64(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return vbslq_f64(m, v1, v2); } static Tv sqrt(Tv v) { return vsqrtq_f64(v); } static Tm gt (Tv v1, Tv v2) { return vcgtq_f64(v1,v2); } static Tm ge (Tv v1, Tv v2) { return vcgeq_f64(v1,v2); } static Tm lt (Tv v1, Tv v2) { return vcltq_f64(v1,v2); } static Tm le (Tv v1, Tv v2) { return vcleq_f64(v1,v2); } static Tm eq (Tv v1, Tv v2) { return vceqq_f64(v1,v2); } static Tm ne (Tv v1, Tv v2) { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(v1,v2)))); } static Tm mask_and (Tm v1, Tm v2) { return vandq_u64(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return vorrq_u64(v1,v2); } static size_t maskbits(Tm v) { auto high_bits = vshrq_n_u64(v, 63); return vgetq_lane_u64(high_bits, 0) | ((vgetq_lane_u64(high_bits, 1)<<1)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; template<> class helper_ { private: using T = float; static constexpr size_t len = 4; public: using Tv = float32x4_t; using Tm = uint32x4_t; static Tv loadu(const T *ptr) { return vld1q_f32(ptr); } static void storeu(T *ptr, Tv v) { vst1q_f32(ptr, v); } static Tv from_scalar(T v) { return vdupq_n_f32(v); } static Tv abs(Tv v) { return vabsq_f32(v); } static Tv max(Tv v1, Tv v2) { return vmaxq_f32(v1, v2); } static Tv min(Tv v1, Tv v2) { return vminq_f32(v1, v2); } static Tv blend(Tm m, Tv v1, Tv v2) { return vbslq_f32(m, v1, v2); } static Tv sqrt(Tv v) { return vsqrtq_f32(v); } static Tm gt (Tv v1, Tv v2) { return vcgtq_f32(v1,v2); } static Tm ge (Tv v1, Tv v2) { return vcgeq_f32(v1,v2); } static Tm lt (Tv v1, Tv v2) { return vcltq_f32(v1,v2); } static Tm le (Tv v1, Tv v2) { return vcleq_f32(v1,v2); } static Tm eq (Tv v1, Tv v2) { return vceqq_f32(v1,v2); } static Tm ne (Tv v1, Tv v2) { return vmvnq_u32(vceqq_f32(v1,v2)); } static Tm mask_and (Tm v1, Tm v2) { return vandq_u32(v1,v2); } static Tm mask_or (Tm v1, Tm v2) { return vorrq_u32(v1,v2); } static size_t maskbits(Tm v) { static constexpr int32x4_t shift = {0, 1, 2, 3}; auto tmp = vshrq_n_u32(v, 31); return vaddvq_u32(vshlq_u32(tmp, shift)); } static bool mask_none(Tm v) { return maskbits(v)==0; } static bool mask_any(Tm v) { return maskbits(v)!=0; } static bool mask_all(Tm v) { static constexpr auto fullmask = (size_t(1)< using native_simd = vtp>; #elif defined(__AVX__) template using native_simd = vtp>; #elif defined(__SSE2__) template using native_simd = vtp>; #elif defined(DUCC0_USE_SVE) template using native_simd = vtp>; #elif defined(DUCC0_USE_NEON) template using native_simd = vtp>; #else template using native_simd = vtp; #endif #else // DUCC0_NO_SIMD is defined /// The SIMD type for \a T with the largest vector length on this platform. template using native_simd = vtp; #endif /// Provides a SIMD type for \a T with vector length \a len, if it exists. template struct simd_select { using type = vtp; }; template inline vtp sin(vtp in) { return apply(in,[](T v){return std::sin(v);}); } template inline vtp cos(vtp in) { return apply(in,[](T v){return std::cos(v);}); } } using detail_simd::element_aligned_tag; using detail_simd::native_simd; using detail_simd::simd_select; using detail_simd::simd_exists; using detail_simd::vectorizable; } #endif #endif