8989#define _sse2neon_likely (x ) __builtin_expect(!!(x), 1)
9090#define _sse2neon_unlikely (x ) __builtin_expect(!!(x), 0)
9191#elif defined(_MSC_VER )
92- #if _MSVC_TRADITIONAL
93- #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94- #endif
9592#ifndef FORCE_INLINE
9693#define FORCE_INLINE static inline
9794#endif
184181 } while (0)
185182#endif
186183
184+ #ifdef _M_ARM
185+ #define vst1q_lane_s64 (a , b , c )
186+ #endif
187+
187188/* Memory barriers
188189 * __atomic_thread_fence does not include a compiler barrier; instead,
189190 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
@@ -202,8 +203,12 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
202203#elif defined(__GNUC__ ) || defined(__clang__ )
203204 __atomic_thread_fence (__ATOMIC_SEQ_CST );
204205#else /* MSVC */
206+ #ifdef _M_ARM
207+ __dmb (_ARM_BARRIER_ISH );
208+ #else
205209 __dmb (_ARM64_BARRIER_ISH );
206210#endif
211+ #endif
207212}
208213
209214/* Architecture-specific build options */
@@ -268,7 +273,7 @@ FORCE_INLINE void _sse2neon_smp_mb(void)
268273 * we have to perform syscall instead.
269274 */
270275#if (!defined(__aarch64__ ) && !defined(_M_ARM64 ))
271- #include <sys/ time.h>
276+ #include <time.h>
272277#endif
273278
274279/* "__has_builtin" can be used to query support for built-in functions
@@ -574,10 +579,10 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
574579/* Backwards compatibility for compilers with lack of specific type support */
575580
576581// Older gcc does not define vld1q_u8_x4 type
577- #if defined(__GNUC__ ) && !defined(__clang__ ) && \
582+ #if defined(_M_ARM ) || (defined( __GNUC__ ) && !defined(__clang__ ) && \
578583 ((__GNUC__ <= 12 && defined(__arm__ )) || \
579584 (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__ )) || \
580- (__GNUC__ <= 9 && defined(__aarch64__ )))
585+ (__GNUC__ <= 9 && defined(__aarch64__ ))))
581586FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4 (const uint8_t * p )
582587{
583588 uint8x16x4_t ret ;
@@ -610,6 +615,9 @@ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
610615}
611616#endif
612617
618+ #if defined(_M_ARM )
619+ #pragma message("TODO: Windows ARM32: Port many SSE2NEON functions")
620+ #else
613621#if !defined(__aarch64__ ) && !defined(_M_ARM64 )
614622/* emulate vaddvq u8 variant */
615623FORCE_INLINE uint8_t _sse2neon_vaddvq_u8 (uint8x16_t a )
@@ -645,6 +653,7 @@ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
645653 return vaddvq_u16 (a );
646654}
647655#endif
656+ #endif
648657
649658/* Function Naming Conventions
650659 * The naming convention of SSE intrinsics is straightforward. A generic SSE
@@ -1765,6 +1774,7 @@ FORCE_INLINE void _mm_free(void *addr)
17651774}
17661775#endif
17671776
1777+ #ifndef _M_ARM
17681778FORCE_INLINE uint64_t _sse2neon_get_fpcr ()
17691779{
17701780 uint64_t value ;
@@ -1808,6 +1818,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode()
18081818
18091819 return r .field .bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF ;
18101820}
1821+ #endif
18111822
18121823// Macro: Get the rounding mode bits from the MXCSR control and status register.
18131824// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
@@ -1826,6 +1837,8 @@ FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE()
18261837
18271838#if defined(__aarch64__ ) || defined(_M_ARM64 )
18281839 r .value = _sse2neon_get_fpcr ();
1840+ #elif defined(_M_ARM )
1841+ r .value = _MoveFromCoprocessor (10 ,7 , 1 ,0 ,0 );
18291842#else
18301843 __asm__ __volatile__("vmrs %0, FPSCR" : "=r" (r .value )); /* read */
18311844#endif
@@ -2247,7 +2260,7 @@ FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
22472260FORCE_INLINE void _mm_prefetch (char const * p , int i )
22482261{
22492262 (void ) i ;
2250- #if defined( _MSC_VER )
2263+ #ifdef _M_ARM64
22512264 switch (i ) {
22522265 case _MM_HINT_NTA :
22532266 __prefetch2 (p , 1 );
@@ -2262,6 +2275,8 @@ FORCE_INLINE void _mm_prefetch(char const *p, int i)
22622275 __prefetch2 (p , 4 );
22632276 break ;
22642277 }
2278+ #elif defined(_M_ARM )
2279+ // TODO
22652280#else
22662281 switch (i ) {
22672282 case _MM_HINT_NTA :
@@ -2348,6 +2363,7 @@ FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
23482363 vset_lane_u16 ((int ) vget_lane_u64 (t , 0 ), vdup_n_u16 (0 ), 0 ));
23492364}
23502365
2366+ #ifndef _M_ARM
23512367// Macro: Set the flush zero bits of the MXCSR control and status register to
23522368// the value in unsigned 32-bit integer a. The flush zero may contain any of the
23532369// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
@@ -2379,6 +2395,7 @@ FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
23792395 __asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
23802396#endif
23812397}
2398+ #endif
23822399
23832400// Set packed single-precision (32-bit) floating-point elements in dst with the
23842401// supplied values.
@@ -2404,6 +2421,7 @@ FORCE_INLINE __m128 _mm_set_ps1(float _w)
24042421// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
24052422FORCE_INLINE void _MM_SET_ROUNDING_MODE (int rounding )
24062423{
2424+ #ifndef _M_ARM
24072425 union {
24082426 fpcr_bitfield field ;
24092427#if defined(__aarch64__ ) || defined(_M_ARM64 )
@@ -2442,6 +2460,7 @@ FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
24422460#else
24432461 __asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
24442462#endif
2463+ #endif
24452464}
24462465
24472466// Copy single-precision (32-bit) floating-point element a to the lower element
@@ -3206,6 +3225,7 @@ FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
32063225 return _mm_move_sd (a , _mm_cmpeq_pd (a , b ));
32073226}
32083227
3228+ #ifndef _M_ARM
32093229// Compare packed double-precision (64-bit) floating-point elements in a and b
32103230// for greater-than-or-equal, and store the results in dst.
32113231// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
@@ -3247,6 +3267,7 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
32473267 return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
32483268#endif
32493269}
3270+ #endif
32503271
32513272// Compare packed signed 16-bit integers in a and b for greater-than, and store
32523273// the results in dst.
@@ -3275,6 +3296,7 @@ FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
32753296 vcgtq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
32763297}
32773298
3299+ #ifndef _M_ARM
32783300// Compare packed double-precision (64-bit) floating-point elements in a and b
32793301// for greater-than, and store the results in dst.
32803302// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
@@ -3358,6 +3380,7 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
33583380 return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
33593381#endif
33603382}
3383+ #endif
33613384
33623385// Compare packed signed 16-bit integers in a and b for less-than, and store the
33633386// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
@@ -3389,6 +3412,7 @@ FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
33893412 vcltq_s8 (vreinterpretq_s8_m128i (a ), vreinterpretq_s8_m128i (b )));
33903413}
33913414
3415+ #ifndef _M_ARM
33923416// Compare packed double-precision (64-bit) floating-point elements in a and b
33933417// for less-than, and store the results in dst.
33943418// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
@@ -3429,6 +3453,7 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
34293453 return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
34303454#endif
34313455}
3456+ #endif
34323457
34333458// Compare packed double-precision (64-bit) floating-point elements in a and b
34343459// for not-equal, and store the results in dst.
@@ -3456,6 +3481,7 @@ FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
34563481 return _mm_move_sd (a , _mm_cmpneq_pd (a , b ));
34573482}
34583483
3484+ #ifndef _M_ARM
34593485// Compare packed double-precision (64-bit) floating-point elements in a and b
34603486// for not-greater-than-or-equal, and store the results in dst.
34613487// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
@@ -3756,6 +3782,7 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
37563782 return (* (double * ) & a0 < * (double * ) & b0 );
37573783#endif
37583784}
3785+ #endif
37593786
37603787// Compare the lower double-precision (64-bit) floating-point element in a and b
37613788// for equality, and return the boolean result (0 or 1).
@@ -4401,6 +4428,7 @@ FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
44014428 vmaxq_u8 (vreinterpretq_u8_m128i (a ), vreinterpretq_u8_m128i (b )));
44024429}
44034430
4431+ #ifndef _M_ARM
44044432// Compare packed double-precision (64-bit) floating-point elements in a and b,
44054433// and store packed maximum values in dst.
44064434// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
@@ -4487,6 +4515,7 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
44874515 return vreinterpretq_m128d_u64 (vld1q_u64 (d ));
44884516#endif
44894517}
4518+ #endif
44904519
44914520// Compare the lower double-precision (64-bit) floating-point elements in a and
44924521// b, store the minimum value in the lower element of dst, and copy the upper
@@ -4793,7 +4822,11 @@ FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
47934822FORCE_INLINE void _mm_pause ()
47944823{
47954824#if defined(_MSC_VER )
4825+ #ifdef _M_ARM
4826+ __isb (_ARM_BARRIER_SY );
4827+ #else
47964828 __isb (_ARM64_BARRIER_SY );
4829+ #endif
47974830#else
47984831 __asm__ __volatile__("isb\n" );
47994832#endif
@@ -7622,6 +7655,7 @@ FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
76227655}
76237656
76247657/* SSE4.2 */
7658+ #ifndef _M_ARM
76257659
76267660const static uint16_t ALIGN_STRUCT (16 ) _sse2neon_cmpestr_mask16b [8 ] = {
76277661 0x01 , 0x02 , 0x04 , 0x08 , 0x10 , 0x20 , 0x40 , 0x80 ,
@@ -8463,9 +8497,11 @@ FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
84638497 return crc ;
84648498}
84658499
8500+ #endif
8501+
84668502/* AES */
84678503
8468- #if !defined(__ARM_FEATURE_CRYPTO ) && !defined(_M_ARM64 )
8504+ #if !defined(__ARM_FEATURE_CRYPTO ) && !defined(_M_ARM64 ) && !defined( _M_ARM )
84698505/* clang-format off */
84708506#define SSE2NEON_AES_SBOX (w ) \
84718507 { \
@@ -8913,6 +8949,7 @@ FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
89138949#undef SSE2NEON_MULTIPLY
89148950#endif
89158951
8952+ #elif defined(_M_ARM )
89168953#else /* __ARM_FEATURE_CRYPTO */
89178954// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
89188955// AESMC and then manually applying the real key as an xor operation. This
@@ -9034,6 +9071,7 @@ FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
90349071 }
90359072}
90369073
9074+ #ifndef _M_ARM
90379075FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode ()
90389076{
90399077 union {
@@ -9053,6 +9091,7 @@ FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode()
90539091
90549092 return r .field .bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF ;
90559093}
9094+ #endif
90569095
90579096// Count the number of bits set to 1 in unsigned 32-bit integer a, and
90589097// return that count in dst.
@@ -9113,6 +9152,7 @@ FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
91139152#endif
91149153}
91159154
9155+ #ifndef _M_ARM
91169156FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode (unsigned int flag )
91179157{
91189158 // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
@@ -9140,6 +9180,7 @@ FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
91409180 __asm__ __volatile__("vmsr FPSCR, %0" ::"r" (r )); /* write */
91419181#endif
91429182}
9183+ #endif
91439184
91449185// Return the current 64-bit value of the processor's time-stamp counter.
91459186// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
@@ -9161,6 +9202,9 @@ FORCE_INLINE uint64_t _rdtsc(void)
91619202#endif
91629203
91639204 return val ;
9205+ #elif defined(_M_ARM )
9206+ uint32_t val = _MoveFromCoprocessor (15 ,0 , 9 ,13 ,0 );
9207+ return ((uint64_t )val ) << 6 ;
91649208#else
91659209 uint32_t pmccntr , pmuseren , pmcntenset ;
91669210 // Read the user mode Performance Monitoring Unit (PMU)
0 commit comments