diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h --- a/clang/lib/Headers/avx512fp16intrin.h +++ b/clang/lib/Headers/avx512fp16intrin.h @@ -2934,8 +2934,8 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)-1, + return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } @@ -2943,15 +2943,15 @@ _mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectps_128( __U, - __builtin_ia32_vfcmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + __builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), (__v4sf)__A); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, + return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } @@ -2959,38 +2959,38 @@ _mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { return (__m128h)_mm_move_ss((__m128)__C, (__m128)__builtin_ia32_vfcmaddcsh_mask( - (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, __U, + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION)); } #define _mm_fcmadd_round_sch(A, B, C, R) \ ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)-1, (int)(R))) #define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ ((__m128h)__builtin_ia32_selectps_128( \ (__mmask8)(U & 1), \ __builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)(U), (int)(R)), \ (__v4sf)(__m128h)(A))) #define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)(U), (int)(R))) #define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ ((__m128h)_mm_move_ss((__m128)(C), \ (__m128)__builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(C), (__v4sf)(A), (__v4sf)(B), (U), (R)))) + (__v4sf)(A), (__v4sf)(B), (__v4sf)(C), (U), (R)))) static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)-1, + return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); } @@ -2998,34 +2998,34 @@ _mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectps_128( __U, - __builtin_ia32_vfmaddcsh_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + __builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION), (__v4sf)__A); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U, + return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); } #define _mm_fmadd_round_sch(A, B, C, R) \ ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)-1, (int)(R))) #define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ ((__m128h)__builtin_ia32_selectps_128( \ (__mmask8)(U & 1), \ __builtin_ia32_vfmaddcsh_mask( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)(U), (int)(R)), \ (__v4sf)(__m128h)(A))) #define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ - (__v4sf)(__m128h)(C), (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ (__mmask8)(U), (int)(R))) static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, @@ -3177,8 +3177,8 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A, - (__v16sf)__B, (__mmask16)-1, + return (__m512h)__builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } @@ -3186,8 +3186,8 @@ _mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { return (__m512h)__builtin_ia32_selectps_512( __U, - __builtin_ia32_vfcmaddcph512_mask((__v16sf)__C, (__v16sf)__A, - (__v16sf)__B, (__mmask16)__U, + __builtin_ia32_vfcmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION), (__v16sf)__A); } @@ -3195,45 +3195,45 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { return (__m512h)__builtin_ia32_vfcmaddcph512_mask( - (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( - (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } #define _mm512_fcmadd_round_pch(A, B, C, R) \ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)-1, (int)(R))) #define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ ((__m512h)__builtin_ia32_selectps_512( \ (__mmask16)(U), \ __builtin_ia32_vfcmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R)), \ (__v16sf)(__m512h)(A))) #define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) #define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, - (__v16sf)__B, (__mmask16)-1, + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, _MM_FROUND_CUR_DIRECTION); } @@ -3241,7 +3241,7 @@ _mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { return (__m512h)__builtin_ia32_selectps_512( __U, - __builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, (__v16sf)__B, + __builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION), (__v16sf)__A); @@ -3249,39 +3249,39 @@ static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__C, (__v16sf)__A, - (__v16sf)__B, (__mmask16)__U, + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { return (__m512h)__builtin_ia32_vfmaddcph512_maskz( - (__v16sf)__C, (__v16sf)__A, (__v16sf)__B, (__mmask16)__U, + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); } #define _mm512_fmadd_round_pch(A, B, C, R) \ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)-1, (int)(R))) #define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ ((__m512h)__builtin_ia32_selectps_512( \ (__mmask16)(U), \ __builtin_ia32_vfmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R)), \ (__v16sf)(__m512h)(A))) #define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) #define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \ - (__v16sf)(__m512h)(C), (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ (__mmask16)(U), (int)(R))) static __inline__ _Float16 __DEFAULT_FN_ATTRS512 diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h --- a/clang/lib/Headers/avx512vlfp16intrin.h +++ b/clang/lib/Headers/avx512vlfp16intrin.h @@ -1825,57 +1825,57 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)-1); + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectps_128( __U, - __builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)(__m128h)__A, - (__v4sf)__B, (__mmask8)__U), + __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, + (__v4sf)__C, (__mmask8)__U), (__v4sf)__A); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U); + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( - (__v4sf)__C, (__v4sf)__A, (__v4sf)__B, (__mmask8)__U); + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, - (__v8sf)__B, (__mmask8)-1); + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)-1); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectps_256( __U, - __builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B, + __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U), (__v8sf)__A); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { - return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__C, (__v8sf)__A, - (__v8sf)__B, (__mmask8)__U); + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( - (__v8sf)__C, (__v8sf)__A, (__v8sf)__B, (__mmask8)__U); + (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, @@ -1919,57 +1919,57 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)-1); + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { return (__m128h)__builtin_ia32_selectps_128( __U, - __builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, (__v4sf)__B, + __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U), (__v4sf)__A); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U); + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__C, (__v4sf)__A, - (__v4sf)__B, (__mmask8)__U); + return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, - (__v8sf)__B, (__mmask8)-1); + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)-1); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { return (__m256h)__builtin_ia32_selectps_256( __U, - __builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, (__v8sf)__B, + __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U), (__v8sf)__A); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { - return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__C, (__v8sf)__A, - (__v8sf)__B, (__mmask8)__U); + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); } static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__C, (__v8sf)__A, - (__v8sf)__B, (__mmask8)__U); + return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); } static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -26067,16 +26067,16 @@ // Swap Src1 and Src2 in the node creation return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); } - case FMA_OP_MASKZ: - case FMA_OP_MASK: { + case CFMA_OP_MASKZ: + case CFMA_OP_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue Mask = Op.getOperand(4); MVT VT = Op.getSimpleValueType(); - SDValue PassThru = Src1; - if (IntrData->Type == FMA_OP_MASKZ) + SDValue PassThru = Src3; + if (IntrData->Type == CFMA_OP_MASKZ) PassThru = getZeroVector(VT, Subtarget, DAG, dl); // We add rounding mode to the Node when diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td --- a/llvm/lib/Target/X86/X86InstrAVX512.td +++ b/llvm/lib/Target/X86/X86InstrAVX512.td @@ -13501,130 +13501,130 @@ } let Constraints = "@earlyclobber $dst, $src1 = $dst" in { - multiclass avx512_cfmop_rm opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + multiclass avx512_cfmaop_rm opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, bit IsCommutable> { defm r: AVX512_maskable_3src, EVEX_4V; + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), IsCommutable>, EVEX_4V; defm m: AVX512_maskable_3src, EVEX_4V; + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1))>, EVEX_4V; defm mb: AVX512_maskable_3src, EVEX_B, EVEX_4V; + (_.VT (OpNode _.RC:$src2, (_.VT (_.BroadcastLdFrag addr:$src3)), _.RC:$src1))>, EVEX_B, EVEX_4V; } } // Constraints = "@earlyclobber $dst, $src1 = $dst" -multiclass avx512_cfmop_round opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_cfmaop_round opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { let Constraints = "@earlyclobber $dst, $src1 = $dst" in defm rb: AVX512_maskable_3src, + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc)))>, EVEX_4V, EVEX_B, EVEX_RC; } -multiclass avx512_cfmaop_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cfmaop_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, bit IsCommutable> { let Predicates = [HasFP16] in { - defm Z : avx512_cfmop_rm, - avx512_cfmop_round, + defm Z : avx512_cfmaop_rm, + avx512_cfmaop_round, EVEX_V512, Sched<[WriteFMAZ]>; } let Predicates = [HasVLX, HasFP16] in { - defm Z256 : avx512_cfmop_rm, EVEX_V256, Sched<[WriteFMAY]>; - defm Z128 : avx512_cfmop_rm, EVEX_V128, Sched<[WriteFMAX]>; + defm Z256 : avx512_cfmaop_rm, EVEX_V256, Sched<[WriteFMAY]>; + defm Z128 : avx512_cfmaop_rm, EVEX_V128, Sched<[WriteFMAX]>; } } multiclass avx512_cfmulop_common opc, string OpcodeStr, SDNode OpNode, - SDNode MaskOpNode, SDNode OpNodeRnd, - X86SchedWriteWidths sched = SchedWriteFMA> { + SDNode MaskOpNode, SDNode OpNodeRnd, bit IsCommutable> { let Predicates = [HasFP16] in { defm Z : avx512_fp_packed, - avx512_fp_round_packed, + avx512_fp_round_packed, EVEX_V512; } let Predicates = [HasVLX, HasFP16] in { defm Z256 : avx512_fp_packed, EVEX_V256; + WriteFMAY, IsCommutable, IsCommutable, "", "@earlyclobber $dst", 0>, EVEX_V256; defm Z128 : avx512_fp_packed, EVEX_V128; + WriteFMAX, IsCommutable, IsCommutable, "", "@earlyclobber $dst", 0>, EVEX_V128; } } let Uses = [MXCSR] in { - defm VFMADDCPH : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd>, + defm VFMADDCPH : avx512_cfmaop_common<0x56, "vfmaddcph", x86vfmaddc, x86vfmaddcRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VF>; - defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd>, + defm VFCMADDCPH : avx512_cfmaop_common<0x56, "vfcmaddcph", x86vfcmaddc, x86vfcmaddcRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VF>; defm VFMULCPH : avx512_cfmulop_common<0xD6, "vfmulcph", x86vfmulc, x86vfmulc, - x86vfmulcRnd>, T_MAP6XS, EVEX_CD8<32, CD8VF>; + x86vfmulcRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VF>; defm VFCMULCPH : avx512_cfmulop_common<0xD6, "vfcmulcph", x86vfcmulc, - x86vfcmulc, x86vfcmulcRnd>, T_MAP6XD, EVEX_CD8<32, CD8VF>; + x86vfcmulc, x86vfcmulcRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VF>; } -multiclass avx512_cfmop_sh_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, - X86SchedWriteWidths sched = SchedWriteFMA> { +multiclass avx512_cfmaop_sh_common opc, string OpcodeStr, SDNode OpNode, SDNode OpNodeRnd, + bit IsCommutable> { let Predicates = [HasFP16], Constraints = "@earlyclobber $dst, $src1 = $dst" in { defm r : AVX512_maskable_3src, - Sched<[sched.XMM]>; + (v4f32 (OpNode VR128X:$src2, VR128X:$src3, VR128X:$src1)), IsCommutable>, + Sched<[WriteFMAX]>; defm m : AVX512_maskable_3src, - Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>; + (v4f32 (OpNode VR128X:$src2, (sse_load_f32 addr:$src3), VR128X:$src1))>, + Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>; defm rb : AVX512_maskable_3src, - EVEX_B, EVEX_RC, Sched<[sched.XMM]>; + (v4f32 (OpNodeRnd VR128X:$src2, VR128X:$src3, VR128X:$src1, (i32 timm:$rc)))>, + EVEX_B, EVEX_RC, Sched<[WriteFMAX]>; } } multiclass avx512_cfmbinop_sh_common opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd, X86SchedWriteWidths sched = SchedWriteFMA> { + SDNode OpNodeRnd, bit IsCommutable> { let Predicates = [HasFP16] in { defm rr : AVX512_maskable, Sched<[sched.XMM]>; + IsCommutable, IsCommutable, IsCommutable, + X86selects, "@earlyclobber $dst">, Sched<[WriteFMAX]>; defm rm : AVX512_maskable, - Sched<[sched.XMM.Folded, sched.XMM.ReadAfterFold]>; + Sched<[WriteFMAX.Folded, WriteFMAX.ReadAfterFold]>; defm rrb : AVX512_maskable, - EVEX_B, EVEX_RC, Sched<[sched.XMM]>; + EVEX_B, EVEX_RC, Sched<[WriteFMAX]>; } } let Uses = [MXCSR] in { - defm VFMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd>, + defm VFMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfmaddcsh", x86vfmaddcSh, x86vfmaddcShRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V; - defm VFCMADDCSHZ : avx512_cfmop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd>, + defm VFCMADDCSHZ : avx512_cfmaop_sh_common<0x57, "vfcmaddcsh", x86vfcmaddcSh, x86vfcmaddcShRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, EVEX_4V; - defm VFMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd>, + defm VFMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfmulcsh", x86vfmulcSh, x86vfmulcShRnd, 1>, T_MAP6XS, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V; - defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd>, + defm VFCMULCSHZ : avx512_cfmbinop_sh_common<0xD7, "vfcmulcsh", x86vfcmulcSh, x86vfcmulcShRnd, 0>, T_MAP6XD, EVEX_CD8<32, CD8VT1>, EVEX_V128, VEX_LIG, EVEX_4V; } diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td --- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -572,22 +572,22 @@ def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>; -def x86vfmaddc : SDNode<"X86ISD::VFMADDC", SDTFPTernaryOp>; -def x86vfmaddcRnd : SDNode<"X86ISD::VFMADDC_RND", SDTFmaRound>; +def x86vfmaddc : SDNode<"X86ISD::VFMADDC", SDTFPTernaryOp, [SDNPCommutative]>; +def x86vfmaddcRnd : SDNode<"X86ISD::VFMADDC_RND", SDTFmaRound, [SDNPCommutative]>; def x86vfcmaddc : SDNode<"X86ISD::VFCMADDC", SDTFPTernaryOp>; def x86vfcmaddcRnd : SDNode<"X86ISD::VFCMADDC_RND", SDTFmaRound>; -def x86vfmulc : SDNode<"X86ISD::VFMULC", SDTFPBinOp>; -def x86vfmulcRnd : SDNode<"X86ISD::VFMULC_RND", SDTFPBinOpRound>; +def x86vfmulc : SDNode<"X86ISD::VFMULC", SDTFPBinOp, [SDNPCommutative]>; +def x86vfmulcRnd : SDNode<"X86ISD::VFMULC_RND", SDTFPBinOpRound, [SDNPCommutative]>; def x86vfcmulc : SDNode<"X86ISD::VFCMULC", SDTFPBinOp>; def x86vfcmulcRnd : SDNode<"X86ISD::VFCMULC_RND", SDTFPBinOpRound>; -def x86vfmaddcSh : SDNode<"X86ISD::VFMADDCSH", SDTFPTernaryOp>; +def x86vfmaddcSh : SDNode<"X86ISD::VFMADDCSH", SDTFPTernaryOp, [SDNPCommutative]>; def x86vfcmaddcSh : SDNode<"X86ISD::VFCMADDCSH", SDTFPTernaryOp>; -def x86vfmulcSh : SDNode<"X86ISD::VFMULCSH", SDTFPBinOp>; +def x86vfmulcSh : SDNode<"X86ISD::VFMULCSH", SDTFPBinOp, [SDNPCommutative]>; def x86vfcmulcSh : SDNode<"X86ISD::VFCMULCSH", SDTFPBinOp>; -def x86vfmaddcShRnd : SDNode<"X86ISD::VFMADDCSH_RND", SDTFmaRound>; +def x86vfmaddcShRnd : SDNode<"X86ISD::VFMADDCSH_RND", SDTFmaRound, [SDNPCommutative]>; def x86vfcmaddcShRnd : SDNode<"X86ISD::VFCMADDCSH_RND",SDTFmaRound>; -def x86vfmulcShRnd : SDNode<"X86ISD::VFMULCSH_RND", SDTFPBinOpRound>; +def x86vfmulcShRnd : SDNode<"X86ISD::VFMULCSH_RND", SDTFPBinOpRound, [SDNPCommutative]>; def x86vfcmulcShRnd : SDNode<"X86ISD::VFCMULCSH_RND", SDTFPBinOpRound>; def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>; diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -2620,7 +2620,19 @@ case X86::VPMADD52LUQZ256rkz: case X86::VPMADD52LUQZr: case X86::VPMADD52LUQZrk: - case X86::VPMADD52LUQZrkz: { + case X86::VPMADD52LUQZrkz: + case X86::VFMADDCPHZr: + case X86::VFMADDCPHZrk: + case X86::VFMADDCPHZrkz: + case X86::VFMADDCPHZ128r: + case X86::VFMADDCPHZ128rk: + case X86::VFMADDCPHZ128rkz: + case X86::VFMADDCPHZ256r: + case X86::VFMADDCPHZ256rk: + case X86::VFMADDCPHZ256rkz: + case X86::VFMADDCSHZr: + case X86::VFMADDCSHZrk: + case X86::VFMADDCSHZrkz: { unsigned CommutableOpIdx1 = 2; unsigned CommutableOpIdx2 = 3; if (X86II::isKMasked(Desc.TSFlags)) { diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -24,7 +24,7 @@ GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, XGETBV, ADX, FPCLASSS, INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP, INTR_TYPE_4OP_IMM8, INTR_TYPE_3OP_IMM8, - FMA_OP_MASK, FMA_OP_MASKZ, + CFMA_OP_MASK, CFMA_OP_MASKZ, CMP_MASK_CC,CMP_MASK_SCALAR_CC, VSHIFT, COMI, COMI_RM, BLENDV, BEXTRI, CVTPD2PS_MASK, INTR_TYPE_1OP_SAE, INTR_TYPE_2OP_SAE, @@ -1161,30 +1161,30 @@ X86ISD::CVTUI2P, X86ISD::MCVTUI2P), X86_INTRINSIC_DATA(avx512fp16_mask_vcvtuqq2ph_256, TRUNCATE_TO_REG, X86ISD::CVTUI2P, X86ISD::MCVTUI2P), - X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_128, FMA_OP_MASK, X86ISD::VFCMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_256, FMA_OP_MASK, X86ISD::VFCMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_512, FMA_OP_MASK, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), - X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_csh, FMA_OP_MASK, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_128, CFMA_OP_MASK, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_256, CFMA_OP_MASK, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_cph_512, CFMA_OP_MASK, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfcmadd_csh, CFMA_OP_MASK, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_128, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0), X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_256, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, 0), X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_cph_512, INTR_TYPE_2OP_MASK, X86ISD::VFCMULC, X86ISD::VFCMULC_RND), X86_INTRINSIC_DATA(avx512fp16_mask_vfcmul_csh, INTR_TYPE_SCALAR_MASK, X86ISD::VFCMULCSH, X86ISD::VFCMULCSH_RND), - X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_128, FMA_OP_MASK, X86ISD::VFMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_256, FMA_OP_MASK, X86ISD::VFMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_512, FMA_OP_MASK, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), - X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_csh, FMA_OP_MASK, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_128, CFMA_OP_MASK, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_256, CFMA_OP_MASK, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_cph_512, CFMA_OP_MASK, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_mask_vfmadd_csh, CFMA_OP_MASK, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_128, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0), X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_256, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, 0), X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_cph_512, INTR_TYPE_2OP_MASK, X86ISD::VFMULC, X86ISD::VFMULC_RND), X86_INTRINSIC_DATA(avx512fp16_mask_vfmul_csh, INTR_TYPE_SCALAR_MASK, X86ISD::VFMULCSH, X86ISD::VFMULCSH_RND), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_128, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_256, FMA_OP_MASKZ, X86ISD::VFCMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_512, FMA_OP_MASKZ, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_csh, FMA_OP_MASKZ, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_128, FMA_OP_MASKZ, X86ISD::VFMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_256, FMA_OP_MASKZ, X86ISD::VFMADDC, 0), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_512, FMA_OP_MASKZ, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), - X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_csh, FMA_OP_MASKZ, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_128, CFMA_OP_MASKZ, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_256, CFMA_OP_MASKZ, X86ISD::VFCMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_cph_512, CFMA_OP_MASKZ, X86ISD::VFCMADDC, X86ISD::VFCMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfcmadd_csh, CFMA_OP_MASKZ, X86ISD::VFCMADDCSH, X86ISD::VFCMADDCSH_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_128, CFMA_OP_MASKZ, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_256, CFMA_OP_MASKZ, X86ISD::VFMADDC, 0), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_cph_512, CFMA_OP_MASKZ, X86ISD::VFMADDC, X86ISD::VFMADDC_RND), + X86_INTRINSIC_DATA(avx512fp16_maskz_vfmadd_csh, CFMA_OP_MASKZ, X86ISD::VFMADDCSH, X86ISD::VFMADDCSH_RND), X86_INTRINSIC_DATA(avx512fp16_max_ph_128, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_256, INTR_TYPE_2OP, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512fp16_max_ph_512, INTR_TYPE_2OP_SAE, X86ISD::FMAX, X86ISD::FMAX_SAE), diff --git a/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512cfma-intrinsics.ll @@ -4,13 +4,25 @@ declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) -define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_bst(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_bst(<4 x float> %x0, <4 x float> %x1, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_bst: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmaddcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> , i8 %x3) + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x0, <4 x float> , <4 x float> %x1, i8 %x3) + ret <4 x float> %res +} + +define <4 x float> @test_int_x86_avx512fp8_mask_cfmadd_ph_bst2(<4 x float> %x0, <4 x float> %x1, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_bst2: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmaddcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> , <4 x float> %x0, <4 x float> %x1, i8 %x3) ret <4 x float> %res } @@ -18,7 +30,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmadd_ph_128: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) ret <4 x float> %res @@ -28,7 +41,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfmadd_ph_128: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) ret <4 x float> %res @@ -37,8 +51,7 @@ define <4 x float> @test_int_x86_avx512fp8_cfmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512fp8_cfmadd_ph_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcph %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: vfmaddcph %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) ret <4 x float> %res @@ -52,7 +65,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vfmaddcph %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) ret <8 x float> %res @@ -62,7 +76,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vfmaddcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) ret <8 x float> %res @@ -71,8 +86,7 @@ define <8 x float> @test_int_x86_avx512fp16_cfmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcph %ymm0, %ymm1, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: vfmaddcph %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) ret <8 x float> %res @@ -86,7 +100,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_ph_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vfmaddcph %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) ret <16 x float> %res @@ -96,7 +111,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_ph_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vfmaddcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) ret <16 x float> %res @@ -105,8 +121,7 @@ define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512_rn: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vfmaddcph {rz-sae}, %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) ret <16 x float> %res @@ -115,8 +130,7 @@ define <16 x float> @test_int_x86_avx512fp16_cfmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfmadd_ph_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcph %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vfmaddcph %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) ret <16 x float> %res @@ -125,11 +139,36 @@ declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmadd_ph_bst(<4 x float> %x0, <4 x float> %x1, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmadd_ph_bst: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmaddcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %x0, <4 x float> , <4 x float> %x1, i8 %x3) + ret <4 x float> %res +} + +; Check conjugate complex FMA is not commutable. +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmadd_ph_bst2(<4 x float> %x0, <4 x float> %x1, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmadd_ph_bst2: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vfcmaddcph %xmm0, %xmm2, %xmm1 {%k1} +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> , <4 x float> %x0, <4 x float> %x1, i8 %x3) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512fp8_mask_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmadd_ph_128: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) ret <4 x float> %res @@ -139,7 +178,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp8_maskz_cfcmadd_ph_128: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) ret <4 x float> %res @@ -148,8 +188,7 @@ define <4 x float> @test_int_x86_avx512fp8_cfcmadd_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512fp8_cfcmadd_ph_128: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcph %xmm0, %xmm1, %xmm2 -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %x2, <4 x float> %x1, <4 x float> %x0, i8 -1) ret <4 x float> %res @@ -163,7 +202,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1} +; CHECK-NEXT: vfcmaddcph %ymm1, %ymm0, %ymm2 {%k1} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) ret <8 x float> %res @@ -173,7 +213,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_256: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %ymm2, %ymm1, %ymm0 {%k1} {z} +; CHECK-NEXT: vfcmaddcph %ymm1, %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) ret <8 x float> %res @@ -182,8 +223,7 @@ define <8 x float> @test_int_x86_avx512fp16_cfcmadd_ph_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_256: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcph %ymm0, %ymm1, %ymm2 -; CHECK-NEXT: vmovaps %ymm2, %ymm0 +; CHECK-NEXT: vfcmaddcph %ymm1, %ymm2, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %x2, <8 x float> %x1, <8 x float> %x0, i8 -1) ret <8 x float> %res @@ -197,7 +237,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_ph_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1} +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) ret <16 x float> %res @@ -207,7 +248,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_ph_512: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcph %zmm2, %zmm1, %zmm0 {%k1} {z} +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3, i32 4) ret <16 x float> %res @@ -216,8 +258,7 @@ define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512_rn(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512_rn: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcph {rz-sae}, %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vfcmaddcph {rz-sae}, %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 11) ret <16 x float> %res @@ -226,8 +267,7 @@ define <16 x float> @test_int_x86_avx512fp16_cfcmadd_ph_512(<16 x float> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp16_cfcmadd_ph_512: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcph %zmm0, %zmm1, %zmm2 -; CHECK-NEXT: vmovaps %zmm2, %zmm0 +; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %x2, <16 x float> %x1, <16 x float> %x0, i16 -1, i32 4) ret <16 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll --- a/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512cfmul-intrinsics.ll @@ -14,6 +14,17 @@ ret <4 x float> %res } +define <4 x float> @test_int_x86_avx512fp8_mask_cfmul_ph_bst2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmul_ph_bst2: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfmulcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.128(<4 x float> , <4 x float> %x0, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512fp8_mask_cfmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfmul_ph_128: ; CHECK: ## %bb.0: @@ -122,6 +133,30 @@ declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmul_ph_bst(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmul_ph_bst: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vfcmulcph {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> %x0, <4 x float> , <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + +; Check conjugate complex FMUL is not commutable. +define <4 x float> @test_int_x86_avx512fp8_mask_cfcmul_ph_bst2(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ +; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmul_ph_bst2: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 +; CHECK-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; CHECK-NEXT: vfcmulcph %xmm0, %xmm1, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.128(<4 x float> , <4 x float> %x0, <4 x float> %x2, i8 %x3) + ret <4 x float> %res +} + define <4 x float> @test_int_x86_avx512fp8_mask_cfcmul_ph_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3){ ; CHECK-LABEL: test_int_x86_avx512fp8_mask_cfcmul_ph_128: ; CHECK: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll --- a/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512cfmulsh-instrinsics.ll @@ -35,7 +35,8 @@ define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfmadd_sh: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmaddcsh %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) ret <4 x float> %res @@ -44,7 +45,8 @@ define <4 x float> @test_nm_nr_int_x86_avx512fp16_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_nm_nr_int_x86_avx512fp16_cfcmadd_sh: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) ret <4 x float> %res @@ -75,7 +77,8 @@ define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfmadd_sh: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9) ret <4 x float> %res @@ -84,7 +87,8 @@ define <4 x float> @test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2) { ; CHECK-LABEL: test_nm_r_int_x86_avx512fp16_mask_cfcmadd_sh: ; CHECK: ## %bb.0: -; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 9) ret <4 x float> %res @@ -118,7 +122,8 @@ ; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmaddcsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) ret <4 x float> %res @@ -128,7 +133,8 @@ ; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_mask_cfcmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) ret <4 x float> %res @@ -162,7 +168,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) ret <4 x float> %res @@ -172,7 +179,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_mask_cfcmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) ret <4 x float> %res @@ -206,7 +214,8 @@ ; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfmaddcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) ret <4 x float> %res @@ -216,7 +225,8 @@ ; CHECK-LABEL: test_m_nr_int_x86_avx512fp16_maskz_cfcmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcsh %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfcmaddcsh %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) ret <4 x float> %res @@ -250,7 +260,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) ret <4 x float> %res @@ -260,7 +271,8 @@ ; CHECK-LABEL: test_int_x86_avx512fp16_maskz_cfcmadd_sh: ; CHECK: ## %bb.0: ; CHECK-NEXT: kmovd %edi, %k1 -; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: vfcmaddcsh {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 9) ret <4 x float> %res diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll --- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll @@ -4,7 +4,8 @@ define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmaddcph %zmm2, %zmm1, %zmm0 +; CHECK-NEXT: vfmaddcph %zmm1, %zmm0, %zmm2 +; CHECK-NEXT: vmovaps %zmm2, %zmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <32 x half> %lhs.coerce to <16 x float> @@ -18,7 +19,8 @@ define dso_local <16 x half> @test2(<16 x half> %acc.coerce, <16 x half> %lhs.coerce, <16 x half> %rhs.coerce) { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmaddcph %ymm2, %ymm1, %ymm0 +; CHECK-NEXT: vfmaddcph %ymm1, %ymm0, %ymm2 +; CHECK-NEXT: vmovaps %ymm2, %ymm0 ; CHECK-NEXT: retq entry: %0 = bitcast <16 x half> %lhs.coerce to <8 x float> @@ -32,7 +34,8 @@ define dso_local <8 x half> @test3(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce to <4 x float> @@ -47,7 +50,8 @@ define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce, <8 x half> %rhs.coerce) { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce to <4 x float> diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll --- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll @@ -4,7 +4,8 @@ define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test1: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vfcmaddcph %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32> @@ -20,7 +21,8 @@ define dso_local <32 x half> @test2(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test2: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vfcmaddcph %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <32 x half> %lhs.coerce.conj to <16 x i32> @@ -36,7 +38,8 @@ define dso_local <16 x half> @test3(<16 x half> %acc.coerce, <16 x half> %lhs.coerce.conj, <16 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vfcmaddcph %ymm2, %ymm0, %ymm1 +; CHECK-NEXT: vmovaps %ymm1, %ymm0 ; CHECK-NEXT: retq entry: %0 = bitcast <16 x half> %lhs.coerce.conj to <8 x i32> @@ -52,7 +55,8 @@ define dso_local <8 x half> @test4(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test4: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> @@ -68,7 +72,8 @@ define dso_local <8 x half> @test5(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test5: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> @@ -85,7 +90,8 @@ ; CHECK-LABEL: test6: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1 -; CHECK-NEXT: vfmaddcph %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: vfmaddcph %xmm1, %xmm0, %xmm2 +; CHECK-NEXT: vmovaps %xmm2, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> @@ -101,7 +107,8 @@ define dso_local <8 x half> @test7(<8 x half> %acc.coerce, <8 x half> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test7: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <8 x half> %lhs.coerce.conj to <4 x i32> @@ -117,7 +124,8 @@ define dso_local <8 x half> @test8(<8 x half> %acc.coerce, <4 x float> %lhs.coerce.conj, <8 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %xmm1, %xmm2, %xmm0 +; CHECK-NEXT: vfcmaddcph %xmm2, %xmm0, %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 ; CHECK-NEXT: retq entry: %0 = bitcast <4 x float> %lhs.coerce.conj to <4 x i32> @@ -133,7 +141,8 @@ define dso_local <32 x half> @test9(<32 x half> %acc.coerce, <8 x i64> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 { ; CHECK-LABEL: test9: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfcmaddcph %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: vfcmaddcph %zmm2, %zmm0, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq entry: %xor1.i = xor <8 x i64> %lhs.coerce.conj, diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll @@ -996,6 +996,14 @@ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) ret <16 x float> %2 } + +define <16 x float> @stack_fold_fmulcph_commute(<16 x float> %a0, <16 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulcph_commute: + ;CHECK: vfmulcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} declare <16 x float> @llvm.x86.avx512fp16.mask.vfmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) define <16 x float> @stack_fold_fmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) { @@ -1023,6 +1031,15 @@ %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> undef, i16 -1, i32 4) ret <16 x float> %2 } + +define <16 x float> @stack_fold_fcmulcph_commute(<16 x float> %a0, <16 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulcph_commute: + ;CHECK: vmovups {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Reload + ;CHECK: vfcmulcph {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float> %a1, <16 x float> %a0, <16 x float> undef, i16 -1, i32 4) + ret <16 x float> %2 +} declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmul.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) define <16 x float> @stack_fold_fcmulcph_mask(<16 x float> %a0, <16 x float> %a1, <16 x float>* %passthru, i16 %mask) { @@ -1047,7 +1064,15 @@ ;CHECK-LABEL: stack_fold_fmaddcph: ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @stack_fold_fmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ;CHECK-LABEL: stack_fold_fmaddcph_commute: + ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4) ret <16 x float> %2 } declare <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -1057,7 +1082,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <16 x float>, <16 x float>* %p - %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4) ret <16 x float> %2 } @@ -1066,7 +1091,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i16, i16* %mask - %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> zeroinitializer, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4) + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4) ret <16 x float> %3 } declare <16 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -1075,7 +1100,16 @@ ;CHECK-LABEL: stack_fold_fcmaddcph: ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 -1, i32 4) + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 -1, i32 4) + ret <16 x float> %2 +} + +define <16 x float> @stack_fold_fcmaddcph_commute(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) { + ;CHECK-LABEL: stack_fold_fcmaddcph_commute: + ;CHECK: vmovups {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Reload + ;CHECK: vfcmaddcph {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a2, <16 x float> %a1, <16 x float> %a0, i16 -1, i32 4) ret <16 x float> %2 } declare <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -1085,7 +1119,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <16 x float>, <16 x float>* %p - %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2, i16 %mask, i32 4) + %2 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> %a0, i16 %mask, i32 4) ret <16 x float> %2 } @@ -1094,7 +1128,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i16, i16* %mask - %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> zeroinitializer, <16 x float> %a1, <16 x float> %a2, i16 %2, i32 4) + %3 = call <16 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.512(<16 x float> %a1, <16 x float> %a2, <16 x float> zeroinitializer, i16 %2, i32 4) ret <16 x float> %3 } declare <16 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.512(<16 x float>, <16 x float>, <16 x float>, i16, i32) @@ -1106,6 +1140,14 @@ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ret <4 x float> %2 } + +define <4 x float> @stack_fold_fmulcsh_commute(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fmulcsh_commute: + ;CHECK: vfmulcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} declare <4 x float> @llvm.x86.avx512fp16.mask.vfmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @stack_fold_fmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { @@ -1133,6 +1175,15 @@ %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> undef, i8 -1, i32 4) ret <4 x float> %2 } + +define <4 x float> @stack_fold_fcmulcsh_commute(<4 x float> %a0, <4 x float> %a1) { + ;CHECK-LABEL: stack_fold_fcmulcsh_commute: + ;CHECK: vmovaps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Reload + ;CHECK: vfcmulcsh {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float> %a1, <4 x float> %a0, <4 x float> undef, i8 -1, i32 4) + ret <4 x float> %2 +} declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmul.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @stack_fold_fcmulcsh_mask(<4 x float> %a0, <4 x float> %a1, <4 x float>* %passthru, i8 %mask) { @@ -1157,7 +1208,15 @@ ;CHECK-LABEL: stack_fold_fmaddcsh: ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;CHECK-LABEL: stack_fold_fmaddcsh_commute: + ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4) ret <4 x float> %2 } declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) @@ -1167,7 +1226,7 @@ ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <4 x float>, <4 x float>* %p - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4) ret <4 x float> %2 } @@ -1176,7 +1235,7 @@ ;CHECK: vfmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4) + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4) ret <4 x float> %3 } declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) @@ -1185,7 +1244,16 @@ ;CHECK-LABEL: stack_fold_fcmaddcsh: ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1, i32 4) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1, i32 4) + ret <4 x float> %2 +} + +define <4 x float> @stack_fold_fcmaddcsh_commute(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { + ;CHECK-LABEL: stack_fold_fcmaddcsh_commute: + ;CHECK: vmovaps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Reload + ;CHECK: vfcmaddcsh {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} + %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a2, <4 x float> %a1, <4 x float> %a0, i8 -1, i32 4) ret <4 x float> %2 } declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) @@ -1195,7 +1263,7 @@ ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <4 x float>, <4 x float>* %p - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask, i32 4) ret <4 x float> %2 } @@ -1204,7 +1272,7 @@ ;CHECK: vfcmaddcsh {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2, i32 4) + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.csh(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2, i32 4) ret <4 x float> %3 } declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32) diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll @@ -646,7 +646,7 @@ ;CHECK-LABEL: stack_fold_fmaddc: ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1) ret <4 x float> %2 } declare <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) @@ -656,7 +656,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <4 x float>, <4 x float>* %p - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask) ret <4 x float> %2 } @@ -665,7 +665,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2) + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2) ret <4 x float> %3 } declare <4 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) @@ -674,7 +674,7 @@ ;CHECK-LABEL: stack_fold_fcmaddc: ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 -1) ret <4 x float> %2 } declare <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) @@ -684,7 +684,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <4 x float>, <4 x float>* %p - %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) + %2 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> %a0, i8 %mask) ret <4 x float> %2 } @@ -693,7 +693,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> zeroinitializer, <4 x float> %a1, <4 x float> %a2, i8 %2) + %3 = call <4 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.128(<4 x float> %a1, <4 x float> %a2, <4 x float> zeroinitializer, i8 %2) ret <4 x float> %3 } declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.128(<4 x float>, <4 x float>, <4 x float>, i8) @@ -756,7 +756,7 @@ ;CHECK-LABEL: stack_fold_fmaddc_ymm: ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1) + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1) ret <8 x float> %2 } declare <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) @@ -766,7 +766,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <8 x float>, <8 x float>* %p - %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask) ret <8 x float> %2 } @@ -775,7 +775,7 @@ ;CHECK: vfmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> zeroinitializer, <8 x float> %a1, <8 x float> %a2, i8 %2) + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2) ret <8 x float> %3 } declare <8 x float> @llvm.x86.avx512fp16.maskz.vfmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) @@ -784,7 +784,7 @@ ;CHECK-LABEL: stack_fold_fcmaddc_ymm: ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() - %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 -1) + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 -1) ret <8 x float> %2 } declare <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8) @@ -794,7 +794,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %a0 = load <8 x float>, <8 x float>* %p - %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) + %2 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> %a0, i8 %mask) ret <8 x float> %2 } @@ -803,7 +803,7 @@ ;CHECK: vfcmaddcph {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"() %2 = load i8, i8* %mask - %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> zeroinitializer, <8 x float> %a1, <8 x float> %a2, i8 %2) + %3 = call <8 x float> @llvm.x86.avx512fp16.mask.vfcmadd.cph.256(<8 x float> %a1, <8 x float> %a2, <8 x float> zeroinitializer, i8 %2) ret <8 x float> %3 } declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)