diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def index 98327ade17e8..974ba35b3233 100644 --- a/clang/include/clang/Basic/BuiltinsX86_64.def +++ b/clang/include/clang/Basic/BuiltinsX86_64.def @@ -1,126 +1,127 @@ //===--- BuiltinsX86_64.def - X86-64 Builtin function database --*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines the X86-64-specific builtin function database. Users of // this file must define the BUILTIN macro to make use of this information. // //===----------------------------------------------------------------------===// // The format of this database matches clang/Basic/Builtins.def. #if defined(BUILTIN) && !defined(TARGET_BUILTIN) # define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif #if defined(BUILTIN) && !defined(TARGET_HEADER_BUILTIN) # define TARGET_HEADER_BUILTIN(ID, TYPE, ATTRS, HEADER, LANG, FEATURE) BUILTIN(ID, TYPE, ATTRS) #endif TARGET_HEADER_BUILTIN(_BitScanForward64, "UcUNi*ULLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_BitScanReverse64, "UcUNi*ULLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__mulh, "LLiLLiLLi", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__umulh, "ULLiULLiULLi", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_mul128, "LLiLLiLLiLLi*", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_umul128, "ULLiULLiULLiULLi*", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__faststorefence, "v", "nh", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__shiftleft128, "ULLiULLiULLiUc", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(__shiftright128, "ULLiULLiULLiUc", "nch", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_InterlockedCompareExchange128, "UcLLiD*LLiLLiLLi*", "nh", "intrin.h", ALL_MS_LANGUAGES, "cx16") TARGET_BUILTIN(__builtin_ia32_readeflags_u64, "UOi", "n", "") TARGET_BUILTIN(__builtin_ia32_writeeflags_u64, "vUOi", "n", "") TARGET_BUILTIN(__builtin_ia32_cvtss2si64, "OiV4f", "ncV:128:", "sse") TARGET_BUILTIN(__builtin_ia32_cvttss2si64, "OiV4f", "ncV:128:", "sse") TARGET_BUILTIN(__builtin_ia32_cvtsd2si64, "OiV2d", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_cvttsd2si64, "OiV2d", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_movnti64, "vOi*Oi", "n", "sse2") TARGET_BUILTIN(__builtin_ia32_vec_ext_v2di, "OiV2OiIi", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_vec_set_v2di, "V2OiV2OiOiIi", "ncV:128:", "sse4.1") TARGET_BUILTIN(__builtin_ia32_crc32di, "UOiUOiUOi", "nc", "sse4.2") TARGET_BUILTIN(__builtin_ia32_vec_ext_v4di, "OiV4OiIi", "ncV:256:", "avx") TARGET_BUILTIN(__builtin_ia32_vec_set_v4di, "V4OiV4OiOiIi", "ncV:256:", "avx") TARGET_BUILTIN(__builtin_ia32_rdfsbase32, "Ui", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_rdfsbase64, "UOi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_rdgsbase32, "Ui", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_rdgsbase64, "UOi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_wrfsbase32, "vUi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_wrfsbase64, "vUOi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_wrgsbase32, "vUi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_wrgsbase64, "vUOi", "n", "fsgsbase") TARGET_BUILTIN(__builtin_ia32_fxrstor64, "vv*", "n", "fxsr") TARGET_BUILTIN(__builtin_ia32_fxsave64, "vv*", "n", "fxsr") TARGET_BUILTIN(__builtin_ia32_xsave64, "vv*UOi", "n", "xsave") TARGET_BUILTIN(__builtin_ia32_xrstor64, "vv*UOi", "n", "xsave") TARGET_BUILTIN(__builtin_ia32_xsaveopt64, "vv*UOi", "n", "xsaveopt") TARGET_BUILTIN(__builtin_ia32_xrstors64, "vv*UOi", "n", "xsaves") TARGET_BUILTIN(__builtin_ia32_xsavec64, "vv*UOi", "n", "xsavec") TARGET_BUILTIN(__builtin_ia32_xsaves64, "vv*UOi", "n", "xsaves") TARGET_BUILTIN(__builtin_ia32_incsspq, "vUOi", "n", "shstk") TARGET_BUILTIN(__builtin_ia32_rdsspq, "UOiUOi", "n", "shstk") TARGET_BUILTIN(__builtin_ia32_wrssq, "vUOiv*", "n", "shstk") TARGET_BUILTIN(__builtin_ia32_wrussq, "vUOiv*", "n", "shstk") TARGET_BUILTIN(__builtin_ia32_addcarryx_u64, "UcUcUOiUOiUOi*", "n", "") TARGET_BUILTIN(__builtin_ia32_subborrow_u64, "UcUcUOiUOiUOi*", "n", "") TARGET_BUILTIN(__builtin_ia32_rdrand64_step, "UiUOi*", "n", "rdrnd") TARGET_BUILTIN(__builtin_ia32_rdseed64_step, "UiUOi*", "n", "rdseed") TARGET_BUILTIN(__builtin_ia32_lzcnt_u64, "UOiUOi", "nc", "lzcnt") TARGET_BUILTIN(__builtin_ia32_bextr_u64, "UOiUOiUOi", "nc", "bmi") TARGET_BUILTIN(__builtin_ia32_tzcnt_u64, "UOiUOi", "nc", "") TARGET_BUILTIN(__builtin_ia32_bzhi_di, "UOiUOiUOi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_pdep_di, "UOiUOiUOi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_pext_di, "UOiUOiUOi", "nc", "bmi2") TARGET_BUILTIN(__builtin_ia32_bextri_u64, "UOiUOiIUOi", "nc", "tbm") TARGET_BUILTIN(__builtin_ia32_lwpins64, "UcUOiUiIUi", "n", "lwp") TARGET_BUILTIN(__builtin_ia32_lwpval64, "vUOiUiIUi", "n", "lwp") TARGET_BUILTIN(__builtin_ia32_vcvtsd2si64, "OiV2dIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtss2si64, "OiV4fIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvtss2usi64, "UOiV4fIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvttsd2si64, "OiV2dIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvttsd2usi64, "UOiV2dIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvttss2si64, "OiV4fIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_vcvttss2usi64, "UOiV4fIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtsi2sd64, "V2dV2dOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtsi2ss64, "V4fV4fOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtusi2sd64, "V2dV2dUOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_cvtusi2ss64, "V4fV4fUOiIi", "ncV:128:", "avx512f") TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri") // UINTR TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr") TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr") TARGET_BUILTIN(__builtin_ia32_testui, "Uc", "n", "uintr") TARGET_BUILTIN(__builtin_ia32_senduipi, "vUWi", "n", "uintr") // AMX internal builtin TARGET_BUILTIN(__builtin_ia32_tileloadd64_internal, "V256iUsUsvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd_internal, "V256iUsUsUsV256iV256iV256i", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tilestored64_internal, "vUsUsv*zV256i", "n", "amx-tile") +TARGET_BUILTIN(__builtin_ia32_tilezero_internal, "V256iUsUs", "n", "amx-tile") // AMX TARGET_BUILTIN(__builtin_ia32_tile_loadconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tile_storeconfig, "vvC*", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilerelease, "v", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilezero, "vUc", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloadd64, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tileloaddt164, "vIUcvC*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tilestored64, "vIUcv*z", "n", "amx-tile") TARGET_BUILTIN(__builtin_ia32_tdpbssd, "vIUcIUcIUc", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbsud, "vIUcIUcIUc", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbusd, "vIUcIUcIUc", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbuud, "vIUcIUcIUc", "n", "amx-int8") TARGET_BUILTIN(__builtin_ia32_tdpbf16ps, "vIUcIUcIUc", "n", "amx-bf16") TARGET_BUILTIN(__builtin_ia32_ptwrite64, "vUOi", "n", "ptwrite") #undef BUILTIN #undef TARGET_BUILTIN #undef TARGET_HEADER_BUILTIN diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h index 03a468ef15b1..901488a17e8c 100644 --- a/clang/lib/Headers/amxintrin.h +++ b/clang/lib/Headers/amxintrin.h @@ -1,273 +1,278 @@ /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== * * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. * See https://llvm.org/LICENSE.txt for license information. * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception * *===------------------------------------------------------------------------=== */ #ifndef __IMMINTRIN_H #error "Never use directly; include instead." #endif /* __IMMINTRIN_H */ #ifndef __AMXINTRIN_H #define __AMXINTRIN_H #ifdef __x86_64__ #define __DEFAULT_FN_ATTRS_TILE \ __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) /// Load tile configuration from a 64-byte memory location specified by /// "mem_addr". The tile configuration includes the tile type palette, the /// number of bytes per row, and the number of rows. If the specified /// palette_id is zero, that signifies the init state for both the tile /// config and the tile data, and the tiles are zeroed. Any invalid /// configurations will result in #GP fault. /// /// \headerfile /// /// This intrinsic corresponds to the LDTILECFG instruction. /// /// \param __config /// A pointer to 512-bits configuration static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_loadconfig(const void *__config) { __builtin_ia32_tile_loadconfig(__config); } /// Stores the current tile configuration to a 64-byte memory location /// specified by "mem_addr". The tile configuration includes the tile type /// palette, the number of bytes per row, and the number of rows. If tiles /// are not configured, all zeroes will be stored to memory. /// /// \headerfile /// /// This intrinsic corresponds to the STTILECFG instruction. /// /// \param __config /// A pointer to 512-bits configuration static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_storeconfig(void *__config) { __builtin_ia32_tile_storeconfig(__config); } /// Release the tile configuration to return to the init state, which /// releases all storage it currently holds. /// /// \headerfile /// /// This intrinsic corresponds to the TILERELEASE instruction. static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { __builtin_ia32_tilerelease(); } /// Load tile rows from memory specifieid by "base" address and "stride" into /// destination tile "dst" using the tile configuration previously configured /// via "_tile_loadconfig". /// /// \headerfile /// /// This intrinsic corresponds to the TILELOADD instruction. /// /// \param dst /// A destination tile. Max size is 1024 Bytes. /// \param base /// A pointer to base address. /// \param stride /// The stride between the rows' data to be loaded in memory. #define _tile_loadd(dst, base, stride) \ __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ (__SIZE_TYPE__)(stride)) /// Load tile rows from memory specifieid by "base" address and "stride" into /// destination tile "dst" using the tile configuration previously configured /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation /// that the data will likely not be reused in the near future and the data /// caching can be optimized accordingly. /// /// \headerfile /// /// This intrinsic corresponds to the TILELOADDT1 instruction. /// /// \param dst /// A destination tile. Max size is 1024 Bytes. /// \param base /// A pointer to base address. /// \param stride /// The stride between the rows' data to be loaded in memory. #define _tile_stream_loadd(dst, base, stride) \ __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ (__SIZE_TYPE__)(stride)) /// Store the tile specified by "src" to memory specifieid by "base" address and /// "stride" using the tile configuration previously configured via /// "_tile_loadconfig". /// /// \headerfile /// /// This intrinsic corresponds to the TILESTORED instruction. /// /// \param dst /// A destination tile. Max size is 1024 Bytes. /// \param base /// A pointer to base address. /// \param stride /// The stride between the rows' data to be stored in memory. #define _tile_stored(dst, base, stride) \ __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) /// Zero the tile specified by "tdest". /// /// \headerfile /// /// This intrinsic corresponds to the TILEZERO instruction. /// /// \param tile /// The destination tile to be zero. Max size is 1024 Bytes. #define _tile_zero(tile) __builtin_ia32_tilezero((tile)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", /// and store the 32-bit result back to tile "dst". /// /// \headerfile /// /// This intrinsic corresponds to the TDPBSSD instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src0 /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbssd(dst, src0, src1) \ __builtin_ia32_tdpbssd((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer /// in "dst", and store the 32-bit result back to tile "dst". /// /// \headerfile /// /// This intrinsic corresponds to the TDPBSUD instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src0 /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbsud(dst, src0, src1) \ __builtin_ia32_tdpbsud((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit /// results. Sum these 4 results with the corresponding 32-bit integer in "dst", /// and store the 32-bit result back to tile "dst". /// /// \headerfile /// /// This intrinsic corresponds to the TDPBUSD instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src0 /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbusd(dst, src0, src1) \ __builtin_ia32_tdpbusd((dst), (src0), (src1)) /// Compute dot-product of bytes in tiles with a source/destination accumulator. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in /// "dst", and store the 32-bit result back to tile "dst". /// /// \headerfile /// /// This intrinsic corresponds to the TDPBUUD instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src0 /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbuud(dst, src0, src1) \ __builtin_ia32_tdpbuud((dst), (src0), (src1)) /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and /// src1, accumulating the intermediate single-precision (32-bit) floating-point /// elements with elements in "dst", and store the 32-bit result back to tile /// "dst". /// /// \headerfile /// /// This intrinsic corresponds to the TDPBF16PS instruction. /// /// \param dst /// The destination tile. Max size is 1024 Bytes. /// \param src0 /// The 1st source tile. Max size is 1024 Bytes. /// \param src1 /// The 2nd source tile. Max size is 1024 Bytes. #define _tile_dpbf16ps(dst, src0, src1) \ __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) #define __DEFAULT_FN_ATTRS_INT8 \ __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 _tile_loadd_internal(unsigned short m, unsigned short n, const void *base, __SIZE_TYPE__ stride) { return __builtin_ia32_tileloadd64_internal(m, n, base, (__SIZE_TYPE__)(stride)); } static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, _tile1024i dst, _tile1024i src1, _tile1024i src2) { return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); } static __inline__ void __DEFAULT_FN_ATTRS_INT8 _tile_stored_internal(unsigned short m, unsigned short n, void *base, __SIZE_TYPE__ stride, _tile1024i tile) { return __builtin_ia32_tilestored64_internal(m, n, base, (__SIZE_TYPE__)(stride), tile); } typedef struct __tile1024i_str { const unsigned short row; const unsigned short col; _tile1024i tile; } __tile1024i; -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_loadd(__tile1024i *dst, const void *base, __SIZE_TYPE__ stride) { dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); } __DEFAULT_FN_ATTRS_INT8 static void __tile_dpbsud(__tile1024i *dst, __tile1024i src1, __tile1024i src2) { dst->tile = _tile_dpbssd_internal(src1.row, src2.col, src1.col, dst->tile, src1.tile, src2.tile); } -__DEFAULT_FN_ATTRS_INT8 +__DEFAULT_FN_ATTRS_TILE static void __tile_stored(void *base, __SIZE_TYPE__ stride, __tile1024i src) { _tile_stored_internal(src.row, src.col, base, stride, src.tile); } +__DEFAULT_FN_ATTRS_TILE +static void __tile_zero(__tile1024i *dst) { + dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); +} + #endif /* __x86_64__ */ #endif /* __AMXINTRIN_H */ diff --git a/clang/test/CodeGen/X86/amx_api.c b/clang/test/CodeGen/X86/amx_api.c index 52eb9542228d..55290f3fa6fb 100644 --- a/clang/test/CodeGen/X86/amx_api.c +++ b/clang/test/CodeGen/X86/amx_api.c @@ -1,54 +1,61 @@ // RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +avx512f -target-feature +amx-int8 \ // RUN: -target-feature +amx-bf16 -emit-llvm -o - -Werror -pedantic | FileCheck %s --check-prefixes=CHECK #include char buf[1024]; #define STRIDE 32 char buf2[1024]; // This is an example code and integration test. void test_api(int cond, short row, short col) { //CHECK-LABEL: @test_api //CHECK: call x86_amx @llvm.x86.tileloadd64.internal //CHECK: call x86_amx @llvm.x86.tdpbssd.internal //CHECK: call void @llvm.x86.tilestored64.internal __tile1024i a = {row, 8}; __tile1024i b = {8, col}; __tile1024i c = {row, col}; if (cond) { __tile_loadd(&a, buf, STRIDE); __tile_loadd(&b, buf, STRIDE); __tile_loadd(&c, buf, STRIDE); } else { __tile_loadd(&a, buf2, STRIDE); __tile_loadd(&b, buf2, STRIDE); __tile_loadd(&c, buf2, STRIDE); } __tile_dpbsud(&c, a, b); __tile_stored(buf, STRIDE, c); } void test_tile_loadd(short row, short col) { //CHECK-LABEL: @test_tile_loadd //CHECK: call x86_amx @llvm.x86.tileloadd64.internal //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> __tile1024i a = {row, col}; __tile_loadd(&a, buf, STRIDE); } void test_tile_dpbsud(__tile1024i a, __tile1024i b, __tile1024i c) { //CHECK-LABEL: @test_tile_dpbsud //CHECK: call x86_amx @llvm.x86.tdpbssd.internal //CHECK-NEXT: {{%.*}} = bitcast x86_amx {{%.*}} to <256 x i32> __tile_dpbsud(&c, a, b); } void test_tile_stored(__tile1024i c) { //CHECK-LABEL: @test_tile_stored //CHECK: {{%.*}} = bitcast <256 x i32> {{%.*}} to x86_amx //CHECK-NEXT: call void @llvm.x86.tilestored64.internal __tile_stored(buf, STRIDE, c); } + +void test_tile_zero(__tile1024i c) { + //CHECK-LABEL: @test_tile_zero + //CHECK: call x86_amx @llvm.x86.tilezero.internal + //CHECK-NEXT bitcast x86_amx {{%.*}} to <256 x i32> + __tile_zero(&c); +} diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index dba44523e153..68b076c594be 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -1,5074 +1,5078 @@ //===- IntrinsicsX86.td - Defines X86 intrinsics -----------*- tablegen -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines all of the X86-specific intrinsics. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // Interrupt traps let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_int : Intrinsic<[], [llvm_i8_ty], [ImmArg>]>; } //===----------------------------------------------------------------------===// // SEH intrinsics for Windows let TargetPrefix = "x86" in { def int_x86_seh_lsda : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty], [IntrNoMem]>; // Marks the EH registration node created in LLVM IR prior to code generation. def int_x86_seh_ehregnode : Intrinsic<[], [llvm_ptr_ty], []>; // Marks the EH guard slot node created in LLVM IR prior to code generation. def int_x86_seh_ehguard : Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // FLAGS. let TargetPrefix = "x86" in { def int_x86_flags_read_u32 : GCCBuiltin<"__builtin_ia32_readeflags_u32">, Intrinsic<[llvm_i32_ty], [], []>; def int_x86_flags_read_u64 : GCCBuiltin<"__builtin_ia32_readeflags_u64">, Intrinsic<[llvm_i64_ty], [], []>; def int_x86_flags_write_u32 : GCCBuiltin<"__builtin_ia32_writeeflags_u32">, Intrinsic<[], [llvm_i32_ty], []>; def int_x86_flags_write_u64 : GCCBuiltin<"__builtin_ia32_writeeflags_u64">, Intrinsic<[], [llvm_i64_ty], []>; } //===----------------------------------------------------------------------===// // Read Time Stamp Counter. let TargetPrefix = "x86" in { def int_x86_rdtsc : GCCBuiltin<"__builtin_ia32_rdtsc">, Intrinsic<[llvm_i64_ty], [], []>; def int_x86_rdtscp : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>; } // Read Performance-Monitoring Counter. let TargetPrefix = "x86" in { def int_x86_rdpmc : GCCBuiltin<"__builtin_ia32_rdpmc">, Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>; } // Read processor ID. let TargetPrefix = "x86" in { def int_x86_rdpid : GCCBuiltin<"__builtin_ia32_rdpid">, Intrinsic<[llvm_i32_ty], [], []>; } //===----------------------------------------------------------------------===// // CET SS let TargetPrefix = "x86" in { def int_x86_incsspd : GCCBuiltin<"__builtin_ia32_incsspd">, Intrinsic<[], [llvm_i32_ty], []>; def int_x86_incsspq : GCCBuiltin<"__builtin_ia32_incsspq">, Intrinsic<[], [llvm_i64_ty], []>; def int_x86_rdsspd : GCCBuiltin<"__builtin_ia32_rdsspd">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty], []>; def int_x86_rdsspq : GCCBuiltin<"__builtin_ia32_rdsspq">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; def int_x86_saveprevssp : GCCBuiltin<"__builtin_ia32_saveprevssp">, Intrinsic<[], [], []>; def int_x86_rstorssp : GCCBuiltin<"__builtin_ia32_rstorssp">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_wrssd : GCCBuiltin<"__builtin_ia32_wrssd">, Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>; def int_x86_wrssq : GCCBuiltin<"__builtin_ia32_wrssq">, Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>; def int_x86_wrussd : GCCBuiltin<"__builtin_ia32_wrussd">, Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>; def int_x86_wrussq : GCCBuiltin<"__builtin_ia32_wrussq">, Intrinsic<[], [llvm_i64_ty, llvm_ptr_ty], []>; def int_x86_setssbsy : GCCBuiltin<"__builtin_ia32_setssbsy">, Intrinsic<[], [], []>; def int_x86_clrssbsy : GCCBuiltin<"__builtin_ia32_clrssbsy">, Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // 3DNow! let TargetPrefix = "x86" in { def int_x86_3dnow_pavgusb : GCCBuiltin<"__builtin_ia32_pavgusb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pf2id : GCCBuiltin<"__builtin_ia32_pf2id">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfacc : GCCBuiltin<"__builtin_ia32_pfacc">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfadd : GCCBuiltin<"__builtin_ia32_pfadd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfcmpeq : GCCBuiltin<"__builtin_ia32_pfcmpeq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfcmpge : GCCBuiltin<"__builtin_ia32_pfcmpge">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfcmpgt : GCCBuiltin<"__builtin_ia32_pfcmpgt">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfmax : GCCBuiltin<"__builtin_ia32_pfmax">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfmin : GCCBuiltin<"__builtin_ia32_pfmin">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfmul : GCCBuiltin<"__builtin_ia32_pfmul">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfrcp : GCCBuiltin<"__builtin_ia32_pfrcp">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfrcpit1 : GCCBuiltin<"__builtin_ia32_pfrcpit1">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfrcpit2 : GCCBuiltin<"__builtin_ia32_pfrcpit2">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfrsqrt : GCCBuiltin<"__builtin_ia32_pfrsqrt">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfrsqit1 : GCCBuiltin<"__builtin_ia32_pfrsqit1">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfsub : GCCBuiltin<"__builtin_ia32_pfsub">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pfsubr : GCCBuiltin<"__builtin_ia32_pfsubr">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pi2fd : GCCBuiltin<"__builtin_ia32_pi2fd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnow_pmulhrw : GCCBuiltin<"__builtin_ia32_pmulhrw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // 3DNow! extensions let TargetPrefix = "x86" in { def int_x86_3dnowa_pf2iw : GCCBuiltin<"__builtin_ia32_pf2iw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnowa_pfnacc : GCCBuiltin<"__builtin_ia32_pfnacc">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnowa_pfpnacc : GCCBuiltin<"__builtin_ia32_pfpnacc">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnowa_pi2fw : GCCBuiltin<"__builtin_ia32_pi2fw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_3dnowa_pswapd : Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // SSE1 // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_rcp_ps : GCCBuiltin<"__builtin_ia32_rcpps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_rsqrt_ss : GCCBuiltin<"__builtin_ia32_rsqrtss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_rsqrt_ps : GCCBuiltin<"__builtin_ia32_rsqrtps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_min_ss : GCCBuiltin<"__builtin_ia32_minss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_min_ps : GCCBuiltin<"__builtin_ia32_minps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_max_ss : GCCBuiltin<"__builtin_ia32_maxss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_max_ps : GCCBuiltin<"__builtin_ia32_maxps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; } // Comparison ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; // NOTE: This comparison intrinsic is not used by clang as long as the // distinction in signaling behaviour is not implemented. def int_x86_sse_cmp_ps : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse_comieq_ss : GCCBuiltin<"__builtin_ia32_comieq">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_comilt_ss : GCCBuiltin<"__builtin_ia32_comilt">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_comile_ss : GCCBuiltin<"__builtin_ia32_comile">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_comigt_ss : GCCBuiltin<"__builtin_ia32_comigt">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_comige_ss : GCCBuiltin<"__builtin_ia32_comige">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_comineq_ss : GCCBuiltin<"__builtin_ia32_comineq">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomieq_ss : GCCBuiltin<"__builtin_ia32_ucomieq">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomilt_ss : GCCBuiltin<"__builtin_ia32_ucomilt">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomile_ss : GCCBuiltin<"__builtin_ia32_ucomile">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomigt_ss : GCCBuiltin<"__builtin_ia32_ucomigt">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomige_ss : GCCBuiltin<"__builtin_ia32_ucomige">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_ucomineq_ss : GCCBuiltin<"__builtin_ia32_ucomineq">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; } // Conversion ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_cvtss2si : GCCBuiltin<"__builtin_ia32_cvtss2si">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvtss2si64 : GCCBuiltin<"__builtin_ia32_cvtss2si64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvttss2si : GCCBuiltin<"__builtin_ia32_cvttss2si">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvttss2si64 : GCCBuiltin<"__builtin_ia32_cvttss2si64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvtps2pi : GCCBuiltin<"__builtin_ia32_cvtps2pi">, Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvttps2pi: GCCBuiltin<"__builtin_ia32_cvttps2pi">, Intrinsic<[llvm_x86mmx_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse_cvtpi2ps : GCCBuiltin<"__builtin_ia32_cvtpi2ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Cacheability support ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_sfence : GCCBuiltin<"__builtin_ia32_sfence">, Intrinsic<[], [], []>; } // Control register. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_stmxcsr : Intrinsic<[], [llvm_ptr_ty], [IntrWriteMem, IntrArgMemOnly, // This prevents reordering with ldmxcsr IntrHasSideEffects]>; def int_x86_sse_ldmxcsr : Intrinsic<[], [llvm_ptr_ty], // FIXME: LDMXCSR does not actually write to memory, // but intrinsic properties are generated incorrectly // for IntrReadMem+IntrHasSideEffects. [/*IntrReadMem,*/ IntrArgMemOnly, IntrHasSideEffects]>; } // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse_movmsk_ps : GCCBuiltin<"__builtin_ia32_movmskps">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // SSE2 // FP arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_min_pd : GCCBuiltin<"__builtin_ia32_minpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_max_sd : GCCBuiltin<"__builtin_ia32_maxsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_max_pd : GCCBuiltin<"__builtin_ia32_maxpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; } // FP comparison ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; // NOTE: This comparison intrinsic is not used by clang as long as the // distinction in signaling behaviour is not implemented. def int_x86_sse2_cmp_pd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse2_comieq_sd : GCCBuiltin<"__builtin_ia32_comisdeq">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_comilt_sd : GCCBuiltin<"__builtin_ia32_comisdlt">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_comile_sd : GCCBuiltin<"__builtin_ia32_comisdle">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_comigt_sd : GCCBuiltin<"__builtin_ia32_comisdgt">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_comige_sd : GCCBuiltin<"__builtin_ia32_comisdge">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_comineq_sd : GCCBuiltin<"__builtin_ia32_comisdneq">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomieq_sd : GCCBuiltin<"__builtin_ia32_ucomisdeq">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomilt_sd : GCCBuiltin<"__builtin_ia32_ucomisdlt">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomile_sd : GCCBuiltin<"__builtin_ia32_ucomisdle">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomigt_sd : GCCBuiltin<"__builtin_ia32_ucomisdgt">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomige_sd : GCCBuiltin<"__builtin_ia32_ucomisdge">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_ucomineq_sd : GCCBuiltin<"__builtin_ia32_ucomisdneq">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; } // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_psll_w : GCCBuiltin<"__builtin_ia32_psllw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_psll_d : GCCBuiltin<"__builtin_ia32_pslld128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sse2_psll_q : GCCBuiltin<"__builtin_ia32_psllq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_sse2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sse2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_sse2_psra_w : GCCBuiltin<"__builtin_ia32_psraw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_psra_d : GCCBuiltin<"__builtin_ia32_psrad128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. def int_x86_sse2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem]>; } // Conversion ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_cvtpd2dq : GCCBuiltin<"__builtin_ia32_cvtpd2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvttpd2dq : GCCBuiltin<"__builtin_ia32_cvttpd2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtpd2ps : GCCBuiltin<"__builtin_ia32_cvtpd2ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtps2dq : GCCBuiltin<"__builtin_ia32_cvtps2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse2_cvttps2dq : GCCBuiltin<"__builtin_ia32_cvttps2dq">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si : GCCBuiltin<"__builtin_ia32_cvtsd2si">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2si64 : GCCBuiltin<"__builtin_ia32_cvtsd2si64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvttsd2si : GCCBuiltin<"__builtin_ia32_cvttsd2si">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_cvttsd2si64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_cvtsd2ss : GCCBuiltin<"__builtin_ia32_cvtsd2ss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse_cvtpd2pi : GCCBuiltin<"__builtin_ia32_cvtpd2pi">, Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse_cvttpd2pi: GCCBuiltin<"__builtin_ia32_cvttpd2pi">, Intrinsic<[llvm_x86mmx_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse_cvtpi2pd : GCCBuiltin<"__builtin_ia32_cvtpi2pd">, Intrinsic<[llvm_v2f64_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sse2_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_movmsk_pd : GCCBuiltin<"__builtin_ia32_movmskpd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse2_pmovmskb_128 : GCCBuiltin<"__builtin_ia32_pmovmskb128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_sse2_maskmov_dqu : GCCBuiltin<"__builtin_ia32_maskmovdqu">, Intrinsic<[], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_ptr_ty], []>; def int_x86_sse2_clflush : GCCBuiltin<"__builtin_ia32_clflush">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_sse2_lfence : GCCBuiltin<"__builtin_ia32_lfence">, Intrinsic<[], [], []>; def int_x86_sse2_mfence : GCCBuiltin<"__builtin_ia32_mfence">, Intrinsic<[], [], []>; def int_x86_sse2_pause : GCCBuiltin<"__builtin_ia32_pause">, Intrinsic<[], [], []>; } //===----------------------------------------------------------------------===// // SSE3 // Addition / subtraction ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse3_addsub_ps : GCCBuiltin<"__builtin_ia32_addsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse3_addsub_pd : GCCBuiltin<"__builtin_ia32_addsubpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; } // Horizontal ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse3_hadd_ps : GCCBuiltin<"__builtin_ia32_haddps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse3_hadd_pd : GCCBuiltin<"__builtin_ia32_haddpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse3_hsub_ps : GCCBuiltin<"__builtin_ia32_hsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_sse3_hsub_pd : GCCBuiltin<"__builtin_ia32_hsubpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; } // Specialized unaligned load. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse3_ldu_dq : GCCBuiltin<"__builtin_ia32_lddqu">, Intrinsic<[llvm_v16i8_ty], [llvm_ptr_ty], [IntrReadMem]>; } // Thread synchronization ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse3_monitor : GCCBuiltin<"__builtin_ia32_monitor">, Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_sse3_mwait : GCCBuiltin<"__builtin_ia32_mwait">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty], []>; } //===----------------------------------------------------------------------===// // SSSE3 // Horizontal arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_w_128 : GCCBuiltin<"__builtin_ia32_phaddw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_d_128 : GCCBuiltin<"__builtin_ia32_phaddd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phadd_sw_128 : GCCBuiltin<"__builtin_ia32_phaddsw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_w_128 : GCCBuiltin<"__builtin_ia32_phsubw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_d_128 : GCCBuiltin<"__builtin_ia32_phsubd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_phsub_sw_128 : GCCBuiltin<"__builtin_ia32_phsubsw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_ssse3_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pmadd_ub_sw_128 : GCCBuiltin<"__builtin_ia32_pmaddubsw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; } // Packed multiply high with round and scale let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_ssse3_pmul_hr_sw_128 : GCCBuiltin<"__builtin_ia32_pmulhrsw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; } // Shuffle ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pshuf_b_128 : GCCBuiltin<"__builtin_ia32_pshufb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_sse_pshuf_w : GCCBuiltin<"__builtin_ia32_pshufw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Sign ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_psign_b : GCCBuiltin<"__builtin_ia32_psignb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_b_128 : GCCBuiltin<"__builtin_ia32_psignb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_ssse3_psign_w : GCCBuiltin<"__builtin_ia32_psignw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_w_128 : GCCBuiltin<"__builtin_ia32_psignw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_ssse3_psign_d : GCCBuiltin<"__builtin_ia32_psignd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_psign_d_128 : GCCBuiltin<"__builtin_ia32_psignd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; } // Absolute value ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_ssse3_pabs_b : GCCBuiltin<"__builtin_ia32_pabsb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pabs_w : GCCBuiltin<"__builtin_ia32_pabsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_ssse3_pabs_d : GCCBuiltin<"__builtin_ia32_pabsd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // SSE4.1 // FP rounding ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_round_ss : GCCBuiltin<"__builtin_ia32_roundss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse41_round_ps : GCCBuiltin<"__builtin_ia32_roundps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse41_round_sd : GCCBuiltin<"__builtin_ia32_roundsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse41_round_pd : GCCBuiltin<"__builtin_ia32_roundpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } // Vector min element let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_phminposuw : GCCBuiltin<"__builtin_ia32_phminposuw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty], [IntrNoMem]>; } // Advanced Encryption Standard (AES) Instructions let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_aesni_aesimc : GCCBuiltin<"__builtin_ia32_aesimc128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenc : GCCBuiltin<"__builtin_ia32_aesenc128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenc_256 : GCCBuiltin<"__builtin_ia32_aesenc256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenc_512 : GCCBuiltin<"__builtin_ia32_aesenc512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenclast : GCCBuiltin<"__builtin_ia32_aesenclast128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenclast_256 : GCCBuiltin<"__builtin_ia32_aesenclast256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_aesni_aesenclast_512 : GCCBuiltin<"__builtin_ia32_aesenclast512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdec : GCCBuiltin<"__builtin_ia32_aesdec128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdec_256 : GCCBuiltin<"__builtin_ia32_aesdec256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdec_512 : GCCBuiltin<"__builtin_ia32_aesdec512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdeclast : GCCBuiltin<"__builtin_ia32_aesdeclast128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdeclast_256 : GCCBuiltin<"__builtin_ia32_aesdeclast256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_aesni_aesdeclast_512 : GCCBuiltin<"__builtin_ia32_aesdeclast512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_aesni_aeskeygenassist : GCCBuiltin<"__builtin_ia32_aeskeygenassist128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // PCLMUL instructions let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_pclmulqdq : GCCBuiltin<"__builtin_ia32_pclmulqdq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_pclmulqdq_256 : GCCBuiltin<"__builtin_ia32_pclmulqdq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_pclmulqdq_512 : GCCBuiltin<"__builtin_ia32_pclmulqdq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Vector pack let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_packusdw : GCCBuiltin<"__builtin_ia32_packusdw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; } // Vector insert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_insertps : GCCBuiltin<"__builtin_ia32_insertps128">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Vector blend let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_v16i8_ty], [IntrNoMem]>; def int_x86_sse41_blendvpd : GCCBuiltin<"__builtin_ia32_blendvpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,llvm_v2f64_ty], [IntrNoMem]>; def int_x86_sse41_blendvps : GCCBuiltin<"__builtin_ia32_blendvps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,llvm_v4f32_ty], [IntrNoMem]>; } // Vector dot product let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_dppd : GCCBuiltin<"__builtin_ia32_dppd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, Commutative, ImmArg>]>; def int_x86_sse41_dpps : GCCBuiltin<"__builtin_ia32_dpps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, Commutative, ImmArg>]>; } // Vector sum of absolute differences let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty,llvm_i8_ty], [IntrNoMem, Commutative, ImmArg>]>; } // Test instruction with bitwise comparison. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse41_ptestz : GCCBuiltin<"__builtin_ia32_ptestz128">, Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_sse41_ptestc : GCCBuiltin<"__builtin_ia32_ptestc128">, Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_sse41_ptestnzc : GCCBuiltin<"__builtin_ia32_ptestnzc128">, Intrinsic<[llvm_i32_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // SSE4.2 // Miscellaneous // CRC Instruction let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse42_crc32_32_8 : GCCBuiltin<"__builtin_ia32_crc32qi">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_sse42_crc32_32_16 : GCCBuiltin<"__builtin_ia32_crc32hi">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_sse42_crc32_32_32 : GCCBuiltin<"__builtin_ia32_crc32si">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_sse42_crc32_64_64 : GCCBuiltin<"__builtin_ia32_crc32di">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } // String/text processing ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse42_pcmpistrm128 : GCCBuiltin<"__builtin_ia32_pcmpistrm128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistri128 : GCCBuiltin<"__builtin_ia32_pcmpistri128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistria128 : GCCBuiltin<"__builtin_ia32_pcmpistria128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistric128 : GCCBuiltin<"__builtin_ia32_pcmpistric128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistrio128 : GCCBuiltin<"__builtin_ia32_pcmpistrio128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistris128 : GCCBuiltin<"__builtin_ia32_pcmpistris128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpistriz128 : GCCBuiltin<"__builtin_ia32_pcmpistriz128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestrm128 : GCCBuiltin<"__builtin_ia32_pcmpestrm128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestri128 : GCCBuiltin<"__builtin_ia32_pcmpestri128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestria128 : GCCBuiltin<"__builtin_ia32_pcmpestria128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestric128 : GCCBuiltin<"__builtin_ia32_pcmpestric128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestrio128 : GCCBuiltin<"__builtin_ia32_pcmpestrio128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestris128 : GCCBuiltin<"__builtin_ia32_pcmpestris128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sse42_pcmpestriz128 : GCCBuiltin<"__builtin_ia32_pcmpestriz128">, Intrinsic<[llvm_i32_ty], [llvm_v16i8_ty, llvm_i32_ty, llvm_v16i8_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } //===----------------------------------------------------------------------===// // SSE4A let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse4a_extrqi : GCCBuiltin<"__builtin_ia32_extrqi">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_sse4a_extrq : GCCBuiltin<"__builtin_ia32_extrq">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_sse4a_insertqi : GCCBuiltin<"__builtin_ia32_insertqi">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_sse4a_insertq : GCCBuiltin<"__builtin_ia32_insertq">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // AVX // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_addsub_pd_256 : GCCBuiltin<"__builtin_ia32_addsubpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_addsub_ps_256 : GCCBuiltin<"__builtin_ia32_addsubps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_rcp_ps_256 : GCCBuiltin<"__builtin_ia32_rcpps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_round_pd_256 : GCCBuiltin<"__builtin_ia32_roundpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx_round_ps_256 : GCCBuiltin<"__builtin_ia32_roundps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } // Horizontal ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_hadd_pd_256 : GCCBuiltin<"__builtin_ia32_haddpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_hsub_ps_256 : GCCBuiltin<"__builtin_ia32_hsubps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_hsub_pd_256 : GCCBuiltin<"__builtin_ia32_hsubpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_hadd_ps_256 : GCCBuiltin<"__builtin_ia32_haddps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; } // Vector permutation let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_vpermilvar_pd : GCCBuiltin<"__builtin_ia32_vpermilvarpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx_vpermilvar_ps : GCCBuiltin<"__builtin_ia32_vpermilvarps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx_vpermilvar_pd_256 : GCCBuiltin<"__builtin_ia32_vpermilvarpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx_vpermilvar_ps_256 : GCCBuiltin<"__builtin_ia32_vpermilvarps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_d_128 : GCCBuiltin<"__builtin_ia32_vpermi2vard128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_d_256 : GCCBuiltin<"__builtin_ia32_vpermi2vard256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_d_512 : GCCBuiltin<"__builtin_ia32_vpermi2vard512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_hi_128 : GCCBuiltin<"__builtin_ia32_vpermi2varhi128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_hi_256 : GCCBuiltin<"__builtin_ia32_vpermi2varhi256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_hi_512 : GCCBuiltin<"__builtin_ia32_vpermi2varhi512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_pd_128 : GCCBuiltin<"__builtin_ia32_vpermi2varpd128">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2i64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_pd_256 : GCCBuiltin<"__builtin_ia32_vpermi2varpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4i64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_pd_512 : GCCBuiltin<"__builtin_ia32_vpermi2varpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8i64_ty, llvm_v8f64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_ps_128 : GCCBuiltin<"__builtin_ia32_vpermi2varps128">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_ps_256 : GCCBuiltin<"__builtin_ia32_vpermi2varps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_ps_512 : GCCBuiltin<"__builtin_ia32_vpermi2varps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16f32_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_q_128 : GCCBuiltin<"__builtin_ia32_vpermi2varq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_q_256 : GCCBuiltin<"__builtin_ia32_vpermi2varq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_q_512 : GCCBuiltin<"__builtin_ia32_vpermi2varq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_qi_128 : GCCBuiltin<"__builtin_ia32_vpermi2varqi128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_qi_256 : GCCBuiltin<"__builtin_ia32_vpermi2varqi256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_vpermi2var_qi_512 : GCCBuiltin<"__builtin_ia32_vpermi2varqi512">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_vpermilvar_pd_512 : GCCBuiltin<"__builtin_ia32_vpermilvarpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_vpermilvar_ps_512 : GCCBuiltin<"__builtin_ia32_vpermilvarps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_pshuf_b_512 : GCCBuiltin<"__builtin_ia32_pshufb512">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // GFNI Instructions let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_vgf2p8affineinvqb_128 : GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v16qi">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8affineinvqb_256 : GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v32qi">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8affineinvqb_512 : GCCBuiltin<"__builtin_ia32_vgf2p8affineinvqb_v64qi">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8affineqb_128 : GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v16qi">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8affineqb_256 : GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v32qi">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8affineqb_512 : GCCBuiltin<"__builtin_ia32_vgf2p8affineqb_v64qi">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_vgf2p8mulb_128 : GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v16qi">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_vgf2p8mulb_256 : GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v32qi">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_vgf2p8mulb_512 : GCCBuiltin<"__builtin_ia32_vgf2p8mulb_v64qi">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // Vector blend let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_blendv_pd_256 : GCCBuiltin<"__builtin_ia32_blendvpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_blendv_ps_256 : GCCBuiltin<"__builtin_ia32_blendvps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; } // Vector dot product let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_dp_ps_256 : GCCBuiltin<"__builtin_ia32_dpps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, Commutative, ImmArg>]>; } // Vector compare let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_cmp_pd_256 : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx_cmp_ps_256 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Vector convert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_cvt_pd2_ps_256 : GCCBuiltin<"__builtin_ia32_cvtpd2ps256">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_cvtt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2dq256">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvt_pd2dq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2dq256">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_cvtt_ps2dq_256 : GCCBuiltin<"__builtin_ia32_cvttps2dq256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; } // Vector bit test let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_vtestz_pd : GCCBuiltin<"__builtin_ia32_vtestzpd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_avx_vtestc_pd : GCCBuiltin<"__builtin_ia32_vtestcpd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_avx_vtestnzc_pd : GCCBuiltin<"__builtin_ia32_vtestnzcpd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_avx_vtestz_ps : GCCBuiltin<"__builtin_ia32_vtestzps">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx_vtestc_ps : GCCBuiltin<"__builtin_ia32_vtestcps">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx_vtestnzc_ps : GCCBuiltin<"__builtin_ia32_vtestnzcps">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx_vtestz_pd_256 : GCCBuiltin<"__builtin_ia32_vtestzpd256">, Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_vtestc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestcpd256">, Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_vtestnzc_pd_256 : GCCBuiltin<"__builtin_ia32_vtestnzcpd256">, Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_vtestz_ps_256 : GCCBuiltin<"__builtin_ia32_vtestzps256">, Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_vtestc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestcps256">, Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_vtestnzc_ps_256 : GCCBuiltin<"__builtin_ia32_vtestnzcps256">, Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx_ptestz_256 : GCCBuiltin<"__builtin_ia32_ptestz256">, Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx_ptestc_256 : GCCBuiltin<"__builtin_ia32_ptestc256">, Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx_ptestnzc_256 : GCCBuiltin<"__builtin_ia32_ptestnzc256">, Intrinsic<[llvm_i32_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_fpclass_pd_128 : Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_fpclass_pd_256 : Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_fpclass_pd_512 : Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_fpclass_ps_128 : Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_fpclass_ps_256 : Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_fpclass_ps_512 : Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fpclass_sd : GCCBuiltin<"__builtin_ia32_fpclasssd_mask">, Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fpclass_ss : GCCBuiltin<"__builtin_ia32_fpclassss_mask">, Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } // Vector extract sign mask let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_movmsk_pd_256 : GCCBuiltin<"__builtin_ia32_movmskpd256">, Intrinsic<[llvm_i32_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx_movmsk_ps_256 : GCCBuiltin<"__builtin_ia32_movmskps256">, Intrinsic<[llvm_i32_ty], [llvm_v8f32_ty], [IntrNoMem]>; } // Vector zero let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_vzeroall : GCCBuiltin<"__builtin_ia32_vzeroall">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; def int_x86_avx_vzeroupper : GCCBuiltin<"__builtin_ia32_vzeroupper">, Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; } // SIMD load ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_ldu_dq_256 : GCCBuiltin<"__builtin_ia32_lddqu256">, Intrinsic<[llvm_v32i8_ty], [llvm_ptr_ty], [IntrReadMem]>; } // Conditional load ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskload_pd : GCCBuiltin<"__builtin_ia32_maskloadpd">, Intrinsic<[llvm_v2f64_ty], [llvm_ptr_ty, llvm_v2i64_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx_maskload_ps : GCCBuiltin<"__builtin_ia32_maskloadps">, Intrinsic<[llvm_v4f32_ty], [llvm_ptr_ty, llvm_v4i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx_maskload_pd_256 : GCCBuiltin<"__builtin_ia32_maskloadpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx_maskload_ps_256 : GCCBuiltin<"__builtin_ia32_maskloadps256">, Intrinsic<[llvm_v8f32_ty], [llvm_ptr_ty, llvm_v8i32_ty], [IntrReadMem, IntrArgMemOnly]>; } // Conditional store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx_maskstore_pd : GCCBuiltin<"__builtin_ia32_maskstorepd">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty], [IntrArgMemOnly]>; def int_x86_avx_maskstore_ps : GCCBuiltin<"__builtin_ia32_maskstoreps">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty], [IntrArgMemOnly]>; def int_x86_avx_maskstore_pd_256 : GCCBuiltin<"__builtin_ia32_maskstorepd256">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty], [IntrArgMemOnly]>; def int_x86_avx_maskstore_ps_256 : GCCBuiltin<"__builtin_ia32_maskstoreps256">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty], [IntrArgMemOnly]>; } // BITALG bits shuffle let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vpshufbitqmb_128 : Intrinsic<[llvm_v16i1_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_vpshufbitqmb_256 : Intrinsic<[llvm_v32i1_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_vpshufbitqmb_512 : Intrinsic<[llvm_v64i1_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // AVX2 // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_psll_w : GCCBuiltin<"__builtin_ia32_psllw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_psll_d : GCCBuiltin<"__builtin_ia32_pslld256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx2_psll_q : GCCBuiltin<"__builtin_ia32_psllq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx2_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_psrl_d : GCCBuiltin<"__builtin_ia32_psrld256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx2_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx2_psra_w : GCCBuiltin<"__builtin_ia32_psraw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx2_psra_d : GCCBuiltin<"__builtin_ia32_psrad256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v4i32_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. def int_x86_avx2_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx2_psrai_d : GCCBuiltin<"__builtin_ia32_psradi256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psra_q_128 : GCCBuiltin<"__builtin_ia32_psraq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_psra_q_256 : GCCBuiltin<"__builtin_ia32_psraq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. def int_x86_avx512_psrai_q_128 : GCCBuiltin<"__builtin_ia32_psraqi128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrai_q_256 : GCCBuiltin<"__builtin_ia32_psraqi256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psll_w_512 : GCCBuiltin<"__builtin_ia32_psllw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psll_d_512 : GCCBuiltin<"__builtin_ia32_pslld512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_psll_q_512 : GCCBuiltin<"__builtin_ia32_psllq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_psrl_w_512 : GCCBuiltin<"__builtin_ia32_psrlw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psrl_d_512 : GCCBuiltin<"__builtin_ia32_psrld512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_psrl_q_512 : GCCBuiltin<"__builtin_ia32_psrlq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_psra_w_512 : GCCBuiltin<"__builtin_ia32_psraw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psra_d_512 : GCCBuiltin<"__builtin_ia32_psrad512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_psra_q_512 : GCCBuiltin<"__builtin_ia32_psraq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v2i64_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. def int_x86_avx512_pslli_w_512 : GCCBuiltin<"__builtin_ia32_psllwi512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pslli_d_512 : GCCBuiltin<"__builtin_ia32_pslldi512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pslli_q_512 : GCCBuiltin<"__builtin_ia32_psllqi512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrli_w_512 : GCCBuiltin<"__builtin_ia32_psrlwi512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrli_d_512 : GCCBuiltin<"__builtin_ia32_psrldi512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrli_q_512 : GCCBuiltin<"__builtin_ia32_psrlqi512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrai_w_512 : GCCBuiltin<"__builtin_ia32_psrawi512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrai_d_512 : GCCBuiltin<"__builtin_ia32_psradi512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_psrai_q_512 : GCCBuiltin<"__builtin_ia32_psraqi512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_pmultishift_qb_128: GCCBuiltin<"__builtin_ia32_vpmultishiftqb128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_pmultishift_qb_256: GCCBuiltin<"__builtin_ia32_vpmultishiftqb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_pmultishift_qb_512: GCCBuiltin<"__builtin_ia32_vpmultishiftqb512">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; } // Pack ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_packsswb : GCCBuiltin<"__builtin_ia32_packsswb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_packssdw : GCCBuiltin<"__builtin_ia32_packssdw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_packuswb : GCCBuiltin<"__builtin_ia32_packuswb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_packusdw : GCCBuiltin<"__builtin_ia32_packusdw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; } // Horizontal arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_phadd_w : GCCBuiltin<"__builtin_ia32_phaddw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_phadd_d : GCCBuiltin<"__builtin_ia32_phaddd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_phadd_sw : GCCBuiltin<"__builtin_ia32_phaddsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_phsub_w : GCCBuiltin<"__builtin_ia32_phsubw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_phsub_d : GCCBuiltin<"__builtin_ia32_phsubd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_phsub_sw : GCCBuiltin<"__builtin_ia32_phsubsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmadd_ub_sw : GCCBuiltin<"__builtin_ia32_pmaddubsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; } // Sign ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_psign_b : GCCBuiltin<"__builtin_ia32_psignb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx2_psign_w : GCCBuiltin<"__builtin_ia32_psignw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_psign_d : GCCBuiltin<"__builtin_ia32_psignd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; } // Packed multiply high with round and scale let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmul_hr_sw : GCCBuiltin<"__builtin_ia32_pmulhrsw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx512_pmul_hr_sw_512 : GCCBuiltin<"__builtin_ia32_pmulhrsw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; } // Vector blend let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pblendvb : GCCBuiltin<"__builtin_ia32_pblendvb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; } // Vector permutation let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_permd : GCCBuiltin<"__builtin_ia32_permvarsi256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_permps : GCCBuiltin<"__builtin_ia32_permvarsf256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty], [IntrNoMem]>; } // Conditional load ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_maskload_d : GCCBuiltin<"__builtin_ia32_maskloadd">, Intrinsic<[llvm_v4i32_ty], [llvm_ptr_ty, llvm_v4i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx2_maskload_q : GCCBuiltin<"__builtin_ia32_maskloadq">, Intrinsic<[llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx2_maskload_d_256 : GCCBuiltin<"__builtin_ia32_maskloadd256">, Intrinsic<[llvm_v8i32_ty], [llvm_ptr_ty, llvm_v8i32_ty], [IntrReadMem, IntrArgMemOnly]>; def int_x86_avx2_maskload_q_256 : GCCBuiltin<"__builtin_ia32_maskloadq256">, Intrinsic<[llvm_v4i64_ty], [llvm_ptr_ty, llvm_v4i64_ty], [IntrReadMem, IntrArgMemOnly]>; } // Conditional store ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_maskstore_d : GCCBuiltin<"__builtin_ia32_maskstored">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrArgMemOnly]>; def int_x86_avx2_maskstore_q : GCCBuiltin<"__builtin_ia32_maskstoreq">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrArgMemOnly]>; def int_x86_avx2_maskstore_d_256 : GCCBuiltin<"__builtin_ia32_maskstored256">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrArgMemOnly]>; def int_x86_avx2_maskstore_q_256 : GCCBuiltin<"__builtin_ia32_maskstoreq256">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrArgMemOnly]>; } // Variable bit shift ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_psllv_d : GCCBuiltin<"__builtin_ia32_psllv4si">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx2_psllv_d_256 : GCCBuiltin<"__builtin_ia32_psllv8si">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_psllv_q : GCCBuiltin<"__builtin_ia32_psllv2di">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx2_psllv_q_256 : GCCBuiltin<"__builtin_ia32_psllv4di">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_psllv_d_512 : GCCBuiltin<"__builtin_ia32_psllv16si">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_psllv_q_512 : GCCBuiltin<"__builtin_ia32_psllv8di">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx2_psrlv_d : GCCBuiltin<"__builtin_ia32_psrlv4si">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx2_psrlv_d_256 : GCCBuiltin<"__builtin_ia32_psrlv8si">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx2_psrlv_q : GCCBuiltin<"__builtin_ia32_psrlv2di">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx2_psrlv_q_256 : GCCBuiltin<"__builtin_ia32_psrlv4di">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_psrlv_d_512 : GCCBuiltin<"__builtin_ia32_psrlv16si">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_psrlv_q_512 : GCCBuiltin<"__builtin_ia32_psrlv8di">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx2_psrav_d : GCCBuiltin<"__builtin_ia32_psrav4si">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx2_psrav_d_256 : GCCBuiltin<"__builtin_ia32_psrav8si">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_psrav_d_512 : GCCBuiltin<"__builtin_ia32_psrav16si">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_psrav_q_128 : GCCBuiltin<"__builtin_ia32_psravq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_psrav_q_256 : GCCBuiltin<"__builtin_ia32_psravq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_psrav_q_512 : GCCBuiltin<"__builtin_ia32_psrav8di">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_psllv_w_128 : GCCBuiltin<"__builtin_ia32_psllv8hi">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psllv_w_256 : GCCBuiltin<"__builtin_ia32_psllv16hi">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_psllv_w_512 : GCCBuiltin<"__builtin_ia32_psllv32hi">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_psrlv_w_128 : GCCBuiltin<"__builtin_ia32_psrlv8hi">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psrlv_w_256 : GCCBuiltin<"__builtin_ia32_psrlv16hi">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_psrlv_w_512 : GCCBuiltin<"__builtin_ia32_psrlv32hi">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_psrav_w_128 : GCCBuiltin<"__builtin_ia32_psrav8hi">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_psrav_w_256 : GCCBuiltin<"__builtin_ia32_psrav16hi">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_psrav_w_512 : GCCBuiltin<"__builtin_ia32_psrav32hi">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; } // Gather ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx2_gather_d_pd : GCCBuiltin<"__builtin_ia32_gatherd_pd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_pd_256 : GCCBuiltin<"__builtin_ia32_gatherd_pd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_pd : GCCBuiltin<"__builtin_ia32_gatherq_pd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_pd_256 : GCCBuiltin<"__builtin_ia32_gatherq_pd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_ps : GCCBuiltin<"__builtin_ia32_gatherd_ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_ps_256 : GCCBuiltin<"__builtin_ia32_gatherd_ps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_ps : GCCBuiltin<"__builtin_ia32_gatherq_ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_ps_256 : GCCBuiltin<"__builtin_ia32_gatherq_ps256">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_q : GCCBuiltin<"__builtin_ia32_gatherd_q">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_q_256 : GCCBuiltin<"__builtin_ia32_gatherd_q256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_q : GCCBuiltin<"__builtin_ia32_gatherq_q">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_q_256 : GCCBuiltin<"__builtin_ia32_gatherq_q256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_d : GCCBuiltin<"__builtin_ia32_gatherd_d">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_d_d_256 : GCCBuiltin<"__builtin_ia32_gatherd_d256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_d : GCCBuiltin<"__builtin_ia32_gatherq_d">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx2_gather_q_d_256 : GCCBuiltin<"__builtin_ia32_gatherq_d256">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrReadMem, ImmArg>]>; } // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb256">, Intrinsic<[llvm_i32_ty], [llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx2_pshuf_b : GCCBuiltin<"__builtin_ia32_pshufb256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx2_mpsadbw : GCCBuiltin<"__builtin_ia32_mpsadbw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i8_ty], [IntrNoMem, Commutative, ImmArg>]>; } //===----------------------------------------------------------------------===// // FMA3 and FMA4 let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_fma_vfmaddsub_ps : GCCBuiltin<"__builtin_ia32_vfmaddsubps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_fma_vfmaddsub_pd : GCCBuiltin<"__builtin_ia32_vfmaddsubpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; def int_x86_fma_vfmaddsub_ps_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_fma_vfmaddsub_pd_256 : GCCBuiltin<"__builtin_ia32_vfmaddsubpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty], [IntrNoMem]>; def int_x86_avx512_vfmadd_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vfmadd_ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vfmaddsub_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vfmaddsub_ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vfmadd_f64 : Intrinsic<[llvm_double_ty], [llvm_double_ty, llvm_double_ty, llvm_double_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vfmadd_f32 : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vpmadd52h_uq_128 : GCCBuiltin<"__builtin_ia32_vpmadd52huq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_vpmadd52l_uq_128 : GCCBuiltin<"__builtin_ia32_vpmadd52luq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_vpmadd52h_uq_256 : GCCBuiltin<"__builtin_ia32_vpmadd52huq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_vpmadd52l_uq_256 : GCCBuiltin<"__builtin_ia32_vpmadd52luq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_vpmadd52h_uq_512 : GCCBuiltin<"__builtin_ia32_vpmadd52huq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_vpmadd52l_uq_512 : GCCBuiltin<"__builtin_ia32_vpmadd52luq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; } // VNNI let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_vpdpbusd_128 : GCCBuiltin<"__builtin_ia32_vpdpbusd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusd_256 : GCCBuiltin<"__builtin_ia32_vpdpbusd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusd_512 : GCCBuiltin<"__builtin_ia32_vpdpbusd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_128 : GCCBuiltin<"__builtin_ia32_vpdpbusds128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_256 : GCCBuiltin<"__builtin_ia32_vpdpbusds256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpbusds_512 : GCCBuiltin<"__builtin_ia32_vpdpbusds512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_128 : GCCBuiltin<"__builtin_ia32_vpdpwssd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_256 : GCCBuiltin<"__builtin_ia32_vpdpwssd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssd_512 : GCCBuiltin<"__builtin_ia32_vpdpwssd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_128 : GCCBuiltin<"__builtin_ia32_vpdpwssds128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_256 : GCCBuiltin<"__builtin_ia32_vpdpwssds256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vpdpwssds_512 : GCCBuiltin<"__builtin_ia32_vpdpwssds512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // XOP let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_xop_vpermil2pd_256 : GCCBuiltin<"__builtin_ia32_vpermil2pd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_xop_vpermil2ps_256 : GCCBuiltin<"__builtin_ia32_vpermil2ps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_ps : GCCBuiltin<"__builtin_ia32_vfrczps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_sd : GCCBuiltin<"__builtin_ia32_vfrczsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_ss : GCCBuiltin<"__builtin_ia32_vfrczss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_pd_256 : GCCBuiltin<"__builtin_ia32_vfrczpd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_xop_vphaddbd : GCCBuiltin<"__builtin_ia32_vphaddbd">, Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphaddbq : GCCBuiltin<"__builtin_ia32_vphaddbq">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphaddbw : GCCBuiltin<"__builtin_ia32_vphaddbw">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphadddq : GCCBuiltin<"__builtin_ia32_vphadddq">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>; def int_x86_xop_vphaddubd : GCCBuiltin<"__builtin_ia32_vphaddubd">, Intrinsic<[llvm_v4i32_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphaddubq : GCCBuiltin<"__builtin_ia32_vphaddubq">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphaddubw : GCCBuiltin<"__builtin_ia32_vphaddubw">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphaddudq : GCCBuiltin<"__builtin_ia32_vphaddudq">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>; def int_x86_xop_vphadduwd : GCCBuiltin<"__builtin_ia32_vphadduwd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vphadduwq : GCCBuiltin<"__builtin_ia32_vphadduwq">, Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vphaddwd : GCCBuiltin<"__builtin_ia32_vphaddwd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vphaddwq : GCCBuiltin<"__builtin_ia32_vphaddwq">, Intrinsic<[llvm_v2i64_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vphsubbw : GCCBuiltin<"__builtin_ia32_vphsubbw">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vphsubdq : GCCBuiltin<"__builtin_ia32_vphsubdq">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty], [IntrNoMem]>; def int_x86_xop_vphsubwd : GCCBuiltin<"__builtin_ia32_vphsubwd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vpmacsdd : GCCBuiltin<"__builtin_ia32_vpmacsdd">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacsdqh : GCCBuiltin<"__builtin_ia32_vpmacsdqh">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacsdql : GCCBuiltin<"__builtin_ia32_vpmacsdql">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacssdd : GCCBuiltin<"__builtin_ia32_vpmacssdd">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacssdqh : GCCBuiltin<"__builtin_ia32_vpmacssdqh">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacssdql : GCCBuiltin<"__builtin_ia32_vpmacssdql">, Intrinsic<[llvm_v2i64_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v2i64_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacsswd : GCCBuiltin<"__builtin_ia32_vpmacsswd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacssww : GCCBuiltin<"__builtin_ia32_vpmacssww">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacswd : GCCBuiltin<"__builtin_ia32_vpmacswd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmacsww : GCCBuiltin<"__builtin_ia32_vpmacsww">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmadcsswd : GCCBuiltin<"__builtin_ia32_vpmadcsswd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpmadcswd : GCCBuiltin<"__builtin_ia32_vpmadcswd">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty, llvm_v4i32_ty], [IntrNoMem, Commutative]>; def int_x86_xop_vpperm : GCCBuiltin<"__builtin_ia32_vpperm">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vpshab : GCCBuiltin<"__builtin_ia32_vpshab">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vpshad : GCCBuiltin<"__builtin_ia32_vpshad">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_xop_vpshaq : GCCBuiltin<"__builtin_ia32_vpshaq">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_xop_vpshaw : GCCBuiltin<"__builtin_ia32_vpshaw">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_xop_vpshlb : GCCBuiltin<"__builtin_ia32_vpshlb">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_xop_vpshld : GCCBuiltin<"__builtin_ia32_vpshld">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_xop_vpshlq : GCCBuiltin<"__builtin_ia32_vpshlq">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_xop_vpshlw : GCCBuiltin<"__builtin_ia32_vpshlw">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // LWP let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_llwpcb : GCCBuiltin<"__builtin_ia32_llwpcb">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_slwpcb : GCCBuiltin<"__builtin_ia32_slwpcb">, Intrinsic<[llvm_ptr_ty], [], []>; def int_x86_lwpins32 : GCCBuiltin<"__builtin_ia32_lwpins32">, Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_lwpins64 : GCCBuiltin<"__builtin_ia32_lwpins64">, Intrinsic<[llvm_i8_ty], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_lwpval32 : GCCBuiltin<"__builtin_ia32_lwpval32">, Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_lwpval64 : GCCBuiltin<"__builtin_ia32_lwpval64">, Intrinsic<[], [llvm_i64_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>]>; } //===----------------------------------------------------------------------===// // MMX // Empty MMX state op. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_emms : GCCBuiltin<"__builtin_ia32_emms">, Intrinsic<[], [], []>; def int_x86_mmx_femms : GCCBuiltin<"__builtin_ia32_femms">, Intrinsic<[], [], []>; } // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Addition def int_x86_mmx_padd_b : GCCBuiltin<"__builtin_ia32_paddb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_padd_w : GCCBuiltin<"__builtin_ia32_paddw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_padd_d : GCCBuiltin<"__builtin_ia32_paddd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_padd_q : GCCBuiltin<"__builtin_ia32_paddq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_padds_b : GCCBuiltin<"__builtin_ia32_paddsb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_padds_w : GCCBuiltin<"__builtin_ia32_paddsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_paddus_b : GCCBuiltin<"__builtin_ia32_paddusb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_paddus_w : GCCBuiltin<"__builtin_ia32_paddusw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Subtraction def int_x86_mmx_psub_b : GCCBuiltin<"__builtin_ia32_psubb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psub_w : GCCBuiltin<"__builtin_ia32_psubw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psub_d : GCCBuiltin<"__builtin_ia32_psubd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psub_q : GCCBuiltin<"__builtin_ia32_psubq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psubus_b : GCCBuiltin<"__builtin_ia32_psubusb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psubus_w : GCCBuiltin<"__builtin_ia32_psubusw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; // Multiplication def int_x86_mmx_pmulh_w : GCCBuiltin<"__builtin_ia32_pmulhw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmull_w : GCCBuiltin<"__builtin_ia32_pmullw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmulu_dq : GCCBuiltin<"__builtin_ia32_pmuludq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Bitwise operations def int_x86_mmx_pand : GCCBuiltin<"__builtin_ia32_pand">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pandn : GCCBuiltin<"__builtin_ia32_pandn">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_por : GCCBuiltin<"__builtin_ia32_por">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pxor : GCCBuiltin<"__builtin_ia32_pxor">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Averages def int_x86_mmx_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Maximum def int_x86_mmx_pmaxu_b : GCCBuiltin<"__builtin_ia32_pmaxub">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmaxs_w : GCCBuiltin<"__builtin_ia32_pmaxsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Minimum def int_x86_mmx_pminu_b : GCCBuiltin<"__builtin_ia32_pminub">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pmins_w : GCCBuiltin<"__builtin_ia32_pminsw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; // Packed sum of absolute differences def int_x86_mmx_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; } // Integer shift ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // Shift left logical def int_x86_mmx_psll_w : GCCBuiltin<"__builtin_ia32_psllw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psll_d : GCCBuiltin<"__builtin_ia32_pslld">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psll_q : GCCBuiltin<"__builtin_ia32_psllq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psrl_w : GCCBuiltin<"__builtin_ia32_psrlw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psrl_d : GCCBuiltin<"__builtin_ia32_psrld">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psrl_q : GCCBuiltin<"__builtin_ia32_psrlq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psra_w : GCCBuiltin<"__builtin_ia32_psraw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_psra_d : GCCBuiltin<"__builtin_ia32_psrad">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; // Oddly these don't require an immediate due to a gcc compatibility issue. def int_x86_mmx_pslli_w : GCCBuiltin<"__builtin_ia32_psllwi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_pslli_d : GCCBuiltin<"__builtin_ia32_pslldi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_pslli_q : GCCBuiltin<"__builtin_ia32_psllqi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_psrli_w : GCCBuiltin<"__builtin_ia32_psrlwi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_psrli_d : GCCBuiltin<"__builtin_ia32_psrldi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_psrli_q : GCCBuiltin<"__builtin_ia32_psrlqi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_psrai_w : GCCBuiltin<"__builtin_ia32_psrawi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_mmx_psrai_d : GCCBuiltin<"__builtin_ia32_psradi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem]>; } // Permute let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_permvar_df_256 : GCCBuiltin<"__builtin_ia32_permvardf256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_permvar_df_512 : GCCBuiltin<"__builtin_ia32_permvardf512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_permvar_di_256 : GCCBuiltin<"__builtin_ia32_permvardi256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_permvar_di_512 : GCCBuiltin<"__builtin_ia32_permvardi512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_permvar_hi_128 : GCCBuiltin<"__builtin_ia32_permvarhi128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem]>; def int_x86_avx512_permvar_hi_256 : GCCBuiltin<"__builtin_ia32_permvarhi256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx512_permvar_hi_512 : GCCBuiltin<"__builtin_ia32_permvarhi512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_permvar_qi_128 : GCCBuiltin<"__builtin_ia32_permvarqi128">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem]>; def int_x86_avx512_permvar_qi_256 : GCCBuiltin<"__builtin_ia32_permvarqi256">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem]>; def int_x86_avx512_permvar_qi_512 : GCCBuiltin<"__builtin_ia32_permvarqi512">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_permvar_sf_512 : GCCBuiltin<"__builtin_ia32_permvarsf512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_permvar_si_512 : GCCBuiltin<"__builtin_ia32_permvarsi512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; } // Pack ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_packsswb : GCCBuiltin<"__builtin_ia32_packsswb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_packssdw : GCCBuiltin<"__builtin_ia32_packssdw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_packuswb : GCCBuiltin<"__builtin_ia32_packuswb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Unpacking ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_punpckhbw : GCCBuiltin<"__builtin_ia32_punpckhbw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_punpckhwd : GCCBuiltin<"__builtin_ia32_punpckhwd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_punpckhdq : GCCBuiltin<"__builtin_ia32_punpckhdq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_punpcklbw : GCCBuiltin<"__builtin_ia32_punpcklbw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_punpcklwd : GCCBuiltin<"__builtin_ia32_punpcklwd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_punpckldq : GCCBuiltin<"__builtin_ia32_punpckldq">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Integer comparison ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_pcmpeq_b : GCCBuiltin<"__builtin_ia32_pcmpeqb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pcmpeq_w : GCCBuiltin<"__builtin_ia32_pcmpeqw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pcmpeq_d : GCCBuiltin<"__builtin_ia32_pcmpeqd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem, Commutative]>; def int_x86_mmx_pcmpgt_b : GCCBuiltin<"__builtin_ia32_pcmpgtb">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_pcmpgt_w : GCCBuiltin<"__builtin_ia32_pcmpgtw">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_pcmpgt_d : GCCBuiltin<"__builtin_ia32_pcmpgtd">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty], [IntrNoMem]>; } // Misc. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_mmx_maskmovq : GCCBuiltin<"__builtin_ia32_maskmovq">, Intrinsic<[], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_ptr_ty], []>; def int_x86_mmx_pmovmskb : GCCBuiltin<"__builtin_ia32_pmovmskb">, Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty], [IntrNoMem]>; def int_x86_mmx_movnt_dq : GCCBuiltin<"__builtin_ia32_movntq">, Intrinsic<[], [llvm_ptrx86mmx_ty, llvm_x86mmx_ty], []>; def int_x86_mmx_palignr_b : GCCBuiltin<"__builtin_ia32_palignr">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_x86mmx_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_mmx_pextr_w : GCCBuiltin<"__builtin_ia32_vec_ext_v4hi">, Intrinsic<[llvm_i32_ty], [llvm_x86mmx_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_mmx_pinsr_w : GCCBuiltin<"__builtin_ia32_vec_set_v4hi">, Intrinsic<[llvm_x86mmx_ty], [llvm_x86mmx_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } //===----------------------------------------------------------------------===// // BMI let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_bmi_bextr_32 : GCCBuiltin<"__builtin_ia32_bextr_u32">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_bmi_bextr_64 : GCCBuiltin<"__builtin_ia32_bextr_u64">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_bmi_bzhi_32 : GCCBuiltin<"__builtin_ia32_bzhi_si">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_bmi_bzhi_64 : GCCBuiltin<"__builtin_ia32_bzhi_di">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_bmi_pdep_32 : GCCBuiltin<"__builtin_ia32_pdep_si">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_bmi_pdep_64 : GCCBuiltin<"__builtin_ia32_pdep_di">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_bmi_pext_32 : GCCBuiltin<"__builtin_ia32_pext_si">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_bmi_pext_64 : GCCBuiltin<"__builtin_ia32_pext_di">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // FS/GS Base let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_rdfsbase_32 : GCCBuiltin<"__builtin_ia32_rdfsbase32">, Intrinsic<[llvm_i32_ty], []>; def int_x86_rdgsbase_32 : GCCBuiltin<"__builtin_ia32_rdgsbase32">, Intrinsic<[llvm_i32_ty], []>; def int_x86_rdfsbase_64 : GCCBuiltin<"__builtin_ia32_rdfsbase64">, Intrinsic<[llvm_i64_ty], []>; def int_x86_rdgsbase_64 : GCCBuiltin<"__builtin_ia32_rdgsbase64">, Intrinsic<[llvm_i64_ty], []>; def int_x86_wrfsbase_32 : GCCBuiltin<"__builtin_ia32_wrfsbase32">, Intrinsic<[], [llvm_i32_ty]>; def int_x86_wrgsbase_32 : GCCBuiltin<"__builtin_ia32_wrgsbase32">, Intrinsic<[], [llvm_i32_ty]>; def int_x86_wrfsbase_64 : GCCBuiltin<"__builtin_ia32_wrfsbase64">, Intrinsic<[], [llvm_i64_ty]>; def int_x86_wrgsbase_64 : GCCBuiltin<"__builtin_ia32_wrgsbase64">, Intrinsic<[], [llvm_i64_ty]>; } //===----------------------------------------------------------------------===// // FXSR let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_fxrstor : GCCBuiltin<"__builtin_ia32_fxrstor">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_fxrstor64 : GCCBuiltin<"__builtin_ia32_fxrstor64">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_fxsave : GCCBuiltin<"__builtin_ia32_fxsave">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_fxsave64 : GCCBuiltin<"__builtin_ia32_fxsave64">, Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // XSAVE let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_xsave : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsave64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xrstor : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xrstor64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsaveopt : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsaveopt64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xrstors : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xrstors64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsavec : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsavec64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsaves : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xsaves64 : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_xgetbv : Intrinsic<[llvm_i64_ty], [llvm_i32_ty], []>; def int_x86_xsetbv : Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; } //===----------------------------------------------------------------------===// // CLFLUSHOPT and CLWB let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_clflushopt : GCCBuiltin<"__builtin_ia32_clflushopt">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_clwb : GCCBuiltin<"__builtin_ia32_clwb">, Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // Support protection key let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_rdpkru : GCCBuiltin <"__builtin_ia32_rdpkru">, Intrinsic<[llvm_i32_ty], [], []>; def int_x86_wrpkru : GCCBuiltin<"__builtin_ia32_wrpkru">, Intrinsic<[], [llvm_i32_ty], []>; } //===----------------------------------------------------------------------===// // Half float conversion let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph">, Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256">, Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_vcvtph2ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16i16_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_vcvtps2ph_512 : GCCBuiltin<"__builtin_ia32_vcvtps2ph512_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_vcvtps2ph_256 : GCCBuiltin<"__builtin_ia32_vcvtps2ph256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_vcvtps2ph_128 : GCCBuiltin<"__builtin_ia32_vcvtps2ph_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; } //===----------------------------------------------------------------------===// // TBM let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_tbm_bextri_u32 : GCCBuiltin<"__builtin_ia32_bextri_u32">, Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_tbm_bextri_u64 : GCCBuiltin<"__builtin_ia32_bextri_u64">, Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, ImmArg>]>; } //===----------------------------------------------------------------------===// // RDRAND intrinsics - Return a random value and whether it is valid. // RDSEED intrinsics - Return a NIST SP800-90B & C compliant random value and // whether it is valid. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". // These are declared side-effecting so they don't get eliminated by CSE or // LICM. def int_x86_rdrand_16 : Intrinsic<[llvm_i16_ty, llvm_i32_ty], [], []>; def int_x86_rdrand_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>; def int_x86_rdrand_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>; def int_x86_rdseed_16 : Intrinsic<[llvm_i16_ty, llvm_i32_ty], [], []>; def int_x86_rdseed_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>; def int_x86_rdseed_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>; } //===----------------------------------------------------------------------===// // ADX let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_addcarry_32: Intrinsic<[llvm_i8_ty, llvm_i32_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_addcarry_64: Intrinsic<[llvm_i8_ty, llvm_i64_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; def int_x86_subborrow_32: Intrinsic<[llvm_i8_ty, llvm_i32_ty], [llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_subborrow_64: Intrinsic<[llvm_i8_ty, llvm_i64_ty], [llvm_i8_ty, llvm_i64_ty, llvm_i64_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // RTM intrinsics. Transactional Memory support. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_xbegin : GCCBuiltin<"__builtin_ia32_xbegin">, Intrinsic<[llvm_i32_ty], [], []>; def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">, Intrinsic<[], [], []>; def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">, Intrinsic<[], [llvm_i8_ty], [ImmArg>]>; def int_x86_xtest : GCCBuiltin<"__builtin_ia32_xtest">, Intrinsic<[llvm_i32_ty], [], []>; } //===----------------------------------------------------------------------===// // AVX512 // Mask ops let TargetPrefix = "x86" in { def int_x86_avx512_kadd_b : Intrinsic<[llvm_v8i1_ty], [llvm_v8i1_ty, llvm_v8i1_ty], [IntrNoMem]>; def int_x86_avx512_kadd_w : Intrinsic<[llvm_v16i1_ty], [llvm_v16i1_ty, llvm_v16i1_ty], [IntrNoMem]>; def int_x86_avx512_kadd_d : Intrinsic<[llvm_v32i1_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; def int_x86_avx512_kadd_q : Intrinsic<[llvm_v64i1_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestc_b : Intrinsic<[llvm_i32_ty], [llvm_v8i1_ty, llvm_v8i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestc_w : Intrinsic<[llvm_i32_ty], [llvm_v16i1_ty, llvm_v16i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestc_d : Intrinsic<[llvm_i32_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestc_q : Intrinsic<[llvm_i32_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestz_b : Intrinsic<[llvm_i32_ty], [llvm_v8i1_ty, llvm_v8i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestz_w : Intrinsic<[llvm_i32_ty], [llvm_v16i1_ty, llvm_v16i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestz_d : Intrinsic<[llvm_i32_ty], [llvm_v32i1_ty, llvm_v32i1_ty], [IntrNoMem]>; def int_x86_avx512_ktestz_q : Intrinsic<[llvm_i32_ty], [llvm_v64i1_ty, llvm_v64i1_ty], [IntrNoMem]>; } // Conversion ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_cvttss2si : GCCBuiltin<"__builtin_ia32_vcvttss2si32">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttss2si64 : GCCBuiltin<"__builtin_ia32_vcvttss2si64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttss2usi : GCCBuiltin<"__builtin_ia32_vcvttss2usi32">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttss2usi64 : GCCBuiltin<"__builtin_ia32_vcvttss2usi64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtusi2ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss32">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtusi642ss : GCCBuiltin<"__builtin_ia32_cvtusi2ss64">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttsd2si : GCCBuiltin<"__builtin_ia32_vcvttsd2si32">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttsd2si64 : GCCBuiltin<"__builtin_ia32_vcvttsd2si64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttsd2usi : GCCBuiltin<"__builtin_ia32_vcvttsd2usi32">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvttsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvttsd2usi64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtusi642sd : GCCBuiltin<"__builtin_ia32_cvtusi2sd64">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtss2usi32 : GCCBuiltin<"__builtin_ia32_vcvtss2usi32">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtss2usi64 : GCCBuiltin<"__builtin_ia32_vcvtss2usi64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtss2si32 : GCCBuiltin<"__builtin_ia32_vcvtss2si32">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtss2si64 : GCCBuiltin<"__builtin_ia32_vcvtss2si64">, Intrinsic<[llvm_i64_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtsd2usi32 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi32">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtsd2usi64 : GCCBuiltin<"__builtin_ia32_vcvtsd2usi64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtsd2si32 : GCCBuiltin<"__builtin_ia32_vcvtsd2si32">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_vcvtsd2si64 : GCCBuiltin<"__builtin_ia32_vcvtsd2si64">, Intrinsic<[llvm_i64_ty], [llvm_v2f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtsi2ss32 : GCCBuiltin<"__builtin_ia32_cvtsi2ss32">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtsi2ss64 : GCCBuiltin<"__builtin_ia32_cvtsi2ss64">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_cvtsi2sd64 : GCCBuiltin<"__builtin_ia32_cvtsi2sd64">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } // Pack ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">, Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">, Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; } // Vector convert let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_sitofp_round : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_uitofp_round : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtpd2dq_128 : GCCBuiltin<"__builtin_ia32_cvtpd2dq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2dq_512 : GCCBuiltin<"__builtin_ia32_cvtpd2dq512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtpd2ps_512 : GCCBuiltin<"__builtin_ia32_cvtpd2ps512_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f64_ty, llvm_v8f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtsd2ss_round : GCCBuiltin<"__builtin_ia32_cvtsd2ss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtss2sd_round : GCCBuiltin<"__builtin_ia32_cvtss2sd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v4f32_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtpd2ps : GCCBuiltin<"__builtin_ia32_cvtpd2ps_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v2f64_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2qq_128 : GCCBuiltin<"__builtin_ia32_cvtpd2qq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2qq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2qq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2qq_512 : GCCBuiltin<"__builtin_ia32_cvtpd2qq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtpd2udq_128 : GCCBuiltin<"__builtin_ia32_cvtpd2udq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2udq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2udq256_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2udq_512 : GCCBuiltin<"__builtin_ia32_cvtpd2udq512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtpd2uqq_128 : GCCBuiltin<"__builtin_ia32_cvtpd2uqq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2uqq_256 : GCCBuiltin<"__builtin_ia32_cvtpd2uqq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtpd2uqq_512 : GCCBuiltin<"__builtin_ia32_cvtpd2uqq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtps2dq_128 : GCCBuiltin<"__builtin_ia32_cvtps2dq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2dq_256 : GCCBuiltin<"__builtin_ia32_cvtps2dq256_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2dq_512 : GCCBuiltin<"__builtin_ia32_cvtps2dq512_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtps2pd_512 : GCCBuiltin<"__builtin_ia32_cvtps2pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtps2qq_128 : GCCBuiltin<"__builtin_ia32_cvtps2qq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2qq_256 : GCCBuiltin<"__builtin_ia32_cvtps2qq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2qq_512 : GCCBuiltin<"__builtin_ia32_cvtps2qq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtps2udq_128 : GCCBuiltin<"__builtin_ia32_cvtps2udq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2udq_256 : GCCBuiltin<"__builtin_ia32_cvtps2udq256_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2udq_512 : GCCBuiltin<"__builtin_ia32_cvtps2udq512_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtps2uqq_128 : GCCBuiltin<"__builtin_ia32_cvtps2uqq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2uqq_256 : GCCBuiltin<"__builtin_ia32_cvtps2uqq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvtps2uqq_512 : GCCBuiltin<"__builtin_ia32_cvtps2uqq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtqq2ps_128 : GCCBuiltin<"__builtin_ia32_cvtqq2ps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2dq_128 : GCCBuiltin<"__builtin_ia32_cvttpd2dq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2dq_512 : GCCBuiltin<"__builtin_ia32_cvttpd2dq512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttpd2qq_128 : GCCBuiltin<"__builtin_ia32_cvttpd2qq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2qq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2qq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2qq_512 : GCCBuiltin<"__builtin_ia32_cvttpd2qq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttpd2udq_128 : GCCBuiltin<"__builtin_ia32_cvttpd2udq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2udq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2udq256_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2udq_512 : GCCBuiltin<"__builtin_ia32_cvttpd2udq512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f64_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttpd2uqq_128 : GCCBuiltin<"__builtin_ia32_cvttpd2uqq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v2f64_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2uqq_256 : GCCBuiltin<"__builtin_ia32_cvttpd2uqq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f64_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttpd2uqq_512 : GCCBuiltin<"__builtin_ia32_cvttpd2uqq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f64_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttps2dq_512 : GCCBuiltin<"__builtin_ia32_cvttps2dq512_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttps2qq_128 : GCCBuiltin<"__builtin_ia32_cvttps2qq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2qq_256 : GCCBuiltin<"__builtin_ia32_cvttps2qq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2qq_512 : GCCBuiltin<"__builtin_ia32_cvttps2qq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttps2udq_128 : GCCBuiltin<"__builtin_ia32_cvttps2udq128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4f32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2udq_256 : GCCBuiltin<"__builtin_ia32_cvttps2udq256_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8f32_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2udq_512 : GCCBuiltin<"__builtin_ia32_cvttps2udq512_mask">, Intrinsic<[llvm_v16i32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvttps2uqq_128 : GCCBuiltin<"__builtin_ia32_cvttps2uqq128_mask">, Intrinsic<[llvm_v2i64_ty], [llvm_v4f32_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2uqq_256 : GCCBuiltin<"__builtin_ia32_cvttps2uqq256_mask">, Intrinsic<[llvm_v4i64_ty], [llvm_v4f32_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_cvttps2uqq_512 : GCCBuiltin<"__builtin_ia32_cvttps2uqq512_mask">, Intrinsic<[llvm_v8i64_ty], [llvm_v8f32_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cvtuqq2ps_128 : GCCBuiltin<"__builtin_ia32_cvtuqq2ps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v2i64_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_rndscale_pd_128 : GCCBuiltin<"__builtin_ia32_rndscalepd_128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_rndscale_pd_256 : GCCBuiltin<"__builtin_ia32_rndscalepd_256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_rndscale_pd_512 : GCCBuiltin<"__builtin_ia32_rndscalepd_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_rndscale_ps_128 : GCCBuiltin<"__builtin_ia32_rndscaleps_128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_rndscale_ps_256 : GCCBuiltin<"__builtin_ia32_rndscaleps_256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_rndscale_ps_512 : GCCBuiltin<"__builtin_ia32_rndscaleps_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_reduce_pd_128 : GCCBuiltin<"__builtin_ia32_reducepd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_reduce_pd_256 : GCCBuiltin<"__builtin_ia32_reducepd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_reduce_pd_512 : GCCBuiltin<"__builtin_ia32_reducepd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_reduce_ps_128 : GCCBuiltin<"__builtin_ia32_reduceps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_reduce_ps_256 : GCCBuiltin<"__builtin_ia32_reduceps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_reduce_ps_512 : GCCBuiltin<"__builtin_ia32_reduceps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_range_pd_128 : GCCBuiltin<"__builtin_ia32_rangepd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_range_pd_256 : GCCBuiltin<"__builtin_ia32_rangepd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_range_pd_512 : GCCBuiltin<"__builtin_ia32_rangepd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_range_ps_128 : GCCBuiltin<"__builtin_ia32_rangeps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_range_ps_256 : GCCBuiltin<"__builtin_ia32_rangeps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_range_ps_512 : GCCBuiltin<"__builtin_ia32_rangeps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; } // Vector load with broadcast let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_broadcastmw_512 : GCCBuiltin<"__builtin_ia32_broadcastmw512">, Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmw_256 : GCCBuiltin<"__builtin_ia32_broadcastmw256">, Intrinsic<[llvm_v8i32_ty], [llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmw_128 : GCCBuiltin<"__builtin_ia32_broadcastmw128">, Intrinsic<[llvm_v4i32_ty], [llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmb_512 : GCCBuiltin<"__builtin_ia32_broadcastmb512">, Intrinsic<[llvm_v8i64_ty], [llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmb_256 : GCCBuiltin<"__builtin_ia32_broadcastmb256">, Intrinsic<[llvm_v4i64_ty], [llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_broadcastmb_128 : GCCBuiltin<"__builtin_ia32_broadcastmb128">, Intrinsic<[llvm_v2i64_ty], [llvm_i8_ty], [IntrNoMem]>; } // Arithmetic ops let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx512_add_ps_512 : GCCBuiltin<"__builtin_ia32_addps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_add_pd_512 : GCCBuiltin<"__builtin_ia32_addpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_sub_ps_512 : GCCBuiltin<"__builtin_ia32_subps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_sub_pd_512 : GCCBuiltin<"__builtin_ia32_subpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mul_ps_512 : GCCBuiltin<"__builtin_ia32_mulps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mul_pd_512 : GCCBuiltin<"__builtin_ia32_mulpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_div_ps_512 : GCCBuiltin<"__builtin_ia32_divps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_add_ss_round : GCCBuiltin<"__builtin_ia32_addss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_div_ss_round : GCCBuiltin<"__builtin_ia32_divss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_mul_ss_round : GCCBuiltin<"__builtin_ia32_mulss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_sub_ss_round : GCCBuiltin<"__builtin_ia32_subss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_max_ss_round : GCCBuiltin<"__builtin_ia32_maxss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_min_ss_round : GCCBuiltin<"__builtin_ia32_minss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_add_sd_round : GCCBuiltin<"__builtin_ia32_addsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_div_sd_round : GCCBuiltin<"__builtin_ia32_divsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_mul_sd_round : GCCBuiltin<"__builtin_ia32_mulsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_sub_sd_round : GCCBuiltin<"__builtin_ia32_subsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_max_sd_round : GCCBuiltin<"__builtin_ia32_maxsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_min_sd_round : GCCBuiltin<"__builtin_ia32_minsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_rndscale_ss : GCCBuiltin<"__builtin_ia32_rndscaless_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_rndscale_sd : GCCBuiltin<"__builtin_ia32_rndscalesd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_range_ss : GCCBuiltin<"__builtin_ia32_rangess128_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_range_sd : GCCBuiltin<"__builtin_ia32_rangesd128_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_reduce_ss : GCCBuiltin<"__builtin_ia32_reducess_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_reduce_sd : GCCBuiltin<"__builtin_ia32_reducesd_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_scalef_sd : GCCBuiltin<"__builtin_ia32_scalefsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_scalef_ss : GCCBuiltin<"__builtin_ia32_scalefss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_scalef_pd_128 : GCCBuiltin<"__builtin_ia32_scalefpd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_scalef_pd_256 : GCCBuiltin<"__builtin_ia32_scalefpd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty],[IntrNoMem]>; def int_x86_avx512_mask_scalef_pd_512 : GCCBuiltin<"__builtin_ia32_scalefpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_scalef_ps_128 : GCCBuiltin<"__builtin_ia32_scalefps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_scalef_ps_256 : GCCBuiltin<"__builtin_ia32_scalefps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_scalef_ps_512 : GCCBuiltin<"__builtin_ia32_scalefps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_sqrt_ss : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_sqrt_sd : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_sqrt_pd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_sqrt_ps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fixupimm_pd_128 : GCCBuiltin<"__builtin_ia32_fixupimmpd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_pd_128 : GCCBuiltin<"__builtin_ia32_fixupimmpd128_maskz">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fixupimm_pd_256 : GCCBuiltin<"__builtin_ia32_fixupimmpd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_pd_256 : GCCBuiltin<"__builtin_ia32_fixupimmpd256_maskz">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_v4i64_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fixupimm_pd_512 : GCCBuiltin<"__builtin_ia32_fixupimmpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_pd_512 : GCCBuiltin<"__builtin_ia32_fixupimmpd512_maskz">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_v8i64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_fixupimm_ps_128 : GCCBuiltin<"__builtin_ia32_fixupimmps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_ps_128 : GCCBuiltin<"__builtin_ia32_fixupimmps128_maskz">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fixupimm_ps_256 : GCCBuiltin<"__builtin_ia32_fixupimmps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_ps_256 : GCCBuiltin<"__builtin_ia32_fixupimmps256_maskz">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_v8i32_ty, llvm_i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_fixupimm_ps_512 : GCCBuiltin<"__builtin_ia32_fixupimmps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_ps_512 : GCCBuiltin<"__builtin_ia32_fixupimmps512_maskz">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16i32_ty, llvm_i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_fixupimm_sd : GCCBuiltin<"__builtin_ia32_fixupimmsd_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_sd : GCCBuiltin<"__builtin_ia32_fixupimmsd_maskz">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2i64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_fixupimm_ss : GCCBuiltin<"__builtin_ia32_fixupimmss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_maskz_fixupimm_ss : GCCBuiltin<"__builtin_ia32_fixupimmss_maskz">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4i32_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_getexp_pd_128 : GCCBuiltin<"__builtin_ia32_getexppd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_getexp_pd_256 : GCCBuiltin<"__builtin_ia32_getexppd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_getexp_pd_512 : GCCBuiltin<"__builtin_ia32_getexppd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getexp_ps_128 : GCCBuiltin<"__builtin_ia32_getexpps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_getexp_ps_256 : GCCBuiltin<"__builtin_ia32_getexpps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_getexp_ps_512 : GCCBuiltin<"__builtin_ia32_getexpps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getexp_ss : GCCBuiltin<"__builtin_ia32_getexpss128_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getexp_sd : GCCBuiltin<"__builtin_ia32_getexpsd128_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getmant_pd_128 : GCCBuiltin<"__builtin_ia32_getmantpd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getmant_pd_256 : GCCBuiltin<"__builtin_ia32_getmantpd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty,llvm_i32_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getmant_pd_512 : GCCBuiltin<"__builtin_ia32_getmantpd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty,llvm_i32_ty, llvm_v8f64_ty, llvm_i8_ty,llvm_i32_ty ], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_getmant_ps_128 : GCCBuiltin<"__builtin_ia32_getmantps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getmant_ps_256 : GCCBuiltin<"__builtin_ia32_getmantps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_i32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_getmant_ps_512 : GCCBuiltin<"__builtin_ia32_getmantps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty,llvm_i32_ty, llvm_v16f32_ty,llvm_i16_ty,llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_getmant_ss : GCCBuiltin<"__builtin_ia32_getmantss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_getmant_sd : GCCBuiltin<"__builtin_ia32_getmantsd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_rsqrt14_ss : GCCBuiltin<"__builtin_ia32_rsqrt14ss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_sd : GCCBuiltin<"__builtin_ia32_rsqrt14sd_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_pd_128 : GCCBuiltin<"__builtin_ia32_rsqrt14pd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_pd_256 : GCCBuiltin<"__builtin_ia32_rsqrt14pd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_pd_512 : GCCBuiltin<"__builtin_ia32_rsqrt14pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_ps_128 : GCCBuiltin<"__builtin_ia32_rsqrt14ps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrt14ps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rsqrt14_ps_512 : GCCBuiltin<"__builtin_ia32_rsqrt14ps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_ss : GCCBuiltin<"__builtin_ia32_rcp14ss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_sd : GCCBuiltin<"__builtin_ia32_rcp14sd_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_pd_128 : GCCBuiltin<"__builtin_ia32_rcp14pd128_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_pd_256 : GCCBuiltin<"__builtin_ia32_rcp14pd256_mask">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_pd_512 : GCCBuiltin<"__builtin_ia32_rcp14pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_ps_128 : GCCBuiltin<"__builtin_ia32_rcp14ps128_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_ps_256 : GCCBuiltin<"__builtin_ia32_rcp14ps256_mask">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_rcp14_ps_512 : GCCBuiltin<"__builtin_ia32_rcp14ps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_rcp28_ps : GCCBuiltin<"__builtin_ia32_rcp28ps_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rcp28_pd : GCCBuiltin<"__builtin_ia32_rcp28pd_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_exp2_ps : GCCBuiltin<"__builtin_ia32_exp2ps_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_exp2_pd : GCCBuiltin<"__builtin_ia32_exp2pd_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rcp28_ss : GCCBuiltin<"__builtin_ia32_rcp28ss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rcp28_sd : GCCBuiltin<"__builtin_ia32_rcp28sd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rsqrt28_ps : GCCBuiltin<"__builtin_ia32_rsqrt28ps_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rsqrt28_pd : GCCBuiltin<"__builtin_ia32_rsqrt28pd_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rsqrt28_ss : GCCBuiltin<"__builtin_ia32_rsqrt28ss_round_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_rsqrt28_sd : GCCBuiltin<"__builtin_ia32_rsqrt28sd_round_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_psad_bw_512 : GCCBuiltin<"__builtin_ia32_psadbw512">, Intrinsic<[llvm_v8i64_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem, Commutative]>; } // Integer arithmetic ops let TargetPrefix = "x86" in { def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx512_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512">, Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem]>; def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx512_pmaddubs_w_512 : GCCBuiltin<"__builtin_ia32_pmaddubsw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v64i8_ty, llvm_v64i8_ty], [IntrNoMem]>; def int_x86_avx512_dbpsadbw_128 : GCCBuiltin<"__builtin_ia32_dbpsadbw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v16i8_ty, llvm_v16i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_dbpsadbw_256 : GCCBuiltin<"__builtin_ia32_dbpsadbw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v32i8_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_dbpsadbw_512 : GCCBuiltin<"__builtin_ia32_dbpsadbw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v64i8_ty, llvm_v64i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } // Gather and Scatter ops let TargetPrefix = "x86" in { // NOTE: These are deprecated in favor of the versions that take a vXi1 mask. // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_gather_dpd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_dps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_qpd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_qps_512 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_dpq_512 : Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_dpi_512 : Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_qpq_512 : Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather_qpi_512 : Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div2_df : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div2_di : Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div4_df : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div4_di : Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div4_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div4_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div8_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3div8_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv2_df : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv2_di : Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv4_df : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv4_di : Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv4_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv4_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv8_sf : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_gather3siv8_si : Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; // scatter // NOTE: These are deprecated in favor of the versions that take a vXi1 mask. // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_scatter_dpd_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_dps_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_qpd_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_qps_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_dpq_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_dpi_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i16_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_qpq_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty,llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatter_qpi_512 : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv2_df : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv2_di : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv4_df : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv4_di : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv4_sf : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv4_si : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv8_sf : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scatterdiv8_si : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv2_df : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv2_di : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv4_df : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv4_di : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv4_sf : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv4_si : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv8_sf : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_scattersiv8_si : Intrinsic<[], [llvm_ptr_ty, llvm_i8_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [ImmArg>]>; // gather prefetch // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_gatherpf_dpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfdpd">, Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_gatherpf_dps_512 : GCCBuiltin<"__builtin_ia32_gatherpfdps">, Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_gatherpf_qpd_512 : GCCBuiltin<"__builtin_ia32_gatherpfqpd">, Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_gatherpf_qps_512 : GCCBuiltin<"__builtin_ia32_gatherpfqps">, Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; // scatter prefetch // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_scatterpf_dpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfdpd">, Intrinsic<[], [llvm_i8_ty, llvm_v8i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_scatterpf_dps_512 : GCCBuiltin<"__builtin_ia32_scatterpfdps">, Intrinsic<[], [llvm_i16_ty, llvm_v16i32_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_scatterpf_qpd_512 : GCCBuiltin<"__builtin_ia32_scatterpfqpd">, Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; def int_x86_avx512_scatterpf_qps_512 : GCCBuiltin<"__builtin_ia32_scatterpfqps">, Intrinsic<[], [llvm_i8_ty, llvm_v8i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [ImmArg>, ImmArg>]>; } // AVX512 gather/scatter intrinsics that use vXi1 masks. let TargetPrefix = "x86" in { // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_mask_gather_dpd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_dps_512 : Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_ptr_ty, llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_qpd_512 : Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_qps_512 : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_dpq_512 : Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_dpi_512 : Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_ptr_ty, llvm_v16i32_ty, llvm_v16i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_qpq_512 : Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather_qpi_512 : Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i64_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div2_df : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div2_di : Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div4_df : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div4_di : Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div4_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div4_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div8_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3div8_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i64_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv2_df : Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv2_di : Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v2i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv4_df : Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv4_di : Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv4_sf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv4_si : Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_ptr_ty, llvm_v4i32_ty, llvm_v4i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv8_sf : Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_gather3siv8_si : Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_ptr_ty, llvm_v8i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrReadMem, ImmArg>]>; def int_x86_avx512_mask_scatter_dpd_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_dps_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty, llvm_v16i32_ty, llvm_v16f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_qpd_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_qps_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8f32_ty, llvm_i32_ty], [ImmArg>]>; // NOTE: These can't be ArgMemOnly because you can put the address completely // in the index register. def int_x86_avx512_mask_scatter_dpq_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_dpi_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v16i1_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_qpq_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty,llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatter_qpi_512 : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i64_ty, llvm_v8i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv2_df : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv2_di : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv4_df : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv4_di : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv4_sf : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv4_si : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v2i64_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv8_sf : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scatterdiv8_si : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i64_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv2_df : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv2_di : Intrinsic<[], [llvm_ptr_ty, llvm_v2i1_ty, llvm_v4i32_ty, llvm_v2i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv4_df : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv4_di : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i64_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv4_sf : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv4_si : Intrinsic<[], [llvm_ptr_ty, llvm_v4i1_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv8_sf : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8f32_ty, llvm_i32_ty], [ImmArg>]>; def int_x86_avx512_mask_scattersiv8_si : Intrinsic<[], [llvm_ptr_ty, llvm_v8i1_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [ImmArg>]>; } // AVX-512 conflict detection instruction // Instructions that count the number of leading zero bits let TargetPrefix = "x86" in { def int_x86_avx512_conflict_d_128 : GCCBuiltin<"__builtin_ia32_vpconflictsi_128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512_conflict_d_256 : GCCBuiltin<"__builtin_ia32_vpconflictsi_256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_conflict_d_512 : GCCBuiltin<"__builtin_ia32_vpconflictsi_512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_conflict_q_128 : GCCBuiltin<"__builtin_ia32_vpconflictdi_128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_conflict_q_256 : GCCBuiltin<"__builtin_ia32_vpconflictdi_256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_conflict_q_512 : GCCBuiltin<"__builtin_ia32_vpconflictdi_512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty], [IntrNoMem]>; } // Compares let TargetPrefix = "x86" in { // 512-bit def int_x86_avx512_vcomi_sd : GCCBuiltin<"__builtin_ia32_vcomisd">, Intrinsic<[llvm_i32_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_vcomi_ss : GCCBuiltin<"__builtin_ia32_vcomiss">, Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; } // Compress, Expand let TargetPrefix = "x86" in { def int_x86_avx512_mask_compress : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [IntrNoMem]>; def int_x86_avx512_mask_expand : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], [IntrNoMem]>; } // truncate let TargetPrefix = "x86" in { def int_x86_avx512_mask_pmov_qb_128 : GCCBuiltin<"__builtin_ia32_pmovqb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovqb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qb_128 : GCCBuiltin<"__builtin_ia32_pmovsqb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovsqb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qb_128 : GCCBuiltin<"__builtin_ia32_pmovusqb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v2i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovusqb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qb_256 : GCCBuiltin<"__builtin_ia32_pmovqb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovqb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qb_256 : GCCBuiltin<"__builtin_ia32_pmovsqb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovsqb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qb_256 : GCCBuiltin<"__builtin_ia32_pmovusqb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovusqb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qb_512 : GCCBuiltin<"__builtin_ia32_pmovqb512_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovqb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qb_512 : GCCBuiltin<"__builtin_ia32_pmovsqb512_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovsqb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qb_512 : GCCBuiltin<"__builtin_ia32_pmovusqb512_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i64_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovusqb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qw_128 : GCCBuiltin<"__builtin_ia32_pmovqw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovqw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qw_128 : GCCBuiltin<"__builtin_ia32_pmovsqw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovsqw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qw_128 : GCCBuiltin<"__builtin_ia32_pmovusqw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v2i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovusqw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qw_256 : GCCBuiltin<"__builtin_ia32_pmovqw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovqw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qw_256 : GCCBuiltin<"__builtin_ia32_pmovsqw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovsqw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qw_256 : GCCBuiltin<"__builtin_ia32_pmovusqw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovusqw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qw_512 : Intrinsic<[llvm_v8i16_ty], [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovqw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qw_512 : GCCBuiltin<"__builtin_ia32_pmovsqw512_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovsqw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qw_512 : GCCBuiltin<"__builtin_ia32_pmovusqw512_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i64_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovusqw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qd_128 : GCCBuiltin<"__builtin_ia32_pmovqd128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_qd_mem_128 : GCCBuiltin<"__builtin_ia32_pmovqd128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qd_128 : GCCBuiltin<"__builtin_ia32_pmovsqd128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qd_mem_128 : GCCBuiltin<"__builtin_ia32_pmovsqd128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qd_128 : GCCBuiltin<"__builtin_ia32_pmovusqd128_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v2i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qd_mem_128 : GCCBuiltin<"__builtin_ia32_pmovusqd128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v2i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qd_mem_256 : GCCBuiltin<"__builtin_ia32_pmovqd256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qd_256 : GCCBuiltin<"__builtin_ia32_pmovsqd256_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qd_mem_256 : GCCBuiltin<"__builtin_ia32_pmovsqd256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qd_256 : GCCBuiltin<"__builtin_ia32_pmovusqd256_mask">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i64_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qd_mem_256 : GCCBuiltin<"__builtin_ia32_pmovusqd256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_qd_mem_512 : GCCBuiltin<"__builtin_ia32_pmovqd512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_qd_512 : GCCBuiltin<"__builtin_ia32_pmovsqd512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_qd_mem_512 : GCCBuiltin<"__builtin_ia32_pmovsqd512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_qd_512 : GCCBuiltin<"__builtin_ia32_pmovusqd512_mask">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i64_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_qd_mem_512 : GCCBuiltin<"__builtin_ia32_pmovusqd512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i64_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_db_128 : GCCBuiltin<"__builtin_ia32_pmovdb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_db_mem_128 : GCCBuiltin<"__builtin_ia32_pmovdb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_db_128 : GCCBuiltin<"__builtin_ia32_pmovsdb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_db_mem_128 : GCCBuiltin<"__builtin_ia32_pmovsdb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_db_128 : GCCBuiltin<"__builtin_ia32_pmovusdb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v4i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_db_mem_128 : GCCBuiltin<"__builtin_ia32_pmovusdb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_db_256 : GCCBuiltin<"__builtin_ia32_pmovdb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_db_mem_256 : GCCBuiltin<"__builtin_ia32_pmovdb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_db_256 : GCCBuiltin<"__builtin_ia32_pmovsdb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_db_mem_256 : GCCBuiltin<"__builtin_ia32_pmovsdb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_db_256 : GCCBuiltin<"__builtin_ia32_pmovusdb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i32_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_db_mem_256 : GCCBuiltin<"__builtin_ia32_pmovusdb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_db_512 : Intrinsic<[llvm_v16i8_ty], [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_db_mem_512 : GCCBuiltin<"__builtin_ia32_pmovdb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_db_512 : GCCBuiltin<"__builtin_ia32_pmovsdb512_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_db_mem_512 : GCCBuiltin<"__builtin_ia32_pmovsdb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_db_512 : GCCBuiltin<"__builtin_ia32_pmovusdb512_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i32_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_db_mem_512 : GCCBuiltin<"__builtin_ia32_pmovusdb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_dw_128 : GCCBuiltin<"__builtin_ia32_pmovdw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_dw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovdw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_dw_128 : GCCBuiltin<"__builtin_ia32_pmovsdw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_dw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovsdw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_dw_128 : GCCBuiltin<"__builtin_ia32_pmovusdw128_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_dw_mem_128 : GCCBuiltin<"__builtin_ia32_pmovusdw128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_dw_256 : GCCBuiltin<"__builtin_ia32_pmovdw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_dw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovdw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_dw_256 : GCCBuiltin<"__builtin_ia32_pmovsdw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_dw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovsdw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_dw_256 : GCCBuiltin<"__builtin_ia32_pmovusdw256_mask">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i32_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_dw_mem_256 : GCCBuiltin<"__builtin_ia32_pmovusdw256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i32_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_dw_512 : Intrinsic<[llvm_v16i16_ty], [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_dw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovdw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_dw_512 : GCCBuiltin<"__builtin_ia32_pmovsdw512_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_dw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovsdw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_dw_512 : GCCBuiltin<"__builtin_ia32_pmovusdw512_mask">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i32_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_dw_mem_512 : GCCBuiltin<"__builtin_ia32_pmovusdw512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i32_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_wb_128 : GCCBuiltin<"__builtin_ia32_pmovwb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmov_wb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovwb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_wb_128 : GCCBuiltin<"__builtin_ia32_pmovswb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_wb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovswb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_wb_128 : GCCBuiltin<"__builtin_ia32_pmovuswb128_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v16i8_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_wb_mem_128 : GCCBuiltin<"__builtin_ia32_pmovuswb128mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v8i16_ty, llvm_i8_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_wb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovwb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_wb_256 : GCCBuiltin<"__builtin_ia32_pmovswb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_wb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovswb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_wb_256 : GCCBuiltin<"__builtin_ia32_pmovuswb256_mask">, Intrinsic<[llvm_v16i8_ty], [llvm_v16i16_ty, llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_wb_mem_256 : GCCBuiltin<"__builtin_ia32_pmovuswb256mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v16i16_ty, llvm_i16_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmov_wb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovwb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovs_wb_512 : GCCBuiltin<"__builtin_ia32_pmovswb512_mask">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovs_wb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovswb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrArgMemOnly]>; def int_x86_avx512_mask_pmovus_wb_512 : GCCBuiltin<"__builtin_ia32_pmovuswb512_mask">, Intrinsic<[llvm_v32i8_ty], [llvm_v32i16_ty, llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_pmovus_wb_mem_512 : GCCBuiltin<"__builtin_ia32_pmovuswb512mem_mask">, Intrinsic<[], [llvm_ptr_ty, llvm_v32i16_ty, llvm_i32_ty], [IntrArgMemOnly]>; } // Bitwise ternary logic let TargetPrefix = "x86" in { def int_x86_avx512_pternlog_d_128 : GCCBuiltin<"__builtin_ia32_pternlogd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_pternlog_d_256 : GCCBuiltin<"__builtin_ia32_pternlogd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v8i32_ty, llvm_v8i32_ty, llvm_v8i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_pternlog_d_512 : GCCBuiltin<"__builtin_ia32_pternlogd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v16i32_ty, llvm_v16i32_ty, llvm_v16i32_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_pternlog_q_128 : GCCBuiltin<"__builtin_ia32_pternlogq128">, Intrinsic<[llvm_v2i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_pternlog_q_256 : GCCBuiltin<"__builtin_ia32_pternlogq256">, Intrinsic<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_pternlog_q_512 : GCCBuiltin<"__builtin_ia32_pternlogq512">, Intrinsic<[llvm_v8i64_ty], [llvm_v8i64_ty, llvm_v8i64_ty, llvm_v8i64_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; } // vp2intersect let TargetPrefix = "x86" in { def int_x86_avx512_vp2intersect_q_512 : Intrinsic<[llvm_v8i1_ty, llvm_v8i1_ty], [llvm_v8i64_ty, llvm_v8i64_ty], [IntrNoMem]>; def int_x86_avx512_vp2intersect_q_256 : Intrinsic<[llvm_v4i1_ty, llvm_v4i1_ty], [llvm_v4i64_ty, llvm_v4i64_ty], [IntrNoMem]>; def int_x86_avx512_vp2intersect_q_128 : Intrinsic<[llvm_v2i1_ty, llvm_v2i1_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; def int_x86_avx512_vp2intersect_d_512 : Intrinsic<[llvm_v16i1_ty, llvm_v16i1_ty], [llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; def int_x86_avx512_vp2intersect_d_256 : Intrinsic<[llvm_v8i1_ty, llvm_v8i1_ty], [llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512_vp2intersect_d_128 : Intrinsic<[llvm_v4i1_ty, llvm_v4i1_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; } // Misc. let TargetPrefix = "x86" in { // NOTE: These comparison intrinsics are not used by clang as long as the // distinction in signaling behaviour is not implemented. def int_x86_avx512_mask_cmp_ps_512 : Intrinsic<[llvm_v16i1_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty, llvm_v16i1_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_cmp_pd_512 : Intrinsic<[llvm_v8i1_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i32_ty, llvm_v8i1_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_cmp_ps_256 : Intrinsic<[llvm_v8i1_ty], [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i32_ty, llvm_v8i1_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cmp_pd_256 : Intrinsic<[llvm_v4i1_ty], [llvm_v4f64_ty, llvm_v4f64_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cmp_ps_128 : Intrinsic<[llvm_v4i1_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cmp_pd_128 : Intrinsic<[llvm_v2i1_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_v2i1_ty], [IntrNoMem, ImmArg>]>; def int_x86_avx512_mask_cmp_ss : GCCBuiltin<"__builtin_ia32_cmpss_mask">, Intrinsic<[llvm_i8_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; def int_x86_avx512_mask_cmp_sd : GCCBuiltin<"__builtin_ia32_cmpsd_mask">, Intrinsic<[llvm_i8_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_i32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem, ImmArg>, ImmArg>]>; } //===----------------------------------------------------------------------===// // SHA intrinsics let TargetPrefix = "x86" in { def int_x86_sha1rnds4 : GCCBuiltin<"__builtin_ia32_sha1rnds4">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem, ImmArg>]>; def int_x86_sha1nexte : GCCBuiltin<"__builtin_ia32_sha1nexte">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sha1msg1 : GCCBuiltin<"__builtin_ia32_sha1msg1">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sha1msg2 : GCCBuiltin<"__builtin_ia32_sha1msg2">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sha256rnds2 : GCCBuiltin<"__builtin_ia32_sha256rnds2">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sha256msg1 : GCCBuiltin<"__builtin_ia32_sha256msg1">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_sha256msg2 : GCCBuiltin<"__builtin_ia32_sha256msg2">, Intrinsic<[llvm_v4i32_ty], [llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // Thread synchronization ops with timer. let TargetPrefix = "x86" in { def int_x86_monitorx : GCCBuiltin<"__builtin_ia32_monitorx">, Intrinsic<[], [ llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty ], []>; def int_x86_mwaitx : GCCBuiltin<"__builtin_ia32_mwaitx">, Intrinsic<[], [ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ], []>; } //===----------------------------------------------------------------------===// // Cache-line zero let TargetPrefix = "x86" in { def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">, Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // Cache write back intrinsics let TargetPrefix = "x86" in { // Write back and invalidate def int_x86_wbinvd : GCCBuiltin<"__builtin_ia32_wbinvd">, Intrinsic<[], [], []>; // Write back no-invalidate def int_x86_wbnoinvd : GCCBuiltin<"__builtin_ia32_wbnoinvd">, Intrinsic<[], [], []>; } //===----------------------------------------------------------------------===// // Cache-line demote let TargetPrefix = "x86" in { def int_x86_cldemote : GCCBuiltin<"__builtin_ia32_cldemote">, Intrinsic<[], [llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // Wait and pause enhancements let TargetPrefix = "x86" in { def int_x86_umonitor : GCCBuiltin<"__builtin_ia32_umonitor">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_umwait : GCCBuiltin<"__builtin_ia32_umwait">, Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; def int_x86_tpause : GCCBuiltin<"__builtin_ia32_tpause">, Intrinsic<[llvm_i8_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>; } //===----------------------------------------------------------------------===// // Direct Move Instructions let TargetPrefix = "x86" in { def int_x86_directstore32 : GCCBuiltin<"__builtin_ia32_directstore_u32">, Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], []>; def int_x86_directstore64 : GCCBuiltin<"__builtin_ia32_directstore_u64">, Intrinsic<[], [llvm_ptr_ty, llvm_i64_ty], []>; def int_x86_movdir64b : GCCBuiltin<"__builtin_ia32_movdir64b">, Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // PTWrite - Write data to processor trace pocket let TargetPrefix = "x86" in { def int_x86_ptwrite32 : GCCBuiltin<"__builtin_ia32_ptwrite32">, Intrinsic<[], [llvm_i32_ty], []>; def int_x86_ptwrite64 : GCCBuiltin<"__builtin_ia32_ptwrite64">, Intrinsic<[], [llvm_i64_ty], []>; } //===----------------------------------------------------------------------===// // INVPCID - Invalidate Process-Context Identifier let TargetPrefix = "x86" in { def int_x86_invpcid : GCCBuiltin<"__builtin_ia32_invpcid">, Intrinsic<[], [llvm_i32_ty, llvm_ptr_ty], []>; } let TargetPrefix = "x86" in { def int_x86_avx512bf16_cvtne2ps2bf16_128: GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_128">, Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtne2ps2bf16_256: GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_256">, Intrinsic<[llvm_v16i16_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtne2ps2bf16_512: GCCBuiltin<"__builtin_ia32_cvtne2ps2bf16_512">, Intrinsic<[llvm_v32i16_ty], [llvm_v16f32_ty, llvm_v16f32_ty], [IntrNoMem]>; // Intrinsic must be masked due to it producing less than 128 bits of results. def int_x86_avx512bf16_mask_cvtneps2bf16_128: Intrinsic<[llvm_v8i16_ty], [llvm_v4f32_ty, llvm_v8i16_ty, llvm_v4i1_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtneps2bf16_256: GCCBuiltin<"__builtin_ia32_cvtneps2bf16_256">, Intrinsic<[llvm_v8i16_ty], [llvm_v8f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_cvtneps2bf16_512: GCCBuiltin<"__builtin_ia32_cvtneps2bf16_512">, Intrinsic<[llvm_v16i16_ty], [llvm_v16f32_ty], [IntrNoMem]>; def int_x86_avx512bf16_dpbf16ps_128: GCCBuiltin<"__builtin_ia32_dpbf16ps_128">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4i32_ty, llvm_v4i32_ty], [IntrNoMem]>; def int_x86_avx512bf16_dpbf16ps_256: GCCBuiltin<"__builtin_ia32_dpbf16ps_256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8i32_ty, llvm_v8i32_ty], [IntrNoMem]>; def int_x86_avx512bf16_dpbf16ps_512: GCCBuiltin<"__builtin_ia32_dpbf16ps_512">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16i32_ty, llvm_v16i32_ty], [IntrNoMem]>; } //===----------------------------------------------------------------------===// // ENQCMD - Enqueue Stores Instructions let TargetPrefix = "x86" in { def int_x86_enqcmd : GCCBuiltin<"__builtin_ia32_enqcmd">, Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_ptr_ty], []>; def int_x86_enqcmds : GCCBuiltin<"__builtin_ia32_enqcmds">, Intrinsic<[llvm_i8_ty], [llvm_ptr_ty, llvm_ptr_ty], []>; } //===----------------------------------------------------------------------===// // SERIALIZE - Serialize instruction fetch and execution let TargetPrefix = "x86" in { def int_x86_serialize : GCCBuiltin<"__builtin_ia32_serialize">, Intrinsic<[], [], []>; } //===----------------------------------------------------------------------===// // TSXLDTRK - TSX Suspend Load Address Tracking let TargetPrefix = "x86" in { def int_x86_xsusldtrk : GCCBuiltin<"__builtin_ia32_xsusldtrk">, Intrinsic<[], [], []>; def int_x86_xresldtrk : GCCBuiltin<"__builtin_ia32_xresldtrk">, Intrinsic<[], [], []>; } //===----------------------------------------------------------------------===// // Key Locker let TargetPrefix = "x86" in { def int_x86_loadiwkey : GCCBuiltin<"__builtin_ia32_loadiwkey">, Intrinsic<[], [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_i32_ty], []>; def int_x86_encodekey128 : Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_i32_ty, llvm_v2i64_ty], []>; def int_x86_encodekey256 : Intrinsic<[llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_i32_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; def int_x86_aesenc128kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; def int_x86_aesdec128kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; def int_x86_aesenc256kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; def int_x86_aesdec256kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty], [llvm_v2i64_ty, llvm_ptr_ty], []>; def int_x86_aesencwide128kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; def int_x86_aesdecwide128kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; def int_x86_aesencwide256kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; def int_x86_aesdecwide256kl : Intrinsic<[llvm_i8_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], [llvm_ptr_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty], []>; } //===----------------------------------------------------------------------===// // AMX - Intel AMX extensions let TargetPrefix = "x86" in { def int_x86_ldtilecfg : GCCBuiltin<"__builtin_ia32_tile_loadconfig">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_sttilecfg : GCCBuiltin<"__builtin_ia32_tile_storeconfig">, Intrinsic<[], [llvm_ptr_ty], []>; def int_x86_tilerelease : GCCBuiltin<"__builtin_ia32_tilerelease">, Intrinsic<[], [], []>; def int_x86_tilezero : GCCBuiltin<"__builtin_ia32_tilezero">, Intrinsic<[], [llvm_i8_ty], [ImmArg>]>; def int_x86_tileloadd64 : GCCBuiltin<"__builtin_ia32_tileloadd64">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg>]>; def int_x86_tileloaddt164 : GCCBuiltin<"__builtin_ia32_tileloaddt164">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg>]>; def int_x86_tilestored64 : GCCBuiltin<"__builtin_ia32_tilestored64">, Intrinsic<[], [llvm_i8_ty, llvm_ptr_ty, llvm_i64_ty], [ImmArg>]>; def int_x86_tdpbssd : GCCBuiltin<"__builtin_ia32_tdpbssd">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; def int_x86_tdpbsud : GCCBuiltin<"__builtin_ia32_tdpbsud">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; def int_x86_tdpbusd : GCCBuiltin<"__builtin_ia32_tdpbusd">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; def int_x86_tdpbuud : GCCBuiltin<"__builtin_ia32_tdpbuud">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; def int_x86_tdpbf16ps : GCCBuiltin<"__builtin_ia32_tdpbf16ps">, Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty], [ImmArg>, ImmArg>, ImmArg>]>; // AMX - internal intrinsics def int_x86_tileloadd64_internal : GCCBuiltin<"__builtin_ia32_tileloadd64_internal">, Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty], []>; def int_x86_tdpbssd_internal : GCCBuiltin<"__builtin_ia32_tdpbssd_internal">, Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty, llvm_x86amx_ty, llvm_x86amx_ty, llvm_x86amx_ty], []>; def int_x86_tilestored64_internal : GCCBuiltin<"__builtin_ia32_tilestored64_internal">, Intrinsic<[], [llvm_i16_ty, llvm_i16_ty, llvm_ptr_ty, llvm_i64_ty, llvm_x86amx_ty], []>; + def int_x86_tilezero_internal : + GCCBuiltin<"__builtin_ia32_tilezero_internal">, + Intrinsic<[llvm_x86amx_ty], [llvm_i16_ty, llvm_i16_ty], + []>; } //===----------------------------------------------------------------------===// // UINTR - User Level Interrupt let TargetPrefix = "x86" in { def int_x86_clui : GCCBuiltin<"__builtin_ia32_clui">, Intrinsic<[], [], []>; def int_x86_stui : GCCBuiltin<"__builtin_ia32_stui">, Intrinsic<[], [], []>; def int_x86_testui : GCCBuiltin<"__builtin_ia32_testui">, Intrinsic<[llvm_i8_ty], [], []>; def int_x86_senduipi : GCCBuiltin<"__builtin_ia32_senduipi">, Intrinsic<[], [llvm_i64_ty], []>; } diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp index a2fe09aecc49..15af0fb2e888 100644 --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -1,533 +1,539 @@ //===------- X86ExpandPseudo.cpp - Expand pseudo instructions -------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains a pass that expands pseudo instructions into target // instructions to allow proper scheduling, if-conversion, other late // optimizations, or simply the encoding of the instructions. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86FrameLowering.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/Analysis/EHPersonalities.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/Passes.h" // For IDs of passes that are preserved. #include "llvm/IR/GlobalValue.h" using namespace llvm; #define DEBUG_TYPE "x86-pseudo" #define X86_EXPAND_PSEUDO_NAME "X86 pseudo instruction expansion pass" namespace { class X86ExpandPseudo : public MachineFunctionPass { public: static char ID; X86ExpandPseudo() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); AU.addPreservedID(MachineLoopInfoID); AU.addPreservedID(MachineDominatorsID); MachineFunctionPass::getAnalysisUsage(AU); } const X86Subtarget *STI = nullptr; const X86InstrInfo *TII = nullptr; const X86RegisterInfo *TRI = nullptr; const X86MachineFunctionInfo *X86FI = nullptr; const X86FrameLowering *X86FL = nullptr; bool runOnMachineFunction(MachineFunction &Fn) override; MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); } StringRef getPassName() const override { return "X86 pseudo instruction expansion pass"; } private: void ExpandICallBranchFunnel(MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI); bool ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI); bool ExpandMBB(MachineBasicBlock &MBB); }; char X86ExpandPseudo::ID = 0; } // End anonymous namespace. INITIALIZE_PASS(X86ExpandPseudo, DEBUG_TYPE, X86_EXPAND_PSEUDO_NAME, false, false) void X86ExpandPseudo::ExpandICallBranchFunnel( MachineBasicBlock *MBB, MachineBasicBlock::iterator MBBI) { MachineBasicBlock *JTMBB = MBB; MachineInstr *JTInst = &*MBBI; MachineFunction *MF = MBB->getParent(); const BasicBlock *BB = MBB->getBasicBlock(); auto InsPt = MachineFunction::iterator(MBB); ++InsPt; std::vector> TargetMBBs; DebugLoc DL = JTInst->getDebugLoc(); MachineOperand Selector = JTInst->getOperand(0); const GlobalValue *CombinedGlobal = JTInst->getOperand(1).getGlobal(); auto CmpTarget = [&](unsigned Target) { if (Selector.isReg()) MBB->addLiveIn(Selector.getReg()); BuildMI(*MBB, MBBI, DL, TII->get(X86::LEA64r), X86::R11) .addReg(X86::RIP) .addImm(1) .addReg(0) .addGlobalAddress(CombinedGlobal, JTInst->getOperand(2 + 2 * Target).getImm()) .addReg(0); BuildMI(*MBB, MBBI, DL, TII->get(X86::CMP64rr)) .add(Selector) .addReg(X86::R11); }; auto CreateMBB = [&]() { auto *NewMBB = MF->CreateMachineBasicBlock(BB); MBB->addSuccessor(NewMBB); if (!MBB->isLiveIn(X86::EFLAGS)) MBB->addLiveIn(X86::EFLAGS); return NewMBB; }; auto EmitCondJump = [&](unsigned CC, MachineBasicBlock *ThenMBB) { BuildMI(*MBB, MBBI, DL, TII->get(X86::JCC_1)).addMBB(ThenMBB).addImm(CC); auto *ElseMBB = CreateMBB(); MF->insert(InsPt, ElseMBB); MBB = ElseMBB; MBBI = MBB->end(); }; auto EmitCondJumpTarget = [&](unsigned CC, unsigned Target) { auto *ThenMBB = CreateMBB(); TargetMBBs.push_back({ThenMBB, Target}); EmitCondJump(CC, ThenMBB); }; auto EmitTailCall = [&](unsigned Target) { BuildMI(*MBB, MBBI, DL, TII->get(X86::TAILJMPd64)) .add(JTInst->getOperand(3 + 2 * Target)); }; std::function EmitBranchFunnel = [&](unsigned FirstTarget, unsigned NumTargets) { if (NumTargets == 1) { EmitTailCall(FirstTarget); return; } if (NumTargets == 2) { CmpTarget(FirstTarget + 1); EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitTailCall(FirstTarget + 1); return; } if (NumTargets < 6) { CmpTarget(FirstTarget + 1); EmitCondJumpTarget(X86::COND_B, FirstTarget); EmitCondJumpTarget(X86::COND_E, FirstTarget + 1); EmitBranchFunnel(FirstTarget + 2, NumTargets - 2); return; } auto *ThenMBB = CreateMBB(); CmpTarget(FirstTarget + (NumTargets / 2)); EmitCondJump(X86::COND_B, ThenMBB); EmitCondJumpTarget(X86::COND_E, FirstTarget + (NumTargets / 2)); EmitBranchFunnel(FirstTarget + (NumTargets / 2) + 1, NumTargets - (NumTargets / 2) - 1); MF->insert(InsPt, ThenMBB); MBB = ThenMBB; MBBI = MBB->end(); EmitBranchFunnel(FirstTarget, NumTargets / 2); }; EmitBranchFunnel(0, (JTInst->getNumOperands() - 2) / 2); for (auto P : TargetMBBs) { MF->insert(InsPt, P.first); BuildMI(P.first, DL, TII->get(X86::TAILJMPd64)) .add(JTInst->getOperand(3 + 2 * P.second)); } JTMBB->erase(JTInst); } /// If \p MBBI is a pseudo instruction, this method expands /// it to the corresponding (sequence of) actual instruction(s). /// \returns true if \p MBBI has been expanded. bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) { MachineInstr &MI = *MBBI; unsigned Opcode = MI.getOpcode(); DebugLoc DL = MBBI->getDebugLoc(); switch (Opcode) { default: return false; case X86::TCRETURNdi: case X86::TCRETURNdicc: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNdi64cc: case X86::TCRETURNri64: case X86::TCRETURNmi64: { bool isMem = Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64; MachineOperand &JumpTarget = MBBI->getOperand(0); MachineOperand &StackAdjust = MBBI->getOperand(isMem ? X86::AddrNumOperands : 1); assert(StackAdjust.isImm() && "Expecting immediate value."); // Adjust stack pointer. int StackAdj = StackAdjust.getImm(); int MaxTCDelta = X86FI->getTCReturnAddrDelta(); int Offset = 0; assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive"); // Incoporate the retaddr area. Offset = StackAdj - MaxTCDelta; assert(Offset >= 0 && "Offset should never be negative"); if (Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64cc) { assert(Offset == 0 && "Conditional tail call cannot adjust the stack."); } if (Offset) { // Check for possible merge with preceding ADD instruction. Offset += X86FL->mergeSPUpdates(MBB, MBBI, true); X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true); } // Jump to label or value in register. bool IsWin64 = STI->isTargetWin64(); if (Opcode == X86::TCRETURNdi || Opcode == X86::TCRETURNdicc || Opcode == X86::TCRETURNdi64 || Opcode == X86::TCRETURNdi64cc) { unsigned Op; switch (Opcode) { case X86::TCRETURNdi: Op = X86::TAILJMPd; break; case X86::TCRETURNdicc: Op = X86::TAILJMPd_CC; break; case X86::TCRETURNdi64cc: assert(!MBB.getParent()->hasWinCFI() && "Conditional tail calls confuse " "the Win64 unwinder."); Op = X86::TAILJMPd64_CC; break; default: // Note: Win64 uses REX prefixes indirect jumps out of functions, but // not direct ones. Op = X86::TAILJMPd64; break; } MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); if (JumpTarget.isGlobal()) { MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(), JumpTarget.getTargetFlags()); } else { assert(JumpTarget.isSymbol()); MIB.addExternalSymbol(JumpTarget.getSymbolName(), JumpTarget.getTargetFlags()); } if (Op == X86::TAILJMPd_CC || Op == X86::TAILJMPd64_CC) { MIB.addImm(MBBI->getOperand(2).getImm()); } } else if (Opcode == X86::TCRETURNmi || Opcode == X86::TCRETURNmi64) { unsigned Op = (Opcode == X86::TCRETURNmi) ? X86::TAILJMPm : (IsWin64 ? X86::TAILJMPm64_REX : X86::TAILJMPm64); MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(Op)); for (unsigned i = 0; i != X86::AddrNumOperands; ++i) MIB.add(MBBI->getOperand(i)); } else if (Opcode == X86::TCRETURNri64) { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(IsWin64 ? X86::TAILJMPr64_REX : X86::TAILJMPr64)) .add(JumpTarget); } else { JumpTarget.setIsKill(); BuildMI(MBB, MBBI, DL, TII->get(X86::TAILJMPr)) .add(JumpTarget); } MachineInstr &NewMI = *std::prev(MBBI); NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI); // Update the call site info. if (MBBI->isCandidateForCallSiteEntry()) MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI); // Delete the pseudo instruction TCRETURN. MBB.erase(MBBI); return true; } case X86::EH_RETURN: case X86::EH_RETURN64: { MachineOperand &DestAddr = MBBI->getOperand(0); assert(DestAddr.isReg() && "Offset should be in register!"); const bool Uses64BitFramePtr = STI->isTarget64BitLP64() || STI->isTargetNaCl64(); Register StackPtr = TRI->getStackRegister(); BuildMI(MBB, MBBI, DL, TII->get(Uses64BitFramePtr ? X86::MOV64rr : X86::MOV32rr), StackPtr) .addReg(DestAddr.getReg()); // The EH_RETURN pseudo is really removed during the MC Lowering. return true; } case X86::IRET: { // Adjust stack to erase error code int64_t StackAdj = MBBI->getOperand(0).getImm(); X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, true); // Replace pseudo with machine iret BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::IRET64 : X86::IRET32)); MBB.erase(MBBI); return true; } case X86::RET: { // Adjust stack to erase error code int64_t StackAdj = MBBI->getOperand(0).getImm(); MachineInstrBuilder MIB; if (StackAdj == 0) { MIB = BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::RETQ : X86::RETL)); } else if (isUInt<16>(StackAdj)) { MIB = BuildMI(MBB, MBBI, DL, TII->get(STI->is64Bit() ? X86::RETIQ : X86::RETIL)) .addImm(StackAdj); } else { assert(!STI->is64Bit() && "shouldn't need to do this for x86_64 targets!"); // A ret can only handle immediates as big as 2**16-1. If we need to pop // off bytes before the return address, we must do it manually. BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r)).addReg(X86::ECX, RegState::Define); X86FL->emitSPUpdate(MBB, MBBI, DL, StackAdj, /*InEpilogue=*/true); BuildMI(MBB, MBBI, DL, TII->get(X86::PUSH32r)).addReg(X86::ECX); MIB = BuildMI(MBB, MBBI, DL, TII->get(X86::RETL)); } for (unsigned I = 1, E = MBBI->getNumOperands(); I != E; ++I) MIB.add(MBBI->getOperand(I)); MBB.erase(MBBI); return true; } case X86::LCMPXCHG16B_SAVE_RBX: { // Perform the following transformation. // SaveRbx = pseudocmpxchg Addr, <4 opds for the address>, InArg, SaveRbx // => // RBX = InArg // actualcmpxchg Addr // RBX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(6); Register SaveRbx = MBBI->getOperand(7).getReg(); // Copy the input argument of the pseudo into the argument of the // actual instruction. // NOTE: We don't copy the kill flag since the input might be the same reg // as one of the other operands of LCMPXCHG16B. TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, InArg.getReg(), false); // Create the actual instruction. MachineInstr *NewInstr = BuildMI(MBB, MBBI, DL, TII->get(X86::LCMPXCHG16B)); // Copy the operands related to the address. for (unsigned Idx = 1; Idx < 6; ++Idx) NewInstr->addOperand(MBBI->getOperand(Idx)); // Finally, restore the value of RBX. TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); // Delete the pseudo. MBBI->eraseFromParent(); return true; } // Loading/storing mask pairs requires two kmov operations. The second one of // these needs a 2 byte displacement relative to the specified address (with // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the // same spill size, they all are stored using MASKPAIR16STORE, loaded using // MASKPAIR16LOAD. // // The displacement value might wrap around in theory, thus the asserts in // both cases. case X86::MASKPAIR16LOAD: { int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); Register Reg = MBBI->getOperand(0).getReg(); bool DstIsDead = MBBI->getOperand(0).isDead(); Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead)); auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm)) .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead)); for (int i = 0; i < X86::AddrNumOperands; ++i) { MIBLo.add(MBBI->getOperand(1 + i)); if (i == X86::AddrDisp) MIBHi.addImm(Disp + 2); else MIBHi.add(MBBI->getOperand(1 + i)); } // Split the memory operand, adjusting the offset and size for the halves. MachineMemOperand *OldMMO = MBBI->memoperands().front(); MachineFunction *MF = MBB.getParent(); MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); MIBLo.setMemRefs(MMOLo); MIBHi.setMemRefs(MMOHi); // Delete the pseudo. MBB.erase(MBBI); return true; } case X86::MASKPAIR16STORE: { int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm(); assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement"); Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg(); bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill(); Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0); Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1); auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk)); for (int i = 0; i < X86::AddrNumOperands; ++i) { MIBLo.add(MBBI->getOperand(i)); if (i == X86::AddrDisp) MIBHi.addImm(Disp + 2); else MIBHi.add(MBBI->getOperand(i)); } MIBLo.addReg(Reg0, getKillRegState(SrcIsKill)); MIBHi.addReg(Reg1, getKillRegState(SrcIsKill)); // Split the memory operand, adjusting the offset and size for the halves. MachineMemOperand *OldMMO = MBBI->memoperands().front(); MachineFunction *MF = MBB.getParent(); MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2); MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2); MIBLo.setMemRefs(MMOLo); MIBHi.setMemRefs(MMOHi); // Delete the pseudo. MBB.erase(MBBI); return true; } case X86::MWAITX_SAVE_RBX: { // Perform the following transformation. // SaveRbx = pseudomwaitx InArg, SaveRbx // => // [E|R]BX = InArg // actualmwaitx // [E|R]BX = SaveRbx const MachineOperand &InArg = MBBI->getOperand(1); // Copy the input argument of the pseudo into the argument of the // actual instruction. TII->copyPhysReg(MBB, MBBI, DL, X86::EBX, InArg.getReg(), InArg.isKill()); // Create the actual instruction. BuildMI(MBB, MBBI, DL, TII->get(X86::MWAITXrrr)); // Finally, restore the value of RBX. Register SaveRbx = MBBI->getOperand(2).getReg(); TII->copyPhysReg(MBB, MBBI, DL, X86::RBX, SaveRbx, /*SrcIsKill*/ true); // Delete the pseudo. MBBI->eraseFromParent(); return true; } case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; case X86::PLDTILECFG: { MI.RemoveOperand(0); MI.setDesc(TII->get(X86::LDTILECFG)); return true; } case X86::PSTTILECFG: { MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg MI.setDesc(TII->get(X86::STTILECFG)); return true; } case X86::PTILELOADDV: { MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TDPBSSD)); MI.tieOperands(0, 1); return true; } case X86::PTILESTOREDV: { MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } + case X86::PTILEZEROV: { + for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + MI.RemoveOperand(i); + MI.setDesc(TII->get(X86::TILEZERO)); + return true; + } } llvm_unreachable("Previous switch has a fallthrough?"); } /// Expand all pseudo instructions contained in \p MBB. /// \returns true if any expansion occurred for \p MBB. bool X86ExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) { bool Modified = false; // MBBI may be invalidated by the expansion. MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); while (MBBI != E) { MachineBasicBlock::iterator NMBBI = std::next(MBBI); Modified |= ExpandMI(MBB, MBBI); MBBI = NMBBI; } return Modified; } bool X86ExpandPseudo::runOnMachineFunction(MachineFunction &MF) { STI = &static_cast(MF.getSubtarget()); TII = STI->getInstrInfo(); TRI = STI->getRegisterInfo(); X86FI = MF.getInfo(); X86FL = STI->getFrameLowering(); bool Modified = false; for (MachineBasicBlock &MBB : MF) Modified |= ExpandMBB(MBB); return Modified; } /// Returns an instance of the pseudo instruction expansion pass. FunctionPass *llvm::createX86ExpandPseudoPass() { return new X86ExpandPseudo(); } diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp index 883b6bfc145d..a96f73df855d 100644 --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1,6007 +1,6019 @@ //===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file defines a DAG pattern matching instruction selector for X86, // converting from a legalized dag to a X86 dag. // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/Type.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor"); static cl::opt AndImmShrink("x86-and-imm-shrink", cl::init(true), cl::desc("Enable setting constant bits to reduce size of mask immediates"), cl::Hidden); static cl::opt EnablePromoteAnyextLoad( "x86-promote-anyext-load", cl::init(true), cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden); extern cl::opt IndirectBranchTracking; //===----------------------------------------------------------------------===// // Pattern Matcher Implementation //===----------------------------------------------------------------------===// namespace { /// This corresponds to X86AddressMode, but uses SDValue's instead of register /// numbers for the leaves of the matched tree. struct X86ISelAddressMode { enum { RegBase, FrameIndexBase } BaseType; // This is really a union, discriminated by BaseType! SDValue Base_Reg; int Base_FrameIndex; unsigned Scale; SDValue IndexReg; int32_t Disp; SDValue Segment; const GlobalValue *GV; const Constant *CP; const BlockAddress *BlockAddr; const char *ES; MCSymbol *MCSym; int JT; Align Alignment; // CP alignment. unsigned char SymbolFlags; // X86II::MO_* bool NegateIndex = false; X86ISelAddressMode() : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0), Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr), MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {} bool hasSymbolicDisplacement() const { return GV != nullptr || CP != nullptr || ES != nullptr || MCSym != nullptr || JT != -1 || BlockAddr != nullptr; } bool hasBaseOrIndexReg() const { return BaseType == FrameIndexBase || IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr; } /// Return true if this addressing mode is already RIP-relative. bool isRIPRelative() const { if (BaseType != RegBase) return false; if (RegisterSDNode *RegNode = dyn_cast_or_null(Base_Reg.getNode())) return RegNode->getReg() == X86::RIP; return false; } void setBaseReg(SDValue Reg) { BaseType = RegBase; Base_Reg = Reg; } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(SelectionDAG *DAG = nullptr) { dbgs() << "X86ISelAddressMode " << this << '\n'; dbgs() << "Base_Reg "; if (Base_Reg.getNode()) Base_Reg.getNode()->dump(DAG); else dbgs() << "nul\n"; if (BaseType == FrameIndexBase) dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'; dbgs() << " Scale " << Scale << '\n' << "IndexReg "; if (NegateIndex) dbgs() << "negate "; if (IndexReg.getNode()) IndexReg.getNode()->dump(DAG); else dbgs() << "nul\n"; dbgs() << " Disp " << Disp << '\n' << "GV "; if (GV) GV->dump(); else dbgs() << "nul"; dbgs() << " CP "; if (CP) CP->dump(); else dbgs() << "nul"; dbgs() << '\n' << "ES "; if (ES) dbgs() << ES; else dbgs() << "nul"; dbgs() << " MCSym "; if (MCSym) dbgs() << MCSym; else dbgs() << "nul"; dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n'; } #endif }; } namespace { //===--------------------------------------------------------------------===// /// ISel - X86-specific code to select X86 machine instructions for /// SelectionDAG operations. /// class X86DAGToDAGISel final : public SelectionDAGISel { /// Keep a pointer to the X86Subtarget around so that we can /// make the right decision when generating code for different targets. const X86Subtarget *Subtarget; /// If true, selector should try to optimize for minimum code size. bool OptForMinSize; /// Disable direct TLS access through segment registers. bool IndirectTlsSegRefs; public: explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel) : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForMinSize(false), IndirectTlsSegRefs(false) {} StringRef getPassName() const override { return "X86 DAG->DAG Instruction Selection"; } bool runOnMachineFunction(MachineFunction &MF) override { // Reset the subtarget each time through. Subtarget = &MF.getSubtarget(); IndirectTlsSegRefs = MF.getFunction().hasFnAttribute( "indirect-tls-seg-refs"); // OptFor[Min]Size are used in pattern predicates that isel is matching. OptForMinSize = MF.getFunction().hasMinSize(); assert((!OptForMinSize || MF.getFunction().hasOptSize()) && "OptForMinSize implies OptForSize"); SelectionDAGISel::runOnMachineFunction(MF); return true; } void emitFunctionEntryCode() override; bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; void PreprocessISelDAG() override; void PostprocessISelDAG() override; // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" private: void Select(SDNode *N) override; bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM); bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, bool AllowSegmentRegForX32 = false); bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); bool matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressBase(SDValue N, X86ISelAddressMode &AM); bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, SDValue ScaleOp, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectMOV64Imm32(SDValue N, SDValue &Imm); bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool selectRelocImm(SDValue N, SDValue &Op); bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); // Convenience method where P is also root. bool tryFoldLoad(SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); } bool tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); bool isProfitableToFormMaskedOp(SDNode *N) const; /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) override; void emitSpecialCodeForMain(); inline void getAddressOperands(X86ISelAddressMode &AM, const SDLoc &DL, MVT VT, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Base = CurDAG->getTargetFrameIndex( AM.Base_FrameIndex, TLI->getPointerTy(CurDAG->getDataLayout())); else if (AM.Base_Reg.getNode()) Base = AM.Base_Reg; else Base = CurDAG->getRegister(0, VT); Scale = getI8Imm(AM.Scale, DL); // Negate the index if needed. if (AM.NegateIndex) { unsigned NegOpc = VT == MVT::i64 ? X86::NEG64r : X86::NEG32r; SDValue Neg = SDValue(CurDAG->getMachineNode(NegOpc, DL, VT, MVT::i32, AM.IndexReg), 0); AM.IndexReg = Neg; } if (AM.IndexReg.getNode()) Index = AM.IndexReg; else Index = CurDAG->getRegister(0, VT); // These are 32-bit even in 64-bit mode since RIP-relative offset // is 32-bit. if (AM.GV) Disp = CurDAG->getTargetGlobalAddress(AM.GV, SDLoc(), MVT::i32, AM.Disp, AM.SymbolFlags); else if (AM.CP) Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment, AM.Disp, AM.SymbolFlags); else if (AM.ES) { assert(!AM.Disp && "Non-zero displacement is ignored with ES."); Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags); } else if (AM.MCSym) { assert(!AM.Disp && "Non-zero displacement is ignored with MCSym."); assert(AM.SymbolFlags == 0 && "oo"); Disp = CurDAG->getMCSymbol(AM.MCSym, MVT::i32); } else if (AM.JT != -1) { assert(!AM.Disp && "Non-zero displacement is ignored with JT."); Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags); } else if (AM.BlockAddr) Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp, AM.SymbolFlags); else Disp = CurDAG->getTargetConstant(AM.Disp, DL, MVT::i32); if (AM.Segment.getNode()) Segment = AM.Segment; else Segment = CurDAG->getRegister(0, MVT::i16); } // Utility function to determine whether we should avoid selecting // immediate forms of instructions for better code size or not. // At a high level, we'd like to avoid such instructions when // we have similar constants used within the same basic block // that can be kept in a register. // bool shouldAvoidImmediateInstFormsForSize(SDNode *N) const { uint32_t UseCount = 0; // Do not want to hoist if we're not optimizing for size. // TODO: We'd like to remove this restriction. // See the comment in X86InstrInfo.td for more info. if (!CurDAG->shouldOptForSize()) return false; // Walk all the users of the immediate. for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); (UI != UE) && (UseCount < 2); ++UI) { SDNode *User = *UI; // This user is already selected. Count it as a legitimate use and // move on. if (User->isMachineOpcode()) { UseCount++; continue; } // We want to count stores of immediates as real uses. if (User->getOpcode() == ISD::STORE && User->getOperand(1).getNode() == N) { UseCount++; continue; } // We don't currently match users that have > 2 operands (except // for stores, which are handled above) // Those instruction won't match in ISEL, for now, and would // be counted incorrectly. // This may change in the future as we add additional instruction // types. if (User->getNumOperands() != 2) continue; // If this is a sign-extended 8-bit integer immediate used in an ALU // instruction, there is probably an opcode encoding to save space. auto *C = dyn_cast(N); if (C && isInt<8>(C->getSExtValue())) continue; // Immediates that are used for offsets as part of stack // manipulation should be left alone. These are typically // used to indicate SP offsets for argument passing and // will get pulled into stores/pushes (implicitly). if (User->getOpcode() == X86ISD::ADD || User->getOpcode() == ISD::ADD || User->getOpcode() == X86ISD::SUB || User->getOpcode() == ISD::SUB) { // Find the other operand of the add/sub. SDValue OtherOp = User->getOperand(0); if (OtherOp.getNode() == N) OtherOp = User->getOperand(1); // Don't count if the other operand is SP. RegisterSDNode *RegNode; if (OtherOp->getOpcode() == ISD::CopyFromReg && (RegNode = dyn_cast_or_null( OtherOp->getOperand(1).getNode()))) if ((RegNode->getReg() == X86::ESP) || (RegNode->getReg() == X86::RSP)) continue; } // ... otherwise, count this and move on. UseCount++; } // If we have more than 1 use, then recommend for hoisting. return (UseCount > 1); } /// Return a target constant with the specified value of type i8. inline SDValue getI8Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i8); } /// Return a target constant with the specified value, of type i32. inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } /// Return a target constant with the specified value, of type i64. inline SDValue getI64Imm(uint64_t Imm, const SDLoc &DL) { return CurDAG->getTargetConstant(Imm, DL, MVT::i64); } SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, const SDLoc &DL) { assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); uint64_t Index = N->getConstantOperandVal(1); MVT VecVT = N->getOperand(0).getSimpleValueType(); return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, const SDLoc &DL) { assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); uint64_t Index = N->getConstantOperandVal(2); MVT VecVT = N->getSimpleValueType(0); return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); } // Helper to detect unneeded and instructions on shift amounts. Called // from PatFrags in tablegen. bool isUnneededShiftMask(SDNode *N, unsigned Width) const { assert(N->getOpcode() == ISD::AND && "Unexpected opcode"); const APInt &Val = cast(N->getOperand(1))->getAPIntValue(); if (Val.countTrailingOnes() >= Width) return true; APInt Mask = Val | CurDAG->computeKnownBits(N->getOperand(0)).Zero; return Mask.countTrailingOnes() >= Width; } /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. SDNode *getGlobalBaseReg(); /// Return a reference to the TargetMachine, casted to the target-specific /// type. const X86TargetMachine &getTargetMachine() const { return static_cast(TM); } /// Return a reference to the TargetInstrInfo, casted to the target-specific /// type. const X86InstrInfo *getInstrInfo() const { return Subtarget->getInstrInfo(); } /// Address-mode matching performs shift-of-and to and-of-shift /// reassociation in order to expose more scaled addressing /// opportunities. bool ComplexPatternFuncMutatesDAG() const override { return true; } bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const; // Indicates we should prefer to use a non-temporal load for this load. bool useNonTemporalLoad(LoadSDNode *N) const { if (!N->isNonTemporal()) return false; unsigned StoreSize = N->getMemoryVT().getStoreSize(); if (N->getAlignment() < StoreSize) return false; switch (StoreSize) { default: llvm_unreachable("Unsupported store size"); case 4: case 8: return false; case 16: return Subtarget->hasSSE41(); case 32: return Subtarget->hasAVX2(); case 64: return Subtarget->hasAVX512(); } } bool foldLoadStoreIntoMemOperand(SDNode *Node); MachineSDNode *matchBEXTRFromAndImm(SDNode *Node); bool matchBitExtract(SDNode *Node); bool shrinkAndImmediate(SDNode *N); bool isMaskZeroExtended(SDNode *N) const; bool tryShiftAmountMod(SDNode *N); bool tryShrinkShlLogicImm(SDNode *N); bool tryVPTERNLOG(SDNode *N); bool matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, SDValue A, SDValue B, SDValue C, uint8_t Imm); bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask); bool tryMatchBitSelect(SDNode *N); MachineSDNode *emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node); MachineSDNode *emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node, SDValue &InFlag); bool tryOptimizeRem8Extend(SDNode *N); bool onlyUsesZeroFlag(SDValue Flags) const; bool hasNoSignFlagUses(SDValue Flags) const; bool hasNoCarryFlagUses(SDValue Flags) const; }; } // Returns true if this masked compare can be implemented legally with this // type. static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { unsigned Opcode = N->getOpcode(); if (Opcode == X86ISD::CMPM || Opcode == X86ISD::CMPMM || Opcode == X86ISD::STRICT_CMPM || Opcode == ISD::SETCC || Opcode == X86ISD::CMPMM_SAE || Opcode == X86ISD::VFPCLASS) { // We can get 256-bit 8 element types here without VLX being enabled. When // this happens we will use 512-bit operations and the mask will not be // zero extended. EVT OpVT = N->getOperand(0).getValueType(); // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the // second operand. if (Opcode == X86ISD::STRICT_CMPM) OpVT = N->getOperand(1).getValueType(); if (OpVT.is256BitVector() || OpVT.is128BitVector()) return Subtarget->hasVLX(); return true; } // Scalar opcodes use 128 bit registers, but aren't subject to the VLX check. if (Opcode == X86ISD::VFPCLASSS || Opcode == X86ISD::FSETCCM || Opcode == X86ISD::FSETCCM_SAE) return true; return false; } // Returns true if we can assume the writer of the mask has zero extended it // for us. bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { // If this is an AND, check if we have a compare on either side. As long as // one side guarantees the mask is zero extended, the AND will preserve those // zeros. if (N->getOpcode() == ISD::AND) return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); return isLegalMaskCompare(N, Subtarget); } bool X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (OptLevel == CodeGenOpt::None) return false; if (!N.hasOneUse()) return false; if (N.getOpcode() != ISD::LOAD) return true; // Don't fold non-temporal loads if we have an instruction for them. if (useNonTemporalLoad(cast(N))) return false; // If N is a load, do additional profitability checks. if (U == Root) { switch (U->getOpcode()) { default: break; case X86ISD::ADD: case X86ISD::ADC: case X86ISD::SUB: case X86ISD::SBB: case X86ISD::AND: case X86ISD::XOR: case X86ISD::OR: case ISD::ADD: case ISD::ADDCARRY: case ISD::AND: case ISD::OR: case ISD::XOR: { SDValue Op1 = U->getOperand(1); // If the other operand is a 8-bit immediate we should fold the immediate // instead. This reduces code size. // e.g. // movl 4(%esp), %eax // addl $4, %eax // vs. // movl $4, %eax // addl 4(%esp), %eax // The former is 2 bytes shorter. In case where the increment is 1, then // the saving can be 4 bytes (by using incl %eax). if (ConstantSDNode *Imm = dyn_cast(Op1)) { if (Imm->getAPIntValue().isSignedIntN(8)) return false; // If this is a 64-bit AND with an immediate that fits in 32-bits, // prefer using the smaller and over folding the load. This is needed to // make sure immediates created by shrinkAndImmediate are always folded. // Ideally we would narrow the load during DAG combine and get the // best of both worlds. if (U->getOpcode() == ISD::AND && Imm->getAPIntValue().getBitWidth() == 64 && Imm->getAPIntValue().isSignedIntN(32)) return false; // If this really a zext_inreg that can be represented with a movzx // instruction, prefer that. // TODO: We could shrink the load and fold if it is non-volatile. if (U->getOpcode() == ISD::AND && (Imm->getAPIntValue() == UINT8_MAX || Imm->getAPIntValue() == UINT16_MAX || Imm->getAPIntValue() == UINT32_MAX)) return false; // ADD/SUB with can negate the immediate and use the opposite operation // to fit 128 into a sign extended 8 bit immediate. if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) && (-Imm->getAPIntValue()).isSignedIntN(8)) return false; if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) && (-Imm->getAPIntValue()).isSignedIntN(8) && hasNoCarryFlagUses(SDValue(U, 1))) return false; } // If the other operand is a TLS address, we should fold it instead. // This produces // movl %gs:0, %eax // leal i@NTPOFF(%eax), %eax // instead of // movl $i@NTPOFF, %eax // addl %gs:0, %eax // if the block also has an access to a second TLS address this will save // a load. // FIXME: This is probably also true for non-TLS addresses. if (Op1.getOpcode() == X86ISD::Wrapper) { SDValue Val = Op1.getOperand(0); if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) return false; } // Don't fold load if this matches the BTS/BTR/BTC patterns. // BTS: (or X, (shl 1, n)) // BTR: (and X, (rotl -2, n)) // BTC: (xor X, (shl 1, n)) if (U->getOpcode() == ISD::OR || U->getOpcode() == ISD::XOR) { if (U->getOperand(0).getOpcode() == ISD::SHL && isOneConstant(U->getOperand(0).getOperand(0))) return false; if (U->getOperand(1).getOpcode() == ISD::SHL && isOneConstant(U->getOperand(1).getOperand(0))) return false; } if (U->getOpcode() == ISD::AND) { SDValue U0 = U->getOperand(0); SDValue U1 = U->getOperand(1); if (U0.getOpcode() == ISD::ROTL) { auto *C = dyn_cast(U0.getOperand(0)); if (C && C->getSExtValue() == -2) return false; } if (U1.getOpcode() == ISD::ROTL) { auto *C = dyn_cast(U1.getOperand(0)); if (C && C->getSExtValue() == -2) return false; } } break; } case ISD::SHL: case ISD::SRA: case ISD::SRL: // Don't fold a load into a shift by immediate. The BMI2 instructions // support folding a load, but not an immediate. The legacy instructions // support folding an immediate, but can't fold a load. Folding an // immediate is preferable to folding a load. if (isa(U->getOperand(1))) return false; break; } } // Prevent folding a load if this can implemented with an insert_subreg or // a move that implicitly zeroes. if (Root->getOpcode() == ISD::INSERT_SUBVECTOR && isNullConstant(Root->getOperand(2)) && (Root->getOperand(0).isUndef() || ISD::isBuildVectorAllZeros(Root->getOperand(0).getNode()))) return false; return true; } // Indicates it is profitable to form an AVX512 masked operation. Returning // false will favor a masked register-register masked move or vblendm and the // operation will be selected separately. bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const { assert( (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) && "Unexpected opcode!"); // If the operation has additional users, the operation will be duplicated. // Check the use count to prevent that. // FIXME: Are there cheap opcodes we might want to duplicate? return N->getOperand(1).hasOneUse(); } /// Replace the original chain operand of the call with /// load's chain operand and move load below the call's chain operand. static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load, SDValue Call, SDValue OrigChain) { SmallVector Ops; SDValue Chain = OrigChain.getOperand(0); if (Chain.getNode() == Load.getNode()) Ops.push_back(Load.getOperand(0)); else { assert(Chain.getOpcode() == ISD::TokenFactor && "Unexpected chain operand"); for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) if (Chain.getOperand(i).getNode() == Load.getNode()) Ops.push_back(Load.getOperand(0)); else Ops.push_back(Chain.getOperand(i)); SDValue NewChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops); Ops.clear(); Ops.push_back(NewChain); } Ops.append(OrigChain->op_begin() + 1, OrigChain->op_end()); CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops); CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0), Load.getOperand(1), Load.getOperand(2)); Ops.clear(); Ops.push_back(SDValue(Load.getNode(), 1)); Ops.append(Call->op_begin() + 1, Call->op_end()); CurDAG->UpdateNodeOperands(Call.getNode(), Ops); } /// Return true if call address is a load and it can be /// moved below CALLSEQ_START and the chains leading up to the call. /// Return the CALLSEQ_START by reference as a second output. /// In the case of a tail call, there isn't a callseq node between the call /// chain and the load. static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { // The transformation is somewhat dangerous if the call's chain was glued to // the call. After MoveBelowOrigChain the load is moved between the call and // the chain, this can create a cycle if the load is not folded. So it is // *really* important that we are sure the load will be folded. if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse()) return false; LoadSDNode *LD = dyn_cast(Callee.getNode()); if (!LD || !LD->isSimple() || LD->getAddressingMode() != ISD::UNINDEXED || LD->getExtensionType() != ISD::NON_EXTLOAD) return false; // Now let's find the callseq_start. while (HasCallSeq && Chain.getOpcode() != ISD::CALLSEQ_START) { if (!Chain.hasOneUse()) return false; Chain = Chain.getOperand(0); } if (!Chain.getNumOperands()) return false; // Since we are not checking for AA here, conservatively abort if the chain // writes to memory. It's not safe to move the callee (a load) across a store. if (isa(Chain.getNode()) && cast(Chain.getNode())->writeMem()) return false; if (Chain.getOperand(0).getNode() == Callee.getNode()) return true; if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor && Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) && Callee.getValue(1).hasOneUse()) return true; return false; } static bool isEndbrImm64(uint64_t Imm) { // There may be some other prefix bytes between 0xF3 and 0x0F1EFA. // i.g: 0xF3660F1EFA, 0xF3670F1EFA if ((Imm & 0x00FFFFFF) != 0x0F1EFA) return false; uint8_t OptionalPrefixBytes [] = {0x26, 0x2e, 0x36, 0x3e, 0x64, 0x65, 0x66, 0x67, 0xf0, 0xf2}; int i = 24; // 24bit 0x0F1EFA has matched while (i < 64) { uint8_t Byte = (Imm >> i) & 0xFF; if (Byte == 0xF3) return true; if (!llvm::is_contained(OptionalPrefixBytes, Byte)) return false; i += 8; } return false; } void X86DAGToDAGISel::PreprocessISelDAG() { bool MadeChange = false; for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), E = CurDAG->allnodes_end(); I != E; ) { SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues. // This is for CET enhancement. // // ENDBR32 and ENDBR64 have specific opcodes: // ENDBR32: F3 0F 1E FB // ENDBR64: F3 0F 1E FA // And we want that attackers won’t find unintended ENDBR32/64 // opcode matches in the binary // Here’s an example: // If the compiler had to generate asm for the following code: // a = 0xF30F1EFA // it could, for example, generate: // mov 0xF30F1EFA, dword ptr[a] // In such a case, the binary would include a gadget that starts // with a fake ENDBR64 opcode. Therefore, we split such generation // into multiple operations, let it not shows in the binary if (N->getOpcode() == ISD::Constant) { MVT VT = N->getSimpleValueType(0); int64_t Imm = cast(N)->getSExtValue(); int32_t EndbrImm = Subtarget->is64Bit() ? 0xF30F1EFA : 0xF30F1EFB; if (Imm == EndbrImm || isEndbrImm64(Imm)) { // Check that the cf-protection-branch is enabled. Metadata *CFProtectionBranch = MF->getMMI().getModule()->getModuleFlag("cf-protection-branch"); if (CFProtectionBranch || IndirectBranchTracking) { SDLoc dl(N); SDValue Complement = CurDAG->getConstant(~Imm, dl, VT, false, true); Complement = CurDAG->getNOT(dl, Complement, VT); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Complement); ++I; MadeChange = true; continue; } } } // If this is a target specific AND node with no flag usages, turn it back // into ISD::AND to enable test instruction matching. if (N->getOpcode() == X86ISD::AND && !N->hasAnyUseOfValue(1)) { SDValue Res = CurDAG->getNode(ISD::AND, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } /// Convert vector increment or decrement to sub/add with an all-ones /// constant: /// add X, <1, 1...> --> sub X, <-1, -1...> /// sub X, <1, 1...> --> add X, <-1, -1...> /// The all-ones vector constant can be materialized using a pcmpeq /// instruction that is commonly recognized as an idiom (has no register /// dependency), so that's better/smaller than loading a splat 1 constant. if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && N->getSimpleValueType(0).isVector()) { APInt SplatVal; if (X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOneValue()) { SDLoc DL(N); MVT VT = N->getSimpleValueType(0); unsigned NumElts = VT.getSizeInBits() / 32; SDValue AllOnes = CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts)); AllOnes = CurDAG->getBitcast(VT, AllOnes); unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD; SDValue Res = CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } } switch (N->getOpcode()) { case X86ISD::VBROADCAST: { MVT VT = N->getSimpleValueType(0); // Emulate v32i16/v64i8 broadcast without BWI. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; SDLoc dl(N); SDValue NarrowBCast = CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0)); SDValue Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); unsigned Index = VT == MVT::v32i16 ? 16 : 32; Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, CurDAG->getIntPtrConstant(Index, dl)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } break; } case X86ISD::VBROADCAST_LOAD: { MVT VT = N->getSimpleValueType(0); // Emulate v32i16/v64i8 broadcast without BWI. if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) { MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8; auto *MemNode = cast(N); SDLoc dl(N); SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other); SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()}; SDValue NarrowBCast = CurDAG->getMemIntrinsicNode( X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(), MemNode->getMemOperand()); SDValue Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT), NarrowBCast, CurDAG->getIntPtrConstant(0, dl)); unsigned Index = VT == MVT::v32i16 ? 16 : 32; Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast, CurDAG->getIntPtrConstant(Index, dl)); --I; SDValue To[] = {Res, NarrowBCast.getValue(1)}; CurDAG->ReplaceAllUsesWith(N, To); ++I; MadeChange = true; continue; } break; } case ISD::VSELECT: { // Replace VSELECT with non-mask conditions with with BLENDV. if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1) break; assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!"); SDValue Blendv = CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1), N->getOperand(2)); --I; CurDAG->ReplaceAllUsesWith(N, Blendv.getNode()); ++I; MadeChange = true; continue; } case ISD::FP_ROUND: case ISD::STRICT_FP_ROUND: case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: case ISD::STRICT_FP_TO_SINT: case ISD::STRICT_FP_TO_UINT: { // Replace vector fp_to_s/uint with their X86 specific equivalent so we // don't need 2 sets of patterns. if (!N->getSimpleValueType(0).isVector()) break; unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break; case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break; case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break; case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break; case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break; case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break; } SDValue Res; if (N->isStrictFPOpcode()) Res = CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other}, {N->getOperand(0), N->getOperand(1)}); else Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } case ISD::SHL: case ISD::SRA: case ISD::SRL: { // Replace vector shifts with their X86 specific equivalent so we don't // need 2 sets of patterns. if (!N->getValueType(0).isVector()) break; unsigned NewOpc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::SHL: NewOpc = X86ISD::VSHLV; break; case ISD::SRA: NewOpc = X86ISD::VSRAV; break; case ISD::SRL: NewOpc = X86ISD::VSRLV; break; } SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0), N->getOperand(1)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } case ISD::ANY_EXTEND: case ISD::ANY_EXTEND_VECTOR_INREG: { // Replace vector any extend with the zero extend equivalents so we don't // need 2 sets of patterns. Ignore vXi1 extensions. if (!N->getValueType(0).isVector()) break; unsigned NewOpc; if (N->getOperand(0).getScalarValueSizeInBits() == 1) { assert(N->getOpcode() == ISD::ANY_EXTEND && "Unexpected opcode for mask vector!"); NewOpc = ISD::SIGN_EXTEND; } else { NewOpc = N->getOpcode() == ISD::ANY_EXTEND ? ISD::ZERO_EXTEND : ISD::ZERO_EXTEND_VECTOR_INREG; } SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0), N->getOperand(0)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } case ISD::FCEIL: case ISD::STRICT_FCEIL: case ISD::FFLOOR: case ISD::STRICT_FFLOOR: case ISD::FTRUNC: case ISD::STRICT_FTRUNC: case ISD::FROUNDEVEN: case ISD::STRICT_FROUNDEVEN: case ISD::FNEARBYINT: case ISD::STRICT_FNEARBYINT: case ISD::FRINT: case ISD::STRICT_FRINT: { // Replace fp rounding with their X86 specific equivalent so we don't // need 2 sets of patterns. unsigned Imm; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::STRICT_FCEIL: case ISD::FCEIL: Imm = 0xA; break; case ISD::STRICT_FFLOOR: case ISD::FFLOOR: Imm = 0x9; break; case ISD::STRICT_FTRUNC: case ISD::FTRUNC: Imm = 0xB; break; case ISD::STRICT_FROUNDEVEN: case ISD::FROUNDEVEN: Imm = 0x8; break; case ISD::STRICT_FNEARBYINT: case ISD::FNEARBYINT: Imm = 0xC; break; case ISD::STRICT_FRINT: case ISD::FRINT: Imm = 0x4; break; } SDLoc dl(N); bool IsStrict = N->isStrictFPOpcode(); SDValue Res; if (IsStrict) Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl, {N->getValueType(0), MVT::Other}, {N->getOperand(0), N->getOperand(1), CurDAG->getTargetConstant(Imm, dl, MVT::i32)}); else Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0), CurDAG->getTargetConstant(Imm, dl, MVT::i32)); --I; CurDAG->ReplaceAllUsesWith(N, Res.getNode()); ++I; MadeChange = true; continue; } case X86ISD::FANDN: case X86ISD::FAND: case X86ISD::FOR: case X86ISD::FXOR: { // Widen scalar fp logic ops to vector to reduce isel patterns. // FIXME: Can we do this during lowering/combine. MVT VT = N->getSimpleValueType(0); if (VT.isVector() || VT == MVT::f128) break; MVT VecVT = VT == MVT::f64 ? MVT::v2f64 : MVT::v4f32; SDLoc dl(N); SDValue Op0 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(0)); SDValue Op1 = CurDAG->getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N->getOperand(1)); SDValue Res; if (Subtarget->hasSSE2()) { EVT IntVT = EVT(VecVT).changeVectorElementTypeToInteger(); Op0 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op0); Op1 = CurDAG->getNode(ISD::BITCAST, dl, IntVT, Op1); unsigned Opc; switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::FANDN: Opc = X86ISD::ANDNP; break; case X86ISD::FAND: Opc = ISD::AND; break; case X86ISD::FOR: Opc = ISD::OR; break; case X86ISD::FXOR: Opc = ISD::XOR; break; } Res = CurDAG->getNode(Opc, dl, IntVT, Op0, Op1); Res = CurDAG->getNode(ISD::BITCAST, dl, VecVT, Res); } else { Res = CurDAG->getNode(N->getOpcode(), dl, VecVT, Op0, Op1); } Res = CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, CurDAG->getIntPtrConstant(0, dl)); --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res); ++I; MadeChange = true; continue; } } if (OptLevel != CodeGenOpt::None && // Only do this when the target can fold the load into the call or // jmp. !Subtarget->useIndirectThunkCalls() && ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && (Subtarget->is64Bit() || !getTargetMachine().isPositionIndependent())))) { /// Also try moving call address load from outside callseq_start to just /// before the call to allow it to be folded. /// /// [Load chain] /// ^ /// | /// [Load] /// ^ ^ /// | | /// / \-- /// / | ///[CALLSEQ_START] | /// ^ | /// | | /// [LOAD/C2Reg] | /// | | /// \ / /// \ / /// [CALL] bool HasCallSeq = N->getOpcode() == X86ISD::CALL; SDValue Chain = N->getOperand(0); SDValue Load = N->getOperand(1); if (!isCalleeLoad(Load, Chain, HasCallSeq)) continue; moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain); ++NumLoadMoved; MadeChange = true; continue; } // Lower fpround and fpextend nodes that target the FP stack to be store and // load to the stack. This is a gross hack. We would like to simply mark // these as being illegal, but when we do that, legalize produces these when // it expands calls, then expands these in the same legalize pass. We would // like dag combine to be able to hack on these between the call expansion // and the node legalization. As such this pass basically does "really // late" legalization of these inline with the X86 isel pass. // FIXME: This should only happen when not compiled with -O0. switch (N->getOpcode()) { default: continue; case ISD::FP_ROUND: case ISD::FP_EXTEND: { MVT SrcVT = N->getOperand(0).getSimpleValueType(); MVT DstVT = N->getSimpleValueType(0); // If any of the sources are vectors, no fp stack involved. if (SrcVT.isVector() || DstVT.isVector()) continue; // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = static_cast(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; if (!SrcIsSSE && !DstIsSSE) { // If this is an FPStack extension, it is a noop. if (N->getOpcode() == ISD::FP_EXTEND) continue; // If this is a value-preserving FPStack truncation, it is a noop. if (N->getConstantOperandVal(1)) continue; } // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); int SPFI = cast(MemTmp)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? SDValue Store = CurDAG->getTruncStore( CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT); SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp, MPI, MemVT); // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because // anything below the conversion could be folded into other existing nodes. // To avoid invalidating 'I', back it up to the convert node. --I; CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result); break; } //The sequence of events for lowering STRICT_FP versions of these nodes requires //dealing with the chain differently, as there is already a preexisting chain. case ISD::STRICT_FP_ROUND: case ISD::STRICT_FP_EXTEND: { MVT SrcVT = N->getOperand(1).getSimpleValueType(); MVT DstVT = N->getSimpleValueType(0); // If any of the sources are vectors, no fp stack involved. if (SrcVT.isVector() || DstVT.isVector()) continue; // If the source and destination are SSE registers, then this is a legal // conversion that should not be lowered. const X86TargetLowering *X86Lowering = static_cast(TLI); bool SrcIsSSE = X86Lowering->isScalarFPTypeInSSEReg(SrcVT); bool DstIsSSE = X86Lowering->isScalarFPTypeInSSEReg(DstVT); if (SrcIsSSE && DstIsSSE) continue; if (!SrcIsSSE && !DstIsSSE) { // If this is an FPStack extension, it is a noop. if (N->getOpcode() == ISD::STRICT_FP_EXTEND) continue; // If this is a value-preserving FPStack truncation, it is a noop. if (N->getConstantOperandVal(2)) continue; } // Here we could have an FP stack truncation or an FPStack <-> SSE convert. // FPStack has extload and truncstore. SSE can fold direct loads into other // operations. Based on this, decide what we want to do. MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT; SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT); int SPFI = cast(MemTmp)->getIndex(); MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI); SDLoc dl(N); // FIXME: optimize the case where the src/dest is a load or store? //Since the operation is StrictFP, use the preexisting chain. SDValue Store, Result; if (!SrcIsSSE) { SDVTList VTs = CurDAG->getVTList(MVT::Other); SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp}; Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT, MPI, /*Align*/ None, MachineMemOperand::MOStore); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Store->getFlags(); Flags.setNoFPExcept(true); Store->setFlags(Flags); } } else { assert(SrcVT == MemVT && "Unexpected VT!"); Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp, MPI); } if (!DstIsSSE) { SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other); SDValue Ops[] = {Store, MemTmp}; Result = CurDAG->getMemIntrinsicNode( X86ISD::FLD, dl, VTs, Ops, MemVT, MPI, /*Align*/ None, MachineMemOperand::MOLoad); if (N->getFlags().hasNoFPExcept()) { SDNodeFlags Flags = Result->getFlags(); Flags.setNoFPExcept(true); Result->setFlags(Flags); } } else { assert(DstVT == MemVT && "Unexpected VT!"); Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI); } // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the // extload we created. This will cause general havok on the dag because // anything below the conversion could be folded into other existing nodes. // To avoid invalidating 'I', back it up to the convert node. --I; CurDAG->ReplaceAllUsesWith(N, Result.getNode()); break; } } // Now that we did that, the node is dead. Increment the iterator to the // next node to process, then delete N. ++I; MadeChange = true; } // Remove any dead nodes that may have been left behind. if (MadeChange) CurDAG->RemoveDeadNodes(); } // Look for a redundant movzx/movsx that can occur after an 8-bit divrem. bool X86DAGToDAGISel::tryOptimizeRem8Extend(SDNode *N) { unsigned Opc = N->getMachineOpcode(); if (Opc != X86::MOVZX32rr8 && Opc != X86::MOVSX32rr8 && Opc != X86::MOVSX64rr8) return false; SDValue N0 = N->getOperand(0); // We need to be extracting the lower bit of an extend. if (!N0.isMachineOpcode() || N0.getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG || N0.getConstantOperandVal(1) != X86::sub_8bit) return false; // We're looking for either a movsx or movzx to match the original opcode. unsigned ExpectedOpc = Opc == X86::MOVZX32rr8 ? X86::MOVZX32rr8_NOREX : X86::MOVSX32rr8_NOREX; SDValue N00 = N0.getOperand(0); if (!N00.isMachineOpcode() || N00.getMachineOpcode() != ExpectedOpc) return false; if (Opc == X86::MOVSX64rr8) { // If we had a sign extend from 8 to 64 bits. We still need to go from 32 // to 64. MachineSDNode *Extend = CurDAG->getMachineNode(X86::MOVSX64rr32, SDLoc(N), MVT::i64, N00); ReplaceUses(N, Extend); } else { // Ok we can drop this extend and just use the original extend. ReplaceUses(N, N00.getNode()); } return true; } void X86DAGToDAGISel::PostprocessISelDAG() { // Skip peepholes at -O0. if (TM.getOptLevel() == CodeGenOpt::None) return; SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); bool MadeChange = false; while (Position != CurDAG->allnodes_begin()) { SDNode *N = &*--Position; // Skip dead nodes and any non-machine opcodes. if (N->use_empty() || !N->isMachineOpcode()) continue; if (tryOptimizeRem8Extend(N)) { MadeChange = true; continue; } // Look for a TESTrr+ANDrr pattern where both operands of the test are // the same. Rewrite to remove the AND. unsigned Opc = N->getMachineOpcode(); if ((Opc == X86::TEST8rr || Opc == X86::TEST16rr || Opc == X86::TEST32rr || Opc == X86::TEST64rr) && N->getOperand(0) == N->getOperand(1) && N->isOnlyUserOf(N->getOperand(0).getNode()) && N->getOperand(0).isMachineOpcode()) { SDValue And = N->getOperand(0); unsigned N0Opc = And.getMachineOpcode(); if (N0Opc == X86::AND8rr || N0Opc == X86::AND16rr || N0Opc == X86::AND32rr || N0Opc == X86::AND64rr) { MachineSDNode *Test = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i32, And.getOperand(0), And.getOperand(1)); ReplaceUses(N, Test); MadeChange = true; continue; } if (N0Opc == X86::AND8rm || N0Opc == X86::AND16rm || N0Opc == X86::AND32rm || N0Opc == X86::AND64rm) { unsigned NewOpc; switch (N0Opc) { case X86::AND8rm: NewOpc = X86::TEST8mr; break; case X86::AND16rm: NewOpc = X86::TEST16mr; break; case X86::AND32rm: NewOpc = X86::TEST32mr; break; case X86::AND64rm: NewOpc = X86::TEST64mr; break; } // Need to swap the memory and register operand. SDValue Ops[] = { And.getOperand(1), And.getOperand(2), And.getOperand(3), And.getOperand(4), And.getOperand(5), And.getOperand(0), And.getOperand(6) /* Chain */ }; MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N), MVT::i32, MVT::Other, Ops); CurDAG->setNodeMemRefs( Test, cast(And.getNode())->memoperands()); ReplaceUses(N, Test); MadeChange = true; continue; } } // Look for a KAND+KORTEST and turn it into KTEST if only the zero flag is // used. We're doing this late so we can prefer to fold the AND into masked // comparisons. Doing that can be better for the live range of the mask // register. if ((Opc == X86::KORTESTBrr || Opc == X86::KORTESTWrr || Opc == X86::KORTESTDrr || Opc == X86::KORTESTQrr) && N->getOperand(0) == N->getOperand(1) && N->isOnlyUserOf(N->getOperand(0).getNode()) && N->getOperand(0).isMachineOpcode() && onlyUsesZeroFlag(SDValue(N, 0))) { SDValue And = N->getOperand(0); unsigned N0Opc = And.getMachineOpcode(); // KANDW is legal with AVX512F, but KTESTW requires AVX512DQ. The other // KAND instructions and KTEST use the same ISA feature. if (N0Opc == X86::KANDBrr || (N0Opc == X86::KANDWrr && Subtarget->hasDQI()) || N0Opc == X86::KANDDrr || N0Opc == X86::KANDQrr) { unsigned NewOpc; switch (Opc) { default: llvm_unreachable("Unexpected opcode!"); case X86::KORTESTBrr: NewOpc = X86::KTESTBrr; break; case X86::KORTESTWrr: NewOpc = X86::KTESTWrr; break; case X86::KORTESTDrr: NewOpc = X86::KTESTDrr; break; case X86::KORTESTQrr: NewOpc = X86::KTESTQrr; break; } MachineSDNode *KTest = CurDAG->getMachineNode(NewOpc, SDLoc(N), MVT::i32, And.getOperand(0), And.getOperand(1)); ReplaceUses(N, KTest); MadeChange = true; continue; } } // Attempt to remove vectors moves that were inserted to zero upper bits. if (Opc != TargetOpcode::SUBREG_TO_REG) continue; unsigned SubRegIdx = N->getConstantOperandVal(2); if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) continue; SDValue Move = N->getOperand(1); if (!Move.isMachineOpcode()) continue; // Make sure its one of the move opcodes we recognize. switch (Move.getMachineOpcode()) { default: continue; case X86::VMOVAPDrr: case X86::VMOVUPDrr: case X86::VMOVAPSrr: case X86::VMOVUPSrr: case X86::VMOVDQArr: case X86::VMOVDQUrr: case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: break; } SDValue In = Move.getOperand(0); if (!In.isMachineOpcode() || In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) continue; // Make sure the instruction has a VEX, XOP, or EVEX prefix. This covers // the SHA instructions which use a legacy encoding. uint64_t TSFlags = getInstrInfo()->get(In.getMachineOpcode()).TSFlags; if ((TSFlags & X86II::EncodingMask) != X86II::VEX && (TSFlags & X86II::EncodingMask) != X86II::EVEX && (TSFlags & X86II::EncodingMask) != X86II::XOP) continue; // Producing instruction is another vector instruction. We can drop the // move. CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); MadeChange = true; } if (MadeChange) CurDAG->RemoveDeadNodes(); } /// Emit any code that needs to be executed only in the main function. void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { TargetLowering::ArgListTy Args; auto &DL = CurDAG->getDataLayout(); TargetLowering::CallLoweringInfo CLI(*CurDAG); CLI.setChain(CurDAG->getRoot()) .setCallee(CallingConv::C, Type::getVoidTy(*CurDAG->getContext()), CurDAG->getExternalSymbol("__main", TLI->getPointerTy(DL)), std::move(Args)); const TargetLowering &TLI = CurDAG->getTargetLoweringInfo(); std::pair Result = TLI.LowerCallTo(CLI); CurDAG->setRoot(Result.second); } } void X86DAGToDAGISel::emitFunctionEntryCode() { // If this is main, emit special code for main. const Function &F = MF->getFunction(); if (F.hasExternalLinkage() && F.getName() == "main") emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { // On 64-bit platforms, we can run into an issue where a frame index // includes a displacement that, when added to the explicit displacement, // will overflow the displacement field. Assuming that the frame index // displacement fits into a 31-bit integer (which is only slightly more // aggressive than the current fundamental assumption that it fits into // a 32-bit integer), a 31-bit disp should always be safe. return isInt<31>(Val); } bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM) { // We may have already matched a displacement and the caller just added the // symbolic displacement. So we still need to do the checks even if Offset // is zero. int64_t Val = AM.Disp + Offset; // Cannot combine ExternalSymbol displacements with integer offsets. if (Val != 0 && (AM.ES || AM.MCSym)) return true; CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit()) { if (Val != 0 && !X86::isOffsetSuitableForCodeModel(Val, M, AM.hasSymbolicDisplacement())) return true; // In addition to the checks required for a register base, check that // we do not try to use an unsafe Disp with a frame index. if (AM.BaseType == X86ISelAddressMode::FrameIndexBase && !isDispSafeForFrameIndex(Val)) return true; } AM.Disp = Val; return false; } bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM, bool AllowSegmentRegForX32) { SDValue Address = N->getOperand(1); // load gs:0 -> GS segment register. // load fs:0 -> FS segment register. // // This optimization is generally valid because the GNU TLS model defines that // gs:0 (or fs:0 on X86-64) contains its own address. However, for X86-64 mode // with 32-bit registers, as we get in ILP32 mode, those registers are first // zero-extended to 64 bits and then added it to the base address, which gives // unwanted results when the register holds a negative value. // For more information see http://people.redhat.com/drepper/tls.pdf if (ConstantSDNode *C = dyn_cast(Address)) { if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr && !IndirectTlsSegRefs && (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())) { if (Subtarget->isTarget64BitILP32() && !AllowSegmentRegForX32) return true; switch (N->getPointerInfo().getAddrSpace()) { case X86AS::GS: AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); return false; case X86AS::FS: AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); return false; // Address space X86AS::SS is not handled here, because it is not used to // address TLS areas. } } } return true; } /// Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes into an addressing /// mode. These wrap things that will resolve down into a symbol reference. /// If no match is possible, this returns true, otherwise it returns false. bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) { // If the addressing mode already has a symbol as the displacement, we can // never match another symbol. if (AM.hasSymbolicDisplacement()) return true; bool IsRIPRelTLS = false; bool IsRIPRel = N.getOpcode() == X86ISD::WrapperRIP; if (IsRIPRel) { SDValue Val = N.getOperand(0); if (Val.getOpcode() == ISD::TargetGlobalTLSAddress) IsRIPRelTLS = true; } // We can't use an addressing mode in the 64-bit large code model. // Global TLS addressing is an exception. In the medium code model, // we use can use a mode when RIP wrappers are present. // That signifies access to globals that are known to be "near", // such as the GOT itself. CodeModel::Model M = TM.getCodeModel(); if (Subtarget->is64Bit() && ((M == CodeModel::Large && !IsRIPRelTLS) || (M == CodeModel::Medium && !IsRIPRel))) return true; // Base and index reg must be 0 in order to use %rip as base. if (IsRIPRel && AM.hasBaseOrIndexReg()) return true; // Make a local copy in case we can't do this fold. X86ISelAddressMode Backup = AM; int64_t Offset = 0; SDValue N0 = N.getOperand(0); if (GlobalAddressSDNode *G = dyn_cast(N0)) { AM.GV = G->getGlobal(); AM.SymbolFlags = G->getTargetFlags(); Offset = G->getOffset(); } else if (ConstantPoolSDNode *CP = dyn_cast(N0)) { AM.CP = CP->getConstVal(); AM.Alignment = CP->getAlign(); AM.SymbolFlags = CP->getTargetFlags(); Offset = CP->getOffset(); } else if (ExternalSymbolSDNode *S = dyn_cast(N0)) { AM.ES = S->getSymbol(); AM.SymbolFlags = S->getTargetFlags(); } else if (auto *S = dyn_cast(N0)) { AM.MCSym = S->getMCSymbol(); } else if (JumpTableSDNode *J = dyn_cast(N0)) { AM.JT = J->getIndex(); AM.SymbolFlags = J->getTargetFlags(); } else if (BlockAddressSDNode *BA = dyn_cast(N0)) { AM.BlockAddr = BA->getBlockAddress(); AM.SymbolFlags = BA->getTargetFlags(); Offset = BA->getOffset(); } else llvm_unreachable("Unhandled symbol reference node."); if (foldOffsetIntoAddress(Offset, AM)) { AM = Backup; return true; } if (IsRIPRel) AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64)); // Commit the changes now that we know this fold is safe. return false; } /// Add the specified node to the specified addressing mode, returning true if /// it cannot be done. This just pattern matches for the addressing mode. bool X86DAGToDAGISel::matchAddress(SDValue N, X86ISelAddressMode &AM) { if (matchAddressRecursively(N, AM, 0)) return true; // Post-processing: Make a second attempt to fold a load, if we now know // that there will not be any other register. This is only performed for // 64-bit ILP32 mode since 32-bit mode and 64-bit LP64 mode will have folded // any foldable load the first time. if (Subtarget->isTarget64BitILP32() && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() != nullptr && AM.IndexReg.getNode() == nullptr) { SDValue Save_Base_Reg = AM.Base_Reg; if (auto *LoadN = dyn_cast(Save_Base_Reg)) { AM.Base_Reg = SDValue(); if (matchLoadInAddress(LoadN, AM, /*AllowSegmentRegForX32=*/true)) AM.Base_Reg = Save_Base_Reg; } } // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has // a smaller encoding and avoids a scaled-index. if (AM.Scale == 2 && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr) { AM.Base_Reg = AM.IndexReg; AM.Scale = 1; } // Post-processing: Convert foo to foo(%rip), even in non-PIC mode, // because it has a smaller encoding. // TODO: Which other code models can use this? switch (TM.getCodeModel()) { default: break; case CodeModel::Small: case CodeModel::Kernel: if (Subtarget->is64Bit() && AM.Scale == 1 && AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr && AM.SymbolFlags == X86II::MO_NO_FLAG && AM.hasSymbolicDisplacement()) AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64); break; } return false; } bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM, unsigned Depth) { // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); X86ISelAddressMode Backup = AM; if (!matchAddressRecursively(N.getOperand(0), AM, Depth+1) && !matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1)) return false; AM = Backup; // Try again after commutating the operands. if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth + 1) && !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1)) return false; AM = Backup; // If we couldn't fold both operands into the address at the same time, // see if we can just put each operand into a register and fold at least // the add. if (AM.BaseType == X86ISelAddressMode::RegBase && !AM.Base_Reg.getNode() && !AM.IndexReg.getNode()) { N = Handle.getValue(); AM.Base_Reg = N.getOperand(0); AM.IndexReg = N.getOperand(1); AM.Scale = 1; return false; } N = Handle.getValue(); return true; } // Insert a node into the DAG at least before the Pos node's position. This // will reposition the node as needed, and will assign it a node ID that is <= // the Pos node's ID. Note that this does *not* preserve the uniqueness of node // IDs! The selection DAG must no longer depend on their uniqueness when this // is used. static void insertDAGNode(SelectionDAG &DAG, SDValue Pos, SDValue N) { if (N->getNodeId() == -1 || (SelectionDAGISel::getUninvalidatedNodeId(N.getNode()) > SelectionDAGISel::getUninvalidatedNodeId(Pos.getNode()))) { DAG.RepositionNode(Pos->getIterator(), N.getNode()); // Mark Node as invalid for pruning as after this it may be a successor to a // selected node but otherwise be in the same position of Pos. // Conservatively mark it with the same -abs(Id) to assure node id // invariant is preserved. N->setNodeId(Pos->getNodeId()); SelectionDAGISel::InvalidateNodeId(N.getNode()); } } // Transform "(X >> (8-C1)) & (0xff << C1)" to "((X >> 8) & 0xff) << C1" if // safe. This allows us to convert the shift and and into an h-register // extract and a scaled index. Returns false if the simplification is // performed. static bool foldMaskAndShiftToExtract(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { if (Shift.getOpcode() != ISD::SRL || !isa(Shift.getOperand(1)) || !Shift.hasOneUse()) return true; int ScaleLog = 8 - Shift.getConstantOperandVal(1); if (ScaleLog <= 0 || ScaleLog >= 4 || Mask != (0xffu << ScaleLog)) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue Eight = DAG.getConstant(8, DL, MVT::i8); SDValue NewMask = DAG.getConstant(0xff, DL, VT); SDValue Srl = DAG.getNode(ISD::SRL, DL, VT, X, Eight); SDValue And = DAG.getNode(ISD::AND, DL, VT, Srl, NewMask); SDValue ShlCount = DAG.getConstant(ScaleLog, DL, MVT::i8); SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And, ShlCount); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, Eight); insertDAGNode(DAG, N, Srl); insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, And); insertDAGNode(DAG, N, ShlCount); insertDAGNode(DAG, N, Shl); DAG.ReplaceAllUsesWith(N, Shl); DAG.RemoveDeadNode(N.getNode()); AM.IndexReg = And; AM.Scale = (1 << ScaleLog); return false; } // Transforms "(X << C1) & C2" to "(X & (C2>>C1)) << C1" if safe and if this // allows us to fold the shift into this addressing mode. Returns false if the // transform succeeded. static bool foldMaskedShiftToScaledMask(SelectionDAG &DAG, SDValue N, X86ISelAddressMode &AM) { SDValue Shift = N.getOperand(0); // Use a signed mask so that shifting right will insert sign bits. These // bits will be removed when we shift the result left so it doesn't matter // what we use. This might allow a smaller immediate encoding. int64_t Mask = cast(N->getOperand(1))->getSExtValue(); // If we have an any_extend feeding the AND, look through it to see if there // is a shift behind it. But only if the AND doesn't use the extended bits. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? bool FoundAnyExtend = false; if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && Shift.getOperand(0).getSimpleValueType() == MVT::i32 && isUInt<32>(Mask)) { FoundAnyExtend = true; Shift = Shift.getOperand(0); } if (Shift.getOpcode() != ISD::SHL || !isa(Shift.getOperand(1))) return true; SDValue X = Shift.getOperand(0); // Not likely to be profitable if either the AND or SHIFT node has more // than one use (unless all uses are for address computation). Besides, // isel mechanism requires their node ids to be reused. if (!N.hasOneUse() || !Shift.hasOneUse()) return true; // Verify that the shift amount is something we can fold. unsigned ShiftAmt = Shift.getConstantOperandVal(1); if (ShiftAmt != 1 && ShiftAmt != 2 && ShiftAmt != 3) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); if (FoundAnyExtend) { SDValue NewX = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); insertDAGNode(DAG, N, NewX); X = NewX; } SDValue NewMask = DAG.getConstant(Mask >> ShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, NewMask); SDValue NewShift = DAG.getNode(ISD::SHL, DL, VT, NewAnd, Shift.getOperand(1)); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewShift); DAG.ReplaceAllUsesWith(N, NewShift); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << ShiftAmt; AM.IndexReg = NewAnd; return false; } // Implement some heroics to detect shifts of masked values where the mask can // be replaced by extending the shift and undoing that in the addressing mode // scale. Patterns such as (shl (srl x, c1), c2) are canonicalized into (and // (srl x, SHIFT), MASK) by DAGCombines that don't know the shl can be done in // the addressing mode. This results in code such as: // // int f(short *y, int *lookup_table) { // ... // return *y + lookup_table[*y >> 11]; // } // // Turning into: // movzwl (%rdi), %eax // movl %eax, %ecx // shrl $11, %ecx // addl (%rsi,%rcx,4), %eax // // Instead of: // movzwl (%rdi), %eax // movl %eax, %ecx // shrl $9, %ecx // andl $124, %rcx // addl (%rsi,%rcx), %eax // // Note that this function assumes the mask is provided as a mask *after* the // value is shifted. The input chain may or may not match that, but computing // such a mask is trivial. static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM) { if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse() || !isa(Shift.getOperand(1))) return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); unsigned MaskLZ = countLeadingZeros(Mask); unsigned MaskTZ = countTrailingZeros(Mask); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. unsigned AMShiftAmt = MaskTZ; // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; // We also need to ensure that mask is a continuous run of bits. if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true; // Scale the leading zero count down based on the actual size of the value. // Also scale it down based on the size of the shift. unsigned ScaleDown = (64 - X.getSimpleValueType().getSizeInBits()) + ShiftAmt; if (MaskLZ < ScaleDown) return true; MaskLZ -= ScaleDown; // The final check is to ensure that any masked out high bits of X are // already known to be zero. Otherwise, the mask has a semantic impact // other than masking out a couple of low bits. Unfortunately, because of // the mask, zero extensions will be removed from operands in some cases. // This code works extra hard to look through extensions because we can // replace them with zero extensions cheaply if necessary. bool ReplacingAnyExtend = false; if (X.getOpcode() == ISD::ANY_EXTEND) { unsigned ExtendBits = X.getSimpleValueType().getSizeInBits() - X.getOperand(0).getSimpleValueType().getSizeInBits(); // Assume that we'll replace the any-extend with a zero-extend, and // narrow the search to the extended value. X = X.getOperand(0); MaskLZ = ExtendBits > MaskLZ ? 0 : MaskLZ - ExtendBits; ReplacingAnyExtend = true; } APInt MaskedHighBits = APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ); KnownBits Known = DAG.computeKnownBits(X); if (MaskedHighBits != Known.Zero) return true; // We've identified a pattern that can be transformed into a single shift // and an addressing mode. Make it so. MVT VT = N.getSimpleValueType(); if (ReplacingAnyExtend) { assert(X.getValueType() != VT); // We looked through an ANY_EXTEND node, insert a ZERO_EXTEND. SDValue NewX = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(X), VT, X); insertDAGNode(DAG, N, NewX); X = NewX; } SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewSRL, NewSHLAmt); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewSRLAmt); insertDAGNode(DAG, N, NewSRL); insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewSRL; return false; } // Transform "(X >> SHIFT) & (MASK << C1)" to // "((X >> (SHIFT + C1)) & (MASK)) << C1". Everything before the SHL will be // matched to a BEXTR later. Returns false if the simplification is performed. static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N, uint64_t Mask, SDValue Shift, SDValue X, X86ISelAddressMode &AM, const X86Subtarget &Subtarget) { if (Shift.getOpcode() != ISD::SRL || !isa(Shift.getOperand(1)) || !Shift.hasOneUse() || !N.hasOneUse()) return true; // Only do this if BEXTR will be matched by matchBEXTRFromAndImm. if (!Subtarget.hasTBM() && !(Subtarget.hasBMI() && Subtarget.hasFastBEXTR())) return true; // We need to ensure that mask is a continuous run of bits. if (!isShiftedMask_64(Mask)) return true; unsigned ShiftAmt = Shift.getConstantOperandVal(1); // The amount of shift we're trying to fit into the addressing mode is taken // from the trailing zeros of the mask. unsigned AMShiftAmt = countTrailingZeros(Mask); // There is nothing we can do here unless the mask is removing some bits. // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits. if (AMShiftAmt == 0 || AMShiftAmt > 3) return true; MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue NewSRLAmt = DAG.getConstant(ShiftAmt + AMShiftAmt, DL, MVT::i8); SDValue NewSRL = DAG.getNode(ISD::SRL, DL, VT, X, NewSRLAmt); SDValue NewMask = DAG.getConstant(Mask >> AMShiftAmt, DL, VT); SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, NewSRL, NewMask); SDValue NewSHLAmt = DAG.getConstant(AMShiftAmt, DL, MVT::i8); SDValue NewSHL = DAG.getNode(ISD::SHL, DL, VT, NewAnd, NewSHLAmt); // Insert the new nodes into the topological ordering. We must do this in // a valid topological ordering as nothing is going to go back and re-sort // these nodes. We continually insert before 'N' in sequence as this is // essentially a pre-flattened and pre-sorted sequence of nodes. There is no // hierarchy left to express. insertDAGNode(DAG, N, NewSRLAmt); insertDAGNode(DAG, N, NewSRL); insertDAGNode(DAG, N, NewMask); insertDAGNode(DAG, N, NewAnd); insertDAGNode(DAG, N, NewSHLAmt); insertDAGNode(DAG, N, NewSHL); DAG.ReplaceAllUsesWith(N, NewSHL); DAG.RemoveDeadNode(N.getNode()); AM.Scale = 1 << AMShiftAmt; AM.IndexReg = NewAnd; return false; } bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth) { SDLoc dl(N); LLVM_DEBUG({ dbgs() << "MatchAddress: "; AM.dump(CurDAG); }); // Limit recursion. if (Depth > 5) return matchAddressBase(N, AM); // If this is already a %rip relative address, we can only merge immediates // into it. Instead of handling this in every case, we handle it here. // RIP relative addressing: %rip + 32-bit displacement! if (AM.isRIPRelative()) { // FIXME: JumpTable and ExternalSymbol address currently don't like // displacements. It isn't very important, but this should be fixed for // consistency. if (!(AM.ES || AM.MCSym) && AM.JT != -1) return true; if (ConstantSDNode *Cst = dyn_cast(N)) if (!foldOffsetIntoAddress(Cst->getSExtValue(), AM)) return false; return true; } switch (N.getOpcode()) { default: break; case ISD::LOCAL_RECOVER: { if (!AM.hasSymbolicDisplacement() && AM.Disp == 0) if (const auto *ESNode = dyn_cast(N.getOperand(0))) { // Use the symbol and don't prefix it. AM.MCSym = ESNode->getMCSymbol(); return false; } break; } case ISD::Constant: { uint64_t Val = cast(N)->getSExtValue(); if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: case X86ISD::WrapperRIP: if (!matchWrapper(N, AM)) return false; break; case ISD::LOAD: if (!matchLoadInAddress(cast(N), AM)) return false; break; case ISD::FrameIndex: if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) { AM.BaseType = X86ISelAddressMode::FrameIndexBase; AM.Base_FrameIndex = cast(N)->getIndex(); return false; } break; case ISD::SHL: if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) { unsigned Val = CN->getZExtValue(); // Note that we handle x<<1 as (,x,2) rather than (x,x) here so // that the base operand remains free for further matching. If // the base doesn't end up getting used, a post-processing step // in MatchAddress turns (,x,2) into (x,x), which is cheaper. if (Val == 1 || Val == 2 || Val == 3) { AM.Scale = 1 << Val; SDValue ShVal = N.getOperand(0); // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (CurDAG->isBaseWithConstantOffset(ShVal)) { AM.IndexReg = ShVal.getOperand(0); ConstantSDNode *AddVal = cast(ShVal.getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!foldOffsetIntoAddress(Disp, AM)) return false; } AM.IndexReg = ShVal; return false; } } break; case ISD::SRL: { // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. assert(N.getSimpleValueType().getSizeInBits() <= 64 && "Unexpected value size!"); SDValue And = N.getOperand(0); if (And.getOpcode() != ISD::AND) break; SDValue X = And.getOperand(0); // The mask used for the transform is expected to be post-shift, but we // found the shift first so just apply the shift to the mask before passing // it down. if (!isa(N.getOperand(1)) || !isa(And.getOperand(1))) break; uint64_t Mask = And.getConstantOperandVal(1) >> N.getConstantOperandVal(1); // Try to fold the mask and shift into the scale, and return false if we // succeed. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, N, X, AM)) return false; break; } case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: // A mul_lohi where we need the low part can be folded as a plain multiply. if (N.getResNo() != 0) break; LLVM_FALLTHROUGH; case ISD::MUL: case X86ISD::MUL_IMM: // X*[3,5,9] -> X+X*[2,4,8] if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr) { if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || CN->getZExtValue() == 9) { AM.Scale = unsigned(CN->getZExtValue())-1; SDValue MulVal = N.getOperand(0); SDValue Reg; // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && isa(MulVal.getOperand(1))) { Reg = MulVal.getOperand(0); ConstantSDNode *AddVal = cast(MulVal.getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); if (foldOffsetIntoAddress(Disp, AM)) Reg = N.getOperand(0); } else { Reg = N.getOperand(0); } AM.IndexReg = AM.Base_Reg = Reg; return false; } } break; case ISD::SUB: { // Given A-B, if A can be completely folded into the address and // the index field with the index field unused, use -B as the index. // This is a win if a has multiple parts that can be folded into // the address. Also, this saves a mov if the base register has // other uses, since it avoids a two-address sub instruction, however // it costs an additional mov if the index register has other uses. // Add an artificial use to this node so that we can keep track of // it if it gets CSE'd with a different node. HandleSDNode Handle(N); // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { N = Handle.getValue(); AM = Backup; break; } N = Handle.getValue(); // Test if the index field is free for use. if (AM.IndexReg.getNode() || AM.isRIPRelative()) { AM = Backup; break; } int Cost = 0; SDValue RHS = N.getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. if (!RHS.getNode()->hasOneUse() || RHS.getNode()->getOpcode() == ISD::CopyFromReg || RHS.getNode()->getOpcode() == ISD::TRUNCATE || RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && RHS.getOperand(0).getValueType() == MVT::i32)) ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. if ((AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() && !AM.Base_Reg.getNode()->hasOneUse()) || AM.BaseType == X86ISelAddressMode::FrameIndexBase) --Cost; // If the folded LHS was interesting, this transformation saves // address arithmetic. if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) + ((AM.Disp != 0) && (Backup.Disp == 0)) + (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2) --Cost; // If it doesn't look like it may be an overall win, don't do it. if (Cost >= 0) { AM = Backup; break; } // Ok, the transformation is legal and appears profitable. Go for it. // Negation will be emitted later to avoid creating dangling nodes if this // was an unprofitable LEA. AM.IndexReg = RHS; AM.NegateIndex = true; AM.Scale = 1; return false; } case ISD::ADD: if (!matchAdd(N, AM, Depth)) return false; break; case ISD::OR: // We want to look through a transform in InstCombine and DAGCombiner that // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'. // Example: (or (and x, 1), (shl y, 3)) --> (add (and x, 1), (shl y, 3)) // An 'lea' can then be used to match the shift (multiply) and add: // and $1, %esi // lea (%rsi, %rdi, 8), %rax if (CurDAG->haveNoCommonBitsSet(N.getOperand(0), N.getOperand(1)) && !matchAdd(N, AM, Depth)) return false; break; case ISD::AND: { // Perform some heroic transforms on an and of a constant-count shift // with a constant to enable use of the scaled offset field. // Scale must not be used already. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; // We only handle up to 64-bit values here as those are what matter for // addressing mode optimizations. assert(N.getSimpleValueType().getSizeInBits() <= 64 && "Unexpected value size!"); if (!isa(N.getOperand(1))) break; if (N.getOperand(0).getOpcode() == ISD::SRL) { SDValue Shift = N.getOperand(0); SDValue X = Shift.getOperand(0); uint64_t Mask = N.getConstantOperandVal(1); // Try to fold the mask and shift into an extract and scale. if (!foldMaskAndShiftToExtract(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift directly into the scale. if (!foldMaskAndShiftToScale(*CurDAG, N, Mask, Shift, X, AM)) return false; // Try to fold the mask and shift into BEXTR and scale. if (!foldMaskedShiftToBEXTR(*CurDAG, N, Mask, Shift, X, AM, *Subtarget)) return false; } // Try to swap the mask and shift to place shifts which can be done as // a scale on the outside of the mask. if (!foldMaskedShiftToScaledMask(*CurDAG, N, AM)) return false; break; } case ISD::ZERO_EXTEND: { // Try to widen a zexted shift left to the same size as its use, so we can // match the shift as a scale factor. if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; if (N.getOperand(0).getOpcode() != ISD::SHL || !N.getOperand(0).hasOneUse()) break; // Give up if the shift is not a valid scale factor [1,2,3]. SDValue Shl = N.getOperand(0); auto *ShAmtC = dyn_cast(Shl.getOperand(1)); if (!ShAmtC || ShAmtC->getZExtValue() > 3) break; // The narrow shift must only shift out zero bits (it must be 'nuw'). // That makes it safe to widen to the destination type. APInt HighZeros = APInt::getHighBitsSet(Shl.getValueSizeInBits(), ShAmtC->getZExtValue()); if (!CurDAG->MaskedValueIsZero(Shl.getOperand(0), HighZeros)) break; // zext (shl nuw i8 %x, C) to i32 --> shl (zext i8 %x to i32), (zext C) MVT VT = N.getSimpleValueType(); SDLoc DL(N); SDValue Zext = CurDAG->getNode(ISD::ZERO_EXTEND, DL, VT, Shl.getOperand(0)); SDValue NewShl = CurDAG->getNode(ISD::SHL, DL, VT, Zext, Shl.getOperand(1)); // Convert the shift to scale factor. AM.Scale = 1 << ShAmtC->getZExtValue(); AM.IndexReg = Zext; insertDAGNode(*CurDAG, N, Zext); insertDAGNode(*CurDAG, N, NewShl); CurDAG->ReplaceAllUsesWith(N, NewShl); CurDAG->RemoveDeadNode(N.getNode()); return false; } } return matchAddressBase(N, AM); } /// Helper for MatchAddress. Add the specified node to the /// specified addressing mode without any further recursion. bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { // Is the base register already occupied? if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) { // If so, check to see if the scale index register is set. if (!AM.IndexReg.getNode()) { AM.IndexReg = N; AM.Scale = 1; return false; } // Otherwise, we cannot select it. return true; } // Default, generate it as a register. AM.BaseType = X86ISelAddressMode::RegBase; AM.Base_Reg = N; return false; } /// Helper for selectVectorAddr. Handles things that can be folded into a /// gather scatter address. The index register and scale should have already /// been handled. bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { // TODO: Support other operations. switch (N.getOpcode()) { case ISD::Constant: { uint64_t Val = cast(N)->getSExtValue(); if (!foldOffsetIntoAddress(Val, AM)) return false; break; } case X86ISD::Wrapper: if (!matchWrapper(N, AM)) return false; break; } return matchAddressBase(N, AM); } bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp, SDValue ScaleOp, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; AM.IndexReg = IndexOp; AM.Scale = cast(ScaleOp)->getZExtValue(); unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace(); if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); SDLoc DL(BasePtr); MVT VT = BasePtr.getSimpleValueType(); // Try to match into the base and displacement fields. if (matchVectorAddress(BasePtr, AM)) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } /// Returns true if it is able to pattern match an addressing mode. /// It returns the operands which make up the maximal addressing mode it can /// match by reference. /// /// Parent is the parent node of the addr operand that is being matched. It /// is always a load, store, atomic node, or null. It is only null when /// checking memory operands for inline asm nodes. bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; if (Parent && // This list of opcodes are all the nodes that have an "addr:$ptr" operand // that are not a MemSDNode, and thus don't have proper addrspace info. Parent->getOpcode() != ISD::INTRINSIC_W_CHAIN && // unaligned loads, fixme Parent->getOpcode() != ISD::INTRINSIC_VOID && // nontemporal stores Parent->getOpcode() != X86ISD::TLSCALL && // Fixme Parent->getOpcode() != X86ISD::ENQCMD && // Fixme Parent->getOpcode() != X86ISD::ENQCMDS && // Fixme Parent->getOpcode() != X86ISD::EH_SJLJ_SETJMP && // setjmp Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp unsigned AddrSpace = cast(Parent)->getPointerInfo().getAddrSpace(); if (AddrSpace == X86AS::GS) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); if (AddrSpace == X86AS::FS) AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16); if (AddrSpace == X86AS::SS) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); } // Save the DL and VT before calling matchAddress, it can invalidate N. SDLoc DL(N); MVT VT = N.getSimpleValueType(); if (matchAddress(N, AM)) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { // In static codegen with small code model, we can get the address of a label // into a register with 'movl' if (N->getOpcode() != X86ISD::Wrapper) return false; N = N.getOperand(0); // At least GNU as does not accept 'movl' for TPOFF relocations. // FIXME: We could use 'movl' when we know we are targeting MC. if (N->getOpcode() == ISD::TargetGlobalTLSAddress) return false; Imm = N; if (N->getOpcode() != ISD::TargetGlobalAddress) return TM.getCodeModel() == CodeModel::Small; Optional CR = cast(N)->getGlobal()->getAbsoluteSymbolRange(); if (!CR) return TM.getCodeModel() == CodeModel::Small; return CR->getUnsignedMax().ult(1ull << 32); } bool X86DAGToDAGISel::selectLEA64_32Addr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { // Save the debug loc before calling selectLEAAddr, in case it invalidates N. SDLoc DL(N); if (!selectLEAAddr(N, Base, Scale, Index, Disp, Segment)) return false; RegisterSDNode *RN = dyn_cast(Base); if (RN && RN->getReg() == 0) Base = CurDAG->getRegister(0, MVT::i64); else if (Base.getValueType() == MVT::i32 && !isa(Base)) { // Base could already be %rip, particularly in the x32 ABI. SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, MVT::i64), 0); Base = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, Base); } RN = dyn_cast(Index); if (RN && RN->getReg() == 0) Index = CurDAG->getRegister(0, MVT::i64); else { assert(Index.getValueType() == MVT::i32 && "Expect to be extending 32-bit registers for use in LEA"); SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, DL, MVT::i64), 0); Index = CurDAG->getTargetInsertSubreg(X86::sub_32bit, DL, MVT::i64, ImplDef, Index); } return true; } /// Calls SelectAddr and determines if the maximal addressing /// mode it matches can be cost effectively emitted as an LEA instruction. bool X86DAGToDAGISel::selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; // Save the DL and VT before calling matchAddress, it can invalidate N. SDLoc DL(N); MVT VT = N.getSimpleValueType(); // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support // segments. SDValue Copy = AM.Segment; SDValue T = CurDAG->getRegister(0, MVT::i32); AM.Segment = T; if (matchAddress(N, AM)) return false; assert (T == AM.Segment); AM.Segment = Copy; unsigned Complexity = 0; if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode()) Complexity = 1; else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase) Complexity = 4; if (AM.IndexReg.getNode()) Complexity++; // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with // a simple shift. if (AM.Scale > 1) Complexity++; // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA // to a LEA. This is determined with some experimentation but is by no means // optimal (especially for code size consideration). LEA is nice because of // its three-address nature. Tweak the cost function again when we can run // convertToThreeAddress() at register allocation time. if (AM.hasSymbolicDisplacement()) { // For X86-64, always use LEA to materialize RIP-relative addresses. if (Subtarget->is64Bit()) Complexity = 4; else Complexity += 2; } // Heuristic: try harder to form an LEA from ADD if the operands set flags. // Unlike ADD, LEA does not affect flags, so we will be less likely to require // duplicating flag-producing instructions later in the pipeline. if (N.getOpcode() == ISD::ADD) { auto isMathWithFlags = [](SDValue V) { switch (V.getOpcode()) { case X86ISD::ADD: case X86ISD::SUB: case X86ISD::ADC: case X86ISD::SBB: /* TODO: These opcodes can be added safely, but we may want to justify their inclusion for different reasons (better for reg-alloc). case X86ISD::SMUL: case X86ISD::UMUL: case X86ISD::OR: case X86ISD::XOR: case X86ISD::AND: */ // Value 1 is the flag output of the node - verify it's not dead. return !SDValue(V.getNode(), 1).use_empty(); default: return false; } }; // TODO: This could be an 'or' rather than 'and' to make the transform more // likely to happen. We might want to factor in whether there's a // load folding opportunity for the math op that disappears with LEA. if (isMathWithFlags(N.getOperand(0)) && isMathWithFlags(N.getOperand(1))) Complexity++; } if (AM.Disp) Complexity++; // If it isn't worth using an LEA, reject it. if (Complexity <= 2) return false; getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment); return true; } /// This is only run on TargetGlobalTLSAddress nodes. bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(N.getOpcode() == ISD::TargetGlobalTLSAddress); const GlobalAddressSDNode *GA = cast(N); X86ISelAddressMode AM; AM.GV = GA->getGlobal(); AM.Disp += GA->getOffset(); AM.SymbolFlags = GA->getTargetFlags(); if (Subtarget->is32Bit()) { AM.Scale = 1; AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32); } MVT VT = N.getSimpleValueType(); getAddressOperands(AM, SDLoc(N), VT, Base, Scale, Index, Disp, Segment); return true; } bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { // Keep track of the original value type and whether this value was // truncated. If we see a truncation from pointer type to VT that truncates // bits that are known to be zero, we can use a narrow reference. EVT VT = N.getValueType(); bool WasTruncated = false; if (N.getOpcode() == ISD::TRUNCATE) { WasTruncated = true; N = N.getOperand(0); } if (N.getOpcode() != X86ISD::Wrapper) return false; // We can only use non-GlobalValues as immediates if they were not truncated, // as we do not have any range information. If we have a GlobalValue and the // address was not truncated, we can select it as an operand directly. unsigned Opc = N.getOperand(0)->getOpcode(); if (Opc != ISD::TargetGlobalAddress || !WasTruncated) { Op = N.getOperand(0); // We can only select the operand directly if we didn't have to look past a // truncate. return !WasTruncated; } // Check that the global's range fits into VT. auto *GA = cast(N.getOperand(0)); Optional CR = GA->getGlobal()->getAbsoluteSymbolRange(); if (!CR || CR->getUnsignedMax().uge(1ull << VT.getSizeInBits())) return false; // Okay, we can use a narrow reference. Op = CurDAG->getTargetGlobalAddress(GA->getGlobal(), SDLoc(N), VT, GA->getOffset(), GA->getTargetFlags()); return true; } bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(Root && P && "Unknown root/parent nodes"); if (!ISD::isNON_EXTLoad(N.getNode()) || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) return false; return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } bool X86DAGToDAGISel::tryFoldBroadcast(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { assert(Root && P && "Unknown root/parent nodes"); if (N->getOpcode() != X86ISD::VBROADCAST_LOAD || !IsProfitableToFold(N, P, Root) || !IsLegalToFold(N, P, Root, OptLevel)) return false; return selectAddr(N.getNode(), N.getOperand(1), Base, Scale, Index, Disp, Segment); } /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. SDNode *X86DAGToDAGISel::getGlobalBaseReg() { unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF); auto &DL = MF->getDataLayout(); return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode(); } bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const { if (N->getOpcode() == ISD::TRUNCATE) N = N->getOperand(0).getNode(); if (N->getOpcode() != X86ISD::Wrapper) return false; auto *GA = dyn_cast(N->getOperand(0)); if (!GA) return false; Optional CR = GA->getGlobal()->getAbsoluteSymbolRange(); if (!CR) return Width == 32 && TM.getCodeModel() == CodeModel::Small; return CR->getSignedMin().sge(-1ull << Width) && CR->getSignedMax().slt(1ull << Width); } static X86::CondCode getCondFromNode(SDNode *N) { assert(N->isMachineOpcode() && "Unexpected node"); X86::CondCode CC = X86::COND_INVALID; unsigned Opc = N->getMachineOpcode(); if (Opc == X86::JCC_1) CC = static_cast(N->getConstantOperandVal(1)); else if (Opc == X86::SETCCr) CC = static_cast(N->getConstantOperandVal(0)); else if (Opc == X86::SETCCm) CC = static_cast(N->getConstantOperandVal(5)); else if (Opc == X86::CMOV16rr || Opc == X86::CMOV32rr || Opc == X86::CMOV64rr) CC = static_cast(N->getConstantOperandVal(2)); else if (Opc == X86::CMOV16rm || Opc == X86::CMOV32rm || Opc == X86::CMOV64rm) CC = static_cast(N->getConstantOperandVal(6)); return CC; } /// Test whether the given X86ISD::CMP node has any users that use a flag /// other than ZF. bool X86DAGToDAGISel::onlyUsesZeroFlag(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; // Only examine CopyToReg uses that copy to EFLAGS. if (UI->getOpcode() != ISD::CopyToReg || cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which only use the zero flag. case X86::COND_E: case X86::COND_NE: continue; // Anything else: assume conservatively. default: return false; } } } return true; } /// Test whether the given X86ISD::CMP node has any uses which require the SF /// flag to be accurate. bool X86DAGToDAGISel::hasNoSignFlagUses(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; // Only examine CopyToReg uses that copy to EFLAGS. if (UI->getOpcode() != ISD::CopyToReg || cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); switch (CC) { // Comparisons which don't examine the SF flag. case X86::COND_A: case X86::COND_AE: case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_NE: case X86::COND_O: case X86::COND_NO: case X86::COND_P: case X86::COND_NP: continue; // Anything else: assume conservatively. default: return false; } } } return true; } static bool mayUseCarryFlag(X86::CondCode CC) { switch (CC) { // Comparisons which don't examine the CF flag. case X86::COND_O: case X86::COND_NO: case X86::COND_E: case X86::COND_NE: case X86::COND_S: case X86::COND_NS: case X86::COND_P: case X86::COND_NP: case X86::COND_L: case X86::COND_GE: case X86::COND_G: case X86::COND_LE: return false; // Anything else: assume conservatively. default: return true; } } /// Test whether the given node which sets flags has any uses which require the /// CF flag to be accurate. bool X86DAGToDAGISel::hasNoCarryFlagUses(SDValue Flags) const { // Examine each user of the node. for (SDNode::use_iterator UI = Flags->use_begin(), UE = Flags->use_end(); UI != UE; ++UI) { // Only check things that use the flags. if (UI.getUse().getResNo() != Flags.getResNo()) continue; unsigned UIOpc = UI->getOpcode(); if (UIOpc == ISD::CopyToReg) { // Only examine CopyToReg uses that copy to EFLAGS. if (cast(UI->getOperand(1))->getReg() != X86::EFLAGS) return false; // Examine each user of the CopyToReg use. for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) { // Only examine the Flag result. if (FlagUI.getUse().getResNo() != 1) continue; // Anything unusual: assume conservatively. if (!FlagUI->isMachineOpcode()) return false; // Examine the condition code of the user. X86::CondCode CC = getCondFromNode(*FlagUI); if (mayUseCarryFlag(CC)) return false; } // This CopyToReg is ok. Move on to the next user. continue; } // This might be an unselected node. So look for the pre-isel opcodes that // use flags. unsigned CCOpNo; switch (UIOpc) { default: // Something unusual. Be conservative. return false; case X86ISD::SETCC: CCOpNo = 0; break; case X86ISD::SETCC_CARRY: CCOpNo = 0; break; case X86ISD::CMOV: CCOpNo = 2; break; case X86ISD::BRCOND: CCOpNo = 2; break; } X86::CondCode CC = (X86::CondCode)UI->getConstantOperandVal(CCOpNo); if (mayUseCarryFlag(CC)) return false; } return true; } /// Check whether or not the chain ending in StoreNode is suitable for doing /// the {load; op; store} to modify transformation. static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, SDValue StoredVal, SelectionDAG *CurDAG, unsigned LoadOpNo, LoadSDNode *&LoadNode, SDValue &InputChain) { // Is the stored value result 0 of the operation? if (StoredVal.getResNo() != 0) return false; // Are there other uses of the operation other than the store? if (!StoredVal.getNode()->hasNUsesOfValue(1, 0)) return false; // Is the store non-extending and non-indexed? if (!ISD::isNormalStore(StoreNode) || StoreNode->isNonTemporal()) return false; SDValue Load = StoredVal->getOperand(LoadOpNo); // Is the stored value a non-extending and non-indexed load? if (!ISD::isNormalLoad(Load.getNode())) return false; // Return LoadNode by reference. LoadNode = cast(Load); // Is store the only read of the loaded value? if (!Load.hasOneUse()) return false; // Is the address of the store the same as the load? if (LoadNode->getBasePtr() != StoreNode->getBasePtr() || LoadNode->getOffset() != StoreNode->getOffset()) return false; bool FoundLoad = false; SmallVector ChainOps; SmallVector LoopWorklist; SmallPtrSet Visited; const unsigned int Max = 1024; // Visualization of Load-Op-Store fusion: // ------------------------- // Legend: // *-lines = Chain operand dependencies. // |-lines = Normal operand dependencies. // Dependencies flow down and right. n-suffix references multiple nodes. // // C Xn C // * * * // * * * // Xn A-LD Yn TF Yn // * * \ | * | // * * \ | * | // * * \ | => A--LD_OP_ST // * * \| \ // TF OP \ // * | \ Zn // * | \ // A-ST Zn // // This merge induced dependences from: #1: Xn -> LD, OP, Zn // #2: Yn -> LD // #3: ST -> Zn // Ensure the transform is safe by checking for the dual // dependencies to make sure we do not induce a loop. // As LD is a predecessor to both OP and ST we can do this by checking: // a). if LD is a predecessor to a member of Xn or Yn. // b). if a Zn is a predecessor to ST. // However, (b) can only occur through being a chain predecessor to // ST, which is the same as Zn being a member or predecessor of Xn, // which is a subset of LD being a predecessor of Xn. So it's // subsumed by check (a). SDValue Chain = StoreNode->getChain(); // Gather X elements in ChainOps. if (Chain == Load.getValue(1)) { FoundLoad = true; ChainOps.push_back(Load.getOperand(0)); } else if (Chain.getOpcode() == ISD::TokenFactor) { for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i) { SDValue Op = Chain.getOperand(i); if (Op == Load.getValue(1)) { FoundLoad = true; // Drop Load, but keep its chain. No cycle check necessary. ChainOps.push_back(Load.getOperand(0)); continue; } LoopWorklist.push_back(Op.getNode()); ChainOps.push_back(Op); } } if (!FoundLoad) return false; // Worklist is currently Xn. Add Yn to worklist. for (SDValue Op : StoredVal->ops()) if (Op.getNode() != LoadNode) LoopWorklist.push_back(Op.getNode()); // Check (a) if Load is a predecessor to Xn + Yn if (SDNode::hasPredecessorHelper(Load.getNode(), Visited, LoopWorklist, Max, true)) return false; InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ChainOps); return true; } // Change a chain of {load; op; store} of the same value into a simple op // through memory of that value, if the uses of the modified value and its // address are suitable. // // The tablegen pattern memory operand pattern is currently not able to match // the case where the EFLAGS on the original operation are used. // // To move this to tablegen, we'll need to improve tablegen to allow flags to // be transferred from a node in the pattern to the result node, probably with // a new keyword. For example, we have this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", // [(store (add (loadi64 addr:$dst), -1), addr:$dst), // (implicit EFLAGS)]>; // but maybe need something like this // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", // [(store (add (loadi64 addr:$dst), -1), addr:$dst), // (transferrable EFLAGS)]>; // // Until then, we manually fold these and instruction select the operation // here. bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { StoreSDNode *StoreNode = cast(Node); SDValue StoredVal = StoreNode->getOperand(1); unsigned Opc = StoredVal->getOpcode(); // Before we try to select anything, make sure this is memory operand size // and opcode we can handle. Note that this must match the code below that // actually lowers the opcodes. EVT MemVT = StoreNode->getMemoryVT(); if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && MemVT != MVT::i8) return false; bool IsCommutable = false; bool IsNegate = false; switch (Opc) { default: return false; case X86ISD::SUB: IsNegate = isNullConstant(StoredVal.getOperand(0)); break; case X86ISD::SBB: break; case X86ISD::ADD: case X86ISD::ADC: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: IsCommutable = true; break; } unsigned LoadOpNo = IsNegate ? 1 : 0; LoadSDNode *LoadNode = nullptr; SDValue InputChain; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, LoadNode, InputChain)) { if (!IsCommutable) return false; // This operation is commutable, try the other operand. LoadOpNo = 1; if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadOpNo, LoadNode, InputChain)) return false; } SDValue Base, Scale, Index, Disp, Segment; if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, Segment)) return false; auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, unsigned Opc8) { switch (MemVT.getSimpleVT().SimpleTy) { case MVT::i64: return Opc64; case MVT::i32: return Opc32; case MVT::i16: return Opc16; case MVT::i8: return Opc8; default: llvm_unreachable("Invalid size!"); } }; MachineSDNode *Result; switch (Opc) { case X86ISD::SUB: // Handle negate. if (IsNegate) { unsigned NewOpc = SelectOpcode(X86::NEG64m, X86::NEG32m, X86::NEG16m, X86::NEG8m); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); break; } LLVM_FALLTHROUGH; case X86ISD::ADD: // Try to match inc/dec. if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) { bool IsOne = isOneConstant(StoredVal.getOperand(1)); bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1)); // ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec. if ((IsOne || IsNegOne) && hasNoCarryFlagUses(StoredVal.getValue(1))) { unsigned NewOpc = ((Opc == X86ISD::ADD) == IsOne) ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); break; } } LLVM_FALLTHROUGH; case X86ISD::ADC: case X86ISD::SBB: case X86ISD::AND: case X86ISD::OR: case X86ISD::XOR: { auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, X86::ADD8mr); case X86ISD::ADC: return SelectOpcode(X86::ADC64mr, X86::ADC32mr, X86::ADC16mr, X86::ADC8mr); case X86ISD::SUB: return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, X86::SUB8mr); case X86ISD::SBB: return SelectOpcode(X86::SBB64mr, X86::SBB32mr, X86::SBB16mr, X86::SBB8mr); case X86ISD::AND: return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, X86::AND8mr); case X86ISD::OR: return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); case X86ISD::XOR: return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, X86::XOR8mr); default: llvm_unreachable("Invalid opcode!"); } }; auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); case X86ISD::ADC: return SelectOpcode(X86::ADC64mi8, X86::ADC32mi8, X86::ADC16mi8, 0); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); case X86ISD::SBB: return SelectOpcode(X86::SBB64mi8, X86::SBB32mi8, X86::SBB16mi8, 0); case X86ISD::AND: return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); case X86ISD::OR: return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); case X86ISD::XOR: return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); default: llvm_unreachable("Invalid opcode!"); } }; auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { switch (Opc) { case X86ISD::ADD: return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, X86::ADD8mi); case X86ISD::ADC: return SelectOpcode(X86::ADC64mi32, X86::ADC32mi, X86::ADC16mi, X86::ADC8mi); case X86ISD::SUB: return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, X86::SUB8mi); case X86ISD::SBB: return SelectOpcode(X86::SBB64mi32, X86::SBB32mi, X86::SBB16mi, X86::SBB8mi); case X86ISD::AND: return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, X86::AND8mi); case X86ISD::OR: return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, X86::OR8mi); case X86ISD::XOR: return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, X86::XOR8mi); default: llvm_unreachable("Invalid opcode!"); } }; unsigned NewOpc = SelectRegOpcode(Opc); SDValue Operand = StoredVal->getOperand(1-LoadOpNo); // See if the operand is a constant that we can fold into an immediate // operand. if (auto *OperandC = dyn_cast(Operand)) { int64_t OperandV = OperandC->getSExtValue(); // Check if we can shrink the operand enough to fit in an immediate (or // fit into a smaller immediate) by negating it and switching the // operation. if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && ((MemVT != MVT::i8 && !isInt<8>(OperandV) && isInt<8>(-OperandV)) || (MemVT == MVT::i64 && !isInt<32>(OperandV) && isInt<32>(-OperandV))) && hasNoCarryFlagUses(StoredVal.getValue(1))) { OperandV = -OperandV; Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; } // First try to fit this into an Imm8 operand. If it doesn't fit, then try // the larger immediate operand. if (MemVT != MVT::i8 && isInt<8>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImm8Opcode(Opc); } else if (MemVT != MVT::i64 || isInt<32>(OperandV)) { Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); NewOpc = SelectImmOpcode(Opc); } } if (Opc == X86ISD::ADC || Opc == X86ISD::SBB) { SDValue CopyTo = CurDAG->getCopyToReg(InputChain, SDLoc(Node), X86::EFLAGS, StoredVal.getOperand(2), SDValue()); const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Operand, CopyTo, CopyTo.getValue(1)}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); } else { const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Operand, InputChain}; Result = CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); } break; } default: llvm_unreachable("Invalid opcode!"); } MachineMemOperand *MemOps[] = {StoreNode->getMemOperand(), LoadNode->getMemOperand()}; CurDAG->setNodeMemRefs(Result, MemOps); // Update Load Chain uses as well. ReplaceUses(SDValue(LoadNode, 1), SDValue(Result, 1)); ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); CurDAG->RemoveDeadNode(Node); return true; } // See if this is an X & Mask that we can match to BEXTR/BZHI. // Where Mask is one of the following patterns: // a) x & (1 << nbits) - 1 // b) x & ~(-1 << nbits) // c) x & (-1 >> (32 - y)) // d) x << (32 - y) >> (32 - y) bool X86DAGToDAGISel::matchBitExtract(SDNode *Node) { assert( (Node->getOpcode() == ISD::AND || Node->getOpcode() == ISD::SRL) && "Should be either an and-mask, or right-shift after clearing high bits."); // BEXTR is BMI instruction, BZHI is BMI2 instruction. We need at least one. if (!Subtarget->hasBMI() && !Subtarget->hasBMI2()) return false; MVT NVT = Node->getSimpleValueType(0); // Only supported for 32 and 64 bits. if (NVT != MVT::i32 && NVT != MVT::i64) return false; SDValue NBits; // If we have BMI2's BZHI, we are ok with muti-use patterns. // Else, if we only have BMI1's BEXTR, we require one-use. const bool CanHaveExtraUses = Subtarget->hasBMI2(); auto checkUses = [CanHaveExtraUses](SDValue Op, unsigned NUses) { return CanHaveExtraUses || Op.getNode()->hasNUsesOfValue(NUses, Op.getResNo()); }; auto checkOneUse = [checkUses](SDValue Op) { return checkUses(Op, 1); }; auto checkTwoUse = [checkUses](SDValue Op) { return checkUses(Op, 2); }; auto peekThroughOneUseTruncation = [checkOneUse](SDValue V) { if (V->getOpcode() == ISD::TRUNCATE && checkOneUse(V)) { assert(V.getSimpleValueType() == MVT::i32 && V.getOperand(0).getSimpleValueType() == MVT::i64 && "Expected i64 -> i32 truncation"); V = V.getOperand(0); } return V; }; // a) x & ((1 << nbits) + (-1)) auto matchPatternA = [checkOneUse, peekThroughOneUseTruncation, &NBits](SDValue Mask) -> bool { // Match `add`. Must only have one use! if (Mask->getOpcode() != ISD::ADD || !checkOneUse(Mask)) return false; // We should be adding all-ones constant (i.e. subtracting one.) if (!isAllOnesConstant(Mask->getOperand(1))) return false; // Match `1 << nbits`. Might be truncated. Must only have one use! SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; if (!isOneConstant(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; auto isAllOnes = [this, peekThroughOneUseTruncation, NVT](SDValue V) { V = peekThroughOneUseTruncation(V); return CurDAG->MaskedValueIsAllOnes( V, APInt::getLowBitsSet(V.getSimpleValueType().getSizeInBits(), NVT.getSizeInBits())); }; // b) x & ~(-1 << nbits) auto matchPatternB = [checkOneUse, isAllOnes, peekThroughOneUseTruncation, &NBits](SDValue Mask) -> bool { // Match `~()`. Must only have one use! if (Mask.getOpcode() != ISD::XOR || !checkOneUse(Mask)) return false; // The -1 only has to be all-ones for the final Node's NVT. if (!isAllOnes(Mask->getOperand(1))) return false; // Match `-1 << nbits`. Might be truncated. Must only have one use! SDValue M0 = peekThroughOneUseTruncation(Mask->getOperand(0)); if (M0->getOpcode() != ISD::SHL || !checkOneUse(M0)) return false; // The -1 only has to be all-ones for the final Node's NVT. if (!isAllOnes(M0->getOperand(0))) return false; NBits = M0->getOperand(1); return true; }; // Match potentially-truncated (bitwidth - y) auto matchShiftAmt = [checkOneUse, &NBits](SDValue ShiftAmt, unsigned Bitwidth) { // Skip over a truncate of the shift amount. if (ShiftAmt.getOpcode() == ISD::TRUNCATE) { ShiftAmt = ShiftAmt.getOperand(0); // The trunc should have been the only user of the real shift amount. if (!checkOneUse(ShiftAmt)) return false; } // Match the shift amount as: (bitwidth - y). It should go away, too. if (ShiftAmt.getOpcode() != ISD::SUB) return false; auto *V0 = dyn_cast(ShiftAmt.getOperand(0)); if (!V0 || V0->getZExtValue() != Bitwidth) return false; NBits = ShiftAmt.getOperand(1); return true; }; // c) x & (-1 >> (32 - y)) auto matchPatternC = [checkOneUse, peekThroughOneUseTruncation, matchShiftAmt](SDValue Mask) -> bool { // The mask itself may be truncated. Mask = peekThroughOneUseTruncation(Mask); unsigned Bitwidth = Mask.getSimpleValueType().getSizeInBits(); // Match `l>>`. Must only have one use! if (Mask.getOpcode() != ISD::SRL || !checkOneUse(Mask)) return false; // We should be shifting truly all-ones constant. if (!isAllOnesConstant(Mask.getOperand(0))) return false; SDValue M1 = Mask.getOperand(1); // The shift amount should not be used externally. if (!checkOneUse(M1)) return false; return matchShiftAmt(M1, Bitwidth); }; SDValue X; // d) x << (32 - y) >> (32 - y) auto matchPatternD = [checkOneUse, checkTwoUse, matchShiftAmt, &X](SDNode *Node) -> bool { if (Node->getOpcode() != ISD::SRL) return false; SDValue N0 = Node->getOperand(0); if (N0->getOpcode() != ISD::SHL || !checkOneUse(N0)) return false; unsigned Bitwidth = N0.getSimpleValueType().getSizeInBits(); SDValue N1 = Node->getOperand(1); SDValue N01 = N0->getOperand(1); // Both of the shifts must be by the exact same value. // There should not be any uses of the shift amount outside of the pattern. if (N1 != N01 || !checkTwoUse(N1)) return false; if (!matchShiftAmt(N1, Bitwidth)) return false; X = N0->getOperand(0); return true; }; auto matchLowBitMask = [matchPatternA, matchPatternB, matchPatternC](SDValue Mask) -> bool { return matchPatternA(Mask) || matchPatternB(Mask) || matchPatternC(Mask); }; if (Node->getOpcode() == ISD::AND) { X = Node->getOperand(0); SDValue Mask = Node->getOperand(1); if (matchLowBitMask(Mask)) { // Great. } else { std::swap(X, Mask); if (!matchLowBitMask(Mask)) return false; } } else if (!matchPatternD(Node)) return false; SDLoc DL(Node); // Truncate the shift amount. NBits = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NBits); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); // Insert 8-bit NBits into lowest 8 bits of 32-bit register. // All the other bits are undefined, we do not care about them. SDValue ImplDef = SDValue( CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, MVT::i32), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), ImplDef); SDValue SRIdxVal = CurDAG->getTargetConstant(X86::sub_8bit, DL, MVT::i32); insertDAGNode(*CurDAG, SDValue(Node, 0), SRIdxVal); NBits = SDValue( CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, MVT::i32, ImplDef, NBits, SRIdxVal), 0); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); if (Subtarget->hasBMI2()) { // Great, just emit the the BZHI.. if (NVT != MVT::i32) { // But have to place the bit count into the wide-enough register first. NBits = CurDAG->getNode(ISD::ANY_EXTEND, DL, NVT, NBits); insertDAGNode(*CurDAG, SDValue(Node, 0), NBits); } SDValue Extract = CurDAG->getNode(X86ISD::BZHI, DL, NVT, X, NBits); ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } // Else, if we do *NOT* have BMI2, let's find out if the if the 'X' is // *logically* shifted (potentially with one-use trunc inbetween), // and the truncation was the only use of the shift, // and if so look past one-use truncation. { SDValue RealX = peekThroughOneUseTruncation(X); // FIXME: only if the shift is one-use? if (RealX != X && RealX.getOpcode() == ISD::SRL) X = RealX; } MVT XVT = X.getSimpleValueType(); // Else, emitting BEXTR requires one more step. // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location // [ bit count][ shift] name // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 // Shift NBits left by 8 bits, thus producing 'control'. // This makes the low 8 bits to be zero. SDValue C8 = CurDAG->getConstant(8, DL, MVT::i8); insertDAGNode(*CurDAG, SDValue(Node, 0), C8); SDValue Control = CurDAG->getNode(ISD::SHL, DL, MVT::i32, NBits, C8); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); // If the 'X' is *logically* shifted, we can fold that shift into 'control'. // FIXME: only if the shift is one-use? if (X.getOpcode() == ISD::SRL) { SDValue ShiftAmt = X.getOperand(1); X = X.getOperand(0); assert(ShiftAmt.getValueType() == MVT::i8 && "Expected shift amount to be i8"); // Now, *zero*-extend the shift amount. The bits 8...15 *must* be zero! // We could zext to i16 in some form, but we intentionally don't do that. SDValue OrigShiftAmt = ShiftAmt; ShiftAmt = CurDAG->getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShiftAmt); insertDAGNode(*CurDAG, OrigShiftAmt, ShiftAmt); // And now 'or' these low 8 bits of shift amount into the 'control'. Control = CurDAG->getNode(ISD::OR, DL, MVT::i32, Control, ShiftAmt); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // But have to place the 'control' into the wide-enough register first. if (XVT != MVT::i32) { Control = CurDAG->getNode(ISD::ANY_EXTEND, DL, XVT, Control); insertDAGNode(*CurDAG, SDValue(Node, 0), Control); } // And finally, form the BEXTR itself. SDValue Extract = CurDAG->getNode(X86ISD::BEXTR, DL, XVT, X, Control); // The 'X' was originally truncated. Do that now. if (XVT != NVT) { insertDAGNode(*CurDAG, SDValue(Node, 0), Extract); Extract = CurDAG->getNode(ISD::TRUNCATE, DL, NVT, Extract); } ReplaceNode(Node, Extract.getNode()); SelectCode(Extract.getNode()); return true; } // See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. MachineSDNode *X86DAGToDAGISel::matchBEXTRFromAndImm(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); SDLoc dl(Node); SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); // If we have TBM we can use an immediate for the control. If we have BMI // we should only do this if the BEXTR instruction is implemented well. // Otherwise moving the control into a register makes this more costly. // TODO: Maybe load folding, greater than 32-bit masks, or a guarantee of LICM // hoisting the move immediate would make it worthwhile with a less optimal // BEXTR? bool PreferBEXTR = Subtarget->hasTBM() || (Subtarget->hasBMI() && Subtarget->hasFastBEXTR()); if (!PreferBEXTR && !Subtarget->hasBMI2()) return nullptr; // Must have a shift right. if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) return nullptr; // Shift can't have additional users. if (!N0->hasOneUse()) return nullptr; // Only supported for 32 and 64 bits. if (NVT != MVT::i32 && NVT != MVT::i64) return nullptr; // Shift amount and RHS of and must be constant. ConstantSDNode *MaskCst = dyn_cast(N1); ConstantSDNode *ShiftCst = dyn_cast(N0->getOperand(1)); if (!MaskCst || !ShiftCst) return nullptr; // And RHS must be a mask. uint64_t Mask = MaskCst->getZExtValue(); if (!isMask_64(Mask)) return nullptr; uint64_t Shift = ShiftCst->getZExtValue(); uint64_t MaskSize = countPopulation(Mask); // Don't interfere with something that can be handled by extracting AH. // TODO: If we are able to fold a load, BEXTR might still be better than AH. if (Shift == 8 && MaskSize == 8) return nullptr; // Make sure we are only using bits that were in the original value, not // shifted in. if (Shift + MaskSize > NVT.getSizeInBits()) return nullptr; // BZHI, if available, is always fast, unlike BEXTR. But even if we decide // that we can't use BEXTR, it is only worthwhile using BZHI if the mask // does not fit into 32 bits. Load folding is not a sufficient reason. if (!PreferBEXTR && MaskSize <= 32) return nullptr; SDValue Control; unsigned ROpc, MOpc; if (!PreferBEXTR) { assert(Subtarget->hasBMI2() && "We must have BMI2's BZHI then."); // If we can't make use of BEXTR then we can't fuse shift+mask stages. // Let's perform the mask first, and apply shift later. Note that we need to // widen the mask to account for the fact that we'll apply shift afterwards! Control = CurDAG->getTargetConstant(Shift + MaskSize, dl, NVT); ROpc = NVT == MVT::i64 ? X86::BZHI64rr : X86::BZHI32rr; MOpc = NVT == MVT::i64 ? X86::BZHI64rm : X86::BZHI32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); } else { // The 'control' of BEXTR has the pattern of: // [15...8 bit][ 7...0 bit] location // [ bit count][ shift] name // I.e. 0b000000011'00000001 means (x >> 0b1) & 0b11 Control = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); if (Subtarget->hasTBM()) { ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; } else { assert(Subtarget->hasBMI() && "We must have BMI1's BEXTR then."); // BMI requires the immediate to placed in a register. ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; unsigned NewOpc = NVT == MVT::i64 ? X86::MOV32ri64 : X86::MOV32ri; Control = SDValue(CurDAG->getMachineNode(NewOpc, dl, NVT, Control), 0); } } MachineSDNode *NewNode; SDValue Input = N0->getOperand(0); SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Control, Input.getOperand(0)}; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(Input.getValue(1), SDValue(NewNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Input)->getMemOperand()}); } else { NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, MVT::i32, Input, Control); } if (!PreferBEXTR) { // We still need to apply the shift. SDValue ShAmt = CurDAG->getTargetConstant(Shift, dl, NVT); unsigned NewOpc = NVT == MVT::i64 ? X86::SHR64ri : X86::SHR32ri; NewNode = CurDAG->getMachineNode(NewOpc, dl, NVT, SDValue(NewNode, 0), ShAmt); } return NewNode; } // Emit a PCMISTR(I/M) instruction. MachineSDNode *X86DAGToDAGISel::emitPCMPISTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node) { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); SDValue Imm = Node->getOperand(2); const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, N1.getOperand(0) }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); return CNode; } SDValue Ops[] = { N0, N1, Imm }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32); MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); return CNode; } // Emit a PCMESTR(I/M) instruction. Also return the Glue result in case we need // to emit a second instruction after this one. This is needed since we have two // copyToReg nodes glued before this and we need to continue that glue through. MachineSDNode *X86DAGToDAGISel::emitPCMPESTR(unsigned ROpc, unsigned MOpc, bool MayFoldLoad, const SDLoc &dl, MVT VT, SDNode *Node, SDValue &InFlag) { SDValue N0 = Node->getOperand(0); SDValue N2 = Node->getOperand(2); SDValue Imm = Node->getOperand(4); const ConstantInt *Val = cast(Imm)->getConstantIntValue(); Imm = CurDAG->getTargetConstant(*Val, SDLoc(Node), Imm.getValueType()); // Try to fold a load. No need to check alignment. SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (MayFoldLoad && tryFoldLoad(Node, N2, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, N2.getOperand(0), InFlag }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Other, MVT::Glue); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); InFlag = SDValue(CNode, 3); // Update the chain. ReplaceUses(N2.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N2)->getMemOperand()}); return CNode; } SDValue Ops[] = { N0, N2, Imm, InFlag }; SDVTList VTs = CurDAG->getVTList(VT, MVT::i32, MVT::Glue); MachineSDNode *CNode = CurDAG->getMachineNode(ROpc, dl, VTs, Ops); InFlag = SDValue(CNode, 2); return CNode; } bool X86DAGToDAGISel::tryShiftAmountMod(SDNode *N) { EVT VT = N->getValueType(0); // Only handle scalar shifts. if (VT.isVector()) return false; // Narrower shifts only mask to 5 bits in hardware. unsigned Size = VT == MVT::i64 ? 64 : 32; SDValue OrigShiftAmt = N->getOperand(1); SDValue ShiftAmt = OrigShiftAmt; SDLoc DL(N); // Skip over a truncate of the shift amount. if (ShiftAmt->getOpcode() == ISD::TRUNCATE) ShiftAmt = ShiftAmt->getOperand(0); // This function is called after X86DAGToDAGISel::matchBitExtract(), // so we are not afraid that we might mess up BZHI/BEXTR pattern. SDValue NewShiftAmt; if (ShiftAmt->getOpcode() == ISD::ADD || ShiftAmt->getOpcode() == ISD::SUB) { SDValue Add0 = ShiftAmt->getOperand(0); SDValue Add1 = ShiftAmt->getOperand(1); // If we are shifting by X+/-N where N == 0 mod Size, then just shift by X // to avoid the ADD/SUB. if (isa(Add1) && cast(Add1)->getZExtValue() % Size == 0) { NewShiftAmt = Add0; // If we are shifting by N-X where N == 0 mod Size, then just shift by -X to // generate a NEG instead of a SUB of a constant. } else if (ShiftAmt->getOpcode() == ISD::SUB && isa(Add0) && cast(Add0)->getZExtValue() != 0 && cast(Add0)->getZExtValue() % Size == 0) { // Insert a negate op. // TODO: This isn't guaranteed to replace the sub if there is a logic cone // that uses it that's not a shift. EVT SubVT = ShiftAmt.getValueType(); SDValue Zero = CurDAG->getConstant(0, DL, SubVT); SDValue Neg = CurDAG->getNode(ISD::SUB, DL, SubVT, Zero, Add1); NewShiftAmt = Neg; // Insert these operands into a valid topological order so they can // get selected independently. insertDAGNode(*CurDAG, OrigShiftAmt, Zero); insertDAGNode(*CurDAG, OrigShiftAmt, Neg); } else return false; } else return false; if (NewShiftAmt.getValueType() != MVT::i8) { // Need to truncate the shift amount. NewShiftAmt = CurDAG->getNode(ISD::TRUNCATE, DL, MVT::i8, NewShiftAmt); // Add to a correct topological ordering. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); } // Insert a new mask to keep the shift amount legal. This should be removed // by isel patterns. NewShiftAmt = CurDAG->getNode(ISD::AND, DL, MVT::i8, NewShiftAmt, CurDAG->getConstant(Size - 1, DL, MVT::i8)); // Place in a correct topological ordering. insertDAGNode(*CurDAG, OrigShiftAmt, NewShiftAmt); SDNode *UpdatedNode = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewShiftAmt); if (UpdatedNode != N) { // If we found an existing node, we should replace ourselves with that node // and wait for it to be selected after its other users. ReplaceNode(N, UpdatedNode); return true; } // If the original shift amount is now dead, delete it so that we don't run // it through isel. if (OrigShiftAmt.getNode()->use_empty()) CurDAG->RemoveDeadNode(OrigShiftAmt.getNode()); // Now that we've optimized the shift amount, defer to normal isel to get // load folding and legacy vs BMI2 selection without repeating it here. SelectCode(N); return true; } bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) { MVT NVT = N->getSimpleValueType(0); unsigned Opcode = N->getOpcode(); SDLoc dl(N); // For operations of the form (x << C1) op C2, check if we can use a smaller // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. SDValue Shift = N->getOperand(0); SDValue N1 = N->getOperand(1); ConstantSDNode *Cst = dyn_cast(N1); if (!Cst) return false; int64_t Val = Cst->getSExtValue(); // If we have an any_extend feeding the AND, look through it to see if there // is a shift behind it. But only if the AND doesn't use the extended bits. // FIXME: Generalize this to other ANY_EXTEND than i32 to i64? bool FoundAnyExtend = false; if (Shift.getOpcode() == ISD::ANY_EXTEND && Shift.hasOneUse() && Shift.getOperand(0).getSimpleValueType() == MVT::i32 && isUInt<32>(Val)) { FoundAnyExtend = true; Shift = Shift.getOperand(0); } if (Shift.getOpcode() != ISD::SHL || !Shift.hasOneUse()) return false; // i8 is unshrinkable, i16 should be promoted to i32. if (NVT != MVT::i32 && NVT != MVT::i64) return false; ConstantSDNode *ShlCst = dyn_cast(Shift.getOperand(1)); if (!ShlCst) return false; uint64_t ShAmt = ShlCst->getZExtValue(); // Make sure that we don't change the operation by removing bits. // This only matters for OR and XOR, AND is unaffected. uint64_t RemovedBitsMask = (1ULL << ShAmt) - 1; if (Opcode != ISD::AND && (Val & RemovedBitsMask) != 0) return false; // Check the minimum bitwidth for the new constant. // TODO: Using 16 and 8 bit operations is also possible for or32 & xor32. auto CanShrinkImmediate = [&](int64_t &ShiftedVal) { if (Opcode == ISD::AND) { // AND32ri is the same as AND64ri32 with zext imm. // Try this before sign extended immediates below. ShiftedVal = (uint64_t)Val >> ShAmt; if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) return true; // Also swap order when the AND can become MOVZX. if (ShiftedVal == UINT8_MAX || ShiftedVal == UINT16_MAX) return true; } ShiftedVal = Val >> ShAmt; if ((!isInt<8>(Val) && isInt<8>(ShiftedVal)) || (!isInt<32>(Val) && isInt<32>(ShiftedVal))) return true; if (Opcode != ISD::AND) { // MOV32ri+OR64r/XOR64r is cheaper than MOV64ri64+OR64rr/XOR64rr ShiftedVal = (uint64_t)Val >> ShAmt; if (NVT == MVT::i64 && !isUInt<32>(Val) && isUInt<32>(ShiftedVal)) return true; } return false; }; int64_t ShiftedVal; if (!CanShrinkImmediate(ShiftedVal)) return false; // Ok, we can reorder to get a smaller immediate. // But, its possible the original immediate allowed an AND to become MOVZX. // Doing this late due to avoid the MakedValueIsZero call as late as // possible. if (Opcode == ISD::AND) { // Find the smallest zext this could possibly be. unsigned ZExtWidth = Cst->getAPIntValue().getActiveBits(); ZExtWidth = PowerOf2Ceil(std::max(ZExtWidth, 8U)); // Figure out which bits need to be zero to achieve that mask. APInt NeededMask = APInt::getLowBitsSet(NVT.getSizeInBits(), ZExtWidth); NeededMask &= ~Cst->getAPIntValue(); if (CurDAG->MaskedValueIsZero(N->getOperand(0), NeededMask)) return false; } SDValue X = Shift.getOperand(0); if (FoundAnyExtend) { SDValue NewX = CurDAG->getNode(ISD::ANY_EXTEND, dl, NVT, X); insertDAGNode(*CurDAG, SDValue(N, 0), NewX); X = NewX; } SDValue NewCst = CurDAG->getConstant(ShiftedVal, dl, NVT); insertDAGNode(*CurDAG, SDValue(N, 0), NewCst); SDValue NewBinOp = CurDAG->getNode(Opcode, dl, NVT, X, NewCst); insertDAGNode(*CurDAG, SDValue(N, 0), NewBinOp); SDValue NewSHL = CurDAG->getNode(ISD::SHL, dl, NVT, NewBinOp, Shift.getOperand(1)); ReplaceNode(N, NewSHL.getNode()); SelectCode(NewSHL.getNode()); return true; } bool X86DAGToDAGISel::matchVPTERNLOG(SDNode *Root, SDNode *ParentA, SDNode *ParentBC, SDValue A, SDValue B, SDValue C, uint8_t Imm) { assert(A.isOperandOf(ParentA)); assert(B.isOperandOf(ParentBC)); assert(C.isOperandOf(ParentBC)); auto tryFoldLoadOrBCast = [this](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) return true; // Not a load, check for broadcast which may be behind a bitcast. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { P = L.getNode(); L = L.getOperand(0); } if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) return false; // Only 32 and 64 bit broadcasts are supported. auto *MemIntr = cast(L); unsigned Size = MemIntr->getMemoryVT().getSizeInBits(); if (Size != 32 && Size != 64) return false; return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; bool FoldedLoad = false; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoadOrBCast(Root, ParentBC, C, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; } else if (tryFoldLoadOrBCast(Root, ParentA, A, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; std::swap(A, C); // Swap bits 1/4 and 3/6. uint8_t OldImm = Imm; Imm = OldImm & 0xa5; if (OldImm & 0x02) Imm |= 0x10; if (OldImm & 0x10) Imm |= 0x02; if (OldImm & 0x08) Imm |= 0x40; if (OldImm & 0x40) Imm |= 0x08; } else if (tryFoldLoadOrBCast(Root, ParentBC, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { FoldedLoad = true; std::swap(B, C); // Swap bits 1/2 and 5/6. uint8_t OldImm = Imm; Imm = OldImm & 0x99; if (OldImm & 0x02) Imm |= 0x04; if (OldImm & 0x04) Imm |= 0x02; if (OldImm & 0x20) Imm |= 0x40; if (OldImm & 0x40) Imm |= 0x20; } SDLoc DL(Root); SDValue TImm = CurDAG->getTargetConstant(Imm, DL, MVT::i8); MVT NVT = Root->getSimpleValueType(0); MachineSDNode *MNode; if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); unsigned Opc; if (C.getOpcode() == X86ISD::VBROADCAST_LOAD) { auto *MemIntr = cast(C); unsigned EltSize = MemIntr->getMemoryVT().getSizeInBits(); assert((EltSize == 32 || EltSize == 64) && "Unexpected broadcast size!"); bool UseD = EltSize == 32; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rmbi : X86::VPTERNLOGQZ128rmbi; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rmbi : X86::VPTERNLOGQZ256rmbi; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrmbi : X86::VPTERNLOGQZrmbi; else llvm_unreachable("Unexpected vector size!"); } else { bool UseD = NVT.getVectorElementType() == MVT::i32; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rmi : X86::VPTERNLOGQZ128rmi; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rmi : X86::VPTERNLOGQZ256rmi; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrmi : X86::VPTERNLOGQZrmi; else llvm_unreachable("Unexpected vector size!"); } SDValue Ops[] = {A, B, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, TImm, C.getOperand(0)}; MNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops); // Update the chain. ReplaceUses(C.getValue(1), SDValue(MNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(MNode, {cast(C)->getMemOperand()}); } else { bool UseD = NVT.getVectorElementType() == MVT::i32; unsigned Opc; if (NVT.is128BitVector()) Opc = UseD ? X86::VPTERNLOGDZ128rri : X86::VPTERNLOGQZ128rri; else if (NVT.is256BitVector()) Opc = UseD ? X86::VPTERNLOGDZ256rri : X86::VPTERNLOGQZ256rri; else if (NVT.is512BitVector()) Opc = UseD ? X86::VPTERNLOGDZrri : X86::VPTERNLOGQZrri; else llvm_unreachable("Unexpected vector size!"); MNode = CurDAG->getMachineNode(Opc, DL, NVT, {A, B, C, TImm}); } ReplaceUses(SDValue(Root, 0), SDValue(MNode, 0)); CurDAG->RemoveDeadNode(Root); return true; } // Try to match two logic ops to a VPTERNLOG. // FIXME: Handle inverted inputs? // FIXME: Handle more complex patterns that use an operand more than once? bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) { MVT NVT = N->getSimpleValueType(0); // Make sure we support VPTERNLOG. if (!NVT.isVector() || !Subtarget->hasAVX512() || NVT.getVectorElementType() == MVT::i1) return false; // We need VLX for 128/256-bit. if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); auto getFoldableLogicOp = [](SDValue Op) { // Peek through single use bitcast. if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse()) Op = Op.getOperand(0); if (!Op.hasOneUse()) return SDValue(); unsigned Opc = Op.getOpcode(); if (Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR || Opc == X86ISD::ANDNP) return Op; return SDValue(); }; SDValue A, FoldableOp; if ((FoldableOp = getFoldableLogicOp(N1))) { A = N0; } else if ((FoldableOp = getFoldableLogicOp(N0))) { A = N1; } else return false; SDValue B = FoldableOp.getOperand(0); SDValue C = FoldableOp.getOperand(1); // We can build the appropriate control immediate by performing the logic // operation we're matching using these constants for A, B, and C. const uint8_t TernlogMagicA = 0xf0; const uint8_t TernlogMagicB = 0xcc; const uint8_t TernlogMagicC = 0xaa; uint8_t Imm; switch (FoldableOp.getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case ISD::AND: Imm = TernlogMagicB & TernlogMagicC; break; case ISD::OR: Imm = TernlogMagicB | TernlogMagicC; break; case ISD::XOR: Imm = TernlogMagicB ^ TernlogMagicC; break; case X86ISD::ANDNP: Imm = ~(TernlogMagicB) & TernlogMagicC; break; } switch (N->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::ANDNP: if (A == N0) Imm &= ~TernlogMagicA; else Imm = ~(Imm) & TernlogMagicA; break; case ISD::AND: Imm &= TernlogMagicA; break; case ISD::OR: Imm |= TernlogMagicA; break; case ISD::XOR: Imm ^= TernlogMagicA; break; } return matchVPTERNLOG(N, N, FoldableOp.getNode(), A, B, C, Imm); } /// If the high bits of an 'and' operand are known zero, try setting the /// high bits of an 'and' constant operand to produce a smaller encoding by /// creating a small, sign-extended negative immediate rather than a large /// positive one. This reverses a transform in SimplifyDemandedBits that /// shrinks mask constants by clearing bits. There is also a possibility that /// the 'and' mask can be made -1, so the 'and' itself is unnecessary. In that /// case, just replace the 'and'. Return 'true' if the node is replaced. bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) { // i8 is unshrinkable, i16 should be promoted to i32, and vector ops don't // have immediate operands. MVT VT = And->getSimpleValueType(0); if (VT != MVT::i32 && VT != MVT::i64) return false; auto *And1C = dyn_cast(And->getOperand(1)); if (!And1C) return false; // Bail out if the mask constant is already negative. It's can't shrink more. // If the upper 32 bits of a 64 bit mask are all zeros, we have special isel // patterns to use a 32-bit and instead of a 64-bit and by relying on the // implicit zeroing of 32 bit ops. So we should check if the lower 32 bits // are negative too. APInt MaskVal = And1C->getAPIntValue(); unsigned MaskLZ = MaskVal.countLeadingZeros(); if (!MaskLZ || (VT == MVT::i64 && MaskLZ == 32)) return false; // Don't extend into the upper 32 bits of a 64 bit mask. if (VT == MVT::i64 && MaskLZ >= 32) { MaskLZ -= 32; MaskVal = MaskVal.trunc(32); } SDValue And0 = And->getOperand(0); APInt HighZeros = APInt::getHighBitsSet(MaskVal.getBitWidth(), MaskLZ); APInt NegMaskVal = MaskVal | HighZeros; // If a negative constant would not allow a smaller encoding, there's no need // to continue. Only change the constant when we know it's a win. unsigned MinWidth = NegMaskVal.getMinSignedBits(); if (MinWidth > 32 || (MinWidth > 8 && MaskVal.getMinSignedBits() <= 32)) return false; // Extend masks if we truncated above. if (VT == MVT::i64 && MaskVal.getBitWidth() < 64) { NegMaskVal = NegMaskVal.zext(64); HighZeros = HighZeros.zext(64); } // The variable operand must be all zeros in the top bits to allow using the // new, negative constant as the mask. if (!CurDAG->MaskedValueIsZero(And0, HighZeros)) return false; // Check if the mask is -1. In that case, this is an unnecessary instruction // that escaped earlier analysis. if (NegMaskVal.isAllOnesValue()) { ReplaceNode(And, And0.getNode()); return true; } // A negative mask allows a smaller encoding. Create a new 'and' node. SDValue NewMask = CurDAG->getConstant(NegMaskVal, SDLoc(And), VT); SDValue NewAnd = CurDAG->getNode(ISD::AND, SDLoc(And), VT, And0, NewMask); ReplaceNode(And, NewAnd.getNode()); SelectCode(NewAnd.getNode()); return true; } static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad, bool FoldedBCast, bool Masked) { #define VPTESTM_CASE(VT, SUFFIX) \ case MVT::VT: \ if (Masked) \ return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \ return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX; #define VPTESTM_BROADCAST_CASES(SUFFIX) \ default: llvm_unreachable("Unexpected VT!"); \ VPTESTM_CASE(v4i32, DZ128##SUFFIX) \ VPTESTM_CASE(v2i64, QZ128##SUFFIX) \ VPTESTM_CASE(v8i32, DZ256##SUFFIX) \ VPTESTM_CASE(v4i64, QZ256##SUFFIX) \ VPTESTM_CASE(v16i32, DZ##SUFFIX) \ VPTESTM_CASE(v8i64, QZ##SUFFIX) #define VPTESTM_FULL_CASES(SUFFIX) \ VPTESTM_BROADCAST_CASES(SUFFIX) \ VPTESTM_CASE(v16i8, BZ128##SUFFIX) \ VPTESTM_CASE(v8i16, WZ128##SUFFIX) \ VPTESTM_CASE(v32i8, BZ256##SUFFIX) \ VPTESTM_CASE(v16i16, WZ256##SUFFIX) \ VPTESTM_CASE(v64i8, BZ##SUFFIX) \ VPTESTM_CASE(v32i16, WZ##SUFFIX) if (FoldedBCast) { switch (TestVT.SimpleTy) { VPTESTM_BROADCAST_CASES(rmb) } } if (FoldedLoad) { switch (TestVT.SimpleTy) { VPTESTM_FULL_CASES(rm) } } switch (TestVT.SimpleTy) { VPTESTM_FULL_CASES(rr) } #undef VPTESTM_FULL_CASES #undef VPTESTM_BROADCAST_CASES #undef VPTESTM_CASE } // Try to create VPTESTM instruction. If InMask is not null, it will be used // to form a masked operation. bool X86DAGToDAGISel::tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue InMask) { assert(Subtarget->hasAVX512() && "Expected AVX512!"); assert(Setcc.getSimpleValueType().getVectorElementType() == MVT::i1 && "Unexpected VT!"); // Look for equal and not equal compares. ISD::CondCode CC = cast(Setcc.getOperand(2))->get(); if (CC != ISD::SETEQ && CC != ISD::SETNE) return false; SDValue SetccOp0 = Setcc.getOperand(0); SDValue SetccOp1 = Setcc.getOperand(1); // Canonicalize the all zero vector to the RHS. if (ISD::isBuildVectorAllZeros(SetccOp0.getNode())) std::swap(SetccOp0, SetccOp1); // See if we're comparing against zero. if (!ISD::isBuildVectorAllZeros(SetccOp1.getNode())) return false; SDValue N0 = SetccOp0; MVT CmpVT = N0.getSimpleValueType(); MVT CmpSVT = CmpVT.getVectorElementType(); // Start with both operands the same. We'll try to refine this. SDValue Src0 = N0; SDValue Src1 = N0; { // Look through single use bitcasts. SDValue N0Temp = N0; if (N0Temp.getOpcode() == ISD::BITCAST && N0Temp.hasOneUse()) N0Temp = N0.getOperand(0); // Look for single use AND. if (N0Temp.getOpcode() == ISD::AND && N0Temp.hasOneUse()) { Src0 = N0Temp.getOperand(0); Src1 = N0Temp.getOperand(1); } } // Without VLX we need to widen the operation. bool Widen = !Subtarget->hasVLX() && !CmpVT.is512BitVector(); auto tryFoldLoadOrBCast = [&](SDNode *Root, SDNode *P, SDValue &L, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { // If we need to widen, we can't fold the load. if (!Widen) if (tryFoldLoad(Root, P, L, Base, Scale, Index, Disp, Segment)) return true; // If we didn't fold a load, try to match broadcast. No widening limitation // for this. But only 32 and 64 bit types are supported. if (CmpSVT != MVT::i32 && CmpSVT != MVT::i64) return false; // Look through single use bitcasts. if (L.getOpcode() == ISD::BITCAST && L.hasOneUse()) { P = L.getNode(); L = L.getOperand(0); } if (L.getOpcode() != X86ISD::VBROADCAST_LOAD) return false; auto *MemIntr = cast(L); if (MemIntr->getMemoryVT().getSizeInBits() != CmpSVT.getSizeInBits()) return false; return tryFoldBroadcast(Root, P, L, Base, Scale, Index, Disp, Segment); }; // We can only fold loads if the sources are unique. bool CanFoldLoads = Src0 != Src1; bool FoldedLoad = false; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (CanFoldLoads) { FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (!FoldedLoad) { // And is commutative. FoldedLoad = tryFoldLoadOrBCast(Root, N0.getNode(), Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedLoad) std::swap(Src0, Src1); } } bool FoldedBCast = FoldedLoad && Src1.getOpcode() == X86ISD::VBROADCAST_LOAD; bool IsMasked = InMask.getNode() != nullptr; SDLoc dl(Root); MVT ResVT = Setcc.getSimpleValueType(); MVT MaskVT = ResVT; if (Widen) { // Widen the inputs using insert_subreg or copy_to_regclass. unsigned Scale = CmpVT.is128BitVector() ? 4 : 2; unsigned SubReg = CmpVT.is128BitVector() ? X86::sub_xmm : X86::sub_ymm; unsigned NumElts = CmpVT.getVectorNumElements() * Scale; CmpVT = MVT::getVectorVT(CmpSVT, NumElts); MaskVT = MVT::getVectorVT(MVT::i1, NumElts); SDValue ImplDef = SDValue(CurDAG->getMachineNode(X86::IMPLICIT_DEF, dl, CmpVT), 0); Src0 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src0); if (!FoldedBCast) Src1 = CurDAG->getTargetInsertSubreg(SubReg, dl, CmpVT, ImplDef, Src1); if (IsMasked) { // Widen the mask. unsigned RegClass = TLI->getRegClassFor(MaskVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); InMask = SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, MaskVT, InMask, RC), 0); } } bool IsTestN = CC == ISD::SETEQ; unsigned Opc = getVPTESTMOpc(CmpVT, IsTestN, FoldedLoad, FoldedBCast, IsMasked); MachineSDNode *CNode; if (FoldedLoad) { SDVTList VTs = CurDAG->getVTList(MaskVT, MVT::Other); if (IsMasked) { SDValue Ops[] = { InMask, Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } else { SDValue Ops[] = { Src0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Src1.getOperand(0) }; CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); } // Update the chain. ReplaceUses(Src1.getValue(1), SDValue(CNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(Src1)->getMemOperand()}); } else { if (IsMasked) CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, InMask, Src0, Src1); else CNode = CurDAG->getMachineNode(Opc, dl, MaskVT, Src0, Src1); } // If we widened, we need to shrink the mask VT. if (Widen) { unsigned RegClass = TLI->getRegClassFor(ResVT)->getID(); SDValue RC = CurDAG->getTargetConstant(RegClass, dl, MVT::i32); CNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, dl, ResVT, SDValue(CNode, 0), RC); } ReplaceUses(SDValue(Root, 0), SDValue(CNode, 0)); CurDAG->RemoveDeadNode(Root); return true; } // Try to match the bitselect pattern (or (and A, B), (andn A, C)). Turn it // into vpternlog. bool X86DAGToDAGISel::tryMatchBitSelect(SDNode *N) { assert(N->getOpcode() == ISD::OR && "Unexpected opcode!"); MVT NVT = N->getSimpleValueType(0); // Make sure we support VPTERNLOG. if (!NVT.isVector() || !Subtarget->hasAVX512()) return false; // We need VLX for 128/256-bit. if (!(Subtarget->hasVLX() || NVT.is512BitVector())) return false; SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); // Canonicalize AND to LHS. if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP || !N0.hasOneUse() || !N1.hasOneUse()) return false; // ANDN is not commutable, use it to pick down A and C. SDValue A = N1.getOperand(0); SDValue C = N1.getOperand(1); // AND is commutable, if one operand matches A, the other operand is B. // Otherwise this isn't a match. SDValue B; if (N0.getOperand(0) == A) B = N0.getOperand(1); else if (N0.getOperand(1) == A) B = N0.getOperand(0); else return false; SDLoc dl(N); SDValue Imm = CurDAG->getTargetConstant(0xCA, dl, MVT::i8); SDValue Ternlog = CurDAG->getNode(X86ISD::VPTERNLOG, dl, NVT, A, B, C, Imm); ReplaceNode(N, Ternlog.getNode()); return matchVPTERNLOG(Ternlog.getNode(), Ternlog.getNode(), Ternlog.getNode(), A, B, C, 0xCA); } void X86DAGToDAGISel::Select(SDNode *Node) { MVT NVT = Node->getSimpleValueType(0); unsigned Opcode = Node->getOpcode(); SDLoc dl(Node); if (Node->isMachineOpcode()) { LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n'); Node->setNodeId(-1); return; // Already selected. } switch (Opcode) { default: break; case ISD::INTRINSIC_W_CHAIN: { unsigned IntNo = Node->getConstantOperandVal(1); switch (IntNo) { default: break; case Intrinsic::x86_encodekey128: case Intrinsic::x86_encodekey256: { if (!Subtarget->hasKL()) break; unsigned Opcode; switch (IntNo) { default: llvm_unreachable("Impossible intrinsic"); case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break; case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break; } SDValue Chain = Node->getOperand(0); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3), SDValue()); if (Opcode == X86::ENCODEKEY256) Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4), Chain.getValue(1)); MachineSDNode *Res = CurDAG->getMachineNode( Opcode, dl, Node->getVTList(), {Node->getOperand(2), Chain, Chain.getValue(1)}); ReplaceNode(Node, Res); return; } case Intrinsic::x86_tileloadd64_internal: { if (!Subtarget->hasAMXTILE()) break; unsigned Opc = X86::PTILELOADDV; // _tile_loadd_internal(row, col, buf, STRIDE) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Base, Scale, Index, Disp, Segment, CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); return; } case Intrinsic::x86_tdpbssd_internal: { if (!Subtarget->hasAMXTILE()) break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); return; } + case Intrinsic::x86_tilezero_internal: { + if (!Subtarget->hasAMXTILE()) + break; + unsigned Opc = X86::PTILEZEROV; + SDValue Chain = Node->getOperand(0); + SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + MachineSDNode *CNode = + CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); + ReplaceNode(Node, CNode); + return; + } } break; } case ISD::INTRINSIC_VOID: { unsigned IntNo = Node->getConstantOperandVal(1); switch (IntNo) { default: break; case Intrinsic::x86_sse3_monitor: case Intrinsic::x86_monitorx: case Intrinsic::x86_clzero: { bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64; unsigned Opc = 0; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_sse3_monitor: if (!Subtarget->hasSSE3()) break; Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr; break; case Intrinsic::x86_monitorx: if (!Subtarget->hasMWAITX()) break; Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr; break; case Intrinsic::x86_clzero: if (!Subtarget->hasCLZERO()) break; Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r; break; } if (Opc) { unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX; SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg, Node->getOperand(2), SDValue()); SDValue InFlag = Chain.getValue(1); if (IntNo == Intrinsic::x86_sse3_monitor || IntNo == Intrinsic::x86_monitorx) { // Copy the other two operands to ECX and EDX. Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3), InFlag); InFlag = Chain.getValue(1); Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4), InFlag); InFlag = Chain.getValue(1); } MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, { Chain, InFlag}); ReplaceNode(Node, CNode); return; } break; } case Intrinsic::x86_tilestored64_internal: { unsigned Opc = X86::PTILESTOREDV; // _tile_stored_internal(row, col, buf, STRIDE, c) SDValue Base = Node->getOperand(4); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Base, Scale, Index, Disp, Segment, Node->getOperand(6), CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); return; } case Intrinsic::x86_tileloadd64: case Intrinsic::x86_tileloaddt164: case Intrinsic::x86_tilestored64: { if (!Subtarget->hasAMXTILE()) break; unsigned Opc; switch (IntNo) { default: llvm_unreachable("Unexpected intrinsic!"); case Intrinsic::x86_tileloadd64: Opc = X86::PTILELOADD; break; case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break; case Intrinsic::x86_tilestored64: Opc = X86::PTILESTORED; break; } // FIXME: Match displacement and scale. unsigned TIndex = Node->getConstantOperandVal(2); SDValue TReg = getI8Imm(TIndex, dl); SDValue Base = Node->getOperand(3); SDValue Scale = getI8Imm(1, dl); SDValue Index = Node->getOperand(4); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; if (Opc == X86::PTILESTORED) { SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } else { SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain }; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); } ReplaceNode(Node, CNode); return; } } break; } case ISD::BRIND: { if (Subtarget->isTargetNaCl()) // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We // leave the instruction alone. break; if (Subtarget->isTarget64BitILP32()) { // Converts a 32-bit register to a 64-bit, zero-extended version of // it. This is needed because x86-64 can do many things, but jmp %r32 // ain't one of them. SDValue Target = Node->getOperand(1); assert(Target.getValueType() == MVT::i32 && "Unexpected VT!"); SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64); SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other, Node->getOperand(0), ZextTarget); ReplaceNode(Node, Brind.getNode()); SelectCode(ZextTarget.getNode()); SelectCode(Brind.getNode()); return; } break; } case X86ISD::GlobalBaseReg: ReplaceNode(Node, getGlobalBaseReg()); return; case ISD::BITCAST: // Just drop all 128/256/512-bit bitcasts. if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() || NVT == MVT::f128) { ReplaceUses(SDValue(Node, 0), Node->getOperand(0)); CurDAG->RemoveDeadNode(Node); return; } break; case ISD::SRL: if (matchBitExtract(Node)) return; LLVM_FALLTHROUGH; case ISD::SRA: case ISD::SHL: if (tryShiftAmountMod(Node)) return; break; case X86ISD::VPTERNLOG: { uint8_t Imm = cast(Node->getOperand(3))->getZExtValue(); if (matchVPTERNLOG(Node, Node, Node, Node->getOperand(0), Node->getOperand(1), Node->getOperand(2), Imm)) return; break; } case X86ISD::ANDNP: if (tryVPTERNLOG(Node)) return; break; case ISD::AND: if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) { // Try to form a masked VPTESTM. Operands can be in either order. SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() && tryVPTESTM(Node, N0, N1)) return; if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() && tryVPTESTM(Node, N1, N0)) return; } if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) { ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); return; } if (matchBitExtract(Node)) return; if (AndImmShrink && shrinkAndImmediate(Node)) return; LLVM_FALLTHROUGH; case ISD::OR: case ISD::XOR: if (tryShrinkShlLogicImm(Node)) return; if (Opcode == ISD::OR && tryMatchBitSelect(Node)) return; if (tryVPTERNLOG(Node)) return; LLVM_FALLTHROUGH; case ISD::ADD: case ISD::SUB: { // Try to avoid folding immediates with multiple uses for optsize. // This code tries to select to register form directly to avoid going // through the isel table which might fold the immediate. We can't change // the patterns on the add/sub/and/or/xor with immediate paterns in the // tablegen files to check immediate use count without making the patterns // unavailable to the fast-isel table. if (!CurDAG->shouldOptForSize()) break; // Only handle i8/i16/i32/i64. if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64) break; SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); ConstantSDNode *Cst = dyn_cast(N1); if (!Cst) break; int64_t Val = Cst->getSExtValue(); // Make sure its an immediate that is considered foldable. // FIXME: Handle unsigned 32 bit immediates for 64-bit AND. if (!isInt<8>(Val) && !isInt<32>(Val)) break; // If this can match to INC/DEC, let it go. if (Opcode == ISD::ADD && (Val == 1 || Val == -1)) break; // Check if we should avoid folding this immediate. if (!shouldAvoidImmediateInstFormsForSize(N1.getNode())) break; // We should not fold the immediate. So we need a register form instead. unsigned ROpc, MOpc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unexpected VT!"); case MVT::i8: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break; case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break; case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break; case ISD::OR: ROpc = X86::OR8rr; MOpc = X86::OR8rm; break; case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break; } break; case MVT::i16: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break; case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break; case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break; case ISD::OR: ROpc = X86::OR16rr; MOpc = X86::OR16rm; break; case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break; } break; case MVT::i32: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break; case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break; case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break; case ISD::OR: ROpc = X86::OR32rr; MOpc = X86::OR32rm; break; case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break; } break; case MVT::i64: switch (Opcode) { default: llvm_unreachable("Unexpected opcode!"); case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break; case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break; case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break; case ISD::OR: ROpc = X86::OR64rr; MOpc = X86::OR64rm; break; case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break; } break; } // Ok this is a AND/OR/XOR/ADD/SUB with constant. // If this is a not a subtract, we can still try to fold a load. if (Opcode != ISD::SUB) { SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N0.getValue(1), SDValue(CNode, 2)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N0)->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); CurDAG->RemoveDeadNode(Node); return; } } CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1); return; } case X86ISD::SMUL: // i16/i32/i64 are handled with isel patterns. if (NVT != MVT::i8) break; LLVM_FALLTHROUGH; case X86ISD::UMUL: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned LoReg, ROpc, MOpc; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r; MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m; break; case MVT::i16: LoReg = X86::AX; ROpc = X86::MUL16r; MOpc = X86::MUL16m; break; case MVT::i32: LoReg = X86::EAX; ROpc = X86::MUL32r; MOpc = X86::MUL32m; break; case MVT::i64: LoReg = X86::RAX; ROpc = X86::MUL64r; MOpc = X86::MUL64m; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commutative. if (!FoldedLoad) { FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (FoldedLoad) std::swap(N0, N1); } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); MachineSDNode *CNode; if (FoldedLoad) { // i16/i32/i64 use an instruction that produces a low and high result even // though only the low result is used. SDVTList VTs; if (NVT == MVT::i8) VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other); else VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other); SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { // i16/i32/i64 use an instruction that produces a low and high result even // though only the low result is used. SDVTList VTs; if (NVT == MVT::i8) VTs = CurDAG->getVTList(NVT, MVT::i32); else VTs = CurDAG->getVTList(NVT, NVT, MVT::i32); CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag}); } ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2)); CurDAG->RemoveDeadNode(Node); return; } case ISD::SMUL_LOHI: case ISD::UMUL_LOHI: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned Opc, MOpc; unsigned LoReg, HiReg; bool IsSigned = Opcode == ISD::SMUL_LOHI; bool UseMULX = !IsSigned && Subtarget->hasBMI2(); bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty(); switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i32: Opc = UseMULXHi ? X86::MULX32Hrr : UseMULX ? X86::MULX32rr : IsSigned ? X86::IMUL32r : X86::MUL32r; MOpc = UseMULXHi ? X86::MULX32Hrm : UseMULX ? X86::MULX32rm : IsSigned ? X86::IMUL32m : X86::MUL32m; LoReg = UseMULX ? X86::EDX : X86::EAX; HiReg = X86::EDX; break; case MVT::i64: Opc = UseMULXHi ? X86::MULX64Hrr : UseMULX ? X86::MULX64rr : IsSigned ? X86::IMUL64r : X86::MUL64r; MOpc = UseMULXHi ? X86::MULX64Hrm : UseMULX ? X86::MULX64rm : IsSigned ? X86::IMUL64m : X86::MUL64m; LoReg = UseMULX ? X86::RDX : X86::RAX; HiReg = X86::RDX; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); // Multiply is commmutative. if (!foldedLoad) { foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); if (foldedLoad) std::swap(N0, N1); } SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); SDValue ResHi, ResLo; if (foldedLoad) { SDValue Chain; MachineSDNode *CNode = nullptr; SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; if (UseMULXHi) { SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); Chain = SDValue(CNode, 1); } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); Chain = SDValue(CNode, 2); } else { SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue); CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); Chain = SDValue(CNode, 0); InFlag = SDValue(CNode, 1); } // Update the chain. ReplaceUses(N1.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { SDValue Ops[] = { N1, InFlag }; if (UseMULXHi) { SDVTList VTs = CurDAG->getVTList(NVT); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); } else if (UseMULX) { SDVTList VTs = CurDAG->getVTList(NVT, NVT); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); ResHi = SDValue(CNode, 0); ResLo = SDValue(CNode, 1); } else { SDVTList VTs = CurDAG->getVTList(MVT::Glue); SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops); InFlag = SDValue(CNode, 0); } } // Copy the low half of the result, if it is needed. if (!SDValue(Node, 0).use_empty()) { if (!ResLo) { assert(LoReg && "Register for low half is not defined!"); ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, InFlag); InFlag = ResLo.getValue(2); } ReplaceUses(SDValue(Node, 0), ResLo); LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the high half of the result, if it is needed. if (!SDValue(Node, 1).use_empty()) { if (!ResHi) { assert(HiReg && "Register for high half is not defined!"); ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, InFlag); InFlag = ResHi.getValue(2); } ReplaceUses(SDValue(Node, 1), ResHi); LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); return; } case ISD::SDIVREM: case ISD::UDIVREM: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); unsigned ROpc, MOpc; bool isSigned = Opcode == ISD::SDIVREM; if (!isSigned) { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: ROpc = X86::DIV8r; MOpc = X86::DIV8m; break; case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break; case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break; case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break; } } else { switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: ROpc = X86::IDIV8r; MOpc = X86::IDIV8m; break; case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break; case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break; case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break; } } unsigned LoReg, HiReg, ClrReg; unsigned SExtOpcode; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); case MVT::i8: LoReg = X86::AL; ClrReg = HiReg = X86::AH; SExtOpcode = 0; // Not used. break; case MVT::i16: LoReg = X86::AX; HiReg = X86::DX; ClrReg = X86::DX; SExtOpcode = X86::CWD; break; case MVT::i32: LoReg = X86::EAX; ClrReg = HiReg = X86::EDX; SExtOpcode = X86::CDQ; break; case MVT::i64: LoReg = X86::RAX; ClrReg = HiReg = X86::RDX; SExtOpcode = X86::CQO; break; } SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4); bool signBitIsZero = CurDAG->SignBitIsZero(N0); SDValue InFlag; if (NVT == MVT::i8) { // Special case for div8, just use a move with zero extension to AX to // clear the upper 8 bits (AH). SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain; MachineSDNode *Move; if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) }; unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8 : X86::MOVZX16rm8; Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops); Chain = SDValue(Move, 1); ReplaceUses(N0.getValue(1), Chain); // Record the mem-refs CurDAG->setNodeMemRefs(Move, {cast(N0)->getMemOperand()}); } else { unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8 : X86::MOVZX16rr8; Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0); Chain = CurDAG->getEntryNode(); } Chain = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0), SDValue()); InFlag = Chain.getValue(1); } else { InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg, N0, SDValue()).getValue(1); if (isSigned && !signBitIsZero) { // Sign extend the low part into the high part. InFlag = SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0); } else { // Zero out the high part, effectively zero extending the input. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); switch (NVT.SimpleTy) { case MVT::i16: ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode, CurDAG->getTargetConstant(X86::sub_16bit, dl, MVT::i32)), 0); break; case MVT::i32: break; case MVT::i64: ClrNode = SDValue(CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode, CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 0); break; default: llvm_unreachable("Unexpected division source"); } InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg, ClrNode, InFlag).getValue(1); } } if (foldedLoad) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); // Record the mem-refs CurDAG->setNodeMemRefs(CNode, {cast(N1)->getMemOperand()}); } else { InFlag = SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0); } // Prevent use of AH in a REX instruction by explicitly copying it to // an ABCD_L register. // // The current assumption of the register allocator is that isel // won't generate explicit references to the GR8_ABCD_H registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) { SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8); unsigned AHExtOpcode = isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX; SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32, MVT::Glue, AHCopy, InFlag); SDValue Result(RNode, 0); InFlag = SDValue(RNode, 1); Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); ReplaceUses(SDValue(Node, 1), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the division (low) result, if it is needed. if (!SDValue(Node, 0).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 0), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } // Copy the remainder (high) result, if it is needed. if (!SDValue(Node, 1).use_empty()) { SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT, InFlag); InFlag = Result.getValue(2); ReplaceUses(SDValue(Node, 1), Result); LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } CurDAG->RemoveDeadNode(Node); return; } case X86ISD::FCMP: case X86ISD::STRICT_FCMP: case X86ISD::STRICT_FCMPS: { bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP || Node->getOpcode() == X86ISD::STRICT_FCMPS; SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0); SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1); // Save the original VT of the compare. MVT CmpVT = N0.getSimpleValueType(); // Floating point needs special handling if we don't have FCOMI. if (Subtarget->hasCMov()) break; bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS; unsigned Opc; switch (CmpVT.SimpleTy) { default: llvm_unreachable("Unexpected type!"); case MVT::f32: Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32; break; case MVT::f64: Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64; break; case MVT::f80: Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80; break; } SDValue Cmp; SDValue Chain = IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode(); if (IsStrictCmp) { SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other); Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0); Chain = Cmp.getValue(1); } else { Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0); } // Move FPSW to AX. SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue()); Chain = FPSW; SDValue FNSTSW = SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW, FPSW.getValue(1)), 0); // Extract upper 8-bits of AX. SDValue Extract = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW); // Move AH into flags. // Some 64-bit targets lack SAHF support, but they do support FCOMI. assert(Subtarget->hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?"); SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue()); Chain = AH; SDValue SAHF = SDValue( CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0); if (IsStrictCmp) ReplaceUses(SDValue(Node, 1), Chain); ReplaceUses(SDValue(Node, 0), SAHF); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::CMP: { SDValue N0 = Node->getOperand(0); SDValue N1 = Node->getOperand(1); // Optimizations for TEST compares. if (!isNullConstant(N1)) break; // Save the original VT of the compare. MVT CmpVT = N0.getSimpleValueType(); // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed // by a test instruction. The test should be removed later by // analyzeCompare if we are using only the zero flag. // TODO: Should we check the users and use the BEXTR flags directly? if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) { unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr : X86::TEST32rr; SDValue BEXTR = SDValue(NewNode, 0); NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); CurDAG->RemoveDeadNode(Node); return; } } // We can peek through truncates, but we need to be careful below. if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) N0 = N0.getOperand(0); // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8) { ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C) break; uint64_t Mask = C->getZExtValue(); // Check if we can replace AND+IMM64 with a shift. This is possible for // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero // flag. if (CmpVT == MVT::i64 && !isInt<32>(Mask) && onlyUsesZeroFlag(SDValue(Node, 0))) { if (isMask_64(~Mask)) { unsigned TrailingZeros = countTrailingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64); SDValue Shift = SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); ReplaceNode(Node, Test); return; } if (isMask_64(Mask)) { unsigned LeadingZeros = countLeadingZeros(Mask); SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64); SDValue Shift = SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32, N0.getOperand(0), Imm), 0); MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl, MVT::i32, Shift, Shift); ReplaceNode(Node, Test); return; } } MVT VT; int SubRegOp; unsigned ROpc, MOpc; // For each of these checks we need to be careful if the sign flag is // being used. It is only safe to use the sign flag in two conditions, // either the sign bit in the shrunken mask is zero or the final test // size is equal to the original compare size. if (isUInt<8>(Mask) && (!(Mask & 0x80) || CmpVT == MVT::i8 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, convert "testl %eax, $8" to "testb %al, $8" VT = MVT::i8; SubRegOp = X86::sub_8bit; ROpc = X86::TEST8ri; MOpc = X86::TEST8mi; } else if (OptForMinSize && isUInt<16>(Mask) && (!(Mask & 0x8000) || CmpVT == MVT::i16 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testl %eax, $32776" to "testw %ax, $32776". // NOTE: We only want to form TESTW instructions if optimizing for // min size. Otherwise we only save one byte and possibly get a length // changing prefix penalty in the decoders. VT = MVT::i16; SubRegOp = X86::sub_16bit; ROpc = X86::TEST16ri; MOpc = X86::TEST16mi; } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 && ((!(Mask & 0x80000000) && // Without minsize 16-bit Cmps can get here so we need to // be sure we calculate the correct sign flag if needed. (CmpVT != MVT::i16 || !(Mask & 0x8000))) || CmpVT == MVT::i32 || hasNoSignFlagUses(SDValue(Node, 0)))) { // For example, "testq %rax, $268468232" to "testl %eax, $268468232". // NOTE: We only want to run that transform if N0 is 32 or 64 bits. // Otherwize, we find ourselves in a position where we have to do // promotion. If previous passes did not promote the and, we assume // they had a good reason not to and do not promote here. VT = MVT::i32; SubRegOp = X86::sub_32bit; ROpc = X86::TEST32ri; MOpc = X86::TEST32mi; } else { // No eligible transformation was found. break; } SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT); SDValue Reg = N0.getOperand(0); // Emit a testl or testw. MachineSDNode *NewNode; SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { if (auto *LoadN = dyn_cast(N0.getOperand(0).getNode())) { if (!LoadN->isSimple()) { unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits(); if (MOpc == X86::TEST8mi && NumVolBits != 8) break; else if (MOpc == X86::TEST16mi && NumVolBits != 16) break; else if (MOpc == X86::TEST32mi && NumVolBits != 32) break; } } SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm, Reg.getOperand(0) }; NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops); // Update the chain. ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1)); // Record the mem-refs CurDAG->setNodeMemRefs(NewNode, {cast(Reg)->getMemOperand()}); } else { // Extract the subregister if necessary. if (N0.getValueType() != VT) Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg); NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm); } // Replace CMP with TEST. ReplaceNode(Node, NewNode); return; } break; } case X86ISD::PCMPISTR: { if (!Subtarget->hasSSE42()) break; bool NeedIndex = !SDValue(Node, 0).use_empty(); bool NeedMask = !SDValue(Node, 1).use_empty(); // We can't fold a load if we are going to make two instructions. bool MayFoldLoad = !NeedIndex || !NeedMask; MachineSDNode *CNode; if (NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm; CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); } if (NeedIndex || !NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm; CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); } // Connect the flag usage to the last instruction created. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::PCMPESTR: { if (!Subtarget->hasSSE42()) break; // Copy the two implicit register inputs. SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX, Node->getOperand(1), SDValue()).getValue(1); InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX, Node->getOperand(3), InFlag).getValue(1); bool NeedIndex = !SDValue(Node, 0).use_empty(); bool NeedMask = !SDValue(Node, 1).use_empty(); // We can't fold a load if we are going to make two instructions. bool MayFoldLoad = !NeedIndex || !NeedMask; MachineSDNode *CNode; if (NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm; CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node, InFlag); ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0)); } if (NeedIndex || !NeedMask) { unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr; unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm; CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag); ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0)); } // Connect the flag usage to the last instruction created. ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case ISD::SETCC: { if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue())) return; break; } case ISD::STORE: if (foldLoadStoreIntoMemOperand(Node)) return; break; case X86ISD::SETCC_CARRY: { // We have to do this manually because tblgen will put the eflags copy in // the wrong place if we use an extract_subreg in the pattern. MVT VT = Node->getSimpleValueType(0); // Copy flags to the EFLAGS register and glue it to next node. SDValue EFLAGS = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, Node->getOperand(1), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; SDValue Result = SDValue( CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); } ReplaceUses(SDValue(Node, 0), Result); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::SBB: { if (isNullConstant(Node->getOperand(0)) && isNullConstant(Node->getOperand(1))) { MVT VT = Node->getSimpleValueType(0); // Create zero. SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32); SDValue Zero = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0); if (VT == MVT::i64) { Zero = SDValue( CurDAG->getMachineNode( TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, CurDAG->getTargetConstant(0, dl, MVT::i64), Zero, CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), 0); } // Copy flags to the EFLAGS register and glue it to next node. SDValue EFLAGS = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, Node->getOperand(2), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr; MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; VTs = CurDAG->getVTList(SBBVT, MVT::i32); SDValue Result = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS, EFLAGS.getValue(1)}), 0); // Replace the flag use. ReplaceUses(SDValue(Node, 1), Result.getValue(1)); // Replace the result use. if (!SDValue(Node, 0).use_empty()) { // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit; Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result); } ReplaceUses(SDValue(Node, 0), Result); } CurDAG->RemoveDeadNode(Node); return; } break; } case X86ISD::MGATHER: { auto *Mgt = cast(Node); SDValue IndexOp = Mgt->getIndex(); SDValue Mask = Mgt->getMask(); MVT IndexVT = IndexOp.getSimpleValueType(); MVT ValueVT = Node->getSimpleValueType(0); MVT MaskVT = Mask.getSimpleValueType(); // This is just to prevent crashes if the nodes are malformed somehow. We're // otherwise only doing loose type checking in here based on type what // a type constraint would say just like table based isel. if (!ValueVT.isVector() || !MaskVT.isVector()) break; unsigned NumElts = ValueVT.getVectorNumElements(); MVT ValueSVT = ValueVT.getVectorElementType(); bool IsFP = ValueSVT.isFloatingPoint(); unsigned EltSize = ValueSVT.getSizeInBits(); unsigned Opc = 0; bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1; if (AVX512Gather) { if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm; else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm; } else { assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() && "Unexpected mask VT!"); if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm; } if (!Opc) break; SDValue Base, Scale, Index, Disp, Segment; if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(), Base, Scale, Index, Disp, Segment)) break; SDValue PassThru = Mgt->getPassThru(); SDValue Chain = Mgt->getChain(); // Gather instructions have a mask output not in the ISD node. SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other); MachineSDNode *NewNode; if (AVX512Gather) { SDValue Ops[] = {PassThru, Mask, Base, Scale, Index, Disp, Segment, Chain}; NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); } else { SDValue Ops[] = {PassThru, Base, Scale, Index, Disp, Segment, Mask, Chain}; NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); } CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2)); CurDAG->RemoveDeadNode(Node); return; } case X86ISD::MSCATTER: { auto *Sc = cast(Node); SDValue Value = Sc->getValue(); SDValue IndexOp = Sc->getIndex(); MVT IndexVT = IndexOp.getSimpleValueType(); MVT ValueVT = Value.getSimpleValueType(); // This is just to prevent crashes if the nodes are malformed somehow. We're // otherwise only doing loose type checking in here based on type what // a type constraint would say just like table based isel. if (!ValueVT.isVector()) break; unsigned NumElts = ValueVT.getVectorNumElements(); MVT ValueSVT = ValueVT.getVectorElementType(); bool IsFP = ValueSVT.isFloatingPoint(); unsigned EltSize = ValueSVT.getSizeInBits(); unsigned Opc; if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr; else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32) Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr; else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr; else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr; else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr; else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32) Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr; else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr; else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr; else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64) Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr; else break; SDValue Base, Scale, Index, Disp, Segment; if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(), Base, Scale, Index, Disp, Segment)) break; SDValue Mask = Sc->getMask(); SDValue Chain = Sc->getChain(); // Scatter instructions have a mask output not in the ISD node. SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other); SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain}; MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops); CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()}); ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1)); CurDAG->RemoveDeadNode(Node); return; } case ISD::PREALLOCATED_SETUP: { auto *MFI = CurDAG->getMachineFunction().getInfo(); auto CallId = MFI->getPreallocatedIdForCallSite( cast(Node->getOperand(1))->getValue()); SDValue Chain = Node->getOperand(0); SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); MachineSDNode *New = CurDAG->getMachineNode( TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain); ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain CurDAG->RemoveDeadNode(Node); return; } case ISD::PREALLOCATED_ARG: { auto *MFI = CurDAG->getMachineFunction().getInfo(); auto CallId = MFI->getPreallocatedIdForCallSite( cast(Node->getOperand(1))->getValue()); SDValue Chain = Node->getOperand(0); SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32); SDValue ArgIndex = Node->getOperand(2); SDValue Ops[3]; Ops[0] = CallIdValue; Ops[1] = ArgIndex; Ops[2] = Chain; MachineSDNode *New = CurDAG->getMachineNode( TargetOpcode::PREALLOCATED_ARG, dl, CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()), MVT::Other), Ops); ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain CurDAG->RemoveDeadNode(Node); return; } case X86ISD::AESENCWIDE128KL: case X86ISD::AESDECWIDE128KL: case X86ISD::AESENCWIDE256KL: case X86ISD::AESDECWIDE256KL: { if (!Subtarget->hasWIDEKL()) break; unsigned Opcode; switch (Node->getOpcode()) { default: llvm_unreachable("Unexpected opcode!"); case X86ISD::AESENCWIDE128KL: Opcode = X86::AESENCWIDE128KL; break; case X86ISD::AESDECWIDE128KL: Opcode = X86::AESDECWIDE128KL; break; case X86ISD::AESENCWIDE256KL: Opcode = X86::AESENCWIDE256KL; break; case X86ISD::AESDECWIDE256KL: Opcode = X86::AESDECWIDE256KL; break; } SDValue Chain = Node->getOperand(0); SDValue Addr = Node->getOperand(1); SDValue Base, Scale, Index, Disp, Segment; if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment)) break; Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2), SDValue()); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8), Chain.getValue(1)); Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9), Chain.getValue(1)); MachineSDNode *Res = CurDAG->getMachineNode( Opcode, dl, Node->getVTList(), {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)}); CurDAG->setNodeMemRefs(Res, cast(Node)->getMemOperand()); ReplaceNode(Node, Res); return; } } SelectCode(Node); } bool X86DAGToDAGISel:: SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, std::vector &OutOps) { SDValue Op0, Op1, Op2, Op3, Op4; switch (ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); case InlineAsm::Constraint_o: // offsetable ?? case InlineAsm::Constraint_v: // not offsetable ?? case InlineAsm::Constraint_m: // memory case InlineAsm::Constraint_X: if (!selectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4)) return true; break; } OutOps.push_back(Op0); OutOps.push_back(Op1); OutOps.push_back(Op2); OutOps.push_back(Op3); OutOps.push_back(Op4); return false; } /// This pass converts a legalized DAG into a X86-specific DAG, /// ready for instruction scheduling. FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM, CodeGenOpt::Level OptLevel) { return new X86DAGToDAGISel(TM, OptLevel); } diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td index e7346261b40c..e4f3290cab9f 100644 --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -1,146 +1,149 @@ //===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file describes the instructions that make up the Intel AMX instruction // set. // //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // AMX instructions let Predicates = [HasAMXTILE, In64BitMode] in { let SchedRW = [WriteSystem] in { let hasSideEffects = 1, Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "ldtilecfg\t$src", [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS; let hasSideEffects = 1 in def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src), "sttilecfg\t$src", [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD; let mayLoad = 1 in def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloadd\t{$src, $dst|$dst, $src}", []>, VEX, T8XD; let mayLoad = 1 in def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst), (ins sibmem:$src), "tileloaddt1\t{$src, $dst|$dst, $src}", []>, VEX, T8PD; let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def TILERELEASE : I<0x49, MRM_C0, (outs), (ins), "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS; let mayStore = 1 in def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs), (ins sibmem:$dst, TILE:$src), "tilestored\t{$src, $dst|$dst, $src}", []>, VEX, T8XS; def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins), "tilezero\t$dst", []>, VEX, T8XD; // Pseduo instruction for RA. let hasSideEffects = 1, mayLoad = 1, Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; let hasSideEffects = 1, mayStore = 1 in def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILECFG:$cfg), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, TILE:$src4, TILECFG:$cfg), []>; + def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, + GR16:$src2, + TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>; def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>; def PTILEZERO : PseudoI<(outs), (ins u8imm:$src), [(int_x86_tilezero timm:$src)]>; } } // SchedRW } // HasAMXTILE let Predicates = [HasAMXINT8, In64BitMode] in { let SchedRW = [WriteSystem] in { let Constraints = "$src1 = $dst" in { def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XD; def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8PD; def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8PS; } // Pseduo instruction for RA. let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbssd timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbsud timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbusd timm:$src1, timm:$src2, timm:$src3)]>; def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbuud timm:$src1, timm:$src2, timm:$src3)]>; } } } // HasAMXTILE let Predicates = [HasAMXBF16, In64BitMode] in { let SchedRW = [WriteSystem] in { let Constraints = "$src1 = $dst" in def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst), (ins TILE:$src1, TILE:$src2, TILE:$src3), "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>, VEX_4V, T8XS; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. // To be translated to the actual instructions in X86ISelLowering.cpp def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1, u8imm:$src2, u8imm:$src3), [(int_x86_tdpbf16ps timm:$src1, timm:$src2, timm:$src3)]>; } } } // HasAMXTILE, HasAMXBF16 diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp index 50719744f238..05ee6c6c8384 100644 --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -1,263 +1,265 @@ //===-- X86PreTileConfig.cpp - Tile Register Configure---------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file Pass to pre-config the shape of AMX register /// AMX register need to be configured before use. The shape of AMX register /// is encoded in the 1st and 2nd machine operand of AMX pseudo instructions. /// The pldtilecfg is to config tile registers. It should dominator all AMX /// instructions. The pldtilecfg produce a virtual cfg register and the cfg /// register is used by all AMX instructions. /// This pass is to find the common dominator of all AMX instructions and /// insert the pldtilecfg instruction. Besides the cfg register that pldtilecfg /// produces is inserted as the last operand of each AMX instruction. We use /// this scheme to model the def-use relationship between AMX config instruction /// and other AMX instructions. Below is an example. /// /// ----B1---- /// / \ /// / \ /// B2 B3 /// %1:tile = PTILELOADDV %2:tile = PTILELOADDV /// /// is transformed to /// /// B1 /// %25:tilecfg = PLDTILECFG /// / \ /// / \ /// %1:tile = PTILELOADDV %25 %2:tile = PTILELOADDV %25 // //===----------------------------------------------------------------------===// #include "X86.h" #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TileShapeInfo.h" #include "llvm/InitializePasses.h" using namespace llvm; #define DEBUG_TYPE "tile-pre-config" namespace { class X86PreTileConfig : public MachineFunctionPass { // context MachineFunction *MF = nullptr; const X86Subtarget *ST = nullptr; const TargetRegisterInfo *TRI; const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; MachineInstr *getTileConfigPoint(); public: X86PreTileConfig() : MachineFunctionPass(ID) {} /// Return the pass name. StringRef getPassName() const override { return "Tile Register Pre-configure"; } /// X86PreTileConfig analysis usage. void getAnalysisUsage(AnalysisUsage &AU) const override; /// Perform register allocation. bool runOnMachineFunction(MachineFunction &mf) override; static char ID; }; } // end anonymous namespace char X86PreTileConfig::ID = 0; INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, const TargetInstrInfo *TII, MachineRegisterInfo *MRI, const X86Subtarget *ST) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. if (ST->hasAVX512()) { // Zero stack slot. Register Zmm = MRI->createVirtualRegister(&X86::VR512RegClass); BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm) .addReg(Zmm, RegState::Undef) .addReg(Zmm, RegState::Undef); addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)), FrameIdx) .addReg(Zmm); } // build psuedo ldtilecfg Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); addFrameReference( BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); return VReg; } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { default: llvm_unreachable("Unexpected machine instruction on tile"); case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = const_cast(MI.getOperand(1)); MachineOperand &MO2 = const_cast(MI.getOperand(2)); ShapeT Shape(&MO1, &MO2, MRI); return Shape; } } MachineInstr *X86PreTileConfig::getTileConfigPoint() { DenseMap PhysShapeInfo; MachineBasicBlock *MBB = nullptr; DenseSet MIs; for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { Register VirtReg = Register::index2VirtReg(i); if (MRI->reg_nodbg_empty(VirtReg)) continue; const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; // Find the common dominator for all MI that define tile register. for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { if (MO.isUndef()) continue; const auto *MI = MO.getParent(); // PHI or IMPLICIT_DEF instructiion. // There must be a input tile before PHI instruction. if (MI->isTransient()) continue; if (!MBB) MBB = const_cast(MI->getParent()); MBB = DomTree->findNearestCommonDominator( MBB, const_cast(MI->getParent())); // Collect the instructions that define shape. ShapeT Shape = getShape(*MI, MRI); std::array ShapeMOs = {Shape.getRow(), Shape.getCol()}; for (auto *ShapeMO : ShapeMOs) { Register ShapeReg = ShapeMO->getReg(); for (const MachineOperand &MO : MRI->def_operands(ShapeReg)) { const auto *ShapeMI = MO.getParent(); MIs.insert(ShapeMI); } } } } if (!MBB) return nullptr; // This pass is before the pass of eliminating PHI node, so it // is in SSA form. assert(MRI->isSSA() && "Not SSA form in pre-tile config"); // Shape def should dominate tile config MBB. // def s s1 s2 // / \ \ / // / \ \ / // conf s3=phi(s1,s2) // | // c // for (const auto *MI : MIs) { const MachineBasicBlock *ShapeMBB = MI->getParent(); if (DomTree->dominates(ShapeMBB, MBB)) continue; if (MI->isMoveImmediate()) continue; report_fatal_error(MF->getName() + ": Failed to config tile register, " "please define the shape earlier"); } // ldtilecfg should be inserted after the MI that define the shape. MachineBasicBlock::reverse_instr_iterator I, E; for (I = MBB->instr_rbegin(), E = MBB->instr_rend(); I != E; ++I) { auto *MI = &*I; if (MIs.count(MI) && (!MI->isMoveImmediate())) break; } MachineBasicBlock::iterator MII; if (I == E) MII = MBB->getFirstNonPHI(); else { MII = MachineBasicBlock::iterator(&*I); MII++; } return &*MII; } static void addTileCFGUse(MachineFunction &MF, Register CFG) { for (MachineBasicBlock &MBB : MF) { // Traverse the basic block. for (MachineInstr &MI : MBB) { unsigned Opcode = MI.getOpcode(); switch (Opcode) { default: break; case X86::PTILELOADDV: case X86::PTILESTOREDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: unsigned NumOperands = MI.getNumOperands(); MI.RemoveOperand(NumOperands - 1); MI.addOperand(MF, MachineOperand::CreateReg(CFG, false)); break; } } } } bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { MF = &mf; MRI = &mf.getRegInfo(); ST = &mf.getSubtarget(); TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); MachineInstr *MI = getTileConfigPoint(); if (!MI) return false; unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); addTileCFGUse(mf, CFG); return true; } FunctionPass *llvm::createX86PreTileConfigPass() { return new X86PreTileConfig(); } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp index 81571decae2d..d90b4e7bdc7e 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.cpp +++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp @@ -1,934 +1,935 @@ //===-- X86RegisterInfo.cpp - X86 Register Information --------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file contains the X86 implementation of the TargetRegisterInfo class. // This file is responsible for the frame pointer elimination optimization // on X86. // //===----------------------------------------------------------------------===// #include "X86RegisterInfo.h" #include "X86FrameLowering.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/LiveRegMatrix.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" using namespace llvm; #define GET_REGINFO_TARGET_DESC #include "X86GenRegisterInfo.inc" static cl::opt EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true), cl::desc("Enable use of a base pointer for complex stack frames")); X86RegisterInfo::X86RegisterInfo(const Triple &TT) : X86GenRegisterInfo((TT.isArch64Bit() ? X86::RIP : X86::EIP), X86_MC::getDwarfRegFlavour(TT, false), X86_MC::getDwarfRegFlavour(TT, true), (TT.isArch64Bit() ? X86::RIP : X86::EIP)) { X86_MC::initLLVMToSEHAndCVRegMapping(this); // Cache some information. Is64Bit = TT.isArch64Bit(); IsWin64 = Is64Bit && TT.isOSWindows(); // Use a callee-saved register as the base pointer. These registers must // not conflict with any ABI requirements. For example, in 32-bit mode PIC // requires GOT in the EBX register before function calls via PLT GOT pointer. if (Is64Bit) { SlotSize = 8; // This matches the simplified 32-bit pointer code in the data layout // computation. // FIXME: Should use the data layout? bool Use64BitReg = TT.getEnvironment() != Triple::GNUX32; StackPtr = Use64BitReg ? X86::RSP : X86::ESP; FramePtr = Use64BitReg ? X86::RBP : X86::EBP; BasePtr = Use64BitReg ? X86::RBX : X86::EBX; } else { SlotSize = 4; StackPtr = X86::ESP; FramePtr = X86::EBP; BasePtr = X86::ESI; } } int X86RegisterInfo::getSEHRegNum(unsigned i) const { return getEncodingValue(i); } const TargetRegisterClass * X86RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC, unsigned Idx) const { // The sub_8bit sub-register index is more constrained in 32-bit mode. // It behaves just like the sub_8bit_hi index. if (!Is64Bit && Idx == X86::sub_8bit) Idx = X86::sub_8bit_hi; // Forward to TableGen's default version. return X86GenRegisterInfo::getSubClassWithSubReg(RC, Idx); } const TargetRegisterClass * X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A, const TargetRegisterClass *B, unsigned SubIdx) const { // The sub_8bit sub-register index is more constrained in 32-bit mode. if (!Is64Bit && SubIdx == X86::sub_8bit) { A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi); if (!A) return nullptr; } return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx); } const TargetRegisterClass * X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, const MachineFunction &MF) const { // Don't allow super-classes of GR8_NOREX. This class is only used after // extracting sub_8bit_hi sub-registers. The H sub-registers cannot be copied // to the full GR8 register class in 64-bit mode, so we cannot allow the // reigster class inflation. // // The GR8_NOREX class is always used in a way that won't be constrained to a // sub-class, so sub-classes like GR8_ABCD_L are allowed to expand to the // full GR8 class. if (RC == &X86::GR8_NOREXRegClass) return RC; const X86Subtarget &Subtarget = MF.getSubtarget(); const TargetRegisterClass *Super = RC; TargetRegisterClass::sc_iterator I = RC->getSuperClasses(); do { switch (Super->getID()) { case X86::FR32RegClassID: case X86::FR64RegClassID: // If AVX-512 isn't supported we should only inflate to these classes. if (!Subtarget.hasAVX512() && getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128RegClassID: case X86::VR256RegClassID: // If VLX isn't supported we should only inflate to these classes. if (!Subtarget.hasVLX() && getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::VR128XRegClassID: case X86::VR256XRegClassID: // If VLX isn't support we shouldn't inflate to these classes. if (Subtarget.hasVLX() && getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::FR32XRegClassID: case X86::FR64XRegClassID: // If AVX-512 isn't support we shouldn't inflate to these classes. if (Subtarget.hasAVX512() && getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; break; case X86::GR8RegClassID: case X86::GR16RegClassID: case X86::GR32RegClassID: case X86::GR64RegClassID: case X86::RFP32RegClassID: case X86::RFP64RegClassID: case X86::RFP80RegClassID: case X86::VR512_0_15RegClassID: case X86::VR512RegClassID: // Don't return a super-class that would shrink the spill size. // That can happen with the vector and float classes. if (getRegSizeInBits(*Super) == getRegSizeInBits(*RC)) return Super; } Super = *I++; } while (Super); return RC; } const TargetRegisterClass * X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { const X86Subtarget &Subtarget = MF.getSubtarget(); switch (Kind) { default: llvm_unreachable("Unexpected Kind in getPointerRegClass!"); case 0: // Normal GPRs. if (Subtarget.isTarget64BitLP64()) return &X86::GR64RegClass; // If the target is 64bit but we have been told to use 32bit addresses, // we can still use 64-bit register as long as we know the high bits // are zeros. // Reflect that in the returned register class. if (Is64Bit) { // When the target also allows 64-bit frame pointer and we do have a // frame, this is fine to use it for the address accesses as well. const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) && TFI->Uses64BitFramePtr ? &X86::LOW32_ADDR_ACCESS_RBPRegClass : &X86::LOW32_ADDR_ACCESSRegClass; } return &X86::GR32RegClass; case 1: // Normal GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOSPRegClass; // NOSP does not contain RIP, so no special case here. return &X86::GR32_NOSPRegClass; case 2: // NOREX GPRs. if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOREXRegClass; return &X86::GR32_NOREXRegClass; case 3: // NOREX GPRs except the stack pointer (for encoding reasons). if (Subtarget.isTarget64BitLP64()) return &X86::GR64_NOREX_NOSPRegClass; // NOSP does not contain RIP, so no special case here. return &X86::GR32_NOREX_NOSPRegClass; case 4: // Available for tailcall (not callee-saved GPRs). return getGPRsForTailCall(MF); } } bool X86RegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC, unsigned DefSubReg, const TargetRegisterClass *SrcRC, unsigned SrcSubReg) const { // Prevent rewriting a copy where the destination size is larger than the // input size. See PR41619. // FIXME: Should this be factored into the base implementation somehow. if (DefRC->hasSuperClassEq(&X86::GR64RegClass) && DefSubReg == 0 && SrcRC->hasSuperClassEq(&X86::GR64RegClass) && SrcSubReg == X86::sub_32bit) return false; return TargetRegisterInfo::shouldRewriteCopySrc(DefRC, DefSubReg, SrcRC, SrcSubReg); } const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { const Function &F = MF.getFunction(); if (IsWin64 || (F.getCallingConv() == CallingConv::Win64)) return &X86::GR64_TCW64RegClass; else if (Is64Bit) return &X86::GR64_TCRegClass; bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); if (hasHipeCC) return &X86::GR32RegClass; return &X86::GR32_TCRegClass; } const TargetRegisterClass * X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { if (RC == &X86::CCRRegClass) { if (Is64Bit) return &X86::GR64RegClass; else return &X86::GR32RegClass; } return RC; } unsigned X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const X86FrameLowering *TFI = getFrameLowering(MF); unsigned FPDiff = TFI->hasFP(MF) ? 1 : 0; switch (RC->getID()) { default: return 0; case X86::GR32RegClassID: return 4 - FPDiff; case X86::GR64RegClassID: return 12 - FPDiff; case X86::VR128RegClassID: return Is64Bit ? 10 : 4; case X86::VR64RegClassID: return 4; } } const MCPhysReg * X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "MachineFunction required"); const X86Subtarget &Subtarget = MF->getSubtarget(); const Function &F = MF->getFunction(); bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); bool CallsEHReturn = MF->callsEHReturn(); CallingConv::ID CC = F.getCallingConv(); // If attribute NoCallerSavedRegisters exists then we set X86_INTR calling // convention because it has the CSR list. if (MF->getFunction().hasFnAttribute("no_caller_saved_registers")) CC = CallingConv::X86_INTR; switch (CC) { case CallingConv::GHC: case CallingConv::HiPE: return CSR_NoRegs_SaveList; case CallingConv::AnyReg: if (HasAVX) return CSR_64_AllRegs_AVX_SaveList; return CSR_64_AllRegs_SaveList; case CallingConv::PreserveMost: return CSR_64_RT_MostRegs_SaveList; case CallingConv::PreserveAll: if (HasAVX) return CSR_64_RT_AllRegs_AVX_SaveList; return CSR_64_RT_AllRegs_SaveList; case CallingConv::CXX_FAST_TLS: if (Is64Bit) return MF->getInfo()->isSplitCSR() ? CSR_64_CXX_TLS_Darwin_PE_SaveList : CSR_64_TLS_Darwin_SaveList; break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX512 && Is64Bit) return CSR_64_Intel_OCL_BI_AVX512_SaveList; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_SaveList; if (HasAVX && Is64Bit) return CSR_64_Intel_OCL_BI_AVX_SaveList; if (!HasAVX && !IsWin64 && Is64Bit) return CSR_64_Intel_OCL_BI_SaveList; break; } case CallingConv::HHVM: return CSR_64_HHVM_SaveList; case CallingConv::X86_RegCall: if (Is64Bit) { if (IsWin64) { return (HasSSE ? CSR_Win64_RegCall_SaveList : CSR_Win64_RegCall_NoSSE_SaveList); } else { return (HasSSE ? CSR_SysV64_RegCall_SaveList : CSR_SysV64_RegCall_NoSSE_SaveList); } } else { return (HasSSE ? CSR_32_RegCall_SaveList : CSR_32_RegCall_NoSSE_SaveList); } case CallingConv::CFGuard_Check: assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86"); return (HasSSE ? CSR_Win32_CFGuard_Check_SaveList : CSR_Win32_CFGuard_Check_NoSSE_SaveList); case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_SaveList; break; case CallingConv::Win64: if (!HasSSE) return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; case CallingConv::X86_64_SysV: if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; case CallingConv::X86_INTR: if (Is64Bit) { if (HasAVX512) return CSR_64_AllRegs_AVX512_SaveList; if (HasAVX) return CSR_64_AllRegs_AVX_SaveList; if (HasSSE) return CSR_64_AllRegs_SaveList; return CSR_64_AllRegs_NoSSE_SaveList; } else { if (HasAVX512) return CSR_32_AllRegs_AVX512_SaveList; if (HasAVX) return CSR_32_AllRegs_AVX_SaveList; if (HasSSE) return CSR_32_AllRegs_SSE_SaveList; return CSR_32_AllRegs_SaveList; } default: break; } if (Is64Bit) { bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() && F.getAttributes().hasAttrSomewhere(Attribute::SwiftError); if (IsSwiftCC) return IsWin64 ? CSR_Win64_SwiftError_SaveList : CSR_64_SwiftError_SaveList; if (IsWin64) return HasSSE ? CSR_Win64_SaveList : CSR_Win64_NoSSE_SaveList; if (CallsEHReturn) return CSR_64EHRet_SaveList; return CSR_64_SaveList; } return CallsEHReturn ? CSR_32EHRet_SaveList : CSR_32_SaveList; } const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy( const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getInfo()->isSplitCSR()) return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList; return nullptr; } const uint32_t * X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const X86Subtarget &Subtarget = MF.getSubtarget(); bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); switch (CC) { case CallingConv::GHC: case CallingConv::HiPE: return CSR_NoRegs_RegMask; case CallingConv::AnyReg: if (HasAVX) return CSR_64_AllRegs_AVX_RegMask; return CSR_64_AllRegs_RegMask; case CallingConv::PreserveMost: return CSR_64_RT_MostRegs_RegMask; case CallingConv::PreserveAll: if (HasAVX) return CSR_64_RT_AllRegs_AVX_RegMask; return CSR_64_RT_AllRegs_RegMask; case CallingConv::CXX_FAST_TLS: if (Is64Bit) return CSR_64_TLS_Darwin_RegMask; break; case CallingConv::Intel_OCL_BI: { if (HasAVX512 && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX512_RegMask; if (HasAVX512 && Is64Bit) return CSR_64_Intel_OCL_BI_AVX512_RegMask; if (HasAVX && IsWin64) return CSR_Win64_Intel_OCL_BI_AVX_RegMask; if (HasAVX && Is64Bit) return CSR_64_Intel_OCL_BI_AVX_RegMask; if (!HasAVX && !IsWin64 && Is64Bit) return CSR_64_Intel_OCL_BI_RegMask; break; } case CallingConv::HHVM: return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: if (Is64Bit) { if (IsWin64) { return (HasSSE ? CSR_Win64_RegCall_RegMask : CSR_Win64_RegCall_NoSSE_RegMask); } else { return (HasSSE ? CSR_SysV64_RegCall_RegMask : CSR_SysV64_RegCall_NoSSE_RegMask); } } else { return (HasSSE ? CSR_32_RegCall_RegMask : CSR_32_RegCall_NoSSE_RegMask); } case CallingConv::CFGuard_Check: assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86"); return (HasSSE ? CSR_Win32_CFGuard_Check_RegMask : CSR_Win32_CFGuard_Check_NoSSE_RegMask); case CallingConv::Cold: if (Is64Bit) return CSR_64_MostRegs_RegMask; break; case CallingConv::Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; case CallingConv::X86_INTR: if (Is64Bit) { if (HasAVX512) return CSR_64_AllRegs_AVX512_RegMask; if (HasAVX) return CSR_64_AllRegs_AVX_RegMask; if (HasSSE) return CSR_64_AllRegs_RegMask; return CSR_64_AllRegs_NoSSE_RegMask; } else { if (HasAVX512) return CSR_32_AllRegs_AVX512_RegMask; if (HasAVX) return CSR_32_AllRegs_AVX_RegMask; if (HasSSE) return CSR_32_AllRegs_SSE_RegMask; return CSR_32_AllRegs_RegMask; } default: break; } // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check // callsEHReturn(). if (Is64Bit) { const Function &F = MF.getFunction(); bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() && F.getAttributes().hasAttrSomewhere(Attribute::SwiftError); if (IsSwiftCC) return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask; return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask; } return CSR_32_RegMask; } const uint32_t* X86RegisterInfo::getNoPreservedMask() const { return CSR_NoRegs_RegMask; } const uint32_t *X86RegisterInfo::getDarwinTLSCallPreservedMask() const { return CSR_64_TLS_Darwin_RegMask; } BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const X86FrameLowering *TFI = getFrameLowering(MF); // Set the floating point control register as reserved. Reserved.set(X86::FPCW); // Set the floating point status register as reserved. Reserved.set(X86::FPSW); // Set the SIMD floating point control register as reserved. Reserved.set(X86::MXCSR); // Set the stack-pointer register and its aliases as reserved. for (const MCPhysReg &SubReg : subregs_inclusive(X86::RSP)) Reserved.set(SubReg); // Set the Shadow Stack Pointer as reserved. Reserved.set(X86::SSP); // Set the instruction pointer register and its aliases as reserved. for (const MCPhysReg &SubReg : subregs_inclusive(X86::RIP)) Reserved.set(SubReg); // Set the frame-pointer register and its aliases as reserved if needed. if (TFI->hasFP(MF)) { for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP)) Reserved.set(SubReg); } // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { CallingConv::ID CC = MF.getFunction().getCallingConv(); const uint32_t *RegMask = getCallPreservedMask(MF, CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( "Stack realignment in presence of dynamic allocas is not supported with" "this calling convention."); Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64); for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr)) Reserved.set(SubReg); } // Mark the segment registers as reserved. Reserved.set(X86::CS); Reserved.set(X86::SS); Reserved.set(X86::DS); Reserved.set(X86::ES); Reserved.set(X86::FS); Reserved.set(X86::GS); // Mark the floating point stack registers as reserved. for (unsigned n = 0; n != 8; ++n) Reserved.set(X86::ST0 + n); // Reserve the registers that only exist in 64-bit mode. if (!Is64Bit) { // These 8-bit registers are part of the x86-64 extension even though their // super-registers are old 32-bits. Reserved.set(X86::SIL); Reserved.set(X86::DIL); Reserved.set(X86::BPL); Reserved.set(X86::SPL); Reserved.set(X86::SIH); Reserved.set(X86::DIH); Reserved.set(X86::BPH); Reserved.set(X86::SPH); for (unsigned n = 0; n != 8; ++n) { // R8, R9, ... for (MCRegAliasIterator AI(X86::R8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); // XMM8, XMM9, ... for (MCRegAliasIterator AI(X86::XMM8 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } if (!Is64Bit || !MF.getSubtarget().hasAVX512()) { for (unsigned n = 16; n != 32; ++n) { for (MCRegAliasIterator AI(X86::XMM0 + n, this, true); AI.isValid(); ++AI) Reserved.set(*AI); } } assert(checkAllSuperRegsMarked(Reserved, {X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::SIH, X86::DIH, X86::BPH, X86::SPH})); return Reserved; } void X86RegisterInfo::adjustStackMapLiveOutMask(uint32_t *Mask) const { // Check if the EFLAGS register is marked as live-out. This shouldn't happen, // because the calling convention defines the EFLAGS register as NOT // preserved. // // Unfortunatelly the EFLAGS show up as live-out after branch folding. Adding // an assert to track this and clear the register afterwards to avoid // unnecessary crashes during release builds. assert(!(Mask[X86::EFLAGS / 32] & (1U << (X86::EFLAGS % 32))) && "EFLAGS are not live-out from a patchpoint."); // Also clean other registers that don't need preserving (IP). for (auto Reg : {X86::EFLAGS, X86::RIP, X86::EIP, X86::IP}) Mask[Reg / 32] &= ~(1U << (Reg % 32)); } //===----------------------------------------------------------------------===// // Stack Frame Processing methods //===----------------------------------------------------------------------===// static bool CantUseSP(const MachineFrameInfo &MFI) { return MFI.hasVarSizedObjects() || MFI.hasOpaqueSPAdjustment(); } bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const { const X86MachineFunctionInfo *X86FI = MF.getInfo(); if (X86FI->hasPreallocatedCall()) return true; const MachineFrameInfo &MFI = MF.getFrameInfo(); if (!EnableBasePointer) return false; // When we need stack realignment, we can't address the stack from the frame // pointer. When we have dynamic allocas or stack-adjusting inline asm, we // can't address variables from the stack pointer. MS inline asm can // reference locals while also adjusting the stack pointer. When we can't // use both the SP and the FP, we need a separate base pointer register. bool CantUseFP = needsStackRealignment(MF); return CantUseFP && CantUseSP(MFI); } bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const { if (!TargetRegisterInfo::canRealignStack(MF)) return false; const MachineFrameInfo &MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); // Stack realignment requires a frame pointer. If we already started // register allocation with frame pointer elimination, it is too late now. if (!MRI->canReserveReg(FramePtr)) return false; // If a base pointer is necessary. Check that it isn't too late to reserve // it. if (CantUseSP(MFI)) return MRI->canReserveReg(BasePtr); return true; } // tryOptimizeLEAtoMOV - helper function that tries to replace a LEA instruction // of the form 'lea (%esp), %ebx' --> 'mov %esp, %ebx'. // TODO: In this case we should be really trying first to entirely eliminate // this instruction which is a plain copy. static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) { MachineInstr &MI = *II; unsigned Opc = II->getOpcode(); // Check if this is a LEA of the form 'lea (%esp), %ebx' if ((Opc != X86::LEA32r && Opc != X86::LEA64r && Opc != X86::LEA64_32r) || MI.getOperand(2).getImm() != 1 || MI.getOperand(3).getReg() != X86::NoRegister || MI.getOperand(4).getImm() != 0 || MI.getOperand(5).getReg() != X86::NoRegister) return false; Register BasePtr = MI.getOperand(1).getReg(); // In X32 mode, ensure the base-pointer is a 32-bit operand, so the LEA will // be replaced with a 32-bit operand MOV which will zero extend the upper // 32-bits of the super register. if (Opc == X86::LEA64_32r) BasePtr = getX86SubSuperRegister(BasePtr, 32); Register NewDestReg = MI.getOperand(0).getReg(); const X86InstrInfo *TII = MI.getParent()->getParent()->getSubtarget().getInstrInfo(); TII->copyPhysReg(*MI.getParent(), II, MI.getDebugLoc(), NewDestReg, BasePtr, MI.getOperand(1).isKill()); MI.eraseFromParent(); return true; } static bool isFuncletReturnInstr(MachineInstr &MI) { switch (MI.getOpcode()) { case X86::CATCHRET: case X86::CLEANUPRET: return true; default: return false; } llvm_unreachable("impossible"); } void X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj, unsigned FIOperandNum, RegScavenger *RS) const { MachineInstr &MI = *II; MachineBasicBlock &MBB = *MI.getParent(); MachineFunction &MF = *MBB.getParent(); MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI); const X86FrameLowering *TFI = getFrameLowering(MF); int FrameIndex = MI.getOperand(FIOperandNum).getIndex(); // Determine base register and offset. int FIOffset; Register BasePtr; if (MI.isReturn()) { assert((!needsStackRealignment(MF) || MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) && "Return instruction can only reference SP relative frame objects"); FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0).getFixed(); } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) { FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr); } else { FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr).getFixed(); } // LOCAL_ESCAPE uses a single offset, with no register. It only works in the // simple FP case, and doesn't work with stack realignment. On 32-bit, the // offset is from the traditional base pointer location. On 64-bit, the // offset is from the SP at the end of the prologue, not the FP location. This // matches the behavior of llvm.frameaddress. unsigned Opc = MI.getOpcode(); if (Opc == TargetOpcode::LOCAL_ESCAPE) { MachineOperand &FI = MI.getOperand(FIOperandNum); FI.ChangeToImmediate(FIOffset); return; } // For LEA64_32r when BasePtr is 32-bits (X32) we can use full-size 64-bit // register as source operand, semantic is the same and destination is // 32-bits. It saves one byte per lea in code since 0x67 prefix is avoided. // Don't change BasePtr since it is used later for stack adjustment. Register MachineBasePtr = BasePtr; if (Opc == X86::LEA64_32r && X86::GR32RegClass.contains(BasePtr)) MachineBasePtr = getX86SubSuperRegister(BasePtr, 64); // This must be part of a four operand memory reference. Replace the // FrameIndex with base register. Add an offset to the offset. MI.getOperand(FIOperandNum).ChangeToRegister(MachineBasePtr, false); if (BasePtr == StackPtr) FIOffset += SPAdj; // The frame index format for stackmaps and patchpoints is different from the // X86 format. It only has a FI and an offset. if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) { assert(BasePtr == FramePtr && "Expected the FP as base register"); int64_t Offset = MI.getOperand(FIOperandNum + 1).getImm() + FIOffset; MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset); return; } if (MI.getOperand(FIOperandNum+3).isImm()) { // Offset is a 32-bit integer. int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm()); int Offset = FIOffset + Imm; assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) && "Requesting 64-bit offset in 32-bit immediate!"); if (Offset != 0 || !tryOptimizeLEAtoMOV(II)) MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset); } else { // Offset is symbolic. This is extremely rare. uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(FIOperandNum+3).getOffset(); MI.getOperand(FIOperandNum + 3).setOffset(Offset); } } unsigned X86RegisterInfo::findDeadCallerSavedReg( MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI) const { const MachineFunction *MF = MBB.getParent(); if (MF->callsEHReturn()) return 0; const TargetRegisterClass &AvailableRegs = *getGPRsForTailCall(*MF); if (MBBI == MBB.end()) return 0; switch (MBBI->getOpcode()) { default: return 0; case TargetOpcode::PATCHABLE_RET: case X86::RET: case X86::RETL: case X86::RETQ: case X86::RETIL: case X86::RETIQ: case X86::TCRETURNdi: case X86::TCRETURNri: case X86::TCRETURNmi: case X86::TCRETURNdi64: case X86::TCRETURNri64: case X86::TCRETURNmi64: case X86::EH_RETURN: case X86::EH_RETURN64: { SmallSet Uses; for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) { MachineOperand &MO = MBBI->getOperand(I); if (!MO.isReg() || MO.isDef()) continue; Register Reg = MO.getReg(); if (!Reg) continue; for (MCRegAliasIterator AI(Reg, this, true); AI.isValid(); ++AI) Uses.insert(*AI); } for (auto CS : AvailableRegs) if (!Uses.count(CS) && CS != X86::RIP && CS != X86::RSP && CS != X86::ESP) return CS; } } return 0; } Register X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const { const X86FrameLowering *TFI = getFrameLowering(MF); return TFI->hasFP(MF) ? FramePtr : StackPtr; } unsigned X86RegisterInfo::getPtrSizedFrameRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget(); Register FrameReg = getFrameRegister(MF); if (Subtarget.isTarget64BitILP32()) FrameReg = getX86SubSuperRegister(FrameReg, 32); return FrameReg; } unsigned X86RegisterInfo::getPtrSizedStackRegister(const MachineFunction &MF) const { const X86Subtarget &Subtarget = MF.getSubtarget(); Register StackReg = getStackRegister(); if (Subtarget.isTarget64BitILP32()) StackReg = getX86SubSuperRegister(StackReg, 32); return StackReg; } static ShapeT getTileShape(Register VirtReg, VirtRegMap *VRM, const MachineRegisterInfo *MRI) { if (VRM->hasShape(VirtReg)) return VRM->getShape(VirtReg); const MachineOperand &Def = *MRI->def_begin(VirtReg); MachineInstr *MI = const_cast(Def.getParent()); unsigned OpCode = MI->getOpcode(); switch (OpCode) { default: llvm_unreachable("Unexpected machine instruction on tile register!"); break; // We only collect the tile shape that is defined. case X86::PTILELOADDV: case X86::PTDPBSSDV: + case X86::PTILEZEROV: MachineOperand &MO1 = MI->getOperand(1); MachineOperand &MO2 = MI->getOperand(2); ShapeT Shape(&MO1, &MO2, MRI); VRM->assignVirt2Shape(VirtReg, Shape); return Shape; } } bool X86RegisterInfo::getRegAllocationHints(Register VirtReg, ArrayRef Order, SmallVectorImpl &Hints, const MachineFunction &MF, const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const { const MachineRegisterInfo *MRI = &MF.getRegInfo(); const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints( VirtReg, Order, Hints, MF, VRM, Matrix); if (RC.getID() != X86::TILERegClassID) return BaseImplRetVal; ShapeT VirtShape = getTileShape(VirtReg, const_cast(VRM), MRI); auto AddHint = [&](MCPhysReg PhysReg) { Register VReg = Matrix->getOneVReg(PhysReg); if (VReg == MCRegister::NoRegister) { // Not allocated yet Hints.push_back(PhysReg); return; } ShapeT PhysShape = getTileShape(VReg, const_cast(VRM), MRI); if (PhysShape == VirtShape) Hints.push_back(PhysReg); }; SmallSet CopyHints; CopyHints.insert(Hints.begin(), Hints.end()); Hints.clear(); for (auto Hint : CopyHints) { if (RC.contains(Hint) && !MRI->isReserved(Hint)) AddHint(Hint); } for (MCPhysReg PhysReg : Order) { if (!CopyHints.count(PhysReg) && RC.contains(PhysReg) && !MRI->isReserved(PhysReg)) AddHint(PhysReg); } #define DEBUG_TYPE "tile-hint" LLVM_DEBUG({ dbgs() << "Hints for virtual register " << format_hex(VirtReg, 8) << "\n"; for (auto Hint : Hints) { dbgs() << "tmm" << Hint << ","; } dbgs() << "\n"; }); #undef DEBUG_TYPE return true; } diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll new file mode 100644 index 000000000000..501bde029dc1 --- /dev/null +++ b/llvm/test/CodeGen/X86/AMX/amx-tile-basic.ll @@ -0,0 +1,38 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -mattr=+avx512f -verify-machineinstrs | FileCheck %s + +define void @test_amx(i8* %pointer, i8* %base, i64 %stride) { +; CHECK-LABEL: test_amx: +; CHECK: # %bb.0: +; CHECK-NEXT: vpxord %zmm0, %zmm0, %zmm0 +; CHECK-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $1, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movb $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, -{{[0-9]+}}(%rsp) +; CHECK-NEXT: ldtilecfg -{{[0-9]+}}(%rsp) +; CHECK-NEXT: movw $8, %ax +; CHECK-NEXT: tilezero %tmm0 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm1 +; CHECK-NEXT: tileloadd (%rsi,%rdx), %tmm2 +; CHECK-NEXT: tdpbssd %tmm2, %tmm1, %tmm0 +; CHECK-NEXT: tilestored %tmm0, (%rdi,%rdx) +; CHECK-NEXT: tilerelease +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %c = call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8) + %a = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %b = call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, i8* %base, i64 %stride) + %d = call x86_amx @llvm.x86.tdpbssd.internal(i16 8, i16 8, i16 8, x86_amx %c, x86_amx %a, x86_amx %b) + call void @llvm.x86.tilestored64.internal(i16 8, i16 8, i8* %pointer, i64 %stride, x86_amx %d) + + ret void +} + +declare x86_amx @llvm.x86.tilezero.internal(i16, i16) +declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64) +declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx) +declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)