Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -1579,6 +1579,42 @@ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i32_ty, llvm_v16f32_ty, llvm_i16_ty], [IntrNoMem]>; + + def int_x86_avx512_mask_movshdup_128 : + GCCBuiltin<"__builtin_ia32_movshdup128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movshdup_256 : + GCCBuiltin<"__builtin_ia32_movshdup256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movshdup_512 : + GCCBuiltin<"__builtin_ia32_movshdup512_mask">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movsldup_128 : + GCCBuiltin<"__builtin_ia32_movsldup128_mask">, + Intrinsic<[llvm_v4f32_ty], + [llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movsldup_256 : + GCCBuiltin<"__builtin_ia32_movsldup256_mask">, + Intrinsic<[llvm_v8f32_ty], + [llvm_v8f32_ty, llvm_v8f32_ty, llvm_i8_ty], + [IntrNoMem]>; + + def int_x86_avx512_mask_movsldup_512 : + GCCBuiltin<"__builtin_ia32_movsldup512_mask">, + Intrinsic<[llvm_v16f32_ty], + [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty], + [IntrNoMem]>; } // Vector blend Index: llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp =================================================================== --- llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ llvm/trunk/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -16,11 +16,31 @@ #include "MCTargetDesc/X86MCTargetDesc.h" #include "Utils/X86ShuffleDecode.h" #include "llvm/MC/MCInst.h" +#include "llvm/MC/MCRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; +static unsigned getVectorRegSize(unsigned RegNo) { + if (X86MCRegisterClasses[X86::VR512RegClassID].contains(RegNo)) + return 512; + else if (X86MCRegisterClasses[X86::VR256XRegClassID].contains(RegNo)) + return 256; + else if (X86MCRegisterClasses[X86::VR128XRegClassID].contains(RegNo)) + return 128; + + llvm_unreachable("Unknown vector reg!"); + return 0; +} + +static MVT getRegOperandVectorVT(const MCInst *MI, const MVT &ScalarVT, + unsigned OperandIndex) { + unsigned OpReg = MI->getOperand(OperandIndex).getReg(); + return MVT::getVectorVT(ScalarVT, + getVectorRegSize(OpReg)/ScalarVT.getSizeInBits()); +} + /// \brief Extracts the src/dst types for a given zero extension instruction. /// \note While the number of elements in DstVT type correct, the /// number in the SrcVT type is expanded to fill the src xmm register and the @@ -107,19 +127,30 @@ } } -#define CASE_VSHUF_COMMON(Inst, Suffix, src2) \ - case X86::VSHUFF##Inst##Suffix##r##src2##i: \ - case X86::VSHUFF##Inst##Suffix##r##src2##ik: \ - case X86::VSHUFF##Inst##Suffix##r##src2##ikz: \ - case X86::VSHUFI##Inst##Suffix##r##src2##i: \ - case X86::VSHUFI##Inst##Suffix##r##src2##ik: \ - case X86::VSHUFI##Inst##Suffix##r##src2##ikz: - -#define CASE_VSHUF(Inst) \ - CASE_VSHUF_COMMON(Inst, Z, r) \ - CASE_VSHUF_COMMON(Inst, Z, m) \ - CASE_VSHUF_COMMON(Inst, Z256, r) \ - CASE_VSHUF_COMMON(Inst, Z256, m) \ +#define CASE_MASK_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: \ + case X86::V##Inst##Suffix##src##k: \ + case X86::V##Inst##Suffix##src##kz: + +#define CASE_SSE_INS_COMMON(Inst, src) \ + case X86::Inst##src: + +#define CASE_AVX_INS_COMMON(Inst, Suffix, src) \ + case X86::V##Inst##Suffix##src: + +#define CASE_MOVDUP(Inst, src) \ + CASE_MASK_INS_COMMON(Inst, Z, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z256, r##src) \ + CASE_MASK_INS_COMMON(Inst, Z128, r##src) \ + CASE_AVX_INS_COMMON(Inst, , r##src) \ + CASE_AVX_INS_COMMON(Inst, Y, r##src) \ + CASE_SSE_INS_COMMON(Inst, r##src) \ + +#define CASE_VSHUF(Inst, src) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFF##Inst, Z256, r##src##i) \ + CASE_MASK_INS_COMMON(SHUFI##Inst, Z256, r##src##i) \ /// \brief Extracts the types and if it has memory operand for a given /// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2) instruction. @@ -129,25 +160,15 @@ default: llvm_unreachable("Unknown VSHUF64x2 family instructions."); break; - CASE_VSHUF_COMMON(64X2, Z, m) - HasMemOp = true; // FALL THROUGH. - CASE_VSHUF_COMMON(64X2, Z, r) - VT = MVT::v8i64; - break; - CASE_VSHUF_COMMON(64X2, Z256, m) + CASE_VSHUF(64X2, m) HasMemOp = true; // FALL THROUGH. - CASE_VSHUF_COMMON(64X2, Z256, r) - VT = MVT::v4i64; + CASE_VSHUF(64X2, r) + VT = getRegOperandVectorVT(MI, MVT::i64, 0); break; - CASE_VSHUF_COMMON(32X4, Z, m) + CASE_VSHUF(32X4, m) HasMemOp = true; // FALL THROUGH. - CASE_VSHUF_COMMON(32X4, Z, r) - VT = MVT::v16i32; - break; - CASE_VSHUF_COMMON(32X4, Z256, m) - HasMemOp = true; // FALL THROUGH. - CASE_VSHUF_COMMON(32X4, Z256, r) - VT = MVT::v8i32; + CASE_VSHUF(32X4, r) + VT = getRegOperandVectorVT(MI, MVT::i32, 0); break; } } @@ -297,43 +318,24 @@ DestName = getRegName(MI->getOperand(0).getReg()); DecodeMOVHLPSMask(2, ShuffleMask); break; - - case X86::MOVSLDUPrr: - case X86::VMOVSLDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::MOVSLDUPrm: - case X86::VMOVSLDUPrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v4f32, ShuffleMask); - break; - - case X86::VMOVSHDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + CASE_MOVDUP(MOVSLDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::VMOVSHDUPYrm: + CASE_MOVDUP(MOVSLDUP, m) { + MVT VT = getRegOperandVectorVT(MI, MVT::f32, 0); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v8f32, ShuffleMask); + DecodeMOVSLDUPMask(VT, ShuffleMask); break; - - case X86::VMOVSLDUPYrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); - // FALL THROUGH. - case X86::VMOVSLDUPYrm: - DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSLDUPMask(MVT::v8f32, ShuffleMask); - break; - - case X86::MOVSHDUPrr: - case X86::VMOVSHDUPrr: - Src1Name = getRegName(MI->getOperand(1).getReg()); + } + CASE_MOVDUP(MOVSHDUP, r) + Src1Name = getRegName(MI->getOperand(MI->getNumOperands() - 1).getReg()); // FALL THROUGH. - case X86::MOVSHDUPrm: - case X86::VMOVSHDUPrm: + CASE_MOVDUP(MOVSHDUP, m) { + MVT VT = getRegOperandVectorVT(MI, MVT::f32, 0); DestName = getRegName(MI->getOperand(0).getReg()); - DecodeMOVSHDUPMask(MVT::v4f32, ShuffleMask); + DecodeMOVSHDUPMask(VT, ShuffleMask); break; - + } case X86::VMOVDDUPYrr: Src1Name = getRegName(MI->getOperand(1).getReg()); // FALL THROUGH. @@ -771,8 +773,10 @@ Src1Name = getRegName(MI->getOperand(1).getReg()); DestName = getRegName(MI->getOperand(0).getReg()); break; - CASE_VSHUF(64X2) - CASE_VSHUF(32X4) { + CASE_VSHUF(64X2, r) + CASE_VSHUF(64X2, m) + CASE_VSHUF(32X4, r) + CASE_VSHUF(32X4, m) { MVT VT; bool HasMemOp; unsigned NumOp = MI->getNumOperands(); Index: llvm/trunk/lib/Target/X86/X86InstrAVX512.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrAVX512.td +++ llvm/trunk/lib/Target/X86/X86InstrAVX512.td @@ -4254,35 +4254,6 @@ def : Pat<(X86Movddup (v8f64 (scalar_to_vector (loadf64 addr:$src)))), (VMOVDDUPZrm addr:$src)>; -//===---------------------------------------------------------------------===// -// Replicate Single FP - MOVSHDUP and MOVSLDUP -//===---------------------------------------------------------------------===// -multiclass avx512_replicate_sfp op, SDNode OpNode, string OpcodeStr, - ValueType vt, RegisterClass RC, PatFrag mem_frag, - X86MemOperand x86memop> { - def rr : AVX512XSI, EVEX; - let mayLoad = 1 in - def rm : AVX512XSI, EVEX; -} - -defm VMOVSHDUPZ : avx512_replicate_sfp<0x16, X86Movshdup, "vmovshdup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; -defm VMOVSLDUPZ : avx512_replicate_sfp<0x12, X86Movsldup, "vmovsldup", - v16f32, VR512, loadv16f32, f512mem>, EVEX_V512, - EVEX_CD8<32, CD8VF>; - -def : Pat<(v16i32 (X86Movshdup VR512:$src)), (VMOVSHDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movshdup (loadv16i32 addr:$src))), - (VMOVSHDUPZrm addr:$src)>; -def : Pat<(v16i32 (X86Movsldup VR512:$src)), (VMOVSLDUPZrr VR512:$src)>; -def : Pat<(v16i32 (X86Movsldup (loadv16i32 addr:$src))), - (VMOVSLDUPZrm addr:$src)>; - //===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions //===----------------------------------------------------------------------===// @@ -7056,13 +7027,13 @@ multiclass avx512_unary_rm opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { defm rr : AVX512_maskable, EVEX, AVX5128IBase; let mayLoad = 1 in defm rm : AVX512_maskable, EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; @@ -7073,7 +7044,7 @@ avx512_unary_rm { let mayLoad = 1 in defm rmb : AVX512_maskable opc_d, bits<8> opc_q, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm Q : avx512_unary_rmb_vl, VEX_W; - defm D : avx512_unary_rmb_vl; + defm D : avx512_unary_rmb_vl; } multiclass avx512_unary_rm_vl_bw opc_b, bits<8> opc_w, string OpcodeStr, SDNode OpNode, Predicate prd> { - defm W : avx512_unary_rm_vl; - defm B : avx512_unary_rm_vl; + defm W : avx512_unary_rm_vl; + defm B : avx512_unary_rm_vl; } multiclass avx512_unary_rm_vl_all opc_b, bits<8> opc_w, @@ -7152,6 +7124,19 @@ defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; +//===---------------------------------------------------------------------===// +// Replicate Single FP - MOVSHDUP and MOVSLDUP +//===---------------------------------------------------------------------===// +multiclass avx512_replicate opc, string OpcodeStr, SDNode OpNode>{ + defm NAME: avx512_unary_rm_vl, XS; + let isCodeGenOnly = 1 in + defm NAME#_I: avx512_unary_rm_vl, XS; +} + +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; //===----------------------------------------------------------------------===// // AVX-512 - Unpack Instructions //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/Target/X86/X86InstrSSE.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrSSE.td +++ llvm/trunk/lib/Target/X86/X86InstrSSE.td @@ -5135,7 +5135,7 @@ IIC_SSE_MOV_LH>, Sched<[WriteLoad]>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { defm VMOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup", v4f32, VR128, loadv4f32, f128mem>, VEX; defm VMOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup", @@ -5150,7 +5150,7 @@ defm MOVSLDUP : sse3_replicate_sfp<0x12, X86Movsldup, "movsldup", v4f32, VR128, memopv4f32, f128mem>; -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4i32 (X86Movshdup VR128:$src)), (VMOVSHDUPrr VR128:$src)>; def : Pat<(v4i32 (X86Movshdup (bc_v4i32 (loadv2i64 addr:$src)))), Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -776,10 +776,10 @@ X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), - X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMAX, - X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), + X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, @@ -788,10 +788,22 @@ X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0), X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN, X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), - X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMIN, - X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FMIN, X86ISD::FMIN_RND), + X86_INTRINSIC_DATA(avx512_mask_movshdup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movshdup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSHDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_128, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_256, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), + X86_INTRINSIC_DATA(avx512_mask_movsldup_512, INTR_TYPE_1OP_MASK, + X86ISD::MOVSLDUP, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_128, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_256, INTR_TYPE_2OP_MASK, ISD::FMUL, 0), X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL, Index: llvm/trunk/test/CodeGen/X86/avx-isa-check.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx-isa-check.ll +++ llvm/trunk/test/CodeGen/X86/avx-isa-check.ll @@ -344,6 +344,30 @@ ret <16 x i16> %shuffle } +define <8 x float> @shuffle_v8f32_11335577(<8 x float> %a, <8 x float> %b) { +; vmovshdup 256 test + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_1133(<4 x float> %a, <4 x float> %b) { +; vmovshdup 128 test + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle +} + +define <8 x float> @shuffle_v8f32_00224466(<8 x float> %a, <8 x float> %b) { +; vmovsldup 256 test + %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> + ret <8 x float> %shuffle +} + +define <4 x float> @shuffle_v4f32_0022(<4 x float> %a, <4 x float> %b) { +; vmovsldup 128 test + %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> + ret <4 x float> %shuffle +} + define <2 x double> @insert_mem_lo_v2f64(double* %ptr, <2 x double> %b) { %a = load double, double* %ptr %v = insertelement <2 x double> undef, double %a, i32 0 Index: llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512-intrinsics.ll @@ -4676,3 +4676,49 @@ ret <8 x i64> %res2 } +declare <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovsldup %zmm0, %zmm1 {%k1} +; CHECK-NEXT: ## zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vmovsldup %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: ## zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vmovsldup %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + +declare <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float>, <16 x float>, i16) + +define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 x float> %x1, i16 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_512: +; CHECK: ## BB#0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmovshdup %zmm0, %zmm1 {%k1} +; CHECK-NEXT: ## zmm1 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vmovshdup %zmm0, %zmm2 {%k1} {z} +; CHECK-NEXT: ## zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vmovshdup %zmm0, %zmm0 +; CHECK-NEXT: ## zmm0 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] +; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm2, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) + %res1 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 -1) + %res2 = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> zeroinitializer, i16 %x2) + %res3 = fadd <16 x float> %res, %res1 + %res4 = fadd <16 x float> %res2, %res3 + ret <16 x float> %res4 +} + Index: llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -5334,3 +5334,100 @@ } declare <8 x i16> @llvm.x86.avx512.mask.vcvtps2ph.256(<8 x float>, i32, <8 x i16>, i8) nounwind readonly + +declare <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[0,0,2,2] +; CHECK-NEXT: vmovsldup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movsldup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vmovsldup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + +declare <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float>, <4 x float>, i8) + +define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_128: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} +; CHECK-NEXT: ## xmm1 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm2 {%k1} {z} +; CHECK-NEXT: ## xmm2 = xmm0[1,1,3,3] +; CHECK-NEXT: vmovshdup %xmm0, %xmm0 +; CHECK-NEXT: ## xmm0 = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) + %res1 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 -1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> zeroinitializer, i8 %x2) + %res3 = fadd <4 x float> %res, %res1 + %res4 = fadd <4 x float> %res2, %res3 + ret <4 x float> %res4 +} + +declare <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float>, <8 x float>, i8) + +define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x float> %x1, i8 %x2) { +; CHECK-LABEL: test_int_x86_avx512_mask_movshdup_256: +; CHECK: ## BB#0: +; CHECK-NEXT: movzbl %dil, %eax +; CHECK-NEXT: kmovw %eax, %k1 +; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} +; CHECK-NEXT: ## ymm1 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm2 {%k1} {z} +; CHECK-NEXT: ## ymm2 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vmovshdup %ymm0, %ymm0 +; CHECK-NEXT: ## ymm0 = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vaddps %ymm0, %ymm2, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) + %res1 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 -1) + %res2 = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> zeroinitializer, i8 %x2) + %res3 = fadd <8 x float> %res, %res1 + %res4 = fadd <8 x float> %res2, %res3 + ret <8 x float> %res4 +} + Index: llvm/trunk/test/MC/X86/avx512-encodings.s =================================================================== --- llvm/trunk/test/MC/X86/avx512-encodings.s +++ llvm/trunk/test/MC/X86/avx512-encodings.s @@ -18297,6 +18297,78 @@ // CHECK: encoding: [0xc5,0xf9,0x7e,0xaa,0xfc,0xfd,0xff,0xff] vmovd %xmm5, -516(%rdx) +// CHECK: vmovshdup %zmm27, %zmm16 +// CHECK: encoding: [0x62,0x81,0x7e,0x48,0x16,0xc3] + vmovshdup %zmm27, %zmm16 + +// CHECK: vmovshdup %zmm27, %zmm16 {%k4} +// CHECK: encoding: [0x62,0x81,0x7e,0x4c,0x16,0xc3] + vmovshdup %zmm27, %zmm16 {%k4} + +// CHECK: vmovshdup %zmm27, %zmm16 {%k4} {z} +// CHECK: encoding: [0x62,0x81,0x7e,0xcc,0x16,0xc3] + vmovshdup %zmm27, %zmm16 {%k4} {z} + +// CHECK: vmovshdup (%rcx), %zmm16 +// CHECK: encoding: [0x62,0xe1,0x7e,0x48,0x16,0x01] + vmovshdup (%rcx), %zmm16 + +// CHECK: vmovshdup 291(%rax,%r14,8), %zmm16 +// CHECK: encoding: [0x62,0xa1,0x7e,0x48,0x16,0x84,0xf0,0x23,0x01,0x00,0x00] + vmovshdup 291(%rax,%r14,8), %zmm16 + +// CHECK: vmovshdup 8128(%rdx), %zmm16 +// CHECK: encoding: [0x62,0xe1,0x7e,0x48,0x16,0x42,0x7f] + vmovshdup 8128(%rdx), %zmm16 + +// CHECK: vmovshdup 8192(%rdx), %zmm16 +// CHECK: encoding: [0x62,0xe1,0x7e,0x48,0x16,0x82,0x00,0x20,0x00,0x00] + vmovshdup 8192(%rdx), %zmm16 + +// CHECK: vmovshdup -8192(%rdx), %zmm16 +// CHECK: encoding: [0x62,0xe1,0x7e,0x48,0x16,0x42,0x80] + vmovshdup -8192(%rdx), %zmm16 + +// CHECK: vmovshdup -8256(%rdx), %zmm16 +// CHECK: encoding: [0x62,0xe1,0x7e,0x48,0x16,0x82,0xc0,0xdf,0xff,0xff] + vmovshdup -8256(%rdx), %zmm16 + +// CHECK: vmovsldup %zmm14, %zmm13 +// CHECK: encoding: [0x62,0x51,0x7e,0x48,0x12,0xee] + vmovsldup %zmm14, %zmm13 + +// CHECK: vmovsldup %zmm14, %zmm13 {%k6} +// CHECK: encoding: [0x62,0x51,0x7e,0x4e,0x12,0xee] + vmovsldup %zmm14, %zmm13 {%k6} + +// CHECK: vmovsldup %zmm14, %zmm13 {%k6} {z} +// CHECK: encoding: [0x62,0x51,0x7e,0xce,0x12,0xee] + vmovsldup %zmm14, %zmm13 {%k6} {z} + +// CHECK: vmovsldup (%rcx), %zmm13 +// CHECK: encoding: [0x62,0x71,0x7e,0x48,0x12,0x29] + vmovsldup (%rcx), %zmm13 + +// CHECK: vmovsldup 291(%rax,%r14,8), %zmm13 +// CHECK: encoding: [0x62,0x31,0x7e,0x48,0x12,0xac,0xf0,0x23,0x01,0x00,0x00] + vmovsldup 291(%rax,%r14,8), %zmm13 + +// CHECK: vmovsldup 8128(%rdx), %zmm13 +// CHECK: encoding: [0x62,0x71,0x7e,0x48,0x12,0x6a,0x7f] + vmovsldup 8128(%rdx), %zmm13 + +// CHECK: vmovsldup 8192(%rdx), %zmm13 +// CHECK: encoding: [0x62,0x71,0x7e,0x48,0x12,0xaa,0x00,0x20,0x00,0x00] + vmovsldup 8192(%rdx), %zmm13 + +// CHECK: vmovsldup -8192(%rdx), %zmm13 +// CHECK: encoding: [0x62,0x71,0x7e,0x48,0x12,0x6a,0x80] + vmovsldup -8192(%rdx), %zmm13 + +// CHECK: vmovsldup -8256(%rdx), %zmm13 +// CHECK: encoding: [0x62,0x71,0x7e,0x48,0x12,0xaa,0xc0,0xdf,0xff,0xff] + vmovsldup -8256(%rdx), %zmm13 + // CHECK: vmovlps (%rcx), %xmm20, %xmm7 // CHECK: encoding: [0x62,0xf1,0x5c,0x00,0x12,0x39] vmovlps (%rcx), %xmm20, %xmm7 Index: llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s =================================================================== --- llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s +++ llvm/trunk/test/MC/X86/x86-64-avx512f_vl.s @@ -21978,3 +21978,148 @@ // CHECK: vcvtps2ph $123, %ymm30, -2064(%rdx) // CHECK: encoding: [0x62,0x63,0x7d,0x28,0x1d,0xb2,0xf0,0xf7,0xff,0xff,0x7b] vcvtps2ph $0x7b, %ymm30, -2064(%rdx) + +// CHECK: vmovshdup %xmm18, %xmm23 +// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x16,0xfa] + vmovshdup %xmm18, %xmm23 + +// CHECK: vmovshdup %xmm18, %xmm23 {%k2} +// CHECK: encoding: [0x62,0xa1,0x7e,0x0a,0x16,0xfa] + vmovshdup %xmm18, %xmm23 {%k2} + +// CHECK: vmovshdup %xmm18, %xmm23 {%k2} {z} +// CHECK: encoding: [0x62,0xa1,0x7e,0x8a,0x16,0xfa] + vmovshdup %xmm18, %xmm23 {%k2} {z} + +// CHECK: vmovshdup (%rcx), %xmm23 +// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x16,0x39] + vmovshdup (%rcx), %xmm23 + +// CHECK: vmovshdup 291(%rax,%r14,8), %xmm23 +// CHECK: encoding: [0x62,0xa1,0x7e,0x08,0x16,0xbc,0xf0,0x23,0x01,0x00,0x00] + vmovshdup 291(%rax,%r14,8), %xmm23 + +// CHECK: vmovshdup 2032(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x16,0x7a,0x7f] + vmovshdup 2032(%rdx), %xmm23 + +// CHECK: vmovshdup 2048(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x16,0xba,0x00,0x08,0x00,0x00] + vmovshdup 2048(%rdx), %xmm23 + +// CHECK: vmovshdup -2048(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x16,0x7a,0x80] + vmovshdup -2048(%rdx), %xmm23 + +// CHECK: vmovshdup -2064(%rdx), %xmm23 +// CHECK: encoding: [0x62,0xe1,0x7e,0x08,0x16,0xba,0xf0,0xf7,0xff,0xff] + vmovshdup -2064(%rdx), %xmm23 + +// CHECK: vmovshdup %ymm24, %ymm18 +// CHECK: encoding: [0x62,0x81,0x7e,0x28,0x16,0xd0] + vmovshdup %ymm24, %ymm18 + +// CHECK: vmovshdup %ymm24, %ymm18 {%k3} +// CHECK: encoding: [0x62,0x81,0x7e,0x2b,0x16,0xd0] + vmovshdup %ymm24, %ymm18 {%k3} + +// CHECK: vmovshdup %ymm24, %ymm18 {%k3} {z} +// CHECK: encoding: [0x62,0x81,0x7e,0xab,0x16,0xd0] + vmovshdup %ymm24, %ymm18 {%k3} {z} + +// CHECK: vmovshdup (%rcx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0x7e,0x28,0x16,0x11] + vmovshdup (%rcx), %ymm18 + +// CHECK: vmovshdup 291(%rax,%r14,8), %ymm18 +// CHECK: encoding: [0x62,0xa1,0x7e,0x28,0x16,0x94,0xf0,0x23,0x01,0x00,0x00] + vmovshdup 291(%rax,%r14,8), %ymm18 + +// CHECK: vmovshdup 4064(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0x7e,0x28,0x16,0x52,0x7f] + vmovshdup 4064(%rdx), %ymm18 + +// CHECK: vmovshdup 4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0x7e,0x28,0x16,0x92,0x00,0x10,0x00,0x00] + vmovshdup 4096(%rdx), %ymm18 + +// CHECK: vmovshdup -4096(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0x7e,0x28,0x16,0x52,0x80] + vmovshdup -4096(%rdx), %ymm18 + +// CHECK: vmovshdup -4128(%rdx), %ymm18 +// CHECK: encoding: [0x62,0xe1,0x7e,0x28,0x16,0x92,0xe0,0xef,0xff,0xff] + vmovshdup -4128(%rdx), %ymm18 + +// CHECK: vmovsldup %xmm21, %xmm25 +// CHECK: encoding: [0x62,0x21,0x7e,0x08,0x12,0xcd] + vmovsldup %xmm21, %xmm25 + +// CHECK: vmovsldup %xmm21, %xmm25 {%k5} +// CHECK: encoding: [0x62,0x21,0x7e,0x0d,0x12,0xcd] + vmovsldup %xmm21, %xmm25 {%k5} + +// CHECK: vmovsldup %xmm21, %xmm25 {%k5} {z} +// CHECK: encoding: [0x62,0x21,0x7e,0x8d,0x12,0xcd] + vmovsldup %xmm21, %xmm25 {%k5} {z} + +// CHECK: vmovsldup (%rcx), %xmm25 +// CHECK: encoding: [0x62,0x61,0x7e,0x08,0x12,0x09] + vmovsldup (%rcx), %xmm25 + +// CHECK: vmovsldup 291(%rax,%r14,8), %xmm25 +// CHECK: encoding: [0x62,0x21,0x7e,0x08,0x12,0x8c,0xf0,0x23,0x01,0x00,0x00] + vmovsldup 291(%rax,%r14,8), %xmm25 + +// CHECK: vmovsldup 2032(%rdx), %xmm25 +// CHECK: encoding: [0x62,0x61,0x7e,0x08,0x12,0x4a,0x7f] + vmovsldup 2032(%rdx), %xmm25 + +// CHECK: vmovsldup 2048(%rdx), %xmm25 +// CHECK: encoding: [0x62,0x61,0x7e,0x08,0x12,0x8a,0x00,0x08,0x00,0x00] + vmovsldup 2048(%rdx), %xmm25 + +// CHECK: vmovsldup -2048(%rdx), %xmm25 +// CHECK: encoding: [0x62,0x61,0x7e,0x08,0x12,0x4a,0x80] + vmovsldup -2048(%rdx), %xmm25 + +// CHECK: vmovsldup -2064(%rdx), %xmm25 +// CHECK: encoding: [0x62,0x61,0x7e,0x08,0x12,0x8a,0xf0,0xf7,0xff,0xff] + vmovsldup -2064(%rdx), %xmm25 + +// CHECK: vmovsldup %ymm29, %ymm24 +// CHECK: encoding: [0x62,0x01,0x7e,0x28,0x12,0xc5] + vmovsldup %ymm29, %ymm24 + +// CHECK: vmovsldup %ymm29, %ymm24 {%k5} +// CHECK: encoding: [0x62,0x01,0x7e,0x2d,0x12,0xc5] + vmovsldup %ymm29, %ymm24 {%k5} + +// CHECK: vmovsldup %ymm29, %ymm24 {%k5} {z} +// CHECK: encoding: [0x62,0x01,0x7e,0xad,0x12,0xc5] + vmovsldup %ymm29, %ymm24 {%k5} {z} + +// CHECK: vmovsldup (%rcx), %ymm24 +// CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x01] + vmovsldup (%rcx), %ymm24 + +// CHECK: vmovsldup 291(%rax,%r14,8), %ymm24 +// CHECK: encoding: [0x62,0x21,0x7e,0x28,0x12,0x84,0xf0,0x23,0x01,0x00,0x00] + vmovsldup 291(%rax,%r14,8), %ymm24 + +// CHECK: vmovsldup 4064(%rdx), %ymm24 +// CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x42,0x7f] + vmovsldup 4064(%rdx), %ymm24 + +// CHECK: vmovsldup 4096(%rdx), %ymm24 +// CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x82,0x00,0x10,0x00,0x00] + vmovsldup 4096(%rdx), %ymm24 + +// CHECK: vmovsldup -4096(%rdx), %ymm24 +// CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x42,0x80] + vmovsldup -4096(%rdx), %ymm24 + +// CHECK: vmovsldup -4128(%rdx), %ymm24 +// CHECK: encoding: [0x62,0x61,0x7e,0x28,0x12,0x82,0xe0,0xef,0xff,0xff] + vmovsldup -4128(%rdx), %ymm24 +