Index: llvm/trunk/include/llvm/IR/IntrinsicsX86.td =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicsX86.td +++ llvm/trunk/include/llvm/IR/IntrinsicsX86.td @@ -3873,23 +3873,23 @@ def int_x86_xop_vpermil2pd : GCCBuiltin<"__builtin_ia32_vpermil2pd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_v2f64_ty, llvm_i8_ty], + llvm_v2i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_xop_vpermil2pd_256 : GCCBuiltin<"__builtin_ia32_vpermil2pd256">, Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, - llvm_v4f64_ty, llvm_i8_ty], + llvm_v4i64_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_xop_vpermil2ps : GCCBuiltin<"__builtin_ia32_vpermil2ps">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_v4f32_ty, llvm_i8_ty], + llvm_v4i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_xop_vpermil2ps_256 : GCCBuiltin<"__builtin_ia32_vpermil2ps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, - llvm_v8f32_ty, llvm_i8_ty], + llvm_v8i32_ty, llvm_i8_ty], [IntrNoMem]>; def int_x86_xop_vfrcz_pd : GCCBuiltin<"__builtin_ia32_vfrczpd">, Index: llvm/trunk/lib/IR/AutoUpgrade.cpp =================================================================== --- llvm/trunk/lib/IR/AutoUpgrade.cpp +++ llvm/trunk/lib/IR/AutoUpgrade.cpp @@ -282,6 +282,27 @@ NewFn = F; return true; } + // Upgrade any XOP PERMIL2 index operand still using a float/double vector. + if (Name.startswith("x86.xop.vpermil2")) { + auto Params = F->getFunctionType()->params(); + auto Idx = Params[2]; + if (Idx->getScalarType()->isFloatingPointTy()) { + F->setName(Name + ".old"); + unsigned IdxSize = Idx->getPrimitiveSizeInBits(); + unsigned EltSize = Idx->getScalarSizeInBits(); + Intrinsic::ID Permil2ID; + if (EltSize == 64 && IdxSize == 128) + Permil2ID = Intrinsic::x86_xop_vpermil2pd; + else if (EltSize == 32 && IdxSize == 128) + Permil2ID = Intrinsic::x86_xop_vpermil2ps; + else if (EltSize == 64 && IdxSize == 256) + Permil2ID = Intrinsic::x86_xop_vpermil2pd_256; + else + Permil2ID = Intrinsic::x86_xop_vpermil2ps_256; + NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID); + return true; + } + } break; } } @@ -911,6 +932,20 @@ CI->eraseFromParent(); return; + case Intrinsic::x86_xop_vpermil2pd: + case Intrinsic::x86_xop_vpermil2ps: + case Intrinsic::x86_xop_vpermil2pd_256: + case Intrinsic::x86_xop_vpermil2ps_256: { + SmallVector Args(CI->arg_operands().begin(), + CI->arg_operands().end()); + VectorType *FltIdxTy = cast(Args[2]->getType()); + VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy); + Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy); + CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name)); + CI->eraseFromParent(); + return; + } + case Intrinsic::x86_sse41_ptestc: case Intrinsic::x86_sse41_ptestz: case Intrinsic::x86_sse41_ptestnzc: { Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -451,6 +451,8 @@ VPCOM, VPCOMU, // XOP packed permute bytes. VPPERM, + // XOP two source permutation. + VPERMIL2, // Vector multiply packed unsigned doubleword integers. PMULUDQ, Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -21947,6 +21947,7 @@ case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; + case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; Index: llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td +++ llvm/trunk/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -245,7 +245,12 @@ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>>; - +def X86vpermil2 : SDNode<"X86ISD::VPERMIL2", + SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameSizeAs<0,3>, + SDTCisSameNumEltsAs<0, 3>, + SDTCisVT<4, i8>]>>; def X86vpperm : SDNode<"X86ISD::VPPERM", SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>]>>; Index: llvm/trunk/lib/Target/X86/X86InstrXOP.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrXOP.td +++ llvm/trunk/lib/Target/X86/X86InstrXOP.td @@ -342,27 +342,34 @@ (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>; } -multiclass xop5op opc, string OpcodeStr, Intrinsic Int128, - Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { +multiclass xop5op opc, string OpcodeStr, SDNode OpNode, + ValueType vt128, ValueType vt256, + ValueType id128, ValueType id256, + PatFrag ld_128, PatFrag ld_256> { def rr : IXOP5; + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (id128 VR128:$src3), (i8 imm:$src4))))]>; def rm : IXOP5, + (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), + (id128 (bitconvert (loadv2i64 addr:$src3))), + (i8 imm:$src4))))]>, VEX_W, MemOp4; def mr : IXOP5; + (vt128 (OpNode (vt128 VR128:$src1), + (vt128 (bitconvert (ld_128 addr:$src2))), + (id128 VR128:$src3), (i8 imm:$src4))))]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP5, VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), + (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; def rmY : IXOP5, - VEX_W, MemOp4, VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2), + (id256 (bitconvert (loadv4i64 addr:$src3))), + (i8 imm:$src4))))]>, VEX_W, MemOp4, VEX_L; def mrY : IXOP5, - VEX_L; + (vt256 (OpNode (vt256 VR256:$src1), + (vt256 (bitconvert (ld_256 addr:$src2))), + (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rrY_REV : IXOP5; + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64, + v2i64, v4i64, loadv2f64, loadv4f64>; let ExeDomain = SSEPackedSingle in - defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32, + v4i32, v8i32, loadv4f32, loadv8f32>; Index: llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h +++ llvm/trunk/lib/Target/X86/X86IntrinsicsInfo.h @@ -2234,6 +2234,10 @@ X86_INTRINSIC_DATA(xop_vpcomuq, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomuw, INTR_TYPE_3OP, X86ISD::VPCOMU, 0), X86_INTRINSIC_DATA(xop_vpcomw, INTR_TYPE_3OP, X86ISD::VPCOM, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2pd_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), + X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0), X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), Index: llvm/trunk/test/CodeGen/X86/stack-folding-xop.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/stack-folding-xop.ll +++ llvm/trunk/test/CodeGen/X86/stack-folding-xop.ll @@ -166,69 +166,69 @@ } declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone -define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { +define <2 x double> @stack_fold_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) { ;CHECK-LABEL: stack_fold_vpermil2pd_rm ;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 0) + %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 0) ret <2 x double> %2 } -define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { +define <2 x double> @stack_fold_vpermil2pd_mr(<2 x double> %a0, <2 x i64> %a1, <2 x double> %a2) { ;CHECK-LABEL: stack_fold_vpermil2pd_mr ;CHECK: vpermil2pd $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x double> %a1, i8 0) + %2 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a2, <2 x i64> %a1, i8 0) ret <2 x double> %2 } -declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone -define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { +define <4 x double> @stack_fold_vpermil2pd_rm_ymm(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) { ;CHECK-LABEL: stack_fold_vpermil2pd_rm ;CHECK: vpermil2pd $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 0) + %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 0) ret <4 x double> %2 } -define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { +define <4 x double> @stack_fold_vpermil2pd_mr_ymm(<4 x double> %a0, <4 x i64> %a1, <4 x double> %a2) { ;CHECK-LABEL: stack_fold_vpermil2pd_mr ;CHECK: vpermil2pd $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x double> %a1, i8 0) + %2 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a2, <4 x i64> %a1, i8 0) ret <4 x double> %2 } -declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone -define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { +define <4 x float> @stack_fold_vpermil2ps_rm(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { ;CHECK-LABEL: stack_fold_vpermil2ps_rm ;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 0) + %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 0) ret <4 x float> %2 } -define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { +define <4 x float> @stack_fold_vpermil2ps_mr(<4 x float> %a0, <4 x i32> %a1, <4 x float> %a2) { ;CHECK-LABEL: stack_fold_vpermil2ps_mr ;CHECK: vpermil2ps $0, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x float> %a1, i8 0) + %2 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a2, <4 x i32> %a1, i8 0) ret <4 x float> %2 } -declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone -define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { +define <8 x float> @stack_fold_vpermil2ps_rm_ymm(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) { ;CHECK-LABEL: stack_fold_vpermil2ps_rm ;CHECK: vpermil2ps $0, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 0) + %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 0) ret <8 x float> %2 } -define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { +define <8 x float> @stack_fold_vpermil2ps_mr_ymm(<8 x float> %a0, <8 x i32> %a1, <8 x float> %a2) { ;CHECK-LABEL: stack_fold_vpermil2ps_mr ;CHECK: vpermil2ps $0, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() - %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x float> %a1, i8 0) + %2 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a2, <8 x i32> %a1, i8 0) ret <8 x float> %2 } -declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone define <4 x i32> @stack_fold_vphaddbd(<16 x i8> %a0) { ;CHECK-LABEL: stack_fold_vphaddbd Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-xop.ll @@ -2,63 +2,60 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+xop | FileCheck %s -declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone -declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone +declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone -declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone -declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone +declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone declare <16 x i8> @llvm.x86.xop.vpperm(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone define <2 x double> @combine_vpermil2pd_identity(<2 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: combine_vpermil2pd_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; CHECK-NEXT: movl $2, %eax +; CHECK-NEXT: vmovq %rax, %xmm2 ; CHECK-NEXT: vpermil2pd $0, %xmm2, %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vpermil2pd $0, %xmm2, %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq - %mask = bitcast <2 x i64> to <2 x double> - %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x double> %mask, i8 0) - %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x double> %mask, i8 0) + %res0 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a1, <2 x double> %a0, <2 x i64> , i8 0) + %res1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %res0, <2 x double> undef, <2 x i64> , i8 0) ret <2 x double> %res1 } define <4 x double> @combine_vpermil2pd256_identity(<4 x double> %a0, <4 x double> %a1) { ; CHECK-LABEL: combine_vpermil2pd256_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [9.881313e-324,0.000000e+00,9.881313e-324,0.000000e+00] +; CHECK-NEXT: vmovapd {{.*#+}} ymm2 = [2,0,2,0] ; CHECK-NEXT: vpermil2pd $0, %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vpermil2pd $0, %ymm2, %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: retq - %mask = bitcast <4 x i64> to <4 x double> - %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x double> %mask, i8 0) - %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x double> %mask, i8 0) + %res0 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a1, <4 x double> %a0, <4 x i64> , i8 0) + %res1 = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %res0, <4 x double> undef, <4 x i64> , i8 0) ret <4 x double> %res1 } define <4 x float> @combine_vpermil2ps_identity(<4 x float> %a0, <4 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00] +; CHECK-NEXT: vmovaps {{.*#+}} xmm2 = [3,2,1,0] ; CHECK-NEXT: vpermil2ps $0, %xmm2, %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vpermil2ps $0, %xmm2, %xmm0, %xmm0, %xmm0 ; CHECK-NEXT: retq - %mask = bitcast <4 x i32> to <4 x float> - %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x float> %mask, i8 0) - %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x float> %mask, i8 0) + %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a1, <4 x float> %a0, <4 x i32> , i8 0) + %res1 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %res0, <4 x float> undef, <4 x i32> , i8 0) ret <4 x float> %res1 } define <8 x float> @combine_vpermil2ps256_identity(<8 x float> %a0, <8 x float> %a1) { ; CHECK-LABEL: combine_vpermil2ps256_identity: ; CHECK: # BB#0: -; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [4.203895e-45,2.802597e-45,1.401298e-45,0.000000e+00,1.401298e-45,0.000000e+00,4.203895e-45,2.802597e-45] +; CHECK-NEXT: vmovaps {{.*#+}} ymm2 = [3,2,1,0,1,0,3,2] ; CHECK-NEXT: vpermil2ps $0, %ymm2, %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: vpermil2ps $0, %ymm2, %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: retq - %mask = bitcast <8 x i32> to <8 x float> - %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x float> %mask, i8 0) - %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x float> %mask, i8 0) + %res0 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a1, <8 x float> %a0, <8 x i32> , i8 0) + %res1 = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %res0, <8 x float> undef, <8 x i32> , i8 0) ret <8 x float> %res1 } @@ -67,8 +64,7 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2ps $2, {{.*}}(%rip), %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %mask = bitcast <4 x i32> to <4 x float> - %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %mask, i8 2) + %res0 = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> , i8 2) ret <4 x float> %res0 } Index: llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll +++ llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll @@ -1,6 +1,82 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s +define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1] + ret <2 x double> %res +} +define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: retq + %vec = load <2 x double>, <2 x double>* %a1 + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1] + ret <2 x double> %res +} +define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %vec = load <2 x double>, <2 x double>* %a2 + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone + +define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ; + ret <4 x double> %res +} +define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0 +; CHECK-NEXT: retq + %vec = load <4 x double>, <4 x double>* %a1 + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ; + ret <4 x double> %res +} +define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %vec = load <4 x double>, <4 x double>* %a2 + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ; + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone + +define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2ps: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ; + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone + +define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { +; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256: +; CHECK: # BB#0: +; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: retq + %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ; + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone + define <16 x i8> @test_int_x86_xop_vpcomeqb(<16 x i8> %a0, <16 x i8> %a1) { ; CHECK-LABEL: test_int_x86_xop_vpcomeqb: ; CHECK: # BB#0: Index: llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64.ll +++ llvm/trunk/test/CodeGen/X86/xop-intrinsics-x86_64.ll @@ -1,81 +1,81 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+xop | FileCheck %s -define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { +define <2 x double> @test_int_x86_xop_vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $1, %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 1) ; [#uses=1] + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %a2, i8 1) ; [#uses=1] ret <2 x double> %res } -define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x double> %a2) { +define <2 x double> @test_int_x86_xop_vpermil2pd_mr(<2 x double> %a0, <2 x double>* %a1, <2 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_mr: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $1, %xmm1, (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: retq %vec = load <2 x double>, <2 x double>* %a1 - %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x double> %a2, i8 1) ; [#uses=1] + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %vec, <2 x i64> %a2, i8 1) ; [#uses=1] ret <2 x double> %res } -define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x double>* %a2) { +define <2 x double> @test_int_x86_xop_vpermil2pd_rm(<2 x double> %a0, <2 x double> %a1, <2 x i64>* %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_rm: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $1, (%rdi), %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %vec = load <2 x double>, <2 x double>* %a2 - %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %vec, i8 1) ; [#uses=1] + %vec = load <2 x i64>, <2 x i64>* %a2 + %res = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> %vec, i8 1) ; [#uses=1] ret <2 x double> %res } -declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone +declare <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double>, <2 x double>, <2 x i64>, i8) nounwind readnone -define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) { +define <4 x double> @test_int_x86_xop_vpermil2pd_256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $2, %ymm2, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 2) ; + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %a2, i8 2) ; ret <4 x double> %res } -define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x double> %a2) { +define <4 x double> @test_int_x86_xop_vpermil2pd_256_mr(<4 x double> %a0, <4 x double>* %a1, <4 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_mr: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $2, %ymm1, (%rdi), %ymm0, %ymm0 ; CHECK-NEXT: retq %vec = load <4 x double>, <4 x double>* %a1 - %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x double> %a2, i8 2) ; + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %vec, <4 x i64> %a2, i8 2) ; ret <4 x double> %res } -define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x double>* %a2) { +define <4 x double> @test_int_x86_xop_vpermil2pd_256_rm(<4 x double> %a0, <4 x double> %a1, <4 x i64>* %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2pd_256_rm: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2pd $2, (%rdi), %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %vec = load <4 x double>, <4 x double>* %a2 - %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %vec, i8 2) ; + %vec = load <4 x i64>, <4 x i64>* %a2 + %res = call <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double> %a0, <4 x double> %a1, <4 x i64> %vec, i8 2) ; ret <4 x double> %res } -declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone +declare <4 x double> @llvm.x86.xop.vpermil2pd.256(<4 x double>, <4 x double>, <4 x i64>, i8) nounwind readnone -define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { +define <4 x float> @test_int_x86_xop_vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2ps: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2ps $3, %xmm2, %xmm1, %xmm0, %xmm0 ; CHECK-NEXT: retq - %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 3) ; + %res = call <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2, i8 3) ; ret <4 x float> %res } -declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone +declare <4 x float> @llvm.x86.xop.vpermil2ps(<4 x float>, <4 x float>, <4 x i32>, i8) nounwind readnone -define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) { +define <8 x float> @test_int_x86_xop_vpermil2ps_256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpermil2ps_256: ; CHECK: # BB#0: ; CHECK-NEXT: vpermil2ps $4, %ymm2, %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq - %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 4) ; + %res = call <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float> %a0, <8 x float> %a1, <8 x i32> %a2, i8 4) ; ret <8 x float> %res } -declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone +declare <8 x float> @llvm.x86.xop.vpermil2ps.256(<8 x float>, <8 x float>, <8 x i32>, i8) nounwind readnone define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) { ; CHECK-LABEL: test_int_x86_xop_vpcmov: