Index: llvm/trunk/lib/Target/X86/X86InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrInfo.cpp +++ llvm/trunk/lib/Target/X86/X86InstrInfo.cpp @@ -2906,6 +2906,32 @@ return nullptr; } } + case X86::VPCOMBri: case X86::VPCOMUBri: + case X86::VPCOMDri: case X86::VPCOMUDri: + case X86::VPCOMQri: case X86::VPCOMUQri: + case X86::VPCOMWri: case X86::VPCOMUWri: { + // Flip comparison mode immediate (if necessary). + unsigned Imm = MI->getOperand(3).getImm() & 0x7; + switch (Imm) { + case 0x00: Imm = 0x02; break; // LT -> GT + case 0x01: Imm = 0x03; break; // LE -> GE + case 0x02: Imm = 0x00; break; // GT -> LT + case 0x03: Imm = 0x01; break; // GE -> LE + case 0x04: // EQ + case 0x05: // NE + case 0x06: // FALSE + case 0x07: // TRUE + default: + break; + } + if (NewMI) { + MachineFunction &MF = *MI->getParent()->getParent(); + MI = MF.CloneMachineInstr(MI); + NewMI = false; + } + MI->getOperand(3).setImm(Imm); + return TargetInstrInfo::commuteInstruction(MI, NewMI); + } case X86::CMOVB16rr: case X86::CMOVB32rr: case X86::CMOVB64rr: case X86::CMOVAE16rr: case X86::CMOVAE32rr: case X86::CMOVAE64rr: case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: Index: llvm/trunk/lib/Target/X86/X86InstrXOP.td =================================================================== --- llvm/trunk/lib/Target/X86/X86InstrXOP.td +++ llvm/trunk/lib/Target/X86/X86InstrXOP.td @@ -20,21 +20,23 @@ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; } -defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; -defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; -defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; -defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; -defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; -defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; -defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; -defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; -defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; -defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; -defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; -defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; -defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; -defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; -defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +let ExeDomain = SSEPackedInt in { + defm VPHSUBWD : xop2op<0xE2, "vphsubwd", int_x86_xop_vphsubwd, loadv2i64>; + defm VPHSUBDQ : xop2op<0xE3, "vphsubdq", int_x86_xop_vphsubdq, loadv2i64>; + defm VPHSUBBW : xop2op<0xE1, "vphsubbw", int_x86_xop_vphsubbw, loadv2i64>; + defm VPHADDWQ : xop2op<0xC7, "vphaddwq", int_x86_xop_vphaddwq, loadv2i64>; + defm VPHADDWD : xop2op<0xC6, "vphaddwd", int_x86_xop_vphaddwd, loadv2i64>; + defm VPHADDUWQ : xop2op<0xD7, "vphadduwq", int_x86_xop_vphadduwq, loadv2i64>; + defm VPHADDUWD : xop2op<0xD6, "vphadduwd", int_x86_xop_vphadduwd, loadv2i64>; + defm VPHADDUDQ : xop2op<0xDB, "vphaddudq", int_x86_xop_vphaddudq, loadv2i64>; + defm VPHADDUBW : xop2op<0xD1, "vphaddubw", int_x86_xop_vphaddubw, loadv2i64>; + defm VPHADDUBQ : xop2op<0xD3, "vphaddubq", int_x86_xop_vphaddubq, loadv2i64>; + defm VPHADDUBD : xop2op<0xD2, "vphaddubd", int_x86_xop_vphaddubd, loadv2i64>; + defm VPHADDDQ : xop2op<0xCB, "vphadddq", int_x86_xop_vphadddq, loadv2i64>; + defm VPHADDBW : xop2op<0xC1, "vphaddbw", int_x86_xop_vphaddbw, loadv2i64>; + defm VPHADDBQ : xop2op<0xC3, "vphaddbq", int_x86_xop_vphaddbq, loadv2i64>; + defm VPHADDBD : xop2op<0xC2, "vphaddbd", int_x86_xop_vphaddbd, loadv2i64>; +} // Scalar load 2 addr operand instructions multiclass xop2opsld opc, string OpcodeStr, Intrinsic Int, @@ -47,11 +49,6 @@ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; } -defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, - ssmem, sse_load_f32>; -defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, - sdmem, sse_load_f64>; - multiclass xop2op128 opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP, XOP; } -defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; -defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; - multiclass xop2op256 opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP, XOP, VEX_L; } -defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; -defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +let ExeDomain = SSEPackedSingle in { + defm VFRCZSS : xop2opsld<0x82, "vfrczss", int_x86_xop_vfrcz_ss, + ssmem, sse_load_f32>; + defm VFRCZPS : xop2op128<0x80, "vfrczps", int_x86_xop_vfrcz_ps, loadv4f32>; + defm VFRCZPS : xop2op256<0x80, "vfrczps", int_x86_xop_vfrcz_ps_256, loadv8f32>; +} + +let ExeDomain = SSEPackedDouble in { + defm VFRCZSD : xop2opsld<0x83, "vfrczsd", int_x86_xop_vfrcz_sd, + sdmem, sse_load_f64>; + defm VFRCZPD : xop2op128<0x81, "vfrczpd", int_x86_xop_vfrcz_pd, loadv2f64>; + defm VFRCZPD : xop2op256<0x81, "vfrczpd", int_x86_xop_vfrcz_pd_256, loadv4f64>; +} multiclass xop3op opc, string OpcodeStr, Intrinsic Int> { def rr : IXOP; -defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; -defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; -defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; -defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; -defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; -defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; -defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; -defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; -defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; -defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; -defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +let ExeDomain = SSEPackedInt in { + defm VPSHLW : xop3op<0x95, "vpshlw", int_x86_xop_vpshlw>; + defm VPSHLQ : xop3op<0x97, "vpshlq", int_x86_xop_vpshlq>; + defm VPSHLD : xop3op<0x96, "vpshld", int_x86_xop_vpshld>; + defm VPSHLB : xop3op<0x94, "vpshlb", int_x86_xop_vpshlb>; + defm VPSHAW : xop3op<0x99, "vpshaw", int_x86_xop_vpshaw>; + defm VPSHAQ : xop3op<0x9B, "vpshaq", int_x86_xop_vpshaq>; + defm VPSHAD : xop3op<0x9A, "vpshad", int_x86_xop_vpshad>; + defm VPSHAB : xop3op<0x98, "vpshab", int_x86_xop_vpshab>; + defm VPROTW : xop3op<0x91, "vprotw", int_x86_xop_vprotw>; + defm VPROTQ : xop3op<0x93, "vprotq", int_x86_xop_vprotq>; + defm VPROTD : xop3op<0x92, "vprotd", int_x86_xop_vprotd>; + defm VPROTB : xop3op<0x90, "vprotb", int_x86_xop_vprotb>; +} multiclass xop3opimm opc, string OpcodeStr, Intrinsic Int> { def ri : IXOPi8, XOP; } -defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; -defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; -defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; -defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +let ExeDomain = SSEPackedInt in { + defm VPROTW : xop3opimm<0xC1, "vprotw", int_x86_xop_vprotwi>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", int_x86_xop_vprotqi>; + defm VPROTD : xop3opimm<0xC2, "vprotd", int_x86_xop_vprotdi>; + defm VPROTB : xop3opimm<0xC0, "vprotb", int_x86_xop_vprotbi>; +} // Instruction where second source can be memory, but third must be register multiclass xop4opm2 opc, string OpcodeStr, Intrinsic Int> { + let isCommutable = 1 in def rr : IXOPi8, XOP_4V, VEX_I8IMM; } -defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; -defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; -defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; -defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; -defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; -defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; -defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; -defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; -defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; -defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; -defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; -defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +let ExeDomain = SSEPackedInt in { + defm VPMADCSWD : xop4opm2<0xB6, "vpmadcswd", int_x86_xop_vpmadcswd>; + defm VPMADCSSWD : xop4opm2<0xA6, "vpmadcsswd", int_x86_xop_vpmadcsswd>; + defm VPMACSWW : xop4opm2<0x95, "vpmacsww", int_x86_xop_vpmacsww>; + defm VPMACSWD : xop4opm2<0x96, "vpmacswd", int_x86_xop_vpmacswd>; + defm VPMACSSWW : xop4opm2<0x85, "vpmacssww", int_x86_xop_vpmacssww>; + defm VPMACSSWD : xop4opm2<0x86, "vpmacsswd", int_x86_xop_vpmacsswd>; + defm VPMACSSDQL : xop4opm2<0x87, "vpmacssdql", int_x86_xop_vpmacssdql>; + defm VPMACSSDQH : xop4opm2<0x8F, "vpmacssdqh", int_x86_xop_vpmacssdqh>; + defm VPMACSSDD : xop4opm2<0x8E, "vpmacssdd", int_x86_xop_vpmacssdd>; + defm VPMACSDQL : xop4opm2<0x97, "vpmacsdql", int_x86_xop_vpmacsdql>; + defm VPMACSDQH : xop4opm2<0x9F, "vpmacsdqh", int_x86_xop_vpmacsdqh>; + defm VPMACSDD : xop4opm2<0x9E, "vpmacsdd", int_x86_xop_vpmacsdd>; +} // Instruction where second source can be memory, third must be imm8 multiclass xopvpcom opc, string Suffix, Intrinsic Int> { + let isCommutable = 1 in def ri : IXOPi8; -defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; -defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; -defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; -defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; -defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; -defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; -defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +let ExeDomain = SSEPackedInt in { // SSE integer instructions + defm VPCOMB : xopvpcom<0xCC, "b", int_x86_xop_vpcomb>; + defm VPCOMW : xopvpcom<0xCD, "w", int_x86_xop_vpcomw>; + defm VPCOMD : xopvpcom<0xCE, "d", int_x86_xop_vpcomd>; + defm VPCOMQ : xopvpcom<0xCF, "q", int_x86_xop_vpcomq>; + defm VPCOMUB : xopvpcom<0xEC, "ub", int_x86_xop_vpcomub>; + defm VPCOMUW : xopvpcom<0xED, "uw", int_x86_xop_vpcomuw>; + defm VPCOMUD : xopvpcom<0xEE, "ud", int_x86_xop_vpcomud>; + defm VPCOMUQ : xopvpcom<0xEF, "uq", int_x86_xop_vpcomuq>; +} // Instruction where either second or third source can be memory multiclass xop4op opc, string OpcodeStr, Intrinsic Int> { @@ -222,8 +237,10 @@ XOP_4V, VEX_I8IMM; } -defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; -defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +let ExeDomain = SSEPackedInt in { + defm VPPERM : xop4op<0xA3, "vpperm", int_x86_xop_vpperm>; + defm VPCMOV : xop4op<0xA2, "vpcmov", int_x86_xop_vpcmov>; +} multiclass xop4op256 opc, string OpcodeStr, Intrinsic Int> { def rrY : IXOPi8; +let ExeDomain = SSEPackedInt in + defm VPCMOV : xop4op256<0xA2, "vpcmov", int_x86_xop_vpcmov_256>; multiclass xop5op opc, string OpcodeStr, Intrinsic Int128, Intrinsic Int256, PatFrag ld_128, PatFrag ld_256> { @@ -295,8 +313,11 @@ VEX_L; } -defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, - int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; -defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, - int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; +let ExeDomain = SSEPackedDouble in + defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd, + int_x86_xop_vpermil2pd_256, loadv2f64, loadv4f64>; + +let ExeDomain = SSEPackedSingle in + defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", int_x86_xop_vpermil2ps, + int_x86_xop_vpermil2ps_256, loadv4f32, loadv8f32>; Index: llvm/trunk/test/CodeGen/X86/commute-xop.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/commute-xop.ll +++ llvm/trunk/test/CodeGen/X86/commute-xop.ll @@ -0,0 +1,184 @@ +; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s + +define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) { + ;CHECK-LABEL: commute_fold_vpcomb + ;CHECK: vpcomgtb (%rdi), %xmm0, %xmm0 + %1 = load <16 x i8>* %a0 + %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) { + ;CHECK-LABEL: commute_fold_vpcomd + ;CHECK: vpcomged (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) { + ;CHECK-LABEL: commute_fold_vpcomq + ;CHECK: vpcomltq (%rdi), %xmm0, %xmm0 + %1 = load <2 x i64>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) { + ;CHECK-LABEL: commute_fold_vpcomub + ;CHECK: vpcomleub (%rdi), %xmm0, %xmm0 + %1 = load <16 x i8>* %a0 + %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub + ret <16 x i8> %2 +} +declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) { + ;CHECK-LABEL: commute_fold_vpcomud + ;CHECK: vpcomequd (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone + +define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) { + ;CHECK-LABEL: commute_fold_vpcomuq + ;CHECK: vpcomnequq (%rdi), %xmm0, %xmm0 + %1 = load <2 x i64>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone + +define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) { + ;CHECK-LABEL: commute_fold_vpcomuw + ;CHECK: vpcomfalseuw (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) { + ;CHECK-LABEL: commute_fold_vpcomw + ;CHECK: vpcomtruew (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone + +define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdd + ;CHECK: vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdqh + ;CHECK: vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsdql + ;CHECK: vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdd + ;CHECK: vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdqh + ;CHECK: vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssdql + ;CHECK: vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <4 x i32>* %a0 + %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2) + ret <2 x i64> %2 +} +declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsswd + ;CHECK: vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { + ;CHECK-LABEL: commute_fold_vpmacssww + ;CHECK: vpmacssww %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmacswd + ;CHECK: vpmacswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) { + ;CHECK-LABEL: commute_fold_vpmacsww + ;CHECK: vpmacsww %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2) + ret <8 x i16> %2 +} +declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone + +define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmadcsswd + ;CHECK: vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + +define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) { + ;CHECK-LABEL: commute_fold_vpmadcswd + ;CHECK: vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0 + %1 = load <8 x i16>* %a0 + %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2) + ret <4 x i32> %2 +} +declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone + + +