diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -175,6 +175,9 @@ FMADD_XA, FMSUB, FNMSUB, + + // X86 VNNI + DPWSSD, }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1223,6 +1223,13 @@ SmallVectorImpl &DelInstrs, DenseMap &InstIdxForVirtReg) const; + /// When calculate the latency of the root instruction, accumulate the + /// latency of the sequence to the root latency. + /// \param Root - Instruction that could be combined with one of its operands + virtual bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const { + return true; + } + /// Attempt to reassociate \P Root and \P Prev according to \P Pattern to /// reduce critical path length. void reassociateOps(MachineInstr &Root, MachineInstr &Prev, diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -91,7 +91,8 @@ private: bool combineInstructions(MachineBasicBlock *); - MachineInstr *getOperandDef(const MachineOperand &MO); + MachineInstr *getOperandDef(const MachineOperand &MO, + SmallVectorImpl &InsInstrs); bool isTransientMI(const MachineInstr *MI); unsigned getDepth(SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, @@ -149,11 +150,29 @@ MachineFunctionPass::getAnalysisUsage(AU); } -MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { +MachineInstr * +MachineCombiner::getOperandDef(const MachineOperand &MO, + SmallVectorImpl &InsInstrs) { MachineInstr *DefInstr = nullptr; // We need a virtual register definition. if (MO.isReg() && MO.getReg().isVirtual()) DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // Since the new instructions are not inserted into the machine function, + // the def-use information is not added in MRI. So it is possible that + // the register is defined in new instructions. + if (!DefInstr) { + for (auto *MI : InsInstrs) { + for (const MachineOperand &DefMO : MI->operands()) { + if (!(DefMO.isReg() && DefMO.getReg().isVirtual())) + continue; + if (!DefMO.isDef()) + continue; + if (DefMO.getReg() != MO.getReg()) + continue; + DefInstr = MI; + } + } + } // PHI's have no depth etc. if (DefInstr && DefInstr->isPHI()) DefInstr = nullptr; @@ -238,7 +257,7 @@ LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, InstrPtr, UseIdx); } else { - MachineInstr *DefInstr = getOperandDef(MO); + MachineInstr *DefInstr = getOperandDef(MO, InsInstrs); if (DefInstr && (TII->getMachineCombinerTraceStrategy() != MachineTraceStrategy::TS_Local || DefInstr->getParent() == &MBB)) { @@ -404,8 +423,13 @@ // Account for the latency of the inserted and deleted instructions by unsigned NewRootLatency, RootLatency; - std::tie(NewRootLatency, RootLatency) = - getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + if (TII->accumulateInstrSeqToRootLatency(*Root)) + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + else { + NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back()); + RootLatency = TSchedModel.computeInstrLatency(Root); + } unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -602,6 +602,26 @@ std::optional isCopyInstrImpl(const MachineInstr &MI) const override; + bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const override; + + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// When calculate the latency of the root instruction, accumulate the + /// latency of the sequence to the root latency. + /// \param Root - Instruction that could be combined with one of its operands + /// For X86 instruction (vpmaddwd + vpmaddwd) -> vpdpwssd, the vpmaddwd + /// is not in the critical path, so the root latency only include vpmaddwd. + bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override { + return false; + } + private: /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions. /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -9750,5 +9751,140 @@ return It; } +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { + unsigned Opc = Root.getOpcode(); + switch (Opc) { + default: + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); + case X86::VPDPWSSDrr: + case X86::VPDPWSSDrm: + case X86::VPDPWSSDYrr: + case X86::VPDPWSSDYrm: { + Patterns.push_back(MachineCombinerPattern::DPWSSD); + return true; + } + case X86::VPDPWSSDZ128r: + case X86::VPDPWSSDZ128m: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ256m: + case X86::VPDPWSSDZr: + case X86::VPDPWSSDZm: { + if (Subtarget.hasBWI()) + Patterns.push_back(MachineCombinerPattern::DPWSSD); + return true; + } + } +} + +static void +genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) { + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + + unsigned Opc = Root.getOpcode(); + unsigned VpAddOpc; + unsigned VpMaddOpc; + switch (Opc) { + default: + assert("It should not reach here"); + break; + // vpdpwssd xmm2,xmm3,XMMWORD PTR [r8+0x20] + // --> + // vpmaddwd xmm3,xmm3,XMMWORD PTR [r8+0x20] + // vpaddd xmm2,xmm2,xmm3 + case X86::VPDPWSSDrr: + VpMaddOpc = X86::VPMADDWDrr; + VpAddOpc = X86::VPADDDrr; + break; + case X86::VPDPWSSDrm: + VpMaddOpc = X86::VPMADDWDrm; + VpAddOpc = X86::VPADDDrr; + break; + case X86::VPDPWSSDZ128r: + VpMaddOpc = X86::VPMADDWDZ128rr; + VpAddOpc = X86::VPADDDZ128rr; + break; + case X86::VPDPWSSDZ128m: + VpMaddOpc = X86::VPMADDWDYrm; + VpAddOpc = X86::VPADDDZ128rr; + break; + // vpdpwssd ymm2,ymm3,YMMWORD PTR [r8+0x20] + // --> + // vpmaddwd ymm3,ymm3,YMMWORD PTR [r8+0x20] + // vpaddd ymm2,ymm2,ymm3 + case X86::VPDPWSSDYrr: + VpMaddOpc = X86::VPMADDWDYrr; + VpAddOpc = X86::VPADDDYrr; + break; + case X86::VPDPWSSDYrm: + VpMaddOpc = X86::VPMADDWDYrm; + VpAddOpc = X86::VPADDDYrr; + break; + case X86::VPDPWSSDZ256r: + VpMaddOpc = X86::VPMADDWDZ256rr; + VpAddOpc = X86::VPADDDZ256rr; + break; + case X86::VPDPWSSDZ256m: + VpMaddOpc = X86::VPMADDWDZ256rm; + VpAddOpc = X86::VPADDDZ256rr; + break; + // vpdpwssd zmm2,zmm3,ZMMWORD PTR [r8+0x20] + // --> + // vpmaddwd zmm3,zmm3,ZMMWORD PTR [r8+0x20] + // vpaddd zmm2,zmm2,zmm3 + case X86::VPDPWSSDZr: + VpMaddOpc = X86::VPMADDWDZrr; + VpAddOpc = X86::VPADDDZrr; + break; + case X86::VPDPWSSDZm: + VpMaddOpc = X86::VPMADDWDZrm; + VpAddOpc = X86::VPADDDZrr; + break; + } + // Create vpmaddwd. + auto *RC = RegInfo.getRegClass(Root.getOperand(0).getReg()); + Register NewReg = RegInfo.createVirtualRegister(RC); + MachineInstr *VpMadd = Root.getMF()->CloneMachineInstr(&Root); + VpMadd->setDesc(TII.get(VpMaddOpc)); + VpMadd->untieRegOperand(1); + VpMadd->removeOperand(1); + VpMadd->getOperand(0).setReg(NewReg); + // Create vpaddd. + Register DstReg = Root.getOperand(0).getReg(); + bool IsKill = Root.getOperand(1).isKill(); + MachineInstr *VpAdd = + BuildMI(*MF, MIMetadata(Root), TII.get(VpAddOpc), DstReg) + .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill)) + .addReg(VpMadd->getOperand(0).getReg(), getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(DstReg, 0)); + InsInstrs.push_back(VpMadd); + InsInstrs.push_back(VpAdd); + DelInstrs.push_back(&Root); +} + +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::DPWSSD: + genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + return; + } +} + #define GET_INSTRINFO_HELPERS #include "X86GenInstrInfo.inc" diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -5,9 +5,12 @@ ; CHECK-LABEL: foo_reg_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm3, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm4, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm5, %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm3, %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm4, %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm5, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %7 = bitcast <8 x i64> %0 to <16 x i32> %8 = bitcast <8 x i64> %1 to <16 x i32> @@ -54,9 +57,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd -128(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd -64(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd -128(%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd -64(%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd (%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: addq $4, %rcx ; CHECK-NEXT: addq $256, %rdi # imm = 0x100 ; CHECK-NEXT: cmpq %rcx, %rdx @@ -179,8 +185,9 @@ ; CHECK-NEXT: vmovdqa64 (%rsi,%r8), %zmm2 ; CHECK-NEXT: vpdpwssd -64(%rdx,%r8), %zmm0, %zmm1 ; CHECK-NEXT: vmovdqa64 %zmm1, -64(%rsi,%r8) -; CHECK-NEXT: vpdpwssd (%rdx,%r8), %zmm0, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, (%rsi,%r8) +; CHECK-NEXT: vpmaddwd (%rdx,%r8), %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%r8) ; CHECK-NEXT: addq $2, %rcx ; CHECK-NEXT: subq $-128, %r8 ; CHECK-NEXT: cmpq %rcx, %rdi @@ -190,9 +197,9 @@ ; CHECK-NEXT: je .LBB2_5 ; CHECK-NEXT: # %bb.4: ; CHECK-NEXT: shlq $6, %rcx -; CHECK-NEXT: vmovdqa64 (%rsi,%rcx), %zmm1 -; CHECK-NEXT: vpdpwssd (%rdx,%rcx), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%rcx) +; CHECK-NEXT: vpmaddwd (%rdx,%rcx), %zmm0, %zmm0 +; CHECK-NEXT: vpaddd (%rsi,%rcx), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi,%rcx) ; CHECK-NEXT: .LBB2_5: ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -7,17 +7,23 @@ ; AVX-LABEL: foo_reg_128: ; AVX: # %bb.0: ; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm3, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm4, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm5, %xmm1, %xmm0 +; AVX-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_reg_128: ; AVX512: # %bb.0: ; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm3, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm4, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm5, %xmm1, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %7 = bitcast <2 x i64> %0 to <4 x i32> %8 = bitcast <2 x i64> %1 to <4 x i32> @@ -36,50 +42,6 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) #1 define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { -; AVX-LABEL: foo_128: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB1_6 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: andl $3, %eax -; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB1_7 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB1_3 -; AVX-NEXT: .LBB1_7: -; AVX-NEXT: andl $-4, %edx -; AVX-NEXT: leaq 48(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd -16(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 -; AVX-NEXT: addq $4, %rcx -; AVX-NEXT: addq $64, %rdi -; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB1_8 -; AVX-NEXT: .LBB1_3: -; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB1_6 -; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $4, %rax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 -; AVX-NEXT: addq $16, %rcx -; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB1_5 -; AVX-NEXT: .LBB1_6: -; AVX-NEXT: retq -; ; AVX512-LABEL: foo_128: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi @@ -100,8 +62,10 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd -32(%rdi), %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd -16(%rdi), %xmm1, %xmm0 +; AVX512-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 ; AVX512-NEXT: addq $4, %rcx ; AVX512-NEXT: addq $64, %rdi @@ -197,45 +161,6 @@ } define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) { -; AVX-LABEL: bar_128: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB2_5 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB2_6 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB2_3 -; AVX-NEXT: .LBB2_6: -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: andl $-2, %edi -; AVX-NEXT: movl $16, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovdqa -16(%rsi,%r8), %xmm1 -; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm2 -; AVX-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, -16(%rsi,%r8) -; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx -; AVX-NEXT: addq $32, %r8 -; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB2_7 -; AVX-NEXT: .LBB2_3: -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB2_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx) -; AVX-NEXT: .LBB2_5: -; AVX-NEXT: retq -; ; AVX512-LABEL: bar_128: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi @@ -269,9 +194,9 @@ ; AVX512-NEXT: je .LBB2_5 ; AVX512-NEXT: # %bb.4: ; AVX512-NEXT: shlq $4, %rcx -; AVX512-NEXT: vmovdqa (%rsi,%rcx), %xmm1 -; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%rcx) +; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi,%rcx) ; AVX512-NEXT: .LBB2_5: ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 @@ -333,17 +258,23 @@ ; AVX-LABEL: foo_reg_256: ; AVX: # %bb.0: ; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm4, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm5, %ymm1, %ymm0 +; AVX-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2 +; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2 +; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_reg_256: ; AVX512: # %bb.0: ; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm3, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm4, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm5, %ymm1, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %7 = bitcast <4 x i64> %0 to <8 x i32> %8 = bitcast <4 x i64> %1 to <8 x i32> @@ -369,50 +300,6 @@ ; } define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { -; AVX-LABEL: foo_256: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB4_6 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: andl $3, %eax -; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB4_7 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB4_3 -; AVX-NEXT: .LBB4_7: -; AVX-NEXT: andl $-4, %edx -; AVX-NEXT: leaq 96(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 -; AVX-NEXT: addq $4, %rcx -; AVX-NEXT: subq $-128, %rdi -; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB4_8 -; AVX-NEXT: .LBB4_3: -; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB4_6 -; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $5, %rax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 -; AVX-NEXT: addq $32, %rcx -; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB4_5 -; AVX-NEXT: .LBB4_6: -; AVX-NEXT: retq -; ; AVX512-LABEL: foo_256: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi @@ -433,9 +320,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd -64(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd -32(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 +; AVX512-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: addq $4, %rcx ; AVX512-NEXT: subq $-128, %rdi ; AVX512-NEXT: cmpq %rcx, %rdx @@ -537,46 +427,6 @@ ; } ; } define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) { -; AVX-LABEL: bar_256: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB5_5 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB5_6 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB5_3 -; AVX-NEXT: .LBB5_6: -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: andl $-2, %edi -; AVX-NEXT: movl $32, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovdqa -32(%rsi,%r8), %ymm1 -; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm2 -; AVX-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1 -; AVX-NEXT: vmovdqa %ymm1, -32(%rsi,%r8) -; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2 -; AVX-NEXT: vmovdqa %ymm2, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx -; AVX-NEXT: addq $64, %r8 -; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB5_7 -; AVX-NEXT: .LBB5_3: -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 -; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx) -; AVX-NEXT: .LBB5_5: -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq -; ; AVX512-LABEL: bar_256: ; AVX512: # %bb.0: ; AVX512-NEXT: testl %edi, %edi @@ -599,8 +449,9 @@ ; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm2 ; AVX512-NEXT: vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa %ymm1, -32(%rsi,%r8) -; AVX512-NEXT: vpdpwssd (%rdx,%r8), %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa %ymm2, (%rsi,%r8) +; AVX512-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%r8) ; AVX512-NEXT: addq $2, %rcx ; AVX512-NEXT: addq $64, %r8 ; AVX512-NEXT: cmpq %rcx, %rdi @@ -610,9 +461,9 @@ ; AVX512-NEXT: je .LBB5_5 ; AVX512-NEXT: # %bb.4: ; AVX512-NEXT: shlq $5, %rcx -; AVX512-NEXT: vmovdqa (%rsi,%rcx), %ymm1 -; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%rcx) +; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi,%rcx) ; AVX512-NEXT: .LBB5_5: ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq