diff --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h --- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h @@ -175,6 +175,9 @@ FMADD_XA, FMSUB, FNMSUB, + + // X86 VNNI + DPWSSD, }; } // end namespace llvm diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1223,6 +1223,13 @@ SmallVectorImpl &DelInstrs, DenseMap &InstIdxForVirtReg) const; + /// When calculate the latency of the root instruction, accumulate the + /// latency of the sequence to the root latency. + /// \param Root - Instruction that could be combined with one of its operands + virtual bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const { + return true; + } + /// Attempt to reassociate \P Root and \P Prev according to \P Pattern to /// reduce critical path length. void reassociateOps(MachineInstr &Root, MachineInstr &Prev, diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp --- a/llvm/lib/CodeGen/MachineCombiner.cpp +++ b/llvm/lib/CodeGen/MachineCombiner.cpp @@ -91,7 +91,8 @@ private: bool combineInstructions(MachineBasicBlock *); - MachineInstr *getOperandDef(const MachineOperand &MO); + MachineInstr *getOperandDef(const MachineOperand &MO, + SmallVectorImpl &InsInstrs); bool isTransientMI(const MachineInstr *MI); unsigned getDepth(SmallVectorImpl &InsInstrs, DenseMap &InstrIdxForVirtReg, @@ -149,11 +150,29 @@ MachineFunctionPass::getAnalysisUsage(AU); } -MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) { +MachineInstr * +MachineCombiner::getOperandDef(const MachineOperand &MO, + SmallVectorImpl &InsInstrs) { MachineInstr *DefInstr = nullptr; // We need a virtual register definition. if (MO.isReg() && MO.getReg().isVirtual()) DefInstr = MRI->getUniqueVRegDef(MO.getReg()); + // Since the new instructions are not inserted into the machine function, + // the def-use information is not added in MRI. So it is possible that + // the register is defined in new instructions. + if (!DefInstr) { + for (auto *MI : InsInstrs) { + for (const MachineOperand &DefMO : MI->operands()) { + if (!(DefMO.isReg() && DefMO.getReg().isVirtual())) + continue; + if (!DefMO.isDef()) + continue; + if (DefMO.getReg() != MO.getReg()) + continue; + DefInstr = MI; + } + } + } // PHI's have no depth etc. if (DefInstr && DefInstr->isPHI()) DefInstr = nullptr; @@ -238,7 +257,7 @@ LatencyOp = TSchedModel.computeOperandLatency(DefInstr, DefIdx, InstrPtr, UseIdx); } else { - MachineInstr *DefInstr = getOperandDef(MO); + MachineInstr *DefInstr = getOperandDef(MO, InsInstrs); if (DefInstr && (TII->getMachineCombinerTraceStrategy() != MachineTraceStrategy::TS_Local || DefInstr->getParent() == &MBB)) { @@ -404,8 +423,13 @@ // Account for the latency of the inserted and deleted instructions by unsigned NewRootLatency, RootLatency; - std::tie(NewRootLatency, RootLatency) = - getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + if (TII->accumulateInstrSeqToRootLatency(*Root)) + std::tie(NewRootLatency, RootLatency) = + getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace); + else { + NewRootLatency = TSchedModel.computeInstrLatency(InsInstrs.back()); + RootLatency = TSchedModel.computeInstrLatency(Root); + } unsigned RootSlack = BlockTrace.getInstrSlack(*Root); unsigned NewCycleCount = NewRootDepth + NewRootLatency; diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h --- a/llvm/lib/Target/X86/X86InstrInfo.h +++ b/llvm/lib/Target/X86/X86InstrInfo.h @@ -602,6 +602,26 @@ std::optional isCopyInstrImpl(const MachineInstr &MI) const override; + bool + getMachineCombinerPatterns(MachineInstr &Root, + SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const override; + + void genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const override; + + /// When calculate the latency of the root instruction, accumulate the + /// latency of the sequence to the root latency. + /// \param Root - Instruction that could be combined with one of its operands + /// For X86 instruction (vpmaddwd + vpmaddwd) -> vpdpwssd, the vpmaddwd + /// is not in the critical path, so the root latency only include vpmaddwd. + bool accumulateInstrSeqToRootLatency(MachineInstr &Root) const override { + return false; + } + private: /// This is a helper for convertToThreeAddress for 8 and 16-bit instructions. /// We use 32-bit LEA to form 3-address code by promoting to a 32-bit diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -22,6 +22,7 @@ #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LivePhysRegs.h" #include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineCombinerPattern.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -9750,5 +9751,140 @@ return It; } +bool X86InstrInfo::getMachineCombinerPatterns( + MachineInstr &Root, SmallVectorImpl &Patterns, + bool DoRegPressureReduce) const { + unsigned Opc = Root.getOpcode(); + switch (Opc) { + default: + return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns, + DoRegPressureReduce); + case X86::VPDPWSSDrr: + case X86::VPDPWSSDrm: + case X86::VPDPWSSDYrr: + case X86::VPDPWSSDYrm: { + Patterns.push_back(MachineCombinerPattern::DPWSSD); + return true; + } + case X86::VPDPWSSDZ128r: + case X86::VPDPWSSDZ128m: + case X86::VPDPWSSDZ256r: + case X86::VPDPWSSDZ256m: + case X86::VPDPWSSDZr: + case X86::VPDPWSSDZm: { + if (Subtarget.hasBWI()) + Patterns.push_back(MachineCombinerPattern::DPWSSD); + return true; + } + } +} + +static void +genAlternativeDpCodeSequence(MachineInstr &Root, const TargetInstrInfo &TII, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) { + MachineFunction *MF = Root.getMF(); + MachineRegisterInfo &RegInfo = MF->getRegInfo(); + + unsigned Opc = Root.getOpcode(); + unsigned VpAddOpc; + unsigned VpMaddOpc; + switch (Opc) { + default: + assert("It should not reach here"); + break; + // vpdpwssd xmm2,xmm3,XMMWORD PTR [r8+0x20] + // --> + // vpmaddwd xmm3,xmm3,XMMWORD PTR [r8+0x20] + // vpaddd xmm2,xmm2,xmm3 + case X86::VPDPWSSDrr: + VpMaddOpc = X86::VPMADDWDrr; + VpAddOpc = X86::VPADDDrr; + break; + case X86::VPDPWSSDrm: + VpMaddOpc = X86::VPMADDWDrm; + VpAddOpc = X86::VPADDDrr; + break; + case X86::VPDPWSSDZ128r: + VpMaddOpc = X86::VPMADDWDZ128rr; + VpAddOpc = X86::VPADDDZ128rr; + break; + case X86::VPDPWSSDZ128m: + VpMaddOpc = X86::VPMADDWDZ128rm; + VpAddOpc = X86::VPADDDZ128rr; + break; + // vpdpwssd ymm2,ymm3,YMMWORD PTR [r8+0x20] + // --> + // vpmaddwd ymm3,ymm3,YMMWORD PTR [r8+0x20] + // vpaddd ymm2,ymm2,ymm3 + case X86::VPDPWSSDYrr: + VpMaddOpc = X86::VPMADDWDYrr; + VpAddOpc = X86::VPADDDYrr; + break; + case X86::VPDPWSSDYrm: + VpMaddOpc = X86::VPMADDWDYrm; + VpAddOpc = X86::VPADDDYrr; + break; + case X86::VPDPWSSDZ256r: + VpMaddOpc = X86::VPMADDWDZ256rr; + VpAddOpc = X86::VPADDDZ256rr; + break; + case X86::VPDPWSSDZ256m: + VpMaddOpc = X86::VPMADDWDZ256rm; + VpAddOpc = X86::VPADDDZ256rr; + break; + // vpdpwssd zmm2,zmm3,ZMMWORD PTR [r8+0x20] + // --> + // vpmaddwd zmm3,zmm3,ZMMWORD PTR [r8+0x20] + // vpaddd zmm2,zmm2,zmm3 + case X86::VPDPWSSDZr: + VpMaddOpc = X86::VPMADDWDZrr; + VpAddOpc = X86::VPADDDZrr; + break; + case X86::VPDPWSSDZm: + VpMaddOpc = X86::VPMADDWDZrm; + VpAddOpc = X86::VPADDDZrr; + break; + } + // Create vpmaddwd. + auto *RC = RegInfo.getRegClass(Root.getOperand(0).getReg()); + Register NewReg = RegInfo.createVirtualRegister(RC); + MachineInstr *VpMadd = Root.getMF()->CloneMachineInstr(&Root); + VpMadd->setDesc(TII.get(VpMaddOpc)); + VpMadd->untieRegOperand(1); + VpMadd->removeOperand(1); + VpMadd->getOperand(0).setReg(NewReg); + // Create vpaddd. + Register DstReg = Root.getOperand(0).getReg(); + bool IsKill = Root.getOperand(1).isKill(); + MachineInstr *VpAdd = + BuildMI(*MF, MIMetadata(Root), TII.get(VpAddOpc), DstReg) + .addReg(Root.getOperand(1).getReg(), getKillRegState(IsKill)) + .addReg(VpMadd->getOperand(0).getReg(), getKillRegState(true)); + InstrIdxForVirtReg.insert(std::make_pair(DstReg, 0)); + InsInstrs.push_back(VpMadd); + InsInstrs.push_back(VpAdd); + DelInstrs.push_back(&Root); +} + +void X86InstrInfo::genAlternativeCodeSequence( + MachineInstr &Root, MachineCombinerPattern Pattern, + SmallVectorImpl &InsInstrs, + SmallVectorImpl &DelInstrs, + DenseMap &InstrIdxForVirtReg) const { + switch (Pattern) { + default: + // Reassociate instructions. + TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs, + DelInstrs, InstrIdxForVirtReg); + return; + case MachineCombinerPattern::DPWSSD: + genAlternativeDpCodeSequence(Root, *this, InsInstrs, DelInstrs, + InstrIdxForVirtReg); + return; + } +} + #define GET_INSTRINFO_HELPERS #include "X86GenInstrInfo.inc" diff --git a/llvm/test/CodeGen/X86/avx512vnni-combine.ll b/llvm/test/CodeGen/X86/avx512vnni-combine.ll --- a/llvm/test/CodeGen/X86/avx512vnni-combine.ll +++ b/llvm/test/CodeGen/X86/avx512vnni-combine.ll @@ -1,13 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids -verify-machineinstrs | FileCheck %s define <8 x i64> @foo_reg_512(<8 x i64> %0, <8 x i64> %1, <8 x i64> %2, <8 x i64> %3, <8 x i64> %4, <8 x i64> %5) { ; CHECK-LABEL: foo_reg_512: ; CHECK: # %bb.0: ; CHECK-NEXT: vpdpwssd %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm3, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm4, %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd %zmm5, %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm3, %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm4, %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd %zmm5, %zmm1, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %7 = bitcast <8 x i64> %0 to <16 x i32> %8 = bitcast <8 x i64> %1 to <16 x i32> @@ -54,9 +57,12 @@ ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vpdpwssd -192(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd -128(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd -64(%rdi), %zmm1, %zmm0 -; CHECK-NEXT: vpdpwssd (%rdi), %zmm1, %zmm0 +; CHECK-NEXT: vpmaddwd -128(%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd -64(%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; CHECK-NEXT: vpmaddwd (%rdi), %zmm1, %zmm2 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: addq $4, %rcx ; CHECK-NEXT: addq $256, %rdi # imm = 0x100 ; CHECK-NEXT: cmpq %rcx, %rdx @@ -179,8 +185,9 @@ ; CHECK-NEXT: vmovdqa64 (%rsi,%r8), %zmm2 ; CHECK-NEXT: vpdpwssd -64(%rdx,%r8), %zmm0, %zmm1 ; CHECK-NEXT: vmovdqa64 %zmm1, -64(%rsi,%r8) -; CHECK-NEXT: vpdpwssd (%rdx,%r8), %zmm0, %zmm2 -; CHECK-NEXT: vmovdqa64 %zmm2, (%rsi,%r8) +; CHECK-NEXT: vpmaddwd (%rdx,%r8), %zmm0, %zmm1 +; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%r8) ; CHECK-NEXT: addq $2, %rcx ; CHECK-NEXT: subq $-128, %r8 ; CHECK-NEXT: cmpq %rcx, %rdi @@ -190,9 +197,9 @@ ; CHECK-NEXT: je .LBB2_5 ; CHECK-NEXT: # %bb.4: ; CHECK-NEXT: shlq $6, %rcx -; CHECK-NEXT: vmovdqa64 (%rsi,%rcx), %zmm1 -; CHECK-NEXT: vpdpwssd (%rdx,%rcx), %zmm0, %zmm1 -; CHECK-NEXT: vmovdqa64 %zmm1, (%rsi,%rcx) +; CHECK-NEXT: vpmaddwd (%rdx,%rcx), %zmm0, %zmm0 +; CHECK-NEXT: vpaddd (%rsi,%rcx), %zmm0, %zmm0 +; CHECK-NEXT: vmovdqa64 %zmm0, (%rsi,%rcx) ; CHECK-NEXT: .LBB2_5: ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avxvnni-combine.ll b/llvm/test/CodeGen/X86/avxvnni-combine.ll --- a/llvm/test/CodeGen/X86/avxvnni-combine.ll +++ b/llvm/test/CodeGen/X86/avxvnni-combine.ll @@ -1,23 +1,29 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids | FileCheck %s --check-prefixes=AVX -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server | FileCheck %s --check-prefixes=AVX512 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=alderlake -verify-machineinstrs| FileCheck %s --check-prefixes=AVX,ADL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sapphirerapids -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,SPR +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server -verify-machineinstrs | FileCheck %s --check-prefixes=AVX512 define <2 x i64> @foo_reg_128(<2 x i64> %0, <2 x i64> %1, <2 x i64> %2, <2 x i64> %3, <2 x i64> %4, <2 x i64> %5) { ; AVX-LABEL: foo_reg_128: ; AVX: # %bb.0: ; AVX-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm3, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm4, %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd %xmm5, %xmm1, %xmm0 +; AVX-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2 +; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 +; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_reg_128: ; AVX512: # %bb.0: ; AVX512-NEXT: vpdpwssd %xmm2, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm3, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm4, %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd %xmm5, %xmm1, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm3, %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm4, %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd %xmm5, %xmm1, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq %7 = bitcast <2 x i64> %0 to <4 x i32> %8 = bitcast <2 x i64> %1 to <4 x i32> @@ -36,49 +42,99 @@ declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>) #1 define <2 x i64> @foo_128(i32 %0, <2 x i64> %1, <2 x i64> %2, ptr %3) { -; AVX-LABEL: foo_128: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB1_6 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: andl $3, %eax -; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB1_7 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB1_3 -; AVX-NEXT: .LBB1_7: -; AVX-NEXT: andl $-4, %edx -; AVX-NEXT: leaq 48(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd -16(%rdi), %xmm1, %xmm0 -; AVX-NEXT: {vex} vpdpwssd (%rdi), %xmm1, %xmm0 -; AVX-NEXT: addq $4, %rcx -; AVX-NEXT: addq $64, %rdi -; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB1_8 -; AVX-NEXT: .LBB1_3: -; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB1_6 -; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $4, %rax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 -; AVX-NEXT: addq $16, %rcx -; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB1_5 -; AVX-NEXT: .LBB1_6: -; AVX-NEXT: retq +; ADL-LABEL: foo_128: +; ADL: # %bb.0: +; ADL-NEXT: testl %edi, %edi +; ADL-NEXT: jle .LBB1_6 +; ADL-NEXT: # %bb.1: +; ADL-NEXT: movl %edi, %edx +; ADL-NEXT: movl %edx, %eax +; ADL-NEXT: andl $3, %eax +; ADL-NEXT: cmpl $4, %edi +; ADL-NEXT: jae .LBB1_7 +; ADL-NEXT: # %bb.2: +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: jmp .LBB1_3 +; ADL-NEXT: .LBB1_7: +; ADL-NEXT: andl $-4, %edx +; ADL-NEXT: leaq 48(%rsi), %rdi +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 +; ADL-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 +; ADL-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm3 +; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; ADL-NEXT: vpaddd %xmm3, %xmm0, %xmm0 +; ADL-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 +; ADL-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; ADL-NEXT: addq $4, %rcx +; ADL-NEXT: addq $64, %rdi +; ADL-NEXT: cmpq %rcx, %rdx +; ADL-NEXT: jne .LBB1_8 +; ADL-NEXT: .LBB1_3: +; ADL-NEXT: testq %rax, %rax +; ADL-NEXT: je .LBB1_6 +; ADL-NEXT: # %bb.4: # %.preheader +; ADL-NEXT: shlq $4, %rcx +; ADL-NEXT: addq %rcx, %rsi +; ADL-NEXT: shlq $4, %rax +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 +; ADL-NEXT: addq $16, %rcx +; ADL-NEXT: cmpq %rcx, %rax +; ADL-NEXT: jne .LBB1_5 +; ADL-NEXT: .LBB1_6: +; ADL-NEXT: retq +; +; SPR-LABEL: foo_128: +; SPR: # %bb.0: +; SPR-NEXT: testl %edi, %edi +; SPR-NEXT: jle .LBB1_6 +; SPR-NEXT: # %bb.1: +; SPR-NEXT: movl %edi, %edx +; SPR-NEXT: movl %edx, %eax +; SPR-NEXT: andl $3, %eax +; SPR-NEXT: cmpl $4, %edi +; SPR-NEXT: jae .LBB1_7 +; SPR-NEXT: # %bb.2: +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: jmp .LBB1_3 +; SPR-NEXT: .LBB1_7: +; SPR-NEXT: andl $-4, %edx +; SPR-NEXT: leaq 48(%rsi), %rdi +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: {vex} vpdpwssd -48(%rdi), %xmm1, %xmm0 +; SPR-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 +; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; SPR-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2 +; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; SPR-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 +; SPR-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; SPR-NEXT: addq $4, %rcx +; SPR-NEXT: addq $64, %rdi +; SPR-NEXT: cmpq %rcx, %rdx +; SPR-NEXT: jne .LBB1_8 +; SPR-NEXT: .LBB1_3: +; SPR-NEXT: testq %rax, %rax +; SPR-NEXT: je .LBB1_6 +; SPR-NEXT: # %bb.4: # %.preheader +; SPR-NEXT: shlq $4, %rcx +; SPR-NEXT: addq %rcx, %rsi +; SPR-NEXT: shlq $4, %rax +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB1_5: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %xmm1, %xmm0 +; SPR-NEXT: addq $16, %rcx +; SPR-NEXT: cmpq %rcx, %rax +; SPR-NEXT: jne .LBB1_5 +; SPR-NEXT: .LBB1_6: +; SPR-NEXT: retq ; ; AVX512-LABEL: foo_128: ; AVX512: # %bb.0: @@ -100,9 +156,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB1_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -48(%rdi), %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd -32(%rdi), %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd -16(%rdi), %xmm1, %xmm0 -; AVX512-NEXT: vpdpwssd (%rdi), %xmm1, %xmm0 +; AVX512-NEXT: vpmaddwd -32(%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd -16(%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 +; AVX512-NEXT: vpmaddwd (%rdi), %xmm1, %xmm2 +; AVX512-NEXT: vpaddd %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: addq $4, %rcx ; AVX512-NEXT: addq $64, %rdi ; AVX512-NEXT: cmpq %rcx, %rdx @@ -197,44 +256,84 @@ } define void @bar_128(i32 %0, ptr %1, <2 x i64> %2, ptr %3) { -; AVX-LABEL: bar_128: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB2_5 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB2_6 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB2_3 -; AVX-NEXT: .LBB2_6: -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: andl $-2, %edi -; AVX-NEXT: movl $16, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovdqa -16(%rsi,%r8), %xmm1 -; AVX-NEXT: vmovdqa (%rsi,%r8), %xmm2 -; AVX-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, -16(%rsi,%r8) -; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx -; AVX-NEXT: addq $32, %r8 -; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB2_7 -; AVX-NEXT: .LBB2_3: -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB2_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $4, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %xmm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, (%rsi,%rcx) -; AVX-NEXT: .LBB2_5: -; AVX-NEXT: retq +; ADL-LABEL: bar_128: +; ADL: # %bb.0: +; ADL-NEXT: testl %edi, %edi +; ADL-NEXT: jle .LBB2_5 +; ADL-NEXT: # %bb.1: +; ADL-NEXT: movl %edi, %eax +; ADL-NEXT: cmpl $1, %edi +; ADL-NEXT: jne .LBB2_6 +; ADL-NEXT: # %bb.2: +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: jmp .LBB2_3 +; ADL-NEXT: .LBB2_6: +; ADL-NEXT: movl %eax, %edi +; ADL-NEXT: andl $-2, %edi +; ADL-NEXT: movl $16, %r8d +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: vmovdqa (%rsi,%r8), %xmm1 +; ADL-NEXT: vpmaddwd -16(%rdx,%r8), %xmm0, %xmm2 +; ADL-NEXT: vpaddd -16(%rsi,%r8), %xmm2, %xmm2 +; ADL-NEXT: vmovdqa %xmm2, -16(%rsi,%r8) +; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %xmm0, %xmm1 +; ADL-NEXT: vmovdqa %xmm1, (%rsi,%r8) +; ADL-NEXT: addq $2, %rcx +; ADL-NEXT: addq $32, %r8 +; ADL-NEXT: cmpq %rcx, %rdi +; ADL-NEXT: jne .LBB2_7 +; ADL-NEXT: .LBB2_3: +; ADL-NEXT: testb $1, %al +; ADL-NEXT: je .LBB2_5 +; ADL-NEXT: # %bb.4: +; ADL-NEXT: shlq $4, %rcx +; ADL-NEXT: vmovdqa (%rsi,%rcx), %xmm1 +; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 +; ADL-NEXT: vmovdqa %xmm1, (%rsi,%rcx) +; ADL-NEXT: .LBB2_5: +; ADL-NEXT: retq +; +; SPR-LABEL: bar_128: +; SPR: # %bb.0: +; SPR-NEXT: testl %edi, %edi +; SPR-NEXT: jle .LBB2_5 +; SPR-NEXT: # %bb.1: +; SPR-NEXT: movl %edi, %eax +; SPR-NEXT: cmpl $1, %edi +; SPR-NEXT: jne .LBB2_6 +; SPR-NEXT: # %bb.2: +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: jmp .LBB2_3 +; SPR-NEXT: .LBB2_6: +; SPR-NEXT: movl %eax, %edi +; SPR-NEXT: andl $-2, %edi +; SPR-NEXT: movl $16, %r8d +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB2_7: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: vmovdqa -16(%rsi,%r8), %xmm1 +; SPR-NEXT: vmovdqa (%rsi,%r8), %xmm2 +; SPR-NEXT: {vex} vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1 +; SPR-NEXT: vmovdqa %xmm1, -16(%rsi,%r8) +; SPR-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1 +; SPR-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; SPR-NEXT: vmovdqa %xmm1, (%rsi,%r8) +; SPR-NEXT: addq $2, %rcx +; SPR-NEXT: addq $32, %r8 +; SPR-NEXT: cmpq %rcx, %rdi +; SPR-NEXT: jne .LBB2_7 +; SPR-NEXT: .LBB2_3: +; SPR-NEXT: testb $1, %al +; SPR-NEXT: je .LBB2_5 +; SPR-NEXT: # %bb.4: +; SPR-NEXT: shlq $4, %rcx +; SPR-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0 +; SPR-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0 +; SPR-NEXT: vmovdqa %xmm0, (%rsi,%rcx) +; SPR-NEXT: .LBB2_5: +; SPR-NEXT: retq ; ; AVX512-LABEL: bar_128: ; AVX512: # %bb.0: @@ -258,8 +357,9 @@ ; AVX512-NEXT: vmovdqa (%rsi,%r8), %xmm2 ; AVX512-NEXT: vpdpwssd -16(%rdx,%r8), %xmm0, %xmm1 ; AVX512-NEXT: vmovdqa %xmm1, -16(%rsi,%r8) -; AVX512-NEXT: vpdpwssd (%rdx,%r8), %xmm0, %xmm2 -; AVX512-NEXT: vmovdqa %xmm2, (%rsi,%r8) +; AVX512-NEXT: vpmaddwd (%rdx,%r8), %xmm0, %xmm1 +; AVX512-NEXT: vpaddd %xmm1, %xmm2, %xmm1 +; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%r8) ; AVX512-NEXT: addq $2, %rcx ; AVX512-NEXT: addq $32, %r8 ; AVX512-NEXT: cmpq %rcx, %rdi @@ -269,9 +369,9 @@ ; AVX512-NEXT: je .LBB2_5 ; AVX512-NEXT: # %bb.4: ; AVX512-NEXT: shlq $4, %rcx -; AVX512-NEXT: vmovdqa (%rsi,%rcx), %xmm1 -; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %xmm0, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, (%rsi,%rcx) +; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %xmm0, %xmm0 +; AVX512-NEXT: vpaddd (%rsi,%rcx), %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rsi,%rcx) ; AVX512-NEXT: .LBB2_5: ; AVX512-NEXT: retq %5 = icmp sgt i32 %0, 0 @@ -333,17 +433,23 @@ ; AVX-LABEL: foo_reg_256: ; AVX: # %bb.0: ; AVX-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm3, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm4, %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd %ymm5, %ymm1, %ymm0 +; AVX-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2 +; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2 +; AVX-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 +; AVX-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX-NEXT: retq ; ; AVX512-LABEL: foo_reg_256: ; AVX512: # %bb.0: ; AVX512-NEXT: vpdpwssd %ymm2, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm3, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm4, %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd %ymm5, %ymm1, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm3, %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm4, %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd %ymm5, %ymm1, %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: retq %7 = bitcast <4 x i64> %0 to <8 x i32> %8 = bitcast <4 x i64> %1 to <8 x i32> @@ -369,49 +475,99 @@ ; } define <4 x i64> @foo_256(i32 %0, <4 x i64> %1, <4 x i64> %2, ptr %3) { -; AVX-LABEL: foo_256: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB4_6 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %edx -; AVX-NEXT: movl %edx, %eax -; AVX-NEXT: andl $3, %eax -; AVX-NEXT: cmpl $4, %edi -; AVX-NEXT: jae .LBB4_7 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB4_3 -; AVX-NEXT: .LBB4_7: -; AVX-NEXT: andl $-4, %edx -; AVX-NEXT: leaq 96(%rsi), %rdi -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd -64(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd -32(%rdi), %ymm1, %ymm0 -; AVX-NEXT: {vex} vpdpwssd (%rdi), %ymm1, %ymm0 -; AVX-NEXT: addq $4, %rcx -; AVX-NEXT: subq $-128, %rdi -; AVX-NEXT: cmpq %rcx, %rdx -; AVX-NEXT: jne .LBB4_8 -; AVX-NEXT: .LBB4_3: -; AVX-NEXT: testq %rax, %rax -; AVX-NEXT: je .LBB4_6 -; AVX-NEXT: # %bb.4: # %.preheader -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: addq %rcx, %rsi -; AVX-NEXT: shlq $5, %rax -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 -; AVX-NEXT: addq $32, %rcx -; AVX-NEXT: cmpq %rcx, %rax -; AVX-NEXT: jne .LBB4_5 -; AVX-NEXT: .LBB4_6: -; AVX-NEXT: retq +; ADL-LABEL: foo_256: +; ADL: # %bb.0: +; ADL-NEXT: testl %edi, %edi +; ADL-NEXT: jle .LBB4_6 +; ADL-NEXT: # %bb.1: +; ADL-NEXT: movl %edi, %edx +; ADL-NEXT: movl %edx, %eax +; ADL-NEXT: andl $3, %eax +; ADL-NEXT: cmpl $4, %edi +; ADL-NEXT: jae .LBB4_7 +; ADL-NEXT: # %bb.2: +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: jmp .LBB4_3 +; ADL-NEXT: .LBB4_7: +; ADL-NEXT: andl $-4, %edx +; ADL-NEXT: leaq 96(%rsi), %rdi +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 +; ADL-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 +; ADL-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm3 +; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; ADL-NEXT: vpaddd %ymm3, %ymm0, %ymm0 +; ADL-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 +; ADL-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; ADL-NEXT: addq $4, %rcx +; ADL-NEXT: subq $-128, %rdi +; ADL-NEXT: cmpq %rcx, %rdx +; ADL-NEXT: jne .LBB4_8 +; ADL-NEXT: .LBB4_3: +; ADL-NEXT: testq %rax, %rax +; ADL-NEXT: je .LBB4_6 +; ADL-NEXT: # %bb.4: # %.preheader +; ADL-NEXT: shlq $5, %rcx +; ADL-NEXT: addq %rcx, %rsi +; ADL-NEXT: shlq $5, %rax +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 +; ADL-NEXT: addq $32, %rcx +; ADL-NEXT: cmpq %rcx, %rax +; ADL-NEXT: jne .LBB4_5 +; ADL-NEXT: .LBB4_6: +; ADL-NEXT: retq +; +; SPR-LABEL: foo_256: +; SPR: # %bb.0: +; SPR-NEXT: testl %edi, %edi +; SPR-NEXT: jle .LBB4_6 +; SPR-NEXT: # %bb.1: +; SPR-NEXT: movl %edi, %edx +; SPR-NEXT: movl %edx, %eax +; SPR-NEXT: andl $3, %eax +; SPR-NEXT: cmpl $4, %edi +; SPR-NEXT: jae .LBB4_7 +; SPR-NEXT: # %bb.2: +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: jmp .LBB4_3 +; SPR-NEXT: .LBB4_7: +; SPR-NEXT: andl $-4, %edx +; SPR-NEXT: leaq 96(%rsi), %rdi +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: {vex} vpdpwssd -96(%rdi), %ymm1, %ymm0 +; SPR-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 +; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; SPR-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2 +; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; SPR-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 +; SPR-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; SPR-NEXT: addq $4, %rcx +; SPR-NEXT: subq $-128, %rdi +; SPR-NEXT: cmpq %rcx, %rdx +; SPR-NEXT: jne .LBB4_8 +; SPR-NEXT: .LBB4_3: +; SPR-NEXT: testq %rax, %rax +; SPR-NEXT: je .LBB4_6 +; SPR-NEXT: # %bb.4: # %.preheader +; SPR-NEXT: shlq $5, %rcx +; SPR-NEXT: addq %rcx, %rsi +; SPR-NEXT: shlq $5, %rax +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB4_5: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: {vex} vpdpwssd (%rsi,%rcx), %ymm1, %ymm0 +; SPR-NEXT: addq $32, %rcx +; SPR-NEXT: cmpq %rcx, %rax +; SPR-NEXT: jne .LBB4_5 +; SPR-NEXT: .LBB4_6: +; SPR-NEXT: retq ; ; AVX512-LABEL: foo_256: ; AVX512: # %bb.0: @@ -433,9 +589,12 @@ ; AVX512-NEXT: .p2align 4, 0x90 ; AVX512-NEXT: .LBB4_8: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vpdpwssd -96(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd -64(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd -32(%rdi), %ymm1, %ymm0 -; AVX512-NEXT: vpdpwssd (%rdi), %ymm1, %ymm0 +; AVX512-NEXT: vpmaddwd -64(%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd -32(%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpmaddwd (%rdi), %ymm1, %ymm2 +; AVX512-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: addq $4, %rcx ; AVX512-NEXT: subq $-128, %rdi ; AVX512-NEXT: cmpq %rcx, %rdx @@ -537,45 +696,86 @@ ; } ; } define void @bar_256(i32 %0, ptr %1, <4 x i64> %2, ptr %3) { -; AVX-LABEL: bar_256: -; AVX: # %bb.0: -; AVX-NEXT: testl %edi, %edi -; AVX-NEXT: jle .LBB5_5 -; AVX-NEXT: # %bb.1: -; AVX-NEXT: movl %edi, %eax -; AVX-NEXT: cmpl $1, %edi -; AVX-NEXT: jne .LBB5_6 -; AVX-NEXT: # %bb.2: -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: jmp .LBB5_3 -; AVX-NEXT: .LBB5_6: -; AVX-NEXT: movl %eax, %edi -; AVX-NEXT: andl $-2, %edi -; AVX-NEXT: movl $32, %r8d -; AVX-NEXT: xorl %ecx, %ecx -; AVX-NEXT: .p2align 4, 0x90 -; AVX-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vmovdqa -32(%rsi,%r8), %ymm1 -; AVX-NEXT: vmovdqa (%rsi,%r8), %ymm2 -; AVX-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1 -; AVX-NEXT: vmovdqa %ymm1, -32(%rsi,%r8) -; AVX-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm2 -; AVX-NEXT: vmovdqa %ymm2, (%rsi,%r8) -; AVX-NEXT: addq $2, %rcx -; AVX-NEXT: addq $64, %r8 -; AVX-NEXT: cmpq %rcx, %rdi -; AVX-NEXT: jne .LBB5_7 -; AVX-NEXT: .LBB5_3: -; AVX-NEXT: testb $1, %al -; AVX-NEXT: je .LBB5_5 -; AVX-NEXT: # %bb.4: -; AVX-NEXT: shlq $5, %rcx -; AVX-NEXT: vmovdqa (%rsi,%rcx), %ymm1 -; AVX-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 -; AVX-NEXT: vmovdqa %ymm1, (%rsi,%rcx) -; AVX-NEXT: .LBB5_5: -; AVX-NEXT: vzeroupper -; AVX-NEXT: retq +; ADL-LABEL: bar_256: +; ADL: # %bb.0: +; ADL-NEXT: testl %edi, %edi +; ADL-NEXT: jle .LBB5_5 +; ADL-NEXT: # %bb.1: +; ADL-NEXT: movl %edi, %eax +; ADL-NEXT: cmpl $1, %edi +; ADL-NEXT: jne .LBB5_6 +; ADL-NEXT: # %bb.2: +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: jmp .LBB5_3 +; ADL-NEXT: .LBB5_6: +; ADL-NEXT: movl %eax, %edi +; ADL-NEXT: andl $-2, %edi +; ADL-NEXT: movl $32, %r8d +; ADL-NEXT: xorl %ecx, %ecx +; ADL-NEXT: .p2align 4, 0x90 +; ADL-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 +; ADL-NEXT: vmovdqa (%rsi,%r8), %ymm1 +; ADL-NEXT: vpmaddwd -32(%rdx,%r8), %ymm0, %ymm2 +; ADL-NEXT: vpaddd -32(%rsi,%r8), %ymm2, %ymm2 +; ADL-NEXT: vmovdqa %ymm2, -32(%rsi,%r8) +; ADL-NEXT: {vex} vpdpwssd (%rdx,%r8), %ymm0, %ymm1 +; ADL-NEXT: vmovdqa %ymm1, (%rsi,%r8) +; ADL-NEXT: addq $2, %rcx +; ADL-NEXT: addq $64, %r8 +; ADL-NEXT: cmpq %rcx, %rdi +; ADL-NEXT: jne .LBB5_7 +; ADL-NEXT: .LBB5_3: +; ADL-NEXT: testb $1, %al +; ADL-NEXT: je .LBB5_5 +; ADL-NEXT: # %bb.4: +; ADL-NEXT: shlq $5, %rcx +; ADL-NEXT: vmovdqa (%rsi,%rcx), %ymm1 +; ADL-NEXT: {vex} vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 +; ADL-NEXT: vmovdqa %ymm1, (%rsi,%rcx) +; ADL-NEXT: .LBB5_5: +; ADL-NEXT: vzeroupper +; ADL-NEXT: retq +; +; SPR-LABEL: bar_256: +; SPR: # %bb.0: +; SPR-NEXT: testl %edi, %edi +; SPR-NEXT: jle .LBB5_5 +; SPR-NEXT: # %bb.1: +; SPR-NEXT: movl %edi, %eax +; SPR-NEXT: cmpl $1, %edi +; SPR-NEXT: jne .LBB5_6 +; SPR-NEXT: # %bb.2: +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: jmp .LBB5_3 +; SPR-NEXT: .LBB5_6: +; SPR-NEXT: movl %eax, %edi +; SPR-NEXT: andl $-2, %edi +; SPR-NEXT: movl $32, %r8d +; SPR-NEXT: xorl %ecx, %ecx +; SPR-NEXT: .p2align 4, 0x90 +; SPR-NEXT: .LBB5_7: # =>This Inner Loop Header: Depth=1 +; SPR-NEXT: vmovdqa -32(%rsi,%r8), %ymm1 +; SPR-NEXT: vmovdqa (%rsi,%r8), %ymm2 +; SPR-NEXT: {vex} vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1 +; SPR-NEXT: vmovdqa %ymm1, -32(%rsi,%r8) +; SPR-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1 +; SPR-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; SPR-NEXT: vmovdqa %ymm1, (%rsi,%r8) +; SPR-NEXT: addq $2, %rcx +; SPR-NEXT: addq $64, %r8 +; SPR-NEXT: cmpq %rcx, %rdi +; SPR-NEXT: jne .LBB5_7 +; SPR-NEXT: .LBB5_3: +; SPR-NEXT: testb $1, %al +; SPR-NEXT: je .LBB5_5 +; SPR-NEXT: # %bb.4: +; SPR-NEXT: shlq $5, %rcx +; SPR-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0 +; SPR-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0 +; SPR-NEXT: vmovdqa %ymm0, (%rsi,%rcx) +; SPR-NEXT: .LBB5_5: +; SPR-NEXT: vzeroupper +; SPR-NEXT: retq ; ; AVX512-LABEL: bar_256: ; AVX512: # %bb.0: @@ -599,8 +799,9 @@ ; AVX512-NEXT: vmovdqa (%rsi,%r8), %ymm2 ; AVX512-NEXT: vpdpwssd -32(%rdx,%r8), %ymm0, %ymm1 ; AVX512-NEXT: vmovdqa %ymm1, -32(%rsi,%r8) -; AVX512-NEXT: vpdpwssd (%rdx,%r8), %ymm0, %ymm2 -; AVX512-NEXT: vmovdqa %ymm2, (%rsi,%r8) +; AVX512-NEXT: vpmaddwd (%rdx,%r8), %ymm0, %ymm1 +; AVX512-NEXT: vpaddd %ymm1, %ymm2, %ymm1 +; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%r8) ; AVX512-NEXT: addq $2, %rcx ; AVX512-NEXT: addq $64, %r8 ; AVX512-NEXT: cmpq %rcx, %rdi @@ -610,9 +811,9 @@ ; AVX512-NEXT: je .LBB5_5 ; AVX512-NEXT: # %bb.4: ; AVX512-NEXT: shlq $5, %rcx -; AVX512-NEXT: vmovdqa (%rsi,%rcx), %ymm1 -; AVX512-NEXT: vpdpwssd (%rdx,%rcx), %ymm0, %ymm1 -; AVX512-NEXT: vmovdqa %ymm1, (%rsi,%rcx) +; AVX512-NEXT: vpmaddwd (%rdx,%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vpaddd (%rsi,%rcx), %ymm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rsi,%rcx) ; AVX512-NEXT: .LBB5_5: ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq