Index: llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp =================================================================== --- llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -38,6 +38,8 @@ #include "X86Subtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -49,6 +51,7 @@ #include "llvm/IR/Function.h" #include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/BranchProbability.h" using namespace llvm; @@ -64,6 +67,10 @@ "inspect for store forwarding blocks."), cl::init(20), cl::Hidden); +namespace llvm { +extern cl::opt StaticLikelyProb; +} // namespace llvm + namespace { using DisplacementSizeMap = std::map; @@ -71,7 +78,7 @@ class X86AvoidSFBPass : public MachineFunctionPass { public: static char ID; - X86AvoidSFBPass() : MachineFunctionPass(ID) { } + X86AvoidSFBPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 Avoid Store Forwarding Blocks"; @@ -82,10 +89,16 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } private: MachineRegisterInfo *MRI = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + MachineBranchProbabilityInfo *MBPI = nullptr; const X86InstrInfo *TII = nullptr; const X86RegisterInfo *TRI = nullptr; SmallVector, 2> @@ -120,11 +133,12 @@ char X86AvoidSFBPass::ID = 0; -INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", - false, false) +INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, + "X86 void store forward block", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false, - false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, + "X86 avoid store forward block", false, false) FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() { return new X86AvoidSFBPass(); @@ -315,7 +329,8 @@ const MachineOperand &Disp = getDispOperand(MI); const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt); const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg); - const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg); + const MachineOperand &Segment = + MI->getOperand(AddrOffset + X86::AddrSegmentReg); if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI())) return false; @@ -361,6 +376,9 @@ MachineBasicBlock *MBB = LoadInst->getParent(); int LimitLeft = InspectionLimit - BlockCount; for (MachineBasicBlock *PMBB : MBB->predecessors()) { + // Accessed address in a self-loop may change every iteration. + if (PMBB == MBB) + continue; int PredCount = 0; for (MachineInstr &PBInst : llvm::reverse(*PMBB)) { if (PBInst.isMetaInstruction()) @@ -544,8 +562,8 @@ if (StoreMI.getParent() == MI.getParent() && isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) && isRelevantAddressingMode(&MI) && - isRelevantAddressingMode(&StoreMI) && - MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) { + isRelevantAddressingMode(&StoreMI) && MI.hasOneMemOperand() && + StoreMI.hasOneMemOperand()) { if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin())) BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI)); } @@ -555,7 +573,7 @@ unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, - *LoadInst->getParent()->getParent()); + *LoadInst->getParent()->getParent()); return TRI->getRegSizeInBits(*TRC) / 8; } @@ -671,6 +689,8 @@ TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); AA = &getAnalysis().getAAResults(); + MBFI = &getAnalysis(); + MBPI = &getAnalysis(); LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";); // Look for a load then a store to XMM/YMM which look like a memcpy findPotentiallylBlockedCopies(MF); @@ -680,6 +700,10 @@ int64_t LdDispImm = getDispOperand(LoadInst).getImm(); DisplacementSizeMap BlockingStoresDispSizeMap; + BlockFrequency WeightedFreq; + auto *LoadMBB = LoadInst->getParent(); + const MachineBasicBlock *LastPredMBB = nullptr; + SmallVector, 2> PredStoreInfo; SmallVector PotentialBlockers = findPotentialBlockers(LoadInst); for (auto *PBInst : PotentialBlockers) { @@ -690,14 +714,39 @@ int64_t PBstDispImm = getDispOperand(PBInst).getImm(); unsigned PBstSize = (*PBInst->memoperands_begin())->getSize(); // This check doesn't cover all cases, but it will suffice for now. - // TODO: take branch probability into consideration, if the blocking - // store is in an unreached block, breaking the memcopy could lose - // performance. if (hasSameBaseOpValue(LoadInst, PBInst) && isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm, - PBstSize)) - updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm, - PBstSize); + PBstSize)) { + auto *StoreMBB = PBInst->getParent(); + if (StoreMBB != LoadMBB) { + PredStoreInfo.push_back({PBstDispImm, PBstSize}); + if (LastPredMBB != StoreMBB) { + WeightedFreq += MBFI->getBlockFreq(StoreMBB) * + MBPI->getEdgeProbability(StoreMBB, LoadMBB); + LastPredMBB = StoreMBB; + } + } else { + updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, + PBstDispImm, PBstSize); + } + } + } + + // Take branch probability into consideration, if the blocking + // store is in an unreached block, breaking the memcopy could lose + // performance. + if (WeightedFreq.getFrequency() != 0) { + BlockFrequency TotalWeightedFreq; + for (auto *PMBB : LoadMBB->predecessors()) + TotalWeightedFreq += + MBFI->getBlockFreq(PMBB) * MBPI->getEdgeProbability(PMBB, LoadMBB); + auto HotProb = + BranchProbability::getBranchProbability(StaticLikelyProb, 100); + if (WeightedFreq >= TotalWeightedFreq * HotProb) { + for (auto [PBstDispImm, PBstSize] : PredStoreInfo) + updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, + PBstDispImm, PBstSize); + } } if (BlockingStoresDispSizeMap.empty()) Index: llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir =================================================================== --- llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir +++ llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir @@ -1,3 +1,4 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc %s -run-pass x86-avoid-SFB -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s # # This was generated from: @@ -18,7 +19,7 @@ # llc -stop-before=x86-avoid-SFB test.ll -o before.mir # # The IR has been modified to include a number of debug/meta instructions between the initial store and load. -# If the code treats debug/meta instructions as normal instructions, the separation between the store and load +# If the code treats debug/meta instructions as normal instructions, the separation between the store and load # will be too great to enable the optimization. --- | @@ -26,7 +27,7 @@ source_filename = "1.cpp" target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-unknown" - + ; Function Attrs: norecurse nounwind uwtable define dso_local void @debug(<4 x float>* noalias %p1, <4 x float>* noalias nocapture %p2) local_unnamed_addr #0 !dbg !10 { entry: @@ -56,31 +57,31 @@ call void @llvm.dbg.value(metadata i8* %0, metadata !23, metadata !DIExpression()), !dbg !27 call void @llvm.dbg.value(metadata i8* %0, metadata !23, metadata !DIExpression()), !dbg !27 br i1 %tobool, label %if.end, label %if.then, !dbg !31 - + if.then: ; preds = %entry %1 = load <4 x float>, <4 x float>* %p1, align 16, !dbg !32 call void @llvm.dbg.value(metadata <4 x float> %1, metadata !24, metadata !DIExpression()), !dbg !33 store <4 x float> %1, <4 x float>* %p2, align 16, !dbg !34 br label %if.end, !dbg !35 - + if.end: ; preds = %if.then, %entry ret void, !dbg !36 } ; Function Attrs: nounwind readnone speculatable declare void @llvm.dbg.value(metadata, metadata, metadata) #1 - + ; Function Attrs: nounwind declare void @llvm.stackprotector(i8*, i8**) #2 - + attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone speculatable } attributes #2 = { nounwind } - + !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!6, !7, !8} !llvm.ident = !{!9} - + !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 9.0.0 (https://github.com/llvm/llvm-project.git 1a0312ca0b20d16edb859065bbace75f6701c92e)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None) !1 = !DIFile(filename: "1.cpp", directory: "/mnt/c/Users/gbdawsoc/Documents/llvm/bg40969") !2 = !{} @@ -130,14 +131,14 @@ failedISel: false tracksRegLiveness: true hasWinCFI: false -registers: +registers: - { id: 0, class: gr64, preferred-register: '' } - { id: 1, class: gr64, preferred-register: '' } - { id: 2, class: vr128, preferred-register: '' } -liveins: +liveins: - { reg: '$rdi', virtual-reg: '%0' } - { reg: '$rsi', virtual-reg: '%1' } -frameInfo: +frameInfo: isFrameAddressTaken: false isReturnAddressTaken: false hasStackMap: false @@ -161,10 +162,65 @@ constants: [] machineFunctionInfo: {} body: | + ; CHECK-LABEL: name: debug + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x30000000), %bb.1(0x50000000) + ; CHECK-NEXT: liveins: $rdi, $rsi + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: DBG_VALUE $rdi, $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE $rsi, $noreg, !22, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE $rdi, $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi + ; CHECK-NEXT: DBG_VALUE [[COPY]], $noreg, !22, !DIExpression(), debug-location !27 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY $rdi + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: TEST64rr [[COPY1]], [[COPY1]], implicit-def $eflags, debug-location !28 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: MOV8mi [[COPY1]], 1, $noreg, 0, $noreg, 0, debug-location !30 :: (store (s8) into %ir.0) + ; CHECK-NEXT: CFI_INSTRUCTION offset $r13, -123 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !21, !DIExpression(), debug-location !27 + ; CHECK-NEXT: DBG_VALUE [[COPY1]], $noreg, !23, !DIExpression(), debug-location !27 + ; CHECK-NEXT: JCC_1 %bb.2, 4, implicit $eflags, debug-location !31 + ; CHECK-NEXT: JMP_1 %bb.1, debug-location !31 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.if.then: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[MOV8rm:%[0-9]+]]:gr8 = MOV8rm [[COPY1]], 1, $noreg, 0, $noreg, debug-location !32 :: (load (s8) from %ir.p1, align 16) + ; CHECK-NEXT: MOV8mr [[COPY]], 1, $noreg, 0, $noreg, killed [[MOV8rm]], debug-location !32 :: (store (s8) into %ir.p2, align 16) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm [[COPY1]], 1, $noreg, 1, $noreg, debug-location !32 :: (load (s64) from %ir.p1 + 1, align 1, basealign 16) + ; CHECK-NEXT: MOV64mr [[COPY]], 1, $noreg, 1, $noreg, killed [[MOV64rm]], debug-location !32 :: (store (s64) into %ir.p2 + 1, align 1, basealign 16) + ; CHECK-NEXT: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm [[COPY1]], 1, $noreg, 9, $noreg, debug-location !32 :: (load (s32) from %ir.p1 + 9, align 1, basealign 16) + ; CHECK-NEXT: MOV32mr [[COPY]], 1, $noreg, 9, $noreg, killed [[MOV32rm]], debug-location !32 :: (store (s32) into %ir.p2 + 9, align 1, basealign 16) + ; CHECK-NEXT: [[MOV16rm:%[0-9]+]]:gr16 = MOV16rm [[COPY1]], 1, $noreg, 13, $noreg, debug-location !32 :: (load (s16) from %ir.p1 + 13, align 1, basealign 16) + ; CHECK-NEXT: MOV16mr [[COPY]], 1, $noreg, 13, $noreg, killed [[MOV16rm]], debug-location !32 :: (store (s16) into %ir.p2 + 13, align 1, basealign 16) + ; CHECK-NEXT: [[MOV8rm1:%[0-9]+]]:gr8 = MOV8rm [[COPY1]], 1, $noreg, 15, $noreg, debug-location !32 :: (load (s8) from %ir.p1 + 15, basealign 16) + ; CHECK-NEXT: MOV8mr [[COPY]], 1, $noreg, 15, $noreg, killed [[MOV8rm1]], debug-location !32 :: (store (s8) into %ir.p2 + 15, basealign 16) + ; CHECK-NEXT: DBG_VALUE %2:vr128, $noreg, !24, !DIExpression(), debug-location !33 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.if.end: + ; CHECK-NEXT: RET 0, debug-location !36 bb.0.entry: successors: %bb.2(0x30000000), %bb.1(0x50000000) liveins: $rdi, $rsi - + DBG_VALUE $rdi, $noreg, !21, !DIExpression(), debug-location !27 DBG_VALUE $rsi, $noreg, !22, !DIExpression(), debug-location !27 DBG_VALUE $rdi, $noreg, !23, !DIExpression(), debug-location !27 @@ -197,27 +253,15 @@ DBG_VALUE %0, $noreg, !23, !DIExpression(), debug-location !27 JCC_1 %bb.2, 4, implicit $eflags, debug-location !31 JMP_1 %bb.1, debug-location !31 - + bb.1.if.then: successors: %bb.2(0x80000000) - + %2:vr128 = MOVAPSrm %0, 1, $noreg, 0, $noreg, debug-location !32 :: (load (s128) from %ir.p1) DBG_VALUE %2, $noreg, !24, !DIExpression(), debug-location !33 MOVAPSmr %1, 1, $noreg, 0, $noreg, killed %2, debug-location !34 :: (store (s128) into %ir.p2) - + bb.2.if.end: RET 0, debug-location !36 - ; CHECK-LABEL: name: debug - ; CHECK: %3:gr8 = MOV8rm - ; CHECK: MOV8mr - ; CHECK: %4:gr64 = MOV64rm - ; CHECK: MOV64mr - ; CHECK: %5:gr32 = MOV32rm - ; CHECK: MOV32mr - ; CHECK: %6:gr16 = MOV16rm - ; CHECK: MOV16mr - ; CHECK: %7:gr8 = MOV8rm - ; CHECK: MOV8mr - ; CHECK: RET 0 ... Index: llvm/test/CodeGen/X86/avoid-sfb.ll =================================================================== --- llvm/test/CodeGen/X86/avoid-sfb.ll +++ llvm/test/CodeGen/X86/avoid-sfb.ll @@ -20,12 +20,8 @@ ; CHECK-NEXT: .LBB0_2: # %if.end ; CHECK-NEXT: movups (%r8), %xmm0 ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl 4(%rdi), %eax -; CHECK-NEXT: movl %eax, 4(%rsi) -; CHECK-NEXT: movq 8(%rdi), %rax -; CHECK-NEXT: movq %rax, 8(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block: @@ -50,12 +46,8 @@ ; CHECK-AVX2-NEXT: .LBB0_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_conditional_block: @@ -67,12 +59,8 @@ ; CHECK-AVX512-NEXT: .LBB0_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 @@ -157,12 +145,8 @@ ; CHECK-NEXT: .LBB2_4: # %if.end3 ; CHECK-NEXT: movups (%r8), %xmm0 ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movl 8(%rdi), %eax -; CHECK-NEXT: movl %eax, 8(%rsi) -; CHECK-NEXT: movl 12(%rdi), %eax -; CHECK-NEXT: movl %eax, 12(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_nondirect_br: @@ -197,12 +181,8 @@ ; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3 ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movl 8(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_nondirect_br: @@ -219,12 +199,8 @@ ; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3 ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movl 8(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 @@ -355,10 +331,8 @@ ; CHECK-NEXT: .LBB4_2: # %if.end ; CHECK-NEXT: movups (%r8), %xmm0 ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq 8(%rdi), %rax -; CHECK-NEXT: movq %rax, 8(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_type64: @@ -385,10 +359,8 @@ ; CHECK-AVX2-NEXT: .LBB4_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_type64: @@ -401,10 +373,8 @@ ; CHECK-AVX512-NEXT: .LBB4_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 @@ -434,16 +404,8 @@ ; CHECK-NEXT: movq %rax, (%rdi) ; CHECK-NEXT: movb %dl, 8(%rdi) ; CHECK-NEXT: .LBB5_2: # %if.end -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movzbl 8(%rdi), %eax -; CHECK-NEXT: movb %al, 8(%rsi) -; CHECK-NEXT: movl 9(%rdi), %eax -; CHECK-NEXT: movl %eax, 9(%rsi) -; CHECK-NEXT: movzwl 13(%rdi), %eax -; CHECK-NEXT: movw %ax, 13(%rsi) -; CHECK-NEXT: movzbl 15(%rdi), %eax -; CHECK-NEXT: movb %al, 15(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_mixed_type: @@ -468,16 +430,8 @@ ; CHECK-AVX2-NEXT: movq %rax, (%rdi) ; CHECK-AVX2-NEXT: movb %dl, 8(%rdi) ; CHECK-AVX2-NEXT: .LBB5_2: # %if.end -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movzbl 8(%rdi), %eax -; CHECK-AVX2-NEXT: movb %al, 8(%rsi) -; CHECK-AVX2-NEXT: movl 9(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 9(%rsi) -; CHECK-AVX2-NEXT: movzwl 13(%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, 13(%rsi) -; CHECK-AVX2-NEXT: movzbl 15(%rdi), %eax -; CHECK-AVX2-NEXT: movb %al, 15(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_mixed_type: @@ -489,16 +443,8 @@ ; CHECK-AVX512-NEXT: movq %rax, (%rdi) ; CHECK-AVX512-NEXT: movb %dl, 8(%rdi) ; CHECK-AVX512-NEXT: .LBB5_2: # %if.end -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movzbl 8(%rdi), %eax -; CHECK-AVX512-NEXT: movb %al, 8(%rsi) -; CHECK-AVX512-NEXT: movl 9(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 9(%rsi) -; CHECK-AVX512-NEXT: movzwl 13(%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, 13(%rsi) -; CHECK-AVX512-NEXT: movzbl 15(%rdi), %eax -; CHECK-AVX512-NEXT: movb %al, 15(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 @@ -616,14 +562,8 @@ ; CHECK-NEXT: .LBB7_2: # %if.end ; CHECK-NEXT: movups (%r8), %xmm0 ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: movzwl 2(%rdi), %eax -; CHECK-NEXT: movw %ax, 2(%rsi) -; CHECK-NEXT: movq 4(%rdi), %rax -; CHECK-NEXT: movq %rax, 4(%rsi) -; CHECK-NEXT: movl 12(%rdi), %eax -; CHECK-NEXT: movl %eax, 12(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_type16: @@ -648,14 +588,8 @@ ; CHECK-AVX2-NEXT: .LBB7_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movzwl (%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, (%rsi) -; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, 2(%rsi) -; CHECK-AVX2-NEXT: movq 4(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX2-NEXT: retq ; ; CHECK-AVX512-LABEL: test_type16: @@ -667,14 +601,8 @@ ; CHECK-AVX512-NEXT: .LBB7_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 ; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movzwl (%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, (%rsi) -; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, 2(%rsi) -; CHECK-AVX512-NEXT: movq 4(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%rsi) ; CHECK-AVX512-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 @@ -1010,9 +938,9 @@ ; CHECK-NEXT: .cfi_offset %r12, -32 ; CHECK-NEXT: .cfi_offset %r14, -24 ; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %r8, %r12 -; CHECK-NEXT: movq %rcx, %r15 -; CHECK-NEXT: movq %rsi, %r14 +; CHECK-NEXT: movq %r8, %r15 +; CHECK-NEXT: movq %rcx, %r14 +; CHECK-NEXT: movq %rsi, %r12 ; CHECK-NEXT: movq %rdi, %rbx ; CHECK-NEXT: movl %r9d, 12(%rdi) ; CHECK-NEXT: cmpl $18, %edx @@ -1022,14 +950,10 @@ ; CHECK-NEXT: movq %rbx, %rdi ; CHECK-NEXT: callq bar@PLT ; CHECK-NEXT: .LBB10_2: # %if.end -; CHECK-NEXT: movups (%r12), %xmm0 -; CHECK-NEXT: movups %xmm0, (%r15) -; CHECK-NEXT: movq (%rbx), %rax -; CHECK-NEXT: movq %rax, (%r14) -; CHECK-NEXT: movl 8(%rbx), %eax -; CHECK-NEXT: movl %eax, 8(%r14) -; CHECK-NEXT: movl 12(%rbx), %eax -; CHECK-NEXT: movl %eax, 12(%r14) +; CHECK-NEXT: movups (%r15), %xmm0 +; CHECK-NEXT: movups %xmm0, (%r14) +; CHECK-NEXT: movups (%rbx), %xmm0 +; CHECK-NEXT: movups %xmm0, (%r12) ; CHECK-NEXT: addq $8, %rsp ; CHECK-NEXT: .cfi_def_cfa_offset 40 ; CHECK-NEXT: popq %rbx @@ -1102,9 +1026,9 @@ ; CHECK-AVX2-NEXT: .cfi_offset %r12, -32 ; CHECK-AVX2-NEXT: .cfi_offset %r14, -24 ; CHECK-AVX2-NEXT: .cfi_offset %r15, -16 -; CHECK-AVX2-NEXT: movq %r8, %r12 -; CHECK-AVX2-NEXT: movq %rcx, %r15 -; CHECK-AVX2-NEXT: movq %rsi, %r14 +; CHECK-AVX2-NEXT: movq %r8, %r15 +; CHECK-AVX2-NEXT: movq %rcx, %r14 +; CHECK-AVX2-NEXT: movq %rsi, %r12 ; CHECK-AVX2-NEXT: movq %rdi, %rbx ; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX2-NEXT: cmpl $18, %edx @@ -1114,14 +1038,10 @@ ; CHECK-AVX2-NEXT: movq %rbx, %rdi ; CHECK-AVX2-NEXT: callq bar@PLT ; CHECK-AVX2-NEXT: .LBB10_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX2-NEXT: movq (%rbx), %rax -; CHECK-AVX2-NEXT: movq %rax, (%r14) -; CHECK-AVX2-NEXT: movl 8(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%r14) -; CHECK-AVX2-NEXT: movl 12(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%r14) +; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14) +; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12) ; CHECK-AVX2-NEXT: addq $8, %rsp ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX2-NEXT: popq %rbx @@ -1150,9 +1070,9 @@ ; CHECK-AVX512-NEXT: .cfi_offset %r12, -32 ; CHECK-AVX512-NEXT: .cfi_offset %r14, -24 ; CHECK-AVX512-NEXT: .cfi_offset %r15, -16 -; CHECK-AVX512-NEXT: movq %r8, %r12 -; CHECK-AVX512-NEXT: movq %rcx, %r15 -; CHECK-AVX512-NEXT: movq %rsi, %r14 +; CHECK-AVX512-NEXT: movq %r8, %r15 +; CHECK-AVX512-NEXT: movq %rcx, %r14 +; CHECK-AVX512-NEXT: movq %rsi, %r12 ; CHECK-AVX512-NEXT: movq %rdi, %rbx ; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) ; CHECK-AVX512-NEXT: cmpl $18, %edx @@ -1162,14 +1082,10 @@ ; CHECK-AVX512-NEXT: movq %rbx, %rdi ; CHECK-AVX512-NEXT: callq bar@PLT ; CHECK-AVX512-NEXT: .LBB10_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX512-NEXT: movq (%rbx), %rax -; CHECK-AVX512-NEXT: movq %rax, (%r14) -; CHECK-AVX512-NEXT: movl 8(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%r14) -; CHECK-AVX512-NEXT: movl 12(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%r14) +; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14) +; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12) ; CHECK-AVX512-NEXT: addq $8, %rsp ; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 ; CHECK-AVX512-NEXT: popq %rbx @@ -1218,14 +1134,10 @@ ; CHECK-NEXT: movups 16(%r8), %xmm1 ; CHECK-NEXT: movups %xmm1, 16(%rcx) ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl 4(%rdi), %ecx -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) -; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl %ecx, 4(%rsi) -; CHECK-NEXT: movq %rdx, 8(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups 16(%rdi), %xmm1 +; CHECK-NEXT: movups %xmm1, 16(%rsi) +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block_float: @@ -1254,14 +1166,8 @@ ; CHECK-AVX2-NEXT: .LBB11_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX2-NEXT: movq 24(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 24(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %ymm0 +; CHECK-AVX2-NEXT: vmovups %ymm0, (%rsi) ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: retq ; @@ -1274,14 +1180,8 @@ ; CHECK-AVX512-NEXT: .LBB11_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX512-NEXT: movq 24(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 24(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %ymm0 +; CHECK-AVX512-NEXT: vmovups %ymm0, (%rsi) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq entry: @@ -1314,12 +1214,10 @@ ; CHECK-NEXT: movups 16(%r8), %xmm1 ; CHECK-NEXT: movups %xmm1, 16(%rcx) ; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups 16(%rdi), %xmm1 +; CHECK-NEXT: movups %xmm1, 16(%rsi) +; CHECK-NEXT: movups %xmm0, (%rsi) ; CHECK-NEXT: retq ; ; DISABLED-LABEL: test_conditional_block_ymm: @@ -1348,12 +1246,8 @@ ; CHECK-AVX2-NEXT: .LBB12_2: # %if.end ; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 ; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %ymm0 +; CHECK-AVX2-NEXT: vmovups %ymm0, (%rsi) ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: retq ; @@ -1366,12 +1260,8 @@ ; CHECK-AVX512-NEXT: .LBB12_2: # %if.end ; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 ; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %ymm0 +; CHECK-AVX512-NEXT: vmovups %ymm0, (%rsi) ; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq entry: Index: llvm/test/CodeGen/X86/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/opt-pipeline.ll +++ llvm/test/CodeGen/X86/opt-pipeline.ll @@ -118,6 +118,9 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: X86 Avoid Store Forwarding Block ; CHECK-NEXT: X86 speculative load hardening ; CHECK-NEXT: MachineDominator Tree Construction