Index: llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp =================================================================== --- llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp +++ llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp @@ -38,6 +38,8 @@ #include "X86Subtarget.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineBlockFrequencyInfo.h" +#include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -49,6 +51,7 @@ #include "llvm/IR/Function.h" #include "llvm/InitializePasses.h" #include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/BranchProbability.h" using namespace llvm; @@ -64,6 +67,10 @@ "inspect for store forwarding blocks."), cl::init(20), cl::Hidden); +namespace llvm { +extern cl::opt StaticLikelyProb; +} // namespace llvm + namespace { using DisplacementSizeMap = std::map; @@ -71,7 +78,7 @@ class X86AvoidSFBPass : public MachineFunctionPass { public: static char ID; - X86AvoidSFBPass() : MachineFunctionPass(ID) { } + X86AvoidSFBPass() : MachineFunctionPass(ID) {} StringRef getPassName() const override { return "X86 Avoid Store Forwarding Blocks"; @@ -82,10 +89,16 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { MachineFunctionPass::getAnalysisUsage(AU); AU.addRequired(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); } private: MachineRegisterInfo *MRI = nullptr; + MachineBlockFrequencyInfo *MBFI = nullptr; + MachineBranchProbabilityInfo *MBPI = nullptr; const X86InstrInfo *TII = nullptr; const X86RegisterInfo *TRI = nullptr; SmallVector, 2> @@ -120,11 +133,12 @@ char X86AvoidSFBPass::ID = 0; -INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", - false, false) +INITIALIZE_PASS_BEGIN(X86AvoidSFBPass, DEBUG_TYPE, + "X86 void store forward block", false, false) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, "Machine code sinking", false, - false) +INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) +INITIALIZE_PASS_END(X86AvoidSFBPass, DEBUG_TYPE, + "X86 avoid store forward block", false, false) FunctionPass *llvm::createX86AvoidStoreForwardingBlocks() { return new X86AvoidSFBPass(); @@ -315,7 +329,8 @@ const MachineOperand &Disp = getDispOperand(MI); const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt); const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg); - const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg); + const MachineOperand &Segment = + MI->getOperand(AddrOffset + X86::AddrSegmentReg); if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI())) return false; @@ -361,6 +376,9 @@ MachineBasicBlock *MBB = LoadInst->getParent(); int LimitLeft = InspectionLimit - BlockCount; for (MachineBasicBlock *PMBB : MBB->predecessors()) { + // Accessed address in a self-loop may change every iteration. + if (PMBB == MBB) + continue; int PredCount = 0; for (MachineInstr &PBInst : llvm::reverse(*PMBB)) { if (PBInst.isMetaInstruction()) @@ -544,8 +562,8 @@ if (StoreMI.getParent() == MI.getParent() && isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) && isRelevantAddressingMode(&MI) && - isRelevantAddressingMode(&StoreMI) && - MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) { + isRelevantAddressingMode(&StoreMI) && MI.hasOneMemOperand() && + StoreMI.hasOneMemOperand()) { if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin())) BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI)); } @@ -555,7 +573,7 @@ unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, - *LoadInst->getParent()->getParent()); + *LoadInst->getParent()->getParent()); return TRI->getRegSizeInBits(*TRC) / 8; } @@ -671,6 +689,8 @@ TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); AA = &getAnalysis().getAAResults(); + MBFI = &getAnalysis(); + MBPI = &getAnalysis(); LLVM_DEBUG(dbgs() << "Start X86AvoidStoreForwardBlocks\n";); // Look for a load then a store to XMM/YMM which look like a memcpy findPotentiallylBlockedCopies(MF); @@ -680,6 +700,10 @@ int64_t LdDispImm = getDispOperand(LoadInst).getImm(); DisplacementSizeMap BlockingStoresDispSizeMap; + BlockFrequency WeightedFreq; + auto *LoadMBB = LoadInst->getParent(); + const MachineBasicBlock *LastPredMBB = nullptr; + SmallVector, 2> PredStoreInfo; SmallVector PotentialBlockers = findPotentialBlockers(LoadInst); for (auto *PBInst : PotentialBlockers) { @@ -690,14 +714,39 @@ int64_t PBstDispImm = getDispOperand(PBInst).getImm(); unsigned PBstSize = (*PBInst->memoperands_begin())->getSize(); // This check doesn't cover all cases, but it will suffice for now. - // TODO: take branch probability into consideration, if the blocking - // store is in an unreached block, breaking the memcopy could lose - // performance. if (hasSameBaseOpValue(LoadInst, PBInst) && isBlockingStore(LdDispImm, getRegSizeInBytes(LoadInst), PBstDispImm, - PBstSize)) - updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, PBstDispImm, - PBstSize); + PBstSize)) { + auto *StoreMBB = PBInst->getParent(); + if (StoreMBB != LoadMBB) { + PredStoreInfo.push_back({PBstDispImm, PBstSize}); + if (LastPredMBB != StoreMBB) { + WeightedFreq += MBFI->getBlockFreq(StoreMBB) * + MBPI->getEdgeProbability(StoreMBB, LoadMBB); + LastPredMBB = StoreMBB; + } + } else { + updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, + PBstDispImm, PBstSize); + } + } + } + + // Take branch probability into consideration, if the blocking + // store is in an unreached block, breaking the memcopy could lose + // performance. + if (WeightedFreq.getFrequency() != 0) { + BlockFrequency TotalWeightedFreq; + for (auto *PMBB : LoadMBB->predecessors()) + TotalWeightedFreq += + MBFI->getBlockFreq(PMBB) * MBPI->getEdgeProbability(PMBB, LoadMBB); + auto HotProb = + BranchProbability::getBranchProbability(StaticLikelyProb, 100); + if (WeightedFreq >= TotalWeightedFreq * HotProb) { + for (auto [PBstDispImm, PBstSize] : PredStoreInfo) + updateBlockingStoresDispSizeMap(BlockingStoresDispSizeMap, + PBstDispImm, PBstSize); + } } if (BlockingStoresDispSizeMap.empty()) Index: llvm/test/CodeGen/X86/avoid-sfb.ll =================================================================== --- llvm/test/CodeGen/X86/avoid-sfb.ll +++ llvm/test/CodeGen/X86/avoid-sfb.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=x86-64 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=x86-64 --x86-disable-avoid-SFB -verify-machineinstrs | FileCheck %s --check-prefix=DISABLED -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-AVX2 -; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-AVX512 +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=x86-64 -verify-machineinstrs | FileCheck %s --check-prefixes=SSE,CHECK +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=x86-64 --x86-disable-avoid-SFB -verify-machineinstrs | FileCheck %s --check-prefixes=SSE,DISABLED +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,CHECK-AVX2 +; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx -verify-machineinstrs | FileCheck %s --check-prefixes=AVX,CHECK-AVX512 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" @@ -11,69 +11,31 @@ ; Function Attrs: nounwind uwtable define void @test_conditional_block(ptr nocapture noalias %s1 , ptr nocapture noalias %s2, i32 %x, ptr nocapture noalias %s3, ptr nocapture noalias readonly %s4) local_unnamed_addr #0 { -; CHECK-LABEL: test_conditional_block: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB0_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %edx, 4(%rdi) -; CHECK-NEXT: .LBB0_2: # %if.end -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl 4(%rdi), %eax -; CHECK-NEXT: movl %eax, 4(%rsi) -; CHECK-NEXT: movq 8(%rdi), %rax -; CHECK-NEXT: movq %rax, 8(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_conditional_block: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB0_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %edx, 4(%rdi) -; DISABLED-NEXT: .LBB0_2: # %if.end -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_conditional_block: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB0_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX2-NEXT: .LBB0_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_conditional_block: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB0_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX512-NEXT: .LBB0_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_conditional_block: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB0_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movl %edx, 4(%rdi) +; SSE-NEXT: .LBB0_2: # %if.end +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_conditional_block: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB0_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl %edx, 4(%rdi) +; AVX-NEXT: .LBB0_2: # %if.end +; AVX-NEXT: vmovups (%r8), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rsi) +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -111,29 +73,17 @@ ; DISABLED-NEXT: movups %xmm0, (%rsi) ; DISABLED-NEXT: retq ; -; CHECK-AVX2-LABEL: test_imm_store: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movl $0, (%rdi) -; CHECK-AVX2-NEXT: movl $1, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movq 4(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_imm_store: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movl $0, (%rdi) -; CHECK-AVX512-NEXT: movl $1, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movq 4(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX512-NEXT: retq +; AVX-LABEL: test_imm_store: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl $0, (%rdi) +; AVX-NEXT: movl $1, (%rcx) +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: movl %eax, (%rsi) +; AVX-NEXT: movq 4(%rdi), %rax +; AVX-NEXT: movq %rax, 4(%rsi) +; AVX-NEXT: movl 12(%rdi), %eax +; AVX-NEXT: movl %eax, 12(%rsi) +; AVX-NEXT: retq entry: store i32 0, ptr %s1, align 4 store i32 1, ptr %s3, align 4 @@ -143,89 +93,41 @@ ; Function Attrs: nounwind uwtable define void @test_nondirect_br(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { -; CHECK-LABEL: test_nondirect_br: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB2_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %edx, 4(%rdi) -; CHECK-NEXT: .LBB2_2: # %if.end -; CHECK-NEXT: cmpl $14, %r9d -; CHECK-NEXT: jl .LBB2_4 -; CHECK-NEXT: # %bb.3: # %if.then2 -; CHECK-NEXT: movl %r9d, 12(%rdi) -; CHECK-NEXT: .LBB2_4: # %if.end3 -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movl 8(%rdi), %eax -; CHECK-NEXT: movl %eax, 8(%rsi) -; CHECK-NEXT: movl 12(%rdi), %eax -; CHECK-NEXT: movl %eax, 12(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_nondirect_br: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB2_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %edx, 4(%rdi) -; DISABLED-NEXT: .LBB2_2: # %if.end -; DISABLED-NEXT: cmpl $14, %r9d -; DISABLED-NEXT: jl .LBB2_4 -; DISABLED-NEXT: # %bb.3: # %if.then2 -; DISABLED-NEXT: movl %r9d, 12(%rdi) -; DISABLED-NEXT: .LBB2_4: # %if.end3 -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_nondirect_br: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB2_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX2-NEXT: .LBB2_2: # %if.end -; CHECK-AVX2-NEXT: cmpl $14, %r9d -; CHECK-AVX2-NEXT: jl .LBB2_4 -; CHECK-AVX2-NEXT: # %bb.3: # %if.then2 -; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX2-NEXT: .LBB2_4: # %if.end3 -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movl 8(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_nondirect_br: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB2_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX512-NEXT: .LBB2_2: # %if.end -; CHECK-AVX512-NEXT: cmpl $14, %r9d -; CHECK-AVX512-NEXT: jl .LBB2_4 -; CHECK-AVX512-NEXT: # %bb.3: # %if.then2 -; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX512-NEXT: .LBB2_4: # %if.end3 -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movl 8(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_nondirect_br: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB2_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movl %edx, 4(%rdi) +; SSE-NEXT: .LBB2_2: # %if.end +; SSE-NEXT: cmpl $14, %r9d +; SSE-NEXT: jl .LBB2_4 +; SSE-NEXT: # %bb.3: # %if.then2 +; SSE-NEXT: movl %r9d, 12(%rdi) +; SSE-NEXT: .LBB2_4: # %if.end3 +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_nondirect_br: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB2_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl %edx, 4(%rdi) +; AVX-NEXT: .LBB2_2: # %if.end +; AVX-NEXT: cmpl $14, %r9d +; AVX-NEXT: jl .LBB2_4 +; AVX-NEXT: # %bb.3: # %if.then2 +; AVX-NEXT: movl %r9d, 12(%rdi) +; AVX-NEXT: .LBB2_4: # %if.end3 +; AVX-NEXT: vmovups (%r8), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rsi) +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -286,45 +188,25 @@ ; DISABLED-NEXT: movups %xmm0, (%rsi) ; DISABLED-NEXT: retq ; -; CHECK-AVX2-LABEL: test_2preds_block: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB3_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX2-NEXT: .LBB3_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: movl 8(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_2preds_block: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB3_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %edx, 4(%rdi) -; CHECK-AVX512-NEXT: .LBB3_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: movl 8(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX512-NEXT: retq +; AVX-LABEL: test_2preds_block: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %r9d, 12(%rdi) +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB3_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl %edx, 4(%rdi) +; AVX-NEXT: .LBB3_2: # %if.end +; AVX-NEXT: vmovups (%r8), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rcx) +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: movl %eax, (%rsi) +; AVX-NEXT: movl 4(%rdi), %eax +; AVX-NEXT: movl %eax, 4(%rsi) +; AVX-NEXT: movl 8(%rdi), %eax +; AVX-NEXT: movl %eax, 8(%rsi) +; AVX-NEXT: movl 12(%rdi), %eax +; AVX-NEXT: movl %eax, 12(%rsi) +; AVX-NEXT: retq entry: %d = getelementptr inbounds %struct.S, ptr %s1, i64 0, i32 3 store i32 %x2, ptr %d, align 4 @@ -345,67 +227,33 @@ ; Function Attrs: nounwind uwtable define void @test_type64(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4) local_unnamed_addr #0 { -; CHECK-LABEL: test_type64: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB4_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movslq %edx, %rax -; CHECK-NEXT: movq %rax, 8(%rdi) -; CHECK-NEXT: .LBB4_2: # %if.end -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq 8(%rdi), %rax -; CHECK-NEXT: movq %rax, 8(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_type64: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB4_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movslq %edx, %rax -; DISABLED-NEXT: movq %rax, 8(%rdi) -; DISABLED-NEXT: .LBB4_2: # %if.end -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_type64: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB4_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movslq %edx, %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rdi) -; CHECK-AVX2-NEXT: .LBB4_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_type64: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB4_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movslq %edx, %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rdi) -; CHECK-AVX512-NEXT: .LBB4_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_type64: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB4_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movslq %edx, %rax +; SSE-NEXT: movq %rax, 8(%rdi) +; SSE-NEXT: .LBB4_2: # %if.end +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_type64: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB4_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movslq %edx, %rax +; AVX-NEXT: movq %rax, 8(%rdi) +; AVX-NEXT: .LBB4_2: # %if.end +; AVX-NEXT: vmovups (%r8), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rsi) +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -425,81 +273,31 @@ ; Function Attrs: noinline nounwind uwtable define void @test_mixed_type(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture readnone %s3, ptr nocapture readnone %s4) local_unnamed_addr #0 { -; CHECK-LABEL: test_mixed_type: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB5_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movslq %edx, %rax -; CHECK-NEXT: movq %rax, (%rdi) -; CHECK-NEXT: movb %dl, 8(%rdi) -; CHECK-NEXT: .LBB5_2: # %if.end -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movzbl 8(%rdi), %eax -; CHECK-NEXT: movb %al, 8(%rsi) -; CHECK-NEXT: movl 9(%rdi), %eax -; CHECK-NEXT: movl %eax, 9(%rsi) -; CHECK-NEXT: movzwl 13(%rdi), %eax -; CHECK-NEXT: movw %ax, 13(%rsi) -; CHECK-NEXT: movzbl 15(%rdi), %eax -; CHECK-NEXT: movb %al, 15(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_mixed_type: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB5_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movslq %edx, %rax -; DISABLED-NEXT: movq %rax, (%rdi) -; DISABLED-NEXT: movb %dl, 8(%rdi) -; DISABLED-NEXT: .LBB5_2: # %if.end -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_mixed_type: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB5_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movslq %edx, %rax -; CHECK-AVX2-NEXT: movq %rax, (%rdi) -; CHECK-AVX2-NEXT: movb %dl, 8(%rdi) -; CHECK-AVX2-NEXT: .LBB5_2: # %if.end -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movzbl 8(%rdi), %eax -; CHECK-AVX2-NEXT: movb %al, 8(%rsi) -; CHECK-AVX2-NEXT: movl 9(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 9(%rsi) -; CHECK-AVX2-NEXT: movzwl 13(%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, 13(%rsi) -; CHECK-AVX2-NEXT: movzbl 15(%rdi), %eax -; CHECK-AVX2-NEXT: movb %al, 15(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_mixed_type: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB5_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movslq %edx, %rax -; CHECK-AVX512-NEXT: movq %rax, (%rdi) -; CHECK-AVX512-NEXT: movb %dl, 8(%rdi) -; CHECK-AVX512-NEXT: .LBB5_2: # %if.end -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movzbl 8(%rdi), %eax -; CHECK-AVX512-NEXT: movb %al, 8(%rsi) -; CHECK-AVX512-NEXT: movl 9(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 9(%rsi) -; CHECK-AVX512-NEXT: movzwl 13(%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, 13(%rsi) -; CHECK-AVX512-NEXT: movzbl 15(%rdi), %eax -; CHECK-AVX512-NEXT: movb %al, 15(%rsi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_mixed_type: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB5_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movslq %edx, %rax +; SSE-NEXT: movq %rax, (%rdi) +; SSE-NEXT: movb %dl, 8(%rdi) +; SSE-NEXT: .LBB5_2: # %if.end +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_mixed_type: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB5_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movslq %edx, %rax +; AVX-NEXT: movq %rax, (%rdi) +; AVX-NEXT: movb %dl, 8(%rdi) +; AVX-NEXT: .LBB5_2: # %if.end +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rsi) +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -552,49 +350,27 @@ ; DISABLED-NEXT: movups %xmm0, (%rsi) ; DISABLED-NEXT: retq ; -; CHECK-AVX2-LABEL: test_multiple_blocks: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movl $0, 4(%rdi) -; CHECK-AVX2-NEXT: movl $0, 36(%rdi) -; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi) -; CHECK-AVX2-NEXT: movl 32(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 32(%rsi) -; CHECK-AVX2-NEXT: movl 36(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 36(%rsi) -; CHECK-AVX2-NEXT: movq 40(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 40(%rsi) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX2-NEXT: movq 24(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 24(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_multiple_blocks: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movl $0, 4(%rdi) -; CHECK-AVX512-NEXT: movl $0, 36(%rdi) -; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi) -; CHECK-AVX512-NEXT: movl 32(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 32(%rsi) -; CHECK-AVX512-NEXT: movl 36(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 36(%rsi) -; CHECK-AVX512-NEXT: movq 40(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 40(%rsi) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX512-NEXT: movq 24(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 24(%rsi) -; CHECK-AVX512-NEXT: retq +; AVX-LABEL: test_multiple_blocks: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl $0, 4(%rdi) +; AVX-NEXT: movl $0, 36(%rdi) +; AVX-NEXT: vmovups 16(%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, 16(%rsi) +; AVX-NEXT: movl 32(%rdi), %eax +; AVX-NEXT: movl %eax, 32(%rsi) +; AVX-NEXT: movl 36(%rdi), %eax +; AVX-NEXT: movl %eax, 36(%rsi) +; AVX-NEXT: movq 40(%rdi), %rax +; AVX-NEXT: movq %rax, 40(%rsi) +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: movl %eax, (%rsi) +; AVX-NEXT: movl 4(%rdi), %eax +; AVX-NEXT: movl %eax, 4(%rsi) +; AVX-NEXT: vmovups 8(%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, 8(%rsi) +; AVX-NEXT: movq 24(%rdi), %rax +; AVX-NEXT: movq %rax, 24(%rsi) +; AVX-NEXT: retq entry: %b = getelementptr inbounds %struct.S4, ptr %s1, i64 0, i32 1 store i32 0, ptr %b, align 4 @@ -607,75 +383,31 @@ ; Function Attrs: nounwind uwtable define void @test_type16(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4) local_unnamed_addr #0 { -; CHECK-LABEL: test_type16: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB7_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movw %dx, 2(%rdi) -; CHECK-NEXT: .LBB7_2: # %if.end -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movzwl (%rdi), %eax -; CHECK-NEXT: movw %ax, (%rsi) -; CHECK-NEXT: movzwl 2(%rdi), %eax -; CHECK-NEXT: movw %ax, 2(%rsi) -; CHECK-NEXT: movq 4(%rdi), %rax -; CHECK-NEXT: movq %rax, 4(%rsi) -; CHECK-NEXT: movl 12(%rdi), %eax -; CHECK-NEXT: movl %eax, 12(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_type16: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB7_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movw %dx, 2(%rdi) -; DISABLED-NEXT: .LBB7_2: # %if.end -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_type16: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB7_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movw %dx, 2(%rdi) -; CHECK-AVX2-NEXT: .LBB7_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX2-NEXT: movzwl (%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, (%rsi) -; CHECK-AVX2-NEXT: movzwl 2(%rdi), %eax -; CHECK-AVX2-NEXT: movw %ax, 2(%rsi) -; CHECK-AVX2-NEXT: movq 4(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_type16: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB7_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movw %dx, 2(%rdi) -; CHECK-AVX512-NEXT: .LBB7_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rcx) -; CHECK-AVX512-NEXT: movzwl (%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, (%rsi) -; CHECK-AVX512-NEXT: movzwl 2(%rdi), %eax -; CHECK-AVX512-NEXT: movw %ax, 2(%rsi) -; CHECK-AVX512-NEXT: movq 4(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 4(%rsi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%rsi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_type16: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB7_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movw %dx, 2(%rdi) +; SSE-NEXT: .LBB7_2: # %if.end +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_type16: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB7_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movw %dx, 2(%rdi) +; AVX-NEXT: .LBB7_2: # %if.end +; AVX-NEXT: vmovups (%r8), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rsi) +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -732,49 +464,27 @@ ; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: retq ; -; CHECK-AVX2-LABEL: test_stack: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movq %rdi, %rax -; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%rdi) -; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-AVX2-NEXT: movq %rcx, 16(%rdi) -; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX2-NEXT: movl %ecx, 24(%rdi) -; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX2-NEXT: movl %ecx, 28(%rdi) -; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-AVX2-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX2-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX2-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_stack: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movq %rdi, %rax -; CHECK-AVX512-NEXT: movl %esi, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%rdi) -; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-AVX512-NEXT: movq %rcx, 16(%rdi) -; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX512-NEXT: movl %ecx, 24(%rdi) -; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX512-NEXT: movl %ecx, 28(%rdi) -; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; CHECK-AVX512-NEXT: movq %rcx, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %ecx -; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%rsp) -; CHECK-AVX512-NEXT: retq +; AVX-LABEL: test_stack: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movq %rdi, %rax +; AVX-NEXT: movl %esi, {{[0-9]+}}(%rsp) +; AVX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %rcx, 16(%rdi) +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: movl %ecx, 24(%rdi) +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: movl %ecx, 28(%rdi) +; AVX-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 +; AVX-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) +; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX-NEXT: movq %rcx, {{[0-9]+}}(%rsp) +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; AVX-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; AVX-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; AVX-NEXT: retq entry: %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, ptr %s2, i64 0, i32 3 store i32 %x, ptr %s6.sroa.3.0..sroa_idx4, align 8 @@ -786,193 +496,99 @@ ; Function Attrs: nounwind uwtable define void @test_limit_all(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { -; CHECK-LABEL: test_limit_all: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset %rbx, -48 -; CHECK-NEXT: .cfi_offset %r12, -40 -; CHECK-NEXT: .cfi_offset %r14, -32 -; CHECK-NEXT: .cfi_offset %r15, -24 -; CHECK-NEXT: .cfi_offset %rbp, -16 -; CHECK-NEXT: movq %r8, %r15 -; CHECK-NEXT: movq %rcx, %r14 -; CHECK-NEXT: movl %edx, %ebp -; CHECK-NEXT: movq %rsi, %r12 -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl %r9d, 12(%rdi) -; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: cmpl $18, %ebp -; CHECK-NEXT: jl .LBB9_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %ebp, 4(%rbx) -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: .LBB9_2: # %if.end -; CHECK-NEXT: movups (%r15), %xmm0 -; CHECK-NEXT: movups %xmm0, (%r14) -; CHECK-NEXT: movups (%rbx), %xmm0 -; CHECK-NEXT: movups %xmm0, (%r12) -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %rbp -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_limit_all: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: pushq %rbp -; DISABLED-NEXT: .cfi_def_cfa_offset 16 -; DISABLED-NEXT: pushq %r15 -; DISABLED-NEXT: .cfi_def_cfa_offset 24 -; DISABLED-NEXT: pushq %r14 -; DISABLED-NEXT: .cfi_def_cfa_offset 32 -; DISABLED-NEXT: pushq %r12 -; DISABLED-NEXT: .cfi_def_cfa_offset 40 -; DISABLED-NEXT: pushq %rbx -; DISABLED-NEXT: .cfi_def_cfa_offset 48 -; DISABLED-NEXT: .cfi_offset %rbx, -48 -; DISABLED-NEXT: .cfi_offset %r12, -40 -; DISABLED-NEXT: .cfi_offset %r14, -32 -; DISABLED-NEXT: .cfi_offset %r15, -24 -; DISABLED-NEXT: .cfi_offset %rbp, -16 -; DISABLED-NEXT: movq %r8, %r15 -; DISABLED-NEXT: movq %rcx, %r14 -; DISABLED-NEXT: movl %edx, %ebp -; DISABLED-NEXT: movq %rsi, %r12 -; DISABLED-NEXT: movq %rdi, %rbx -; DISABLED-NEXT: movl %r9d, 12(%rdi) -; DISABLED-NEXT: callq bar@PLT -; DISABLED-NEXT: cmpl $18, %ebp -; DISABLED-NEXT: jl .LBB9_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %ebp, 4(%rbx) -; DISABLED-NEXT: movq %rbx, %rdi -; DISABLED-NEXT: callq bar@PLT -; DISABLED-NEXT: .LBB9_2: # %if.end -; DISABLED-NEXT: movups (%r15), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r14) -; DISABLED-NEXT: movups (%rbx), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r12) -; DISABLED-NEXT: popq %rbx -; DISABLED-NEXT: .cfi_def_cfa_offset 40 -; DISABLED-NEXT: popq %r12 -; DISABLED-NEXT: .cfi_def_cfa_offset 32 -; DISABLED-NEXT: popq %r14 -; DISABLED-NEXT: .cfi_def_cfa_offset 24 -; DISABLED-NEXT: popq %r15 -; DISABLED-NEXT: .cfi_def_cfa_offset 16 -; DISABLED-NEXT: popq %rbp -; DISABLED-NEXT: .cfi_def_cfa_offset 8 -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_limit_all: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: pushq %rbp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: pushq %r15 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX2-NEXT: pushq %r14 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX2-NEXT: pushq %r12 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX2-NEXT: pushq %rbx -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX2-NEXT: .cfi_offset %rbx, -48 -; CHECK-AVX2-NEXT: .cfi_offset %r12, -40 -; CHECK-AVX2-NEXT: .cfi_offset %r14, -32 -; CHECK-AVX2-NEXT: .cfi_offset %r15, -24 -; CHECK-AVX2-NEXT: .cfi_offset %rbp, -16 -; CHECK-AVX2-NEXT: movq %r8, %r15 -; CHECK-AVX2-NEXT: movq %rcx, %r14 -; CHECK-AVX2-NEXT: movl %edx, %ebp -; CHECK-AVX2-NEXT: movq %rsi, %r12 -; CHECK-AVX2-NEXT: movq %rdi, %rbx -; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX2-NEXT: callq bar@PLT -; CHECK-AVX2-NEXT: cmpl $18, %ebp -; CHECK-AVX2-NEXT: jl .LBB9_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %ebp, 4(%rbx) -; CHECK-AVX2-NEXT: movq %rbx, %rdi -; CHECK-AVX2-NEXT: callq bar@PLT -; CHECK-AVX2-NEXT: .LBB9_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r15), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX2-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r12) -; CHECK-AVX2-NEXT: popq %rbx -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX2-NEXT: popq %r12 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX2-NEXT: popq %r14 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX2-NEXT: popq %r15 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: popq %rbp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_limit_all: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: pushq %rbp -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX512-NEXT: pushq %r15 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX512-NEXT: pushq %r14 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX512-NEXT: pushq %r12 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX512-NEXT: pushq %rbx -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX512-NEXT: .cfi_offset %rbx, -48 -; CHECK-AVX512-NEXT: .cfi_offset %r12, -40 -; CHECK-AVX512-NEXT: .cfi_offset %r14, -32 -; CHECK-AVX512-NEXT: .cfi_offset %r15, -24 -; CHECK-AVX512-NEXT: .cfi_offset %rbp, -16 -; CHECK-AVX512-NEXT: movq %r8, %r15 -; CHECK-AVX512-NEXT: movq %rcx, %r14 -; CHECK-AVX512-NEXT: movl %edx, %ebp -; CHECK-AVX512-NEXT: movq %rsi, %r12 -; CHECK-AVX512-NEXT: movq %rdi, %rbx -; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX512-NEXT: callq bar@PLT -; CHECK-AVX512-NEXT: cmpl $18, %ebp -; CHECK-AVX512-NEXT: jl .LBB9_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %ebp, 4(%rbx) -; CHECK-AVX512-NEXT: movq %rbx, %rdi -; CHECK-AVX512-NEXT: callq bar@PLT -; CHECK-AVX512-NEXT: .LBB9_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r15), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r14) -; CHECK-AVX512-NEXT: vmovups (%rbx), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r12) -; CHECK-AVX512-NEXT: popq %rbx -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX512-NEXT: popq %r12 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX512-NEXT: popq %r14 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX512-NEXT: popq %r15 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX512-NEXT: popq %rbp -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_limit_all: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %rbp +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: pushq %r15 +; SSE-NEXT: .cfi_def_cfa_offset 24 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: .cfi_def_cfa_offset 32 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: .cfi_def_cfa_offset 40 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: .cfi_def_cfa_offset 48 +; SSE-NEXT: .cfi_offset %rbx, -48 +; SSE-NEXT: .cfi_offset %r12, -40 +; SSE-NEXT: .cfi_offset %r14, -32 +; SSE-NEXT: .cfi_offset %r15, -24 +; SSE-NEXT: .cfi_offset %rbp, -16 +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movl %edx, %ebp +; SSE-NEXT: movq %rsi, %r12 +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: movl %r9d, 12(%rdi) +; SSE-NEXT: callq bar@PLT +; SSE-NEXT: cmpl $18, %ebp +; SSE-NEXT: jl .LBB9_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movl %ebp, 4(%rbx) +; SSE-NEXT: movq %rbx, %rdi +; SSE-NEXT: callq bar@PLT +; SSE-NEXT: .LBB9_2: # %if.end +; SSE-NEXT: movups (%r15), %xmm0 +; SSE-NEXT: movups %xmm0, (%r14) +; SSE-NEXT: movups (%rbx), %xmm0 +; SSE-NEXT: movups %xmm0, (%r12) +; SSE-NEXT: popq %rbx +; SSE-NEXT: .cfi_def_cfa_offset 40 +; SSE-NEXT: popq %r12 +; SSE-NEXT: .cfi_def_cfa_offset 32 +; SSE-NEXT: popq %r14 +; SSE-NEXT: .cfi_def_cfa_offset 24 +; SSE-NEXT: popq %r15 +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: popq %rbp +; SSE-NEXT: .cfi_def_cfa_offset 8 +; SSE-NEXT: retq +; +; AVX-LABEL: test_limit_all: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %rbp +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: pushq %r15 +; AVX-NEXT: .cfi_def_cfa_offset 24 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: .cfi_def_cfa_offset 40 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: .cfi_offset %rbx, -48 +; AVX-NEXT: .cfi_offset %r12, -40 +; AVX-NEXT: .cfi_offset %r14, -32 +; AVX-NEXT: .cfi_offset %r15, -24 +; AVX-NEXT: .cfi_offset %rbp, -16 +; AVX-NEXT: movq %r8, %r15 +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movl %edx, %ebp +; AVX-NEXT: movq %rsi, %r12 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: movl %r9d, 12(%rdi) +; AVX-NEXT: callq bar@PLT +; AVX-NEXT: cmpl $18, %ebp +; AVX-NEXT: jl .LBB9_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl %ebp, 4(%rbx) +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: callq bar@PLT +; AVX-NEXT: .LBB9_2: # %if.end +; AVX-NEXT: vmovups (%r15), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%r14) +; AVX-NEXT: vmovups (%rbx), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%r12) +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 40 +; AVX-NEXT: popq %r12 +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: popq %r14 +; AVX-NEXT: .cfi_def_cfa_offset 24 +; AVX-NEXT: popq %r15 +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %rbp +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq entry: %d = getelementptr inbounds %struct.S, ptr %s1, i64 0, i32 3 store i32 %x2, ptr %d, align 4 @@ -994,193 +610,93 @@ ; Function Attrs: nounwind uwtable define void @test_limit_one_pred(ptr noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { -; CHECK-LABEL: test_limit_one_pred: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: pushq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: pushq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: pushq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: .cfi_def_cfa_offset 48 -; CHECK-NEXT: .cfi_offset %rbx, -40 -; CHECK-NEXT: .cfi_offset %r12, -32 -; CHECK-NEXT: .cfi_offset %r14, -24 -; CHECK-NEXT: .cfi_offset %r15, -16 -; CHECK-NEXT: movq %r8, %r12 -; CHECK-NEXT: movq %rcx, %r15 -; CHECK-NEXT: movq %rsi, %r14 -; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: movl %r9d, 12(%rdi) -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB10_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl %edx, 4(%rbx) -; CHECK-NEXT: movq %rbx, %rdi -; CHECK-NEXT: callq bar@PLT -; CHECK-NEXT: .LBB10_2: # %if.end -; CHECK-NEXT: movups (%r12), %xmm0 -; CHECK-NEXT: movups %xmm0, (%r15) -; CHECK-NEXT: movq (%rbx), %rax -; CHECK-NEXT: movq %rax, (%r14) -; CHECK-NEXT: movl 8(%rbx), %eax -; CHECK-NEXT: movl %eax, 8(%r14) -; CHECK-NEXT: movl 12(%rbx), %eax -; CHECK-NEXT: movl %eax, 12(%r14) -; CHECK-NEXT: addq $8, %rsp -; CHECK-NEXT: .cfi_def_cfa_offset 40 -; CHECK-NEXT: popq %rbx -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: popq %r12 -; CHECK-NEXT: .cfi_def_cfa_offset 24 -; CHECK-NEXT: popq %r14 -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: popq %r15 -; CHECK-NEXT: .cfi_def_cfa_offset 8 -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_limit_one_pred: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: pushq %r15 -; DISABLED-NEXT: .cfi_def_cfa_offset 16 -; DISABLED-NEXT: pushq %r14 -; DISABLED-NEXT: .cfi_def_cfa_offset 24 -; DISABLED-NEXT: pushq %r12 -; DISABLED-NEXT: .cfi_def_cfa_offset 32 -; DISABLED-NEXT: pushq %rbx -; DISABLED-NEXT: .cfi_def_cfa_offset 40 -; DISABLED-NEXT: pushq %rax -; DISABLED-NEXT: .cfi_def_cfa_offset 48 -; DISABLED-NEXT: .cfi_offset %rbx, -40 -; DISABLED-NEXT: .cfi_offset %r12, -32 -; DISABLED-NEXT: .cfi_offset %r14, -24 -; DISABLED-NEXT: .cfi_offset %r15, -16 -; DISABLED-NEXT: movq %r8, %r15 -; DISABLED-NEXT: movq %rcx, %r14 -; DISABLED-NEXT: movq %rsi, %r12 -; DISABLED-NEXT: movq %rdi, %rbx -; DISABLED-NEXT: movl %r9d, 12(%rdi) -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB10_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl %edx, 4(%rbx) -; DISABLED-NEXT: movq %rbx, %rdi -; DISABLED-NEXT: callq bar@PLT -; DISABLED-NEXT: .LBB10_2: # %if.end -; DISABLED-NEXT: movups (%r15), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r14) -; DISABLED-NEXT: movups (%rbx), %xmm0 -; DISABLED-NEXT: movups %xmm0, (%r12) -; DISABLED-NEXT: addq $8, %rsp -; DISABLED-NEXT: .cfi_def_cfa_offset 40 -; DISABLED-NEXT: popq %rbx -; DISABLED-NEXT: .cfi_def_cfa_offset 32 -; DISABLED-NEXT: popq %r12 -; DISABLED-NEXT: .cfi_def_cfa_offset 24 -; DISABLED-NEXT: popq %r14 -; DISABLED-NEXT: .cfi_def_cfa_offset 16 -; DISABLED-NEXT: popq %r15 -; DISABLED-NEXT: .cfi_def_cfa_offset 8 -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_limit_one_pred: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: pushq %r15 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: pushq %r14 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX2-NEXT: pushq %r12 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX2-NEXT: pushq %rbx -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX2-NEXT: pushq %rax -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX2-NEXT: .cfi_offset %rbx, -40 -; CHECK-AVX2-NEXT: .cfi_offset %r12, -32 -; CHECK-AVX2-NEXT: .cfi_offset %r14, -24 -; CHECK-AVX2-NEXT: .cfi_offset %r15, -16 -; CHECK-AVX2-NEXT: movq %r8, %r12 -; CHECK-AVX2-NEXT: movq %rcx, %r15 -; CHECK-AVX2-NEXT: movq %rsi, %r14 -; CHECK-AVX2-NEXT: movq %rdi, %rbx -; CHECK-AVX2-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB10_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl %edx, 4(%rbx) -; CHECK-AVX2-NEXT: movq %rbx, %rdi -; CHECK-AVX2-NEXT: callq bar@PLT -; CHECK-AVX2-NEXT: .LBB10_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX2-NEXT: movq (%rbx), %rax -; CHECK-AVX2-NEXT: movq %rax, (%r14) -; CHECK-AVX2-NEXT: movl 8(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 8(%r14) -; CHECK-AVX2-NEXT: movl 12(%rbx), %eax -; CHECK-AVX2-NEXT: movl %eax, 12(%r14) -; CHECK-AVX2-NEXT: addq $8, %rsp -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX2-NEXT: popq %rbx -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX2-NEXT: popq %r12 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX2-NEXT: popq %r14 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX2-NEXT: popq %r15 -; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_limit_one_pred: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: pushq %r15 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX512-NEXT: pushq %r14 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX512-NEXT: pushq %r12 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX512-NEXT: pushq %rbx -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX512-NEXT: pushq %rax -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 48 -; CHECK-AVX512-NEXT: .cfi_offset %rbx, -40 -; CHECK-AVX512-NEXT: .cfi_offset %r12, -32 -; CHECK-AVX512-NEXT: .cfi_offset %r14, -24 -; CHECK-AVX512-NEXT: .cfi_offset %r15, -16 -; CHECK-AVX512-NEXT: movq %r8, %r12 -; CHECK-AVX512-NEXT: movq %rcx, %r15 -; CHECK-AVX512-NEXT: movq %rsi, %r14 -; CHECK-AVX512-NEXT: movq %rdi, %rbx -; CHECK-AVX512-NEXT: movl %r9d, 12(%rdi) -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB10_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl %edx, 4(%rbx) -; CHECK-AVX512-NEXT: movq %rbx, %rdi -; CHECK-AVX512-NEXT: callq bar@PLT -; CHECK-AVX512-NEXT: .LBB10_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r12), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, (%r15) -; CHECK-AVX512-NEXT: movq (%rbx), %rax -; CHECK-AVX512-NEXT: movq %rax, (%r14) -; CHECK-AVX512-NEXT: movl 8(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 8(%r14) -; CHECK-AVX512-NEXT: movl 12(%rbx), %eax -; CHECK-AVX512-NEXT: movl %eax, 12(%r14) -; CHECK-AVX512-NEXT: addq $8, %rsp -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 40 -; CHECK-AVX512-NEXT: popq %rbx -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 32 -; CHECK-AVX512-NEXT: popq %r12 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 24 -; CHECK-AVX512-NEXT: popq %r14 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 16 -; CHECK-AVX512-NEXT: popq %r15 -; CHECK-AVX512-NEXT: .cfi_def_cfa_offset 8 -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_limit_one_pred: +; SSE: # %bb.0: # %entry +; SSE-NEXT: pushq %r15 +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: pushq %r14 +; SSE-NEXT: .cfi_def_cfa_offset 24 +; SSE-NEXT: pushq %r12 +; SSE-NEXT: .cfi_def_cfa_offset 32 +; SSE-NEXT: pushq %rbx +; SSE-NEXT: .cfi_def_cfa_offset 40 +; SSE-NEXT: pushq %rax +; SSE-NEXT: .cfi_def_cfa_offset 48 +; SSE-NEXT: .cfi_offset %rbx, -40 +; SSE-NEXT: .cfi_offset %r12, -32 +; SSE-NEXT: .cfi_offset %r14, -24 +; SSE-NEXT: .cfi_offset %r15, -16 +; SSE-NEXT: movq %r8, %r15 +; SSE-NEXT: movq %rcx, %r14 +; SSE-NEXT: movq %rsi, %r12 +; SSE-NEXT: movq %rdi, %rbx +; SSE-NEXT: movl %r9d, 12(%rdi) +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB10_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movl %edx, 4(%rbx) +; SSE-NEXT: movq %rbx, %rdi +; SSE-NEXT: callq bar@PLT +; SSE-NEXT: .LBB10_2: # %if.end +; SSE-NEXT: movups (%r15), %xmm0 +; SSE-NEXT: movups %xmm0, (%r14) +; SSE-NEXT: movups (%rbx), %xmm0 +; SSE-NEXT: movups %xmm0, (%r12) +; SSE-NEXT: addq $8, %rsp +; SSE-NEXT: .cfi_def_cfa_offset 40 +; SSE-NEXT: popq %rbx +; SSE-NEXT: .cfi_def_cfa_offset 32 +; SSE-NEXT: popq %r12 +; SSE-NEXT: .cfi_def_cfa_offset 24 +; SSE-NEXT: popq %r14 +; SSE-NEXT: .cfi_def_cfa_offset 16 +; SSE-NEXT: popq %r15 +; SSE-NEXT: .cfi_def_cfa_offset 8 +; SSE-NEXT: retq +; +; AVX-LABEL: test_limit_one_pred: +; AVX: # %bb.0: # %entry +; AVX-NEXT: pushq %r15 +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: pushq %r14 +; AVX-NEXT: .cfi_def_cfa_offset 24 +; AVX-NEXT: pushq %r12 +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: pushq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 40 +; AVX-NEXT: pushq %rax +; AVX-NEXT: .cfi_def_cfa_offset 48 +; AVX-NEXT: .cfi_offset %rbx, -40 +; AVX-NEXT: .cfi_offset %r12, -32 +; AVX-NEXT: .cfi_offset %r14, -24 +; AVX-NEXT: .cfi_offset %r15, -16 +; AVX-NEXT: movq %r8, %r15 +; AVX-NEXT: movq %rcx, %r14 +; AVX-NEXT: movq %rsi, %r12 +; AVX-NEXT: movq %rdi, %rbx +; AVX-NEXT: movl %r9d, 12(%rdi) +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB10_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl %edx, 4(%rbx) +; AVX-NEXT: movq %rbx, %rdi +; AVX-NEXT: callq bar@PLT +; AVX-NEXT: .LBB10_2: # %if.end +; AVX-NEXT: vmovups (%r15), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%r14) +; AVX-NEXT: vmovups (%rbx), %xmm0 +; AVX-NEXT: vmovups %xmm0, (%r12) +; AVX-NEXT: addq $8, %rsp +; AVX-NEXT: .cfi_def_cfa_offset 40 +; AVX-NEXT: popq %rbx +; AVX-NEXT: .cfi_def_cfa_offset 32 +; AVX-NEXT: popq %r12 +; AVX-NEXT: .cfi_def_cfa_offset 24 +; AVX-NEXT: popq %r14 +; AVX-NEXT: .cfi_def_cfa_offset 16 +; AVX-NEXT: popq %r15 +; AVX-NEXT: .cfi_def_cfa_offset 8 +; AVX-NEXT: retq entry: %d = getelementptr inbounds %struct.S, ptr %s1, i64 0, i32 3 store i32 %x2, ptr %d, align 4 @@ -1207,83 +723,36 @@ ; Function Attrs: nounwind uwtable define void @test_conditional_block_float(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4, float %y) local_unnamed_addr #0 { -; CHECK-LABEL: test_conditional_block_float: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB11_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 -; CHECK-NEXT: .LBB11_2: # %if.end -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups 16(%r8), %xmm1 -; CHECK-NEXT: movups %xmm1, 16(%rcx) -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movl (%rdi), %eax -; CHECK-NEXT: movl 4(%rdi), %ecx -; CHECK-NEXT: movq 8(%rdi), %rdx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) -; CHECK-NEXT: movl %eax, (%rsi) -; CHECK-NEXT: movl %ecx, 4(%rsi) -; CHECK-NEXT: movq %rdx, 8(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_conditional_block_float: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB11_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 -; DISABLED-NEXT: .LBB11_2: # %if.end -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups 16(%r8), %xmm1 -; DISABLED-NEXT: movups %xmm1, 16(%rcx) -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups 16(%rdi), %xmm1 -; DISABLED-NEXT: movups %xmm1, 16(%rsi) -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_conditional_block_float: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB11_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 -; CHECK-AVX2-NEXT: .LBB11_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, (%rsi) -; CHECK-AVX2-NEXT: movl 4(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX2-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX2-NEXT: movq 24(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 24(%rsi) -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_conditional_block_float: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB11_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 -; CHECK-AVX512-NEXT: .LBB11_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, (%rsi) -; CHECK-AVX512-NEXT: movl 4(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 4(%rsi) -; CHECK-AVX512-NEXT: vmovups 8(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 8(%rsi) -; CHECK-AVX512-NEXT: movq 24(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 24(%rsi) -; CHECK-AVX512-NEXT: vzeroupper -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_conditional_block_float: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB11_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 +; SSE-NEXT: .LBB11_2: # %if.end +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups 16(%r8), %xmm1 +; SSE-NEXT: movups %xmm1, 16(%rcx) +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups 16(%rdi), %xmm1 +; SSE-NEXT: movups %xmm1, 16(%rsi) +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_conditional_block_float: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB11_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movl $1065353216, 4(%rdi) # imm = 0x3F800000 +; AVX-NEXT: .LBB11_2: # %if.end +; AVX-NEXT: vmovups (%r8), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -1303,77 +772,36 @@ ; Function Attrs: nounwind uwtable define void @test_conditional_block_ymm(ptr nocapture noalias %s1, ptr nocapture %s2, i32 %x, ptr nocapture %s3, ptr nocapture readonly %s4) local_unnamed_addr #0 { -; CHECK-LABEL: test_conditional_block_ymm: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: cmpl $18, %edx -; CHECK-NEXT: jl .LBB12_2 -; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: movq $1, 8(%rdi) -; CHECK-NEXT: .LBB12_2: # %if.end -; CHECK-NEXT: movups (%r8), %xmm0 -; CHECK-NEXT: movups 16(%r8), %xmm1 -; CHECK-NEXT: movups %xmm1, 16(%rcx) -; CHECK-NEXT: movups %xmm0, (%rcx) -; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movups 16(%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 16(%rsi) -; CHECK-NEXT: movq %rax, (%rsi) -; CHECK-NEXT: movq %rcx, 8(%rsi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_conditional_block_ymm: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: cmpl $18, %edx -; DISABLED-NEXT: jl .LBB12_2 -; DISABLED-NEXT: # %bb.1: # %if.then -; DISABLED-NEXT: movq $1, 8(%rdi) -; DISABLED-NEXT: .LBB12_2: # %if.end -; DISABLED-NEXT: movups (%r8), %xmm0 -; DISABLED-NEXT: movups 16(%r8), %xmm1 -; DISABLED-NEXT: movups %xmm1, 16(%rcx) -; DISABLED-NEXT: movups %xmm0, (%rcx) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups 16(%rdi), %xmm1 -; DISABLED-NEXT: movups %xmm1, 16(%rsi) -; DISABLED-NEXT: movups %xmm0, (%rsi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_conditional_block_ymm: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: cmpl $18, %edx -; CHECK-AVX2-NEXT: jl .LBB12_2 -; CHECK-AVX2-NEXT: # %bb.1: # %if.then -; CHECK-AVX2-NEXT: movq $1, 8(%rdi) -; CHECK-AVX2-NEXT: .LBB12_2: # %if.end -; CHECK-AVX2-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX2-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX2-NEXT: movq (%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, (%rsi) -; CHECK-AVX2-NEXT: movq 8(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX2-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 16(%rsi) -; CHECK-AVX2-NEXT: vzeroupper -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_conditional_block_ymm: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: cmpl $18, %edx -; CHECK-AVX512-NEXT: jl .LBB12_2 -; CHECK-AVX512-NEXT: # %bb.1: # %if.then -; CHECK-AVX512-NEXT: movq $1, 8(%rdi) -; CHECK-AVX512-NEXT: .LBB12_2: # %if.end -; CHECK-AVX512-NEXT: vmovups (%r8), %ymm0 -; CHECK-AVX512-NEXT: vmovups %ymm0, (%rcx) -; CHECK-AVX512-NEXT: movq (%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, (%rsi) -; CHECK-AVX512-NEXT: movq 8(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 8(%rsi) -; CHECK-AVX512-NEXT: vmovups 16(%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 16(%rsi) -; CHECK-AVX512-NEXT: vzeroupper -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_conditional_block_ymm: +; SSE: # %bb.0: # %entry +; SSE-NEXT: cmpl $18, %edx +; SSE-NEXT: jl .LBB12_2 +; SSE-NEXT: # %bb.1: # %if.then +; SSE-NEXT: movq $1, 8(%rdi) +; SSE-NEXT: .LBB12_2: # %if.end +; SSE-NEXT: movups (%r8), %xmm0 +; SSE-NEXT: movups 16(%r8), %xmm1 +; SSE-NEXT: movups %xmm1, 16(%rcx) +; SSE-NEXT: movups %xmm0, (%rcx) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups 16(%rdi), %xmm1 +; SSE-NEXT: movups %xmm1, 16(%rsi) +; SSE-NEXT: movups %xmm0, (%rsi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_conditional_block_ymm: +; AVX: # %bb.0: # %entry +; AVX-NEXT: cmpl $18, %edx +; AVX-NEXT: jl .LBB12_2 +; AVX-NEXT: # %bb.1: # %if.then +; AVX-NEXT: movq $1, 8(%rdi) +; AVX-NEXT: .LBB12_2: # %if.end +; AVX-NEXT: vmovups (%r8), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rcx) +; AVX-NEXT: vmovups (%rdi), %ymm0 +; AVX-NEXT: vmovups %ymm0, (%rsi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq entry: %cmp = icmp sgt i32 %x, 17 br i1 %cmp, label %if.then, label %if.end @@ -1390,33 +818,19 @@ } define dso_local void @test_alias(ptr nocapture %A, i32 %x) local_unnamed_addr #0 { -; CHECK-LABEL: test_alias: -; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %esi, (%rdi) -; CHECK-NEXT: movups (%rdi), %xmm0 -; CHECK-NEXT: movups %xmm0, 4(%rdi) -; CHECK-NEXT: retq -; -; DISABLED-LABEL: test_alias: -; DISABLED: # %bb.0: # %entry -; DISABLED-NEXT: movl %esi, (%rdi) -; DISABLED-NEXT: movups (%rdi), %xmm0 -; DISABLED-NEXT: movups %xmm0, 4(%rdi) -; DISABLED-NEXT: retq -; -; CHECK-AVX2-LABEL: test_alias: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movl %esi, (%rdi) -; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 -; CHECK-AVX2-NEXT: vmovups %xmm0, 4(%rdi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_alias: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movl %esi, (%rdi) -; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%rdi) -; CHECK-AVX512-NEXT: retq +; SSE-LABEL: test_alias: +; SSE: # %bb.0: # %entry +; SSE-NEXT: movl %esi, (%rdi) +; SSE-NEXT: movups (%rdi), %xmm0 +; SSE-NEXT: movups %xmm0, 4(%rdi) +; SSE-NEXT: retq +; +; AVX-LABEL: test_alias: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %esi, (%rdi) +; AVX-NEXT: vmovups (%rdi), %xmm0 +; AVX-NEXT: vmovups %xmm0, 4(%rdi) +; AVX-NEXT: retq entry: store i32 %x, ptr %A, align 4 %add.ptr = getelementptr inbounds i8, ptr %A, i64 4 @@ -1444,27 +858,16 @@ ; DISABLED-NEXT: movups %xmm0, 20(%rdi) ; DISABLED-NEXT: retq ; -; CHECK-AVX2-LABEL: test_noalias: -; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: movl %esi, (%rdi) -; CHECK-AVX2-NEXT: movl (%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 20(%rdi) -; CHECK-AVX2-NEXT: movq 4(%rdi), %rax -; CHECK-AVX2-NEXT: movq %rax, 24(%rdi) -; CHECK-AVX2-NEXT: movl 12(%rdi), %eax -; CHECK-AVX2-NEXT: movl %eax, 32(%rdi) -; CHECK-AVX2-NEXT: retq -; -; CHECK-AVX512-LABEL: test_noalias: -; CHECK-AVX512: # %bb.0: # %entry -; CHECK-AVX512-NEXT: movl %esi, (%rdi) -; CHECK-AVX512-NEXT: movl (%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 20(%rdi) -; CHECK-AVX512-NEXT: movq 4(%rdi), %rax -; CHECK-AVX512-NEXT: movq %rax, 24(%rdi) -; CHECK-AVX512-NEXT: movl 12(%rdi), %eax -; CHECK-AVX512-NEXT: movl %eax, 32(%rdi) -; CHECK-AVX512-NEXT: retq +; AVX-LABEL: test_noalias: +; AVX: # %bb.0: # %entry +; AVX-NEXT: movl %esi, (%rdi) +; AVX-NEXT: movl (%rdi), %eax +; AVX-NEXT: movl %eax, 20(%rdi) +; AVX-NEXT: movq 4(%rdi), %rax +; AVX-NEXT: movq %rax, 24(%rdi) +; AVX-NEXT: movl 12(%rdi), %eax +; AVX-NEXT: movl %eax, 32(%rdi) +; AVX-NEXT: retq entry: store i32 %x, ptr %A, align 4 %add.ptr = getelementptr inbounds i8, ptr %A, i64 20 @@ -1476,3 +879,6 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1) #1 attributes #0 = { nounwind uwtable } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-AVX2: {{.*}} +; CHECK-AVX512: {{.*}} Index: llvm/test/CodeGen/X86/opt-pipeline.ll =================================================================== --- llvm/test/CodeGen/X86/opt-pipeline.ll +++ llvm/test/CodeGen/X86/opt-pipeline.ll @@ -118,6 +118,9 @@ ; CHECK-NEXT: Lazy Machine Block Frequency Analysis ; CHECK-NEXT: X86 LEA Optimize ; CHECK-NEXT: X86 Optimize Call Frame +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: X86 Avoid Store Forwarding Block ; CHECK-NEXT: X86 speculative load hardening ; CHECK-NEXT: MachineDominator Tree Construction