Index: lib/Target/X86/X86FixupSFB.cpp =================================================================== --- lib/Target/X86/X86FixupSFB.cpp +++ lib/Target/X86/X86FixupSFB.cpp @@ -52,14 +52,21 @@ #define DEBUG_TYPE "x86-fixup-SFB" +namespace llvm { +void initializeX86FixupSFBPassPass(PassRegistry &); +} // end namespace llvm + static cl::opt DisableX86FixupSFB("disable-fixup-SFB", cl::Hidden, cl::desc("X86: Disable SFB fixup."), cl::init(false)); namespace { -class FixupSFBPass : public MachineFunctionPass { +class X86FixupSFBPass : public MachineFunctionPass { public: - FixupSFBPass() : MachineFunctionPass(ID) {} + static char ID; + X86FixupSFBPass() : MachineFunctionPass(ID) { + initializeX86FixupSFBPassPass(*PassRegistry::getPassRegistry()); + } StringRef getPassName() const override { return "X86 Fixup Store Forward Block"; @@ -67,6 +74,11 @@ bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired(); + } + private: MachineRegisterInfo *MRI; const X86InstrInfo *TII; @@ -74,6 +86,7 @@ SmallVector, 2> BlockedLoadsStores; SmallVector ForRemoval; bool Is64Bit; + AliasAnalysis *AA; /// \brief Returns couples of Load then Store to memory which look /// like a memcpy. @@ -94,15 +107,22 @@ int64_t StoreDisp, unsigned Size, int64_t LMMOffset, int64_t SMMOffset); + bool alias(const MachineMemOperand &Op1, const MachineMemOperand &Op2) const; + unsigned getRegSizeInBytes(MachineInstr *Inst); - static char ID; }; } // end anonymous namespace -char FixupSFBPass::ID = 0; +char X86FixupSFBPass::ID = 0; -FunctionPass *llvm::createX86FixupSFB() { return new FixupSFBPass(); } +INITIALIZE_PASS_BEGIN(X86FixupSFBPass, DEBUG_TYPE, "Machine code sinking", false, + false) +INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_END(X86FixupSFBPass, DEBUG_TYPE, "Machine code sinking", false, + false) + +FunctionPass *llvm::createX86FixupSFB() { return new X86FixupSFBPass(); } static bool isXMMLoadOpcode(unsigned Opcode) { return Opcode == X86::MOVUPSrm || Opcode == X86::MOVAPSrm || @@ -315,7 +335,7 @@ return PotentialBlockers; } -void FixupSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, +void X86FixupSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode, int64_t LoadDisp, MachineInstr *StoreInst, unsigned NStoreOpcode, int64_t StoreDisp, unsigned Size, int64_t LMMOffset, @@ -354,7 +374,7 @@ DEBUG(StInst->getPrevNode()->dump()); } -void FixupSFBPass::buildCopies(int Size, MachineInstr *LoadInst, +void X86FixupSFBPass::buildCopies(int Size, MachineInstr *LoadInst, int64_t LdDispImm, MachineInstr *StoreInst, int64_t StDispImm, int64_t LMMOffset, int64_t SMMOffset) { @@ -437,7 +457,26 @@ } } -void FixupSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { +static int getBaseOperandValue(MachineOperand &BaseOperand) { + return BaseOperand.isReg() ? BaseOperand.getReg() : BaseOperand.getIndex(); +} + +bool X86FixupSFBPass::alias(const MachineMemOperand &Op1, + const MachineMemOperand &Op2) const { + if (!Op1.getValue() || !Op2.getValue()) + return true; + + int64_t MinOffset = std::min(Op1.getOffset(), Op2.getOffset()); + int64_t Overlapa = Op1.getSize() + Op1.getOffset() - MinOffset; + int64_t Overlapb = Op2.getSize() + Op2.getOffset() - MinOffset; + + AliasResult AAResult = + AA->alias(MemoryLocation(Op1.getValue(), Overlapa, Op1.getAAInfo()), + MemoryLocation(Op2.getValue(), Overlapb, Op2.getAAInfo())); + return AAResult != NoAlias; +} + +void X86FixupSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) { for (auto &MBB : MF) for (auto &MI : MBB) if (isPotentialBlockedMemCpyLd(MI.getOpcode())) { @@ -450,20 +489,24 @@ if (isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) && (StoreMI.getParent() == MI.getParent())) + // Skip cases where the memcpy may overlap. if (isRelevantAddressingMode(&MI) && - isRelevantAddressingMode(&StoreMI)) + isRelevantAddressingMode(&StoreMI) && + !alias(**MI.memoperands_begin(), + **StoreMI.memoperands_begin())) { BlockedLoadsStores.push_back( std::pair(&MI, &StoreMI)); + } } } } -unsigned FixupSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { +unsigned X86FixupSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) { auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI, *LoadInst->getParent()->getParent()); return TRI->getRegSizeInBits(*TRC) / 8; } -void FixupSFBPass::breakBlockedCopies( +void X86FixupSFBPass::breakBlockedCopies( MachineInstr *LoadInst, MachineInstr *StoreInst, const std::map &BlockingStoresDisp) { int64_t LdDispImm = getDispOperand(LoadInst).getImm(); @@ -497,7 +540,7 @@ LMMOffset); } -bool FixupSFBPass::runOnMachineFunction(MachineFunction &MF) { +bool X86FixupSFBPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; if (DisableX86FixupSFB || skipFunction(MF.getFunction())) @@ -508,6 +551,7 @@ TII = MF.getSubtarget().getInstrInfo(); TRI = MF.getSubtarget().getRegisterInfo(); Is64Bit = MF.getSubtarget().is64Bit(); + AA = &getAnalysis().getAAResults(); DEBUG(dbgs() << "Start X86FixupSFB\n";); // Look for a load then a store to XMM/YMM which look like a memcpy findPotentiallylBlockedCopies(MF); @@ -520,7 +564,7 @@ MachineOperand &LoadBase = getBaseOperand(LoadInst); int64_t LdDispImm = getDispOperand(LoadInst).getImm(); std::map BlockingStoresDisp; - int LdBaseReg = LoadBase.isReg() ? LoadBase.getReg() : LoadBase.getIndex(); + int LdBase = getBaseOperandValue(LoadBase); for (auto PBInst : PotentialBlockers) { if (isPotentialBlockingStoreInst(PBInst->getOpcode(), @@ -531,15 +575,14 @@ int64_t PBstDispImm = getDispOperand(PBInst).getImm(); assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand"); unsigned PBstSize = (*PBInst->memoperands_begin())->getSize(); - int PBstBaseReg = - PBstoreBase.isReg() ? PBstoreBase.getReg() : PBstoreBase.getIndex(); + int PBstBase = getBaseOperandValue(PBstoreBase); // This check doesn't cover all cases, but it will suffice for now. // TODO: take branch probability into consideration, if the blocking // store is in an unreached block, breaking the memcopy could lose // performance. if (((LoadBase.isReg() && PBstoreBase.isReg()) || (LoadBase.isFI() && PBstoreBase.isFI())) && - LdBaseReg == PBstBaseReg && + LdBase == PBstBase && ((PBstDispImm >= LdDispImm) && (PBstDispImm <= LdDispImm + (getRegSizeInBytes(LoadInst) - PBstSize)))) { Index: lib/Target/X86/X86TargetMachine.cpp =================================================================== --- lib/Target/X86/X86TargetMachine.cpp +++ lib/Target/X86/X86TargetMachine.cpp @@ -62,6 +62,7 @@ void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86ExecutionDomainFixPass(PassRegistry &); void initializeX86DomainReassignmentPass(PassRegistry &); +void initializeX86FixupSFBPassPass(PassRegistry &); } // end namespace llvm @@ -80,6 +81,7 @@ initializeX86CmovConverterPassPass(PR); initializeX86ExecutionDomainFixPass(PR); initializeX86DomainReassignmentPass(PR); + initializeX86FixupSFBPassPass(PR); } static std::unique_ptr createTLOF(const Triple &TT) { Index: test/CodeGen/X86/fixup-sfb-32.ll =================================================================== --- test/CodeGen/X86/fixup-sfb-32.ll +++ test/CodeGen/X86/fixup-sfb-32.ll @@ -7,7 +7,7 @@ %struct.S = type { i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @test_conditional_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_conditional_block(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %edi @@ -149,7 +149,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_imm_store(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 { +define void @test_imm_store(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 { ; CHECK-LABEL: test_imm_store: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -223,7 +223,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_nondirect_br(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_nondirect_br(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_nondirect_br: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %edi @@ -398,7 +398,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_2preds_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_2preds_block(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_2preds_block: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebx @@ -567,7 +567,7 @@ %struct.S2 = type { i64, i64 } ; Function Attrs: nounwind uwtable -define void @test_type64(%struct.S2* nocapture %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_type64(%struct.S2* nocapture noalias %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_type64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %edi @@ -719,7 +719,7 @@ %struct.S3 = type { i64, i8, i8, i16, i32 } ; Function Attrs: noinline nounwind uwtable -define void @test_mixed_type(%struct.S3* nocapture %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 { +define void @test_mixed_type(%struct.S3* nocapture noalias %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_mixed_type: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %esi @@ -859,7 +859,7 @@ %struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @test_multiple_blocks(%struct.S4* nocapture %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 { +define void @test_multiple_blocks(%struct.S4* nocapture noalias %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 { ; CHECK-LABEL: test_multiple_blocks: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -966,7 +966,7 @@ %struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 } ; Function Attrs: nounwind uwtable -define void @test_type16(%struct.S5* nocapture %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_type16(%struct.S5* nocapture noalias %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_type16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %edi @@ -1120,8 +1120,14 @@ define void @test_stack(%struct.S6* noalias nocapture sret %agg.result, %struct.S6* byval nocapture readnone align 8 %s1, %struct.S6* byval nocapture align 8 %s2, i32 %x) local_unnamed_addr #0 { ; CHECK-LABEL: test_stack: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushl %eax +; CHECK-NEXT: pushl %edi ; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: pushl %esi +; CHECK-NEXT: .cfi_def_cfa_offset 12 +; CHECK-NEXT: pushl %eax +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset %esi, -12 +; CHECK-NEXT: .cfi_offset %edi, -8 ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1135,7 +1141,19 @@ ; CHECK-NEXT: movl %ecx, 24(%eax) ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-NEXT: movl %ecx, 28(%eax) -; CHECK-NEXT: popl %ecx +; CHECK-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-NEXT: addl $4, %esp +; CHECK-NEXT: popl %esi +; CHECK-NEXT: popl %edi ; CHECK-NEXT: retl $4 ; ; DISABLED-LABEL: test_stack: @@ -1143,19 +1161,29 @@ ; DISABLED-NEXT: pushl %eax ; DISABLED-NEXT: .cfi_def_cfa_offset 8 ; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax -; DISABLED-NEXT: movl %eax, {{[0-9]+}}(%esp) -; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax +; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DISABLED-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; DISABLED-NEXT: movups %xmm0, (%eax) ; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm0 ; DISABLED-NEXT: movups %xmm0, 16(%eax) +; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; DISABLED-NEXT: movups {{[0-9]+}}(%esp), %xmm1 +; DISABLED-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; DISABLED-NEXT: movups %xmm1, {{[0-9]+}}(%esp) ; DISABLED-NEXT: popl %ecx ; DISABLED-NEXT: retl $4 ; ; CHECK-AVX2-LABEL: test_stack: ; CHECK-AVX2: # %bb.0: # %entry -; CHECK-AVX2-NEXT: pushl %eax +; CHECK-AVX2-NEXT: pushl %edi ; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 8 +; CHECK-AVX2-NEXT: pushl %esi +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 12 +; CHECK-AVX2-NEXT: pushl %eax +; CHECK-AVX2-NEXT: .cfi_def_cfa_offset 16 +; CHECK-AVX2-NEXT: .cfi_offset %esi, -12 +; CHECK-AVX2-NEXT: .cfi_offset %edi, -8 ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%esp) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1169,7 +1197,19 @@ ; CHECK-AVX2-NEXT: movl %ecx, 24(%eax) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-AVX2-NEXT: movl %ecx, 28(%eax) -; CHECK-AVX2-NEXT: popl %ecx +; CHECK-AVX2-NEXT: movups {{[0-9]+}}(%esp), %xmm0 +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edx +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %esi +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %edi +; CHECK-AVX2-NEXT: movups %xmm0, {{[0-9]+}}(%esp) +; CHECK-AVX2-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-AVX2-NEXT: movl %edx, {{[0-9]+}}(%esp) +; CHECK-AVX2-NEXT: movl %esi, {{[0-9]+}}(%esp) +; CHECK-AVX2-NEXT: movl %edi, {{[0-9]+}}(%esp) +; CHECK-AVX2-NEXT: addl $4, %esp +; CHECK-AVX2-NEXT: popl %esi +; CHECK-AVX2-NEXT: popl %edi ; CHECK-AVX2-NEXT: retl $4 ; ; CHECK-AVX512-LABEL: test_stack: @@ -1183,12 +1223,22 @@ ; CHECK-AVX512-NEXT: movl %ecx, 16(%eax) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-AVX512-NEXT: movl %ecx, 20(%eax) -; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-AVX512-NEXT: movl %ecx, 24(%eax) -; CHECK-AVX512-NEXT: vmovups %xmm0, (%eax) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-AVX512-NEXT: movl %ecx, 28(%eax) +; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-AVX512-NEXT: vmovups %xmm0, (%eax) +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%esp) +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; CHECK-AVX512-NEXT: popl %ecx ; CHECK-AVX512-NEXT: retl $4 entry: @@ -1196,12 +1246,15 @@ %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3 store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8 %0 = bitcast %struct.S6* %agg.result to i8* + %s6.sroa.0.0..sroa_cast2 = bitcast %struct.S6* %s1 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %s6.sroa.0.0..sroa_cast2, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false) + ret void } ; Function Attrs: nounwind uwtable -define void @test_limit_all(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_limit_all(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_limit_all: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebp @@ -1397,7 +1450,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_limit_one_pred(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_limit_one_pred(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_limit_one_pred: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebp @@ -1613,7 +1666,7 @@ %struct.S7 = type { float, float, float , float, float, float, float, float } ; Function Attrs: nounwind uwtable -define void @test_conditional_block_float(%struct.S7* nocapture %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 { +define void @test_conditional_block_float(%struct.S7* nocapture noalias %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block_float: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebx @@ -1768,7 +1821,7 @@ %struct.S8 = type { i64, i64, i64, i64, i64, i64 } ; Function Attrs: nounwind uwtable -define void @test_conditional_block_ymm(%struct.S8* nocapture %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_conditional_block_ymm(%struct.S8* nocapture noalias %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block_ymm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushl %ebx @@ -1923,4 +1976,111 @@ tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 32, i32 4, i1 false) ret void } +define dso_local void @test_alias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 { +; CHECK-LABEL: test_alias: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %eax, (%ecx) +; CHECK-NEXT: movups (%ecx), %xmm0 +; CHECK-NEXT: movups %xmm0, 4(%ecx) +; CHECK-NEXT: retl +; +; DISABLED-LABEL: test_alias: +; DISABLED: # %bb.0: # %entry +; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax +; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DISABLED-NEXT: movl %eax, (%ecx) +; DISABLED-NEXT: movups (%ecx), %xmm0 +; DISABLED-NEXT: movups %xmm0, 4(%ecx) +; DISABLED-NEXT: retl +; +; CHECK-AVX2-LABEL: test_alias: +; CHECK-AVX2: # %bb.0: # %entry +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX2-NEXT: movl %eax, (%ecx) +; CHECK-AVX2-NEXT: movups (%ecx), %xmm0 +; CHECK-AVX2-NEXT: movups %xmm0, 4(%ecx) +; CHECK-AVX2-NEXT: retl +; +; CHECK-AVX512-LABEL: test_alias: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %eax, (%ecx) +; CHECK-AVX512-NEXT: vmovups (%ecx), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%ecx) +; CHECK-AVX512-NEXT: retl +entry: + %a = bitcast i8* %A to i32* + store i32 %x, i32* %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %A, i64 4 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind uwtable +define dso_local void @test_noalias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 { +; CHECK-LABEL: test_noalias: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: movl %eax, (%ecx) +; CHECK-NEXT: movl (%ecx), %eax +; CHECK-NEXT: movl %eax, 20(%ecx) +; CHECK-NEXT: movl 4(%ecx), %eax +; CHECK-NEXT: movl %eax, 24(%ecx) +; CHECK-NEXT: movl 8(%ecx), %eax +; CHECK-NEXT: movl %eax, 28(%ecx) +; CHECK-NEXT: movl 12(%ecx), %eax +; CHECK-NEXT: movl %eax, 32(%ecx) +; CHECK-NEXT: retl +; +; DISABLED-LABEL: test_noalias: +; DISABLED: # %bb.0: # %entry +; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %eax +; DISABLED-NEXT: movl {{[0-9]+}}(%esp), %ecx +; DISABLED-NEXT: movl %eax, (%ecx) +; DISABLED-NEXT: movups (%ecx), %xmm0 +; DISABLED-NEXT: movups %xmm0, 20(%ecx) +; DISABLED-NEXT: retl +; +; CHECK-AVX2-LABEL: test_noalias: +; CHECK-AVX2: # %bb.0: # %entry +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX2-NEXT: movl %eax, (%ecx) +; CHECK-AVX2-NEXT: movl (%ecx), %eax +; CHECK-AVX2-NEXT: movl %eax, 20(%ecx) +; CHECK-AVX2-NEXT: movl 4(%ecx), %eax +; CHECK-AVX2-NEXT: movl %eax, 24(%ecx) +; CHECK-AVX2-NEXT: movl 8(%ecx), %eax +; CHECK-AVX2-NEXT: movl %eax, 28(%ecx) +; CHECK-AVX2-NEXT: movl 12(%ecx), %eax +; CHECK-AVX2-NEXT: movl %eax, 32(%ecx) +; CHECK-AVX2-NEXT: retl +; +; CHECK-AVX512-LABEL: test_noalias: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx +; CHECK-AVX512-NEXT: movl %eax, (%ecx) +; CHECK-AVX512-NEXT: movl (%ecx), %eax +; CHECK-AVX512-NEXT: movl %eax, 20(%ecx) +; CHECK-AVX512-NEXT: movl 4(%ecx), %eax +; CHECK-AVX512-NEXT: movl %eax, 24(%ecx) +; CHECK-AVX512-NEXT: movl 8(%ecx), %eax +; CHECK-AVX512-NEXT: movl %eax, 28(%ecx) +; CHECK-AVX512-NEXT: movl 12(%ecx), %eax +; CHECK-AVX512-NEXT: movl %eax, 32(%ecx) +; CHECK-AVX512-NEXT: retl +entry: + %a = bitcast i8* %A to i32* + store i32 %x, i32* %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %A, i64 20 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false) + ret void +} + Index: test/CodeGen/X86/fixup-sfb.ll =================================================================== --- test/CodeGen/X86/fixup-sfb.ll +++ test/CodeGen/X86/fixup-sfb.ll @@ -4,18 +4,13 @@ ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2 ; RUN: llc < %s -mtriple=x86_64-linux -mcpu=skx | FileCheck %s -check-prefix=CHECK-AVX512 -; RUN: llc < %s -mtriple=i686-linux -; RUN: llc < %s -mtriple=i686-linux --disable-fixup-SFB -; RUN: llc < %s -mtriple=i686-linux -mattr sse4 -; RUN: llc < %s -mtriple=i686-linux -mattr avx512 - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" %struct.S = type { i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @test_conditional_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_conditional_block(%struct.S* nocapture noalias %s1 , %struct.S* nocapture noalias %s2, i32 %x, %struct.S* nocapture noalias %s3, %struct.S* nocapture noalias readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -99,7 +94,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_imm_store(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 { +define void @test_imm_store(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3) local_unnamed_addr #0 { ; CHECK-LABEL: test_imm_store: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl $0, (%rdi) @@ -155,7 +150,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_nondirect_br(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_nondirect_br(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_nondirect_br: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -268,7 +263,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_2preds_block(%struct.S* nocapture %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_2preds_block(%struct.S* nocapture noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_2preds_block: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl %r9d, 12(%rdi) @@ -365,7 +360,7 @@ %struct.S2 = type { i64, i64 } ; Function Attrs: nounwind uwtable -define void @test_type64(%struct.S2* nocapture %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_type64(%struct.S2* nocapture noalias %s1, %struct.S2* nocapture %s2, i32 %x, %struct.S2* nocapture %s3, %struct.S2* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_type64: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -449,7 +444,7 @@ %struct.S3 = type { i64, i8, i8, i16, i32 } ; Function Attrs: noinline nounwind uwtable -define void @test_mixed_type(%struct.S3* nocapture %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 { +define void @test_mixed_type(%struct.S3* nocapture noalias %s1, %struct.S3* nocapture %s2, i32 %x, %struct.S3* nocapture readnone %s3, %struct.S3* nocapture readnone %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_mixed_type: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -547,7 +542,7 @@ %struct.S4 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } ; Function Attrs: nounwind uwtable -define void @test_multiple_blocks(%struct.S4* nocapture %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 { +define void @test_multiple_blocks(%struct.S4* nocapture noalias %s1, %struct.S4* nocapture %s2) local_unnamed_addr #0 { ; CHECK-LABEL: test_multiple_blocks: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl $0, 4(%rdi) @@ -636,7 +631,7 @@ %struct.S5 = type { i16, i16, i16, i16, i16, i16, i16, i16 } ; Function Attrs: nounwind uwtable -define void @test_type16(%struct.S5* nocapture %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_type16(%struct.S5* nocapture noalias %s1, %struct.S5* nocapture %s2, i32 %x, %struct.S5* nocapture %s3, %struct.S5* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_type16: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -741,6 +736,14 @@ ; CHECK-NEXT: movl %eax, 24(%rdi) ; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-NEXT: movl %eax, 28(%rdi) +; CHECK-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx +; CHECK-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %ecx, {{[0-9]+}}(%rsp) +; CHECK-NEXT: movl %edx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq ; @@ -751,6 +754,10 @@ ; DISABLED-NEXT: movups %xmm0, (%rdi) ; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 ; DISABLED-NEXT: movups %xmm0, 16(%rdi) +; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm0 +; DISABLED-NEXT: movaps {{[0-9]+}}(%rsp), %xmm1 +; DISABLED-NEXT: movaps %xmm0, {{[0-9]+}}(%rsp) +; DISABLED-NEXT: movaps %xmm1, {{[0-9]+}}(%rsp) ; DISABLED-NEXT: movq %rdi, %rax ; DISABLED-NEXT: retq ; @@ -765,6 +772,14 @@ ; CHECK-AVX2-NEXT: movl %eax, 24(%rdi) ; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-AVX2-NEXT: movl %eax, 28(%rdi) +; CHECK-AVX2-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-AVX2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; CHECK-AVX2-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-AVX2-NEXT: movl %eax, {{[0-9]+}}(%rsp) ; CHECK-AVX2-NEXT: movq %rdi, %rax ; CHECK-AVX2-NEXT: retq ; @@ -779,6 +794,14 @@ ; CHECK-AVX512-NEXT: movl %eax, 24(%rdi) ; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax ; CHECK-AVX512-NEXT: movl %eax, 28(%rdi) +; CHECK-AVX512-NEXT: vmovups {{[0-9]+}}(%rsp), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, {{[0-9]+}}(%rsp) +; CHECK-AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax +; CHECK-AVX512-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp) +; CHECK-AVX512-NEXT: movl {{[0-9]+}}(%rsp), %eax +; CHECK-AVX512-NEXT: movl %eax, {{[0-9]+}}(%rsp) ; CHECK-AVX512-NEXT: movq %rdi, %rax ; CHECK-AVX512-NEXT: retq entry: @@ -786,12 +809,15 @@ %s6.sroa.3.0..sroa_idx4 = getelementptr inbounds %struct.S6, %struct.S6* %s2, i64 0, i32 3 store i32 %x, i32* %s6.sroa.3.0..sroa_idx4, align 8 %0 = bitcast %struct.S6* %agg.result to i8* + %s6.sroa.0.0..sroa_cast2 = bitcast %struct.S6* %s1 to i8* call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %s6.sroa.0.0..sroa_cast2, i8* nonnull %s6.sroa.0.0..sroa_cast1, i64 32, i32 4, i1 false) + ret void } ; Function Attrs: nounwind uwtable -define void @test_limit_all(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_limit_all(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_limit_all: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %rbp @@ -983,7 +1009,7 @@ } ; Function Attrs: nounwind uwtable -define void @test_limit_one_pred(%struct.S* %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { +define void @test_limit_one_pred(%struct.S* noalias %s1, %struct.S* nocapture %s2, i32 %x, %struct.S* nocapture %s3, %struct.S* nocapture readonly %s4, i32 %x2) local_unnamed_addr #0 { ; CHECK-LABEL: test_limit_one_pred: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: pushq %r15 @@ -1185,7 +1211,7 @@ %struct.S7 = type { float, float, float , float, float, float, float, float } ; Function Attrs: nounwind uwtable -define void @test_conditional_block_float(%struct.S7* nocapture %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 { +define void @test_conditional_block_float(%struct.S7* nocapture noalias %s1, %struct.S7* nocapture %s2, i32 %x, %struct.S7* nocapture %s3, %struct.S7* nocapture readonly %s4, float %y) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block_float: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -1285,7 +1311,7 @@ %struct.S8 = type { i64, i64, i64, i64, i64, i64 } ; Function Attrs: nounwind uwtable -define void @test_conditional_block_ymm(%struct.S8* nocapture %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 { +define void @test_conditional_block_ymm(%struct.S8* nocapture noalias %s1, %struct.S8* nocapture %s2, i32 %x, %struct.S8* nocapture %s3, %struct.S8* nocapture readonly %s4) local_unnamed_addr #0 { ; CHECK-LABEL: test_conditional_block_ymm: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: cmpl $18, %edx @@ -1376,3 +1402,90 @@ ret void } +define dso_local void @test_alias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 { +; CHECK-LABEL: test_alias: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, (%rdi) +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: movups %xmm0, 4(%rdi) +; CHECK-NEXT: retq +; +; DISABLED-LABEL: test_alias: +; DISABLED: # %bb.0: # %entry +; DISABLED-NEXT: movl %esi, (%rdi) +; DISABLED-NEXT: movups (%rdi), %xmm0 +; DISABLED-NEXT: movups %xmm0, 4(%rdi) +; DISABLED-NEXT: retq +; +; CHECK-AVX2-LABEL: test_alias: +; CHECK-AVX2: # %bb.0: # %entry +; CHECK-AVX2-NEXT: movl %esi, (%rdi) +; CHECK-AVX2-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX2-NEXT: vmovups %xmm0, 4(%rdi) +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512-LABEL: test_alias: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %esi, (%rdi) +; CHECK-AVX512-NEXT: vmovups (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovups %xmm0, 4(%rdi) +; CHECK-AVX512-NEXT: retq +entry: + %a = bitcast i8* %A to i32* + store i32 %x, i32* %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %A, i64 4 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind uwtable +define dso_local void @test_noalias(i8* nocapture %A, i32 %x) local_unnamed_addr #0 { +; CHECK-LABEL: test_noalias: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: movl %esi, (%rdi) +; CHECK-NEXT: movl (%rdi), %eax +; CHECK-NEXT: movl %eax, 20(%rdi) +; CHECK-NEXT: movq 4(%rdi), %rax +; CHECK-NEXT: movq %rax, 24(%rdi) +; CHECK-NEXT: movl 12(%rdi), %eax +; CHECK-NEXT: movl %eax, 32(%rdi) +; CHECK-NEXT: retq +; +; DISABLED-LABEL: test_noalias: +; DISABLED: # %bb.0: # %entry +; DISABLED-NEXT: movl %esi, (%rdi) +; DISABLED-NEXT: movups (%rdi), %xmm0 +; DISABLED-NEXT: movups %xmm0, 20(%rdi) +; DISABLED-NEXT: retq +; +; CHECK-AVX2-LABEL: test_noalias: +; CHECK-AVX2: # %bb.0: # %entry +; CHECK-AVX2-NEXT: movl %esi, (%rdi) +; CHECK-AVX2-NEXT: movl (%rdi), %eax +; CHECK-AVX2-NEXT: movl %eax, 20(%rdi) +; CHECK-AVX2-NEXT: movq 4(%rdi), %rax +; CHECK-AVX2-NEXT: movq %rax, 24(%rdi) +; CHECK-AVX2-NEXT: movl 12(%rdi), %eax +; CHECK-AVX2-NEXT: movl %eax, 32(%rdi) +; CHECK-AVX2-NEXT: retq +; +; CHECK-AVX512-LABEL: test_noalias: +; CHECK-AVX512: # %bb.0: # %entry +; CHECK-AVX512-NEXT: movl %esi, (%rdi) +; CHECK-AVX512-NEXT: movl (%rdi), %eax +; CHECK-AVX512-NEXT: movl %eax, 20(%rdi) +; CHECK-AVX512-NEXT: movq 4(%rdi), %rax +; CHECK-AVX512-NEXT: movq %rax, 24(%rdi) +; CHECK-AVX512-NEXT: movl 12(%rdi), %eax +; CHECK-AVX512-NEXT: movl %eax, 32(%rdi) +; CHECK-AVX512-NEXT: retq +entry: + %a = bitcast i8* %A to i32* + store i32 %x, i32* %a, align 4 + %add.ptr = getelementptr inbounds i8, i8* %A, i64 20 + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 %add.ptr, i8* align 4 %A, i64 16, i32 4, i1 false) + ret void +} + + +