Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -471,6 +471,11 @@ bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) const; + /// Return true if the target can fuse a compare and branch. + /// Loop-strength-reduction (LSR) uses that knowledge to adjust its cost + /// calculation for the instructions in a loop. + bool canMacroFuseCmp() const; + /// \brief Return true if the target supports masked load/store /// AVX2 and AVX-512 targets allow masks for consecutive load and store bool isLegalMaskedStore(Type *DataType) const; @@ -974,6 +979,7 @@ Instruction *I) = 0; virtual bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) = 0; + virtual bool canMacroFuseCmp() = 0; virtual bool isLegalMaskedStore(Type *DataType) = 0; virtual bool isLegalMaskedLoad(Type *DataType) = 0; virtual bool isLegalMaskedScatter(Type *DataType) = 0; @@ -1192,6 +1198,9 @@ TargetTransformInfo::LSRCost &C2) override { return Impl.isLSRCostLess(C1, C2); } + bool canMacroFuseCmp() override { + return Impl.canMacroFuseCmp(); + } bool isLegalMaskedStore(Type *DataType) override { return Impl.isLegalMaskedStore(DataType); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -246,6 +246,8 @@ C2.ScaleCost, C2.ImmCost, C2.SetupCost); } + bool canMacroFuseCmp() { return false; } + bool isLegalMaskedStore(Type *DataType) { return false; } bool isLegalMaskedLoad(Type *DataType) { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -155,6 +155,10 @@ return TTIImpl->isLSRCostLess(C1, C2); } +bool TargetTransformInfo::canMacroFuseCmp() const { + return TTIImpl->canMacroFuseCmp(); +} + bool TargetTransformInfo::isLegalMaskedStore(Type *DataType) const { return TTIImpl->isLegalMaskedStore(DataType); } Index: lib/Target/X86/X86TargetTransformInfo.h =================================================================== --- lib/Target/X86/X86TargetTransformInfo.h +++ lib/Target/X86/X86TargetTransformInfo.h @@ -120,6 +120,7 @@ Type *Ty); bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2); + bool canMacroFuseCmp(); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2482,6 +2482,10 @@ C2.ScaleCost, C2.ImmCost, C2.SetupCost); } +bool X86TTIImpl::canMacroFuseCmp() { + return ST->hasMacroFusion(); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { // The backend can't handle a single element vector. if (isa(DataTy) && DataTy->getVectorNumElements() == 1) Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -1343,14 +1343,15 @@ // If ICmpZero formula ends with not 0, it could not be replaced by // just add or sub. We'll need to compare final result of AddRec. - // That means we'll need an additional instruction. + // That means we'll need an additional instruction. But if the target can + // macro-fuse a compare with a branch, don't count this extra instruction. // For -10 + {0, +, 1}: // i = i + 1; // cmp i, 10 // // For {-10, +, 1}: // i = i + 1; - if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd()) + if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() && !TTI.canMacroFuseCmp()) C.Insns++; // Each new AddRec adds 1 instruction to calculation. C.Insns += (C.AddRecCost - PrevAddRecCost); Index: test/CodeGen/X86/rdrand.ll =================================================================== --- test/CodeGen/X86/rdrand.ll +++ test/CodeGen/X86/rdrand.ll @@ -82,35 +82,41 @@ define void @loop(i32* %p, i32 %n) nounwind { ; X86-LABEL: loop: ; X86: # %bb.0: # %entry +; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax ; X86-NEXT: je .LBB3_3 ; X86-NEXT: # %bb.1: # %while.body.preheader ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorl %edx, %edx ; X86-NEXT: .p2align 4, 0x90 ; X86-NEXT: .LBB3_2: # %while.body ; X86-NEXT: # =>This Inner Loop Header: Depth=1 -; X86-NEXT: rdrandl %edx -; X86-NEXT: movl %edx, (%ecx) -; X86-NEXT: leal 4(%ecx), %ecx -; X86-NEXT: addl $-1, %eax +; X86-NEXT: rdrandl %esi +; X86-NEXT: movl %esi, (%ecx,%edx,4) +; X86-NEXT: addl $1, %edx +; X86-NEXT: cmpl %edx, %eax ; X86-NEXT: jne .LBB3_2 ; X86-NEXT: .LBB3_3: # %while.end +; X86-NEXT: popl %esi ; X86-NEXT: retl ; ; X64-LABEL: loop: ; X64: # %bb.0: # %entry ; X64-NEXT: testl %esi, %esi -; X64-NEXT: je .LBB3_2 +; X64-NEXT: je .LBB3_3 +; X64-NEXT: # %bb.1: # %while.body.preheader +; X64-NEXT: movl %esi, %eax +; X64-NEXT: xorl %ecx, %ecx ; X64-NEXT: .p2align 4, 0x90 -; X64-NEXT: .LBB3_1: # %while.body +; X64-NEXT: .LBB3_2: # %while.body ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: rdrandl %eax -; X64-NEXT: movl %eax, (%rdi) -; X64-NEXT: leaq 4(%rdi), %rdi -; X64-NEXT: addl $-1, %esi -; X64-NEXT: jne .LBB3_1 -; X64-NEXT: .LBB3_2: # %while.end +; X64-NEXT: rdrandl %edx +; X64-NEXT: movl %edx, (%rdi,%rcx,4) +; X64-NEXT: addq $1, %rcx +; X64-NEXT: cmpl %ecx, %eax +; X64-NEXT: jne .LBB3_2 +; X64-NEXT: .LBB3_3: # %while.end ; X64-NEXT: retq entry: %tobool1 = icmp eq i32 %n, 0 Index: test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -347,30 +347,31 @@ ; X32-NEXT: pushl %ebx ; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi -; X32-NEXT: movl $-400, %eax # imm = 0xFE70 +; X32-NEXT: movl $3, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movl {{[0-9]+}}(%esp), %edx ; X32-NEXT: movl {{[0-9]+}}(%esp), %esi ; X32-NEXT: .p2align 4, 0x90 ; X32-NEXT: .LBB3_1: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movzbl 400(%esi,%eax), %edi -; X32-NEXT: movzbl 400(%edx,%eax), %ebx +; X32-NEXT: movzbl -3(%esi,%eax), %edi +; X32-NEXT: movzbl -3(%edx,%eax), %ebx ; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movb %bl, 400(%ecx,%eax) -; X32-NEXT: movzbl 401(%esi,%eax), %edi -; X32-NEXT: movzbl 401(%edx,%eax), %ebx +; X32-NEXT: movb %bl, -3(%ecx,%eax) +; X32-NEXT: movzbl -2(%esi,%eax), %edi +; X32-NEXT: movzbl -2(%edx,%eax), %ebx ; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movb %bl, 401(%ecx,%eax) -; X32-NEXT: movzbl 402(%esi,%eax), %edi -; X32-NEXT: movzbl 402(%edx,%eax), %ebx +; X32-NEXT: movb %bl, -2(%ecx,%eax) +; X32-NEXT: movzbl -1(%esi,%eax), %edi +; X32-NEXT: movzbl -1(%edx,%eax), %ebx ; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movb %bl, 402(%ecx,%eax) -; X32-NEXT: movzbl 403(%esi,%eax), %edi -; X32-NEXT: movzbl 403(%edx,%eax), %ebx +; X32-NEXT: movb %bl, -1(%ecx,%eax) +; X32-NEXT: movzbl (%esi,%eax), %edi +; X32-NEXT: movzbl (%edx,%eax), %ebx ; X32-NEXT: addl %edi, %ebx -; X32-NEXT: movb %bl, 403(%ecx,%eax) +; X32-NEXT: movb %bl, (%ecx,%eax) ; X32-NEXT: addl $4, %eax +; X32-NEXT: cmpl $403, %eax # imm = 0x193 ; X32-NEXT: jne .LBB3_1 ; X32-NEXT: # %bb.2: # %for.end ; X32-NEXT: popl %esi Index: test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll +++ test/Transforms/LoopStrengthReduce/X86/macro-fuse-cmp.ll @@ -0,0 +1,63 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=BASE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=macrofusion | FileCheck %s --check-prefix=FUSE + +; PR35681 - https://bugs.llvm.org/show_bug.cgi?id=35681 +; If a CPU can macro-fuse a compare and branch, then we discount that +; cost in LSR and avoid generating large offsets in each memory access. +; This reduces code size and may improve decode throughput. + +define void @maxArray(double* noalias nocapture %x, double* noalias nocapture readonly %y) { +; BASE-LABEL: maxArray: +; BASE: # %bb.0: # %entry +; BASE-NEXT: movq $-524288, %rax # imm = 0xFFF80000 +; BASE-NEXT: .p2align 4, 0x90 +; BASE-NEXT: .LBB0_1: # %vector.body +; BASE-NEXT: # =>This Inner Loop Header: Depth=1 +; BASE-NEXT: movupd 524288(%rsi,%rax), %xmm0 +; BASE-NEXT: movupd 524288(%rdi,%rax), %xmm1 +; BASE-NEXT: maxpd %xmm1, %xmm0 +; BASE-NEXT: movupd %xmm0, 524288(%rdi,%rax) +; BASE-NEXT: addq $16, %rax +; BASE-NEXT: jne .LBB0_1 +; BASE-NEXT: # %bb.2: # %for.cond.cleanup +; BASE-NEXT: retq +; +; FUSE-LABEL: maxArray: +; FUSE: # %bb.0: # %entry +; FUSE-NEXT: xorl %eax, %eax +; FUSE-NEXT: .p2align 4, 0x90 +; FUSE-NEXT: .LBB0_1: # %vector.body +; FUSE-NEXT: # =>This Inner Loop Header: Depth=1 +; FUSE-NEXT: movupd (%rsi,%rax,8), %xmm0 +; FUSE-NEXT: movupd (%rdi,%rax,8), %xmm1 +; FUSE-NEXT: maxpd %xmm1, %xmm0 +; FUSE-NEXT: movupd %xmm0, (%rdi,%rax,8) +; FUSE-NEXT: addq $2, %rax +; FUSE-NEXT: cmpq $65536, %rax # imm = 0x10000 +; FUSE-NEXT: jne .LBB0_1 +; FUSE-NEXT: # %bb.2: # %for.cond.cleanup +; FUSE-NEXT: retq +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds double, double* %y, i64 %index + %1 = bitcast double* %0 to <2 x double>* + %wide.load = load <2 x double>, <2 x double>* %1, align 8 + %2 = getelementptr inbounds double, double* %x, i64 %index + %3 = bitcast double* %2 to <2 x double>* + %wide.load21 = load <2 x double>, <2 x double>* %3, align 8 + %4 = fcmp ogt <2 x double> %wide.load, %wide.load21 + %5 = select <2 x i1> %4, <2 x double> %wide.load, <2 x double> %wide.load21 + %6 = bitcast double* %2 to <2 x double>* + store <2 x double> %5, <2 x double>* %6, align 8 + %index.next = add i64 %index, 2 + %7 = icmp eq i64 %index.next, 65536 + br i1 %7, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: + ret void +} +