Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -349,6 +349,7 @@ unsigned ImmCost; unsigned SetupCost; unsigned ScaleCost; + unsigned FoldedAddress; }; /// Parameters that control the generic loop unrolling transformation. Index: lib/Target/X86/X86TargetTransformInfo.cpp =================================================================== --- lib/Target/X86/X86TargetTransformInfo.cpp +++ lib/Target/X86/X86TargetTransformInfo.cpp @@ -2469,10 +2469,12 @@ bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, TargetTransformInfo::LSRCost &C2) { // X86 specific here are "instruction number 1st priority". - return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, + unsigned C1Insns = C1.Insns + (C1.FoldedAddress >> 3); + unsigned C2Insns = C2.Insns + (C2.FoldedAddress >> 3); + return std::tie(C1Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls, C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) < - std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, + std::tie(C2Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls, C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost); } Index: lib/Transforms/Scalar/LoopStrengthReduce.cpp =================================================================== --- lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -976,6 +976,7 @@ C.ImmCost = 0; C.SetupCost = 0; C.ScaleCost = 0; + C.FoldedAddress = 0; } bool isLess(Cost &Other, const TargetTransformInfo &TTI); @@ -986,9 +987,9 @@ // Once any of the metrics loses, they must all remain losers. bool isValid() { return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds - | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u) + | C.ImmCost | C.SetupCost | C.ScaleCost | C.FoldedAddress) != ~0u) || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds - & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u); + & C.ImmCost & C.SetupCost & C.ScaleCost & C.FoldedAddress) == ~0u); } #endif @@ -1298,6 +1299,9 @@ Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; } + if (NumBaseParts > 1 && LU.Kind == LSRUse::Address && + isAMCompletelyFolded(TTI, LU, F)) + C.FoldedAddress += LU.Fixups.size(); // If we don't count instruction cost exit here. if (!InsnsCost) { @@ -1347,6 +1351,7 @@ C.ImmCost = ~0u; C.SetupCost = ~0u; C.ScaleCost = ~0u; + C.FoldedAddress = ~0u; } /// Choose the lower cost. @@ -1376,6 +1381,9 @@ OS << ", plus " << C.ImmCost << " imm cost"; if (C.SetupCost != 0) OS << ", plus " << C.SetupCost << " setup cost"; + if (C.FoldedAddress != 0) + OS << ", plus " << C.FoldedAddress << " folded address" + << (C.FoldedAddress == 1 ? "" : "es"); } LLVM_DUMP_METHOD void Cost::dump() const { Index: test/CodeGen/X86/misched-matrix.ll =================================================================== --- test/CodeGen/X86/misched-matrix.ll +++ test/CodeGen/X86/misched-matrix.ll @@ -16,19 +16,19 @@ ; alias analysis ability (that doesn't require any AliasAnalysis pass). ; ; TOPDOWN-LABEL: %for.body -; TOPDOWN: movl %{{.*}}, ( +; TOPDOWN: movl %{{.*}}, -12( ; TOPDOWN-NOT: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 4( +; TOPDOWN: movl %{{.*}}, -8( ; TOPDOWN-NOT: imull {{[0-9]*}}( -; TOPDOWN: movl %{{.*}}, 8( -; TOPDOWN: movl %{{.*}}, 12( +; TOPDOWN: movl %{{.*}}, -4( +; TOPDOWN: movl %{{.*}}, ( ; TOPDOWN-LABEL: %for.end ; ; For -misched=ilpmin, verify that each expression subtree is ; scheduled independently, and that the imull/adds are interleaved. ; ; ILPMIN-LABEL: %for.body -; ILPMIN: movl %{{.*}}, ( +; ILPMIN: movl %{{.*}}, -12( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -36,7 +36,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 4( +; ILPMIN: movl %{{.*}}, -8( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -44,7 +44,7 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 8( +; ILPMIN: movl %{{.*}}, -4( ; ILPMIN: imull ; ILPMIN: imull ; ILPMIN: addl @@ -52,14 +52,14 @@ ; ILPMIN: addl ; ILPMIN: imull ; ILPMIN: addl -; ILPMIN: movl %{{.*}}, 12( +; ILPMIN: movl %{{.*}}, ( ; ILPMIN-LABEL: %for.end ; ; For -misched=ilpmax, verify that each expression subtree is ; scheduled independently, and that the imull/adds are clustered. ; ; ILPMAX-LABEL: %for.body -; ILPMAX: movl %{{.*}}, ( +; ILPMAX: movl %{{.*}}, -12( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -67,7 +67,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 4( +; ILPMAX: movl %{{.*}}, -8( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -75,7 +75,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 8( +; ILPMAX: movl %{{.*}}, -4( ; ILPMAX: imull ; ILPMAX: imull ; ILPMAX: imull @@ -83,7 +83,7 @@ ; ILPMAX: addl ; ILPMAX: addl ; ILPMAX: addl -; ILPMAX: movl %{{.*}}, 12( +; ILPMAX: movl %{{.*}}, ( ; ILPMAX-LABEL: %for.end define void @mmult([4 x i32]* noalias nocapture %m1, [4 x i32]* noalias nocapture %m2, Index: test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll =================================================================== --- test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll +++ test/Transforms/LoopStrengthReduce/X86/folded_addresses.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-reduce -mtriple=x86_64 -S %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: norecurse nounwind uwtable +define void @foo(i32, float, float* noalias nocapture readonly, float* noalias nocapture) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x float> undef, float [[TMP1:%.*]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x float> [[TMP9]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x float> undef, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x float> [[TMP11]], <8 x float> undef, <8 x i32> zeroinitializer +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr float, float* [[TMP2:%.*]], i64 56 +; CHECK-NEXT: [[SCEVGEP11:%.*]] = getelementptr float, float* [[TMP3:%.*]], i64 56 +; CHECK-NEXT: br label [[TMP13:%.*]] +; CHECK: [[LSR_IV12:%.*]] = phi float* [ [[SCEVGEP13:%.*]], [[TMP13]] ], [ [[SCEVGEP11]], [[TMP4:%.*]] ] +; CHECK-NEXT: [[LSR_IV1:%.*]] = phi float* [ [[SCEVGEP2:%.*]], [[TMP13]] ], [ [[SCEVGEP]], [[TMP4]] ] +; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[TMP13]] ], [ 4096, [[TMP4]] ] +; CHECK-NEXT: [[LSR_IV1214:%.*]] = bitcast float* [[LSR_IV12]] to <8 x float>* +; CHECK-NEXT: [[LSR_IV13:%.*]] = bitcast float* [[LSR_IV1]] to <8 x float>* +; CHECK-NEXT: [[SCEVGEP28:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7 +; CHECK-NEXT: [[TMP14:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP28]], align 4 +; CHECK-NEXT: [[SCEVGEP27:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6 +; CHECK-NEXT: [[TMP15:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP27]], align 4 +; CHECK-NEXT: [[SCEVGEP25:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5 +; CHECK-NEXT: [[TMP16:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP25]], align 4 +; CHECK-NEXT: [[SCEVGEP23:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4 +; CHECK-NEXT: [[TMP17:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP23]], align 4 +; CHECK-NEXT: [[SCEVGEP10:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -7 +; CHECK-NEXT: [[TMP18:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP10]], align 4 +; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -6 +; CHECK-NEXT: [[TMP19:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP9]], align 4 +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -5 +; CHECK-NEXT: [[TMP20:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP8]], align 4 +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -4 +; CHECK-NEXT: [[TMP21:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP7]], align 4 +; CHECK-NEXT: [[TMP22:%.*]] = fmul <8 x float> [[TMP18]], [[TMP6]] +; CHECK-NEXT: [[TMP23:%.*]] = fmul <8 x float> [[TMP19]], [[TMP8]] +; CHECK-NEXT: [[TMP24:%.*]] = fmul <8 x float> [[TMP20]], [[TMP10]] +; CHECK-NEXT: [[TMP25:%.*]] = fmul <8 x float> [[TMP21]], [[TMP12]] +; CHECK-NEXT: [[TMP26:%.*]] = fadd <8 x float> [[TMP14]], [[TMP22]] +; CHECK-NEXT: [[TMP27:%.*]] = fadd <8 x float> [[TMP15]], [[TMP23]] +; CHECK-NEXT: [[TMP28:%.*]] = fadd <8 x float> [[TMP16]], [[TMP24]] +; CHECK-NEXT: [[TMP29:%.*]] = fadd <8 x float> [[TMP17]], [[TMP25]] +; CHECK-NEXT: [[SCEVGEP21:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -7 +; CHECK-NEXT: store <8 x float> [[TMP26]], <8 x float>* [[SCEVGEP21]], align 4 +; CHECK-NEXT: [[SCEVGEP26:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -6 +; CHECK-NEXT: store <8 x float> [[TMP27]], <8 x float>* [[SCEVGEP26]], align 4 +; CHECK-NEXT: [[SCEVGEP24:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -5 +; CHECK-NEXT: store <8 x float> [[TMP28]], <8 x float>* [[SCEVGEP24]], align 4 +; CHECK-NEXT: [[SCEVGEP22:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -4 +; CHECK-NEXT: store <8 x float> [[TMP29]], <8 x float>* [[SCEVGEP22]], align 4 +; CHECK-NEXT: [[SCEVGEP20:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3 +; CHECK-NEXT: [[TMP30:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP20]], align 4 +; CHECK-NEXT: [[SCEVGEP19:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2 +; CHECK-NEXT: [[TMP31:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP19]], align 4 +; CHECK-NEXT: [[SCEVGEP17:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1 +; CHECK-NEXT: [[TMP32:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP17]], align 4 +; CHECK-NEXT: [[TMP33:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV1214]], align 4 +; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -3 +; CHECK-NEXT: [[TMP34:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP6]], align 4 +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -2 +; CHECK-NEXT: [[TMP35:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP5]], align 4 +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV13]], i64 -1 +; CHECK-NEXT: [[TMP36:%.*]] = load <8 x float>, <8 x float>* [[SCEVGEP4]], align 4 +; CHECK-NEXT: [[TMP37:%.*]] = load <8 x float>, <8 x float>* [[LSR_IV13]], align 4 +; CHECK-NEXT: [[TMP38:%.*]] = fmul <8 x float> [[TMP34]], [[TMP6]] +; CHECK-NEXT: [[TMP39:%.*]] = fmul <8 x float> [[TMP35]], [[TMP8]] +; CHECK-NEXT: [[TMP40:%.*]] = fmul <8 x float> [[TMP36]], [[TMP10]] +; CHECK-NEXT: [[TMP41:%.*]] = fmul <8 x float> [[TMP37]], [[TMP12]] +; CHECK-NEXT: [[TMP42:%.*]] = fadd <8 x float> [[TMP30]], [[TMP38]] +; CHECK-NEXT: [[TMP43:%.*]] = fadd <8 x float> [[TMP31]], [[TMP39]] +; CHECK-NEXT: [[TMP44:%.*]] = fadd <8 x float> [[TMP32]], [[TMP40]] +; CHECK-NEXT: [[TMP45:%.*]] = fadd <8 x float> [[TMP33]], [[TMP41]] +; CHECK-NEXT: [[SCEVGEP15:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -3 +; CHECK-NEXT: store <8 x float> [[TMP42]], <8 x float>* [[SCEVGEP15]], align 4 +; CHECK-NEXT: [[SCEVGEP18:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -2 +; CHECK-NEXT: store <8 x float> [[TMP43]], <8 x float>* [[SCEVGEP18]], align 4 +; CHECK-NEXT: [[SCEVGEP16:%.*]] = getelementptr <8 x float>, <8 x float>* [[LSR_IV1214]], i64 -1 +; CHECK-NEXT: store <8 x float> [[TMP44]], <8 x float>* [[SCEVGEP16]], align 4 +; CHECK-NEXT: store <8 x float> [[TMP45]], <8 x float>* [[LSR_IV1214]], align 4 +; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], -64 +; CHECK-NEXT: [[SCEVGEP2]] = getelementptr float, float* [[LSR_IV1]], i64 64 +; CHECK-NEXT: [[SCEVGEP13]] = getelementptr float, float* [[LSR_IV12]], i64 64 +; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0 +; CHECK-NEXT: br i1 [[TMP46]], label [[TMP47:%.*]], label [[TMP13]] +; CHECK: ret void +; + %5 = insertelement <8 x float> undef, float %1, i32 0 + %6 = shufflevector <8 x float> %5, <8 x float> undef, <8 x i32> zeroinitializer + %7 = insertelement <8 x float> undef, float %1, i32 0 + %8 = shufflevector <8 x float> %7, <8 x float> undef, <8 x i32> zeroinitializer + %9 = insertelement <8 x float> undef, float %1, i32 0 + %10 = shufflevector <8 x float> %9, <8 x float> undef, <8 x i32> zeroinitializer + %11 = insertelement <8 x float> undef, float %1, i32 0 + %12 = shufflevector <8 x float> %11, <8 x float> undef, <8 x i32> zeroinitializer + br label %13 + +;