diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -126,6 +126,8 @@ std::function SimplifyAndSetOp) const; + bool isAllowLSRDropSolution() const; + /// \name Scalar TTI Implementations /// @{ diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -2430,3 +2430,5 @@ } return -1; } + +bool ARMTTIImpl::isAllowLSRDropSolution() const { return true; } diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-fabs.ll @@ -4,21 +4,25 @@ define arm_aapcs_vfpcc void @fabs(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %blockSize) { ; CHECK-LABEL: fabs: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vabs.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %while.end -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp3 = icmp eq i32 %blockSize, 0 br i1 %cmp3, label %while.end, label %vector.ph diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-intrinsic-round.ll @@ -4,21 +4,25 @@ define arm_aapcs_vfpcc void @round(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: round: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB0_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB0_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vrinta.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB0_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph @@ -49,21 +53,25 @@ define arm_aapcs_vfpcc void @rint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: rint: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB1_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vrintx.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph @@ -94,21 +102,25 @@ define arm_aapcs_vfpcc void @trunc(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: trunc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB2_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB2_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vrintz.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB2_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph @@ -139,21 +151,25 @@ define arm_aapcs_vfpcc void @ceil(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: ceil: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB3_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vrintp.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph @@ -184,21 +200,25 @@ define arm_aapcs_vfpcc void @floor(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: floor: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB4_1: @ %vector.ph +; CHECK-NEXT: movs r3, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB4_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r4, r0, r3, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: adds r3, #4 ; CHECK-NEXT: vrintm.f32 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r12] ; CHECK-NEXT: letp lr, .LBB4_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph @@ -230,24 +250,28 @@ define arm_aapcs_vfpcc void @nearbyint(float* noalias nocapture readonly %pSrcA, float* noalias nocapture %pDst, i32 %n) #0 { ; CHECK-LABEL: nearbyint: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: it eq -; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: popeq {r4, pc} ; CHECK-NEXT: .LBB5_1: @ %vector.ph +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0], #16 +; CHECK-NEXT: add.w r12, r0, r4, lsl #2 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: add.w r3, r1, r4, lsl #2 +; CHECK-NEXT: adds r4, #4 ; CHECK-NEXT: vrintr.f32 s3, s3 ; CHECK-NEXT: vrintr.f32 s2, s2 ; CHECK-NEXT: vrintr.f32 s1, s1 ; CHECK-NEXT: vrintr.f32 s0, s0 -; CHECK-NEXT: vstrw.32 q0, [r1], #16 +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, pc} entry: %cmp5 = icmp eq i32 %n, 0 br i1 %cmp5, label %for.cond.cleanup, label %vector.ph