diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -615,6 +615,9 @@ /// cost should return false, otherwise return true. bool isNumRegsMajorCostOfLSR() const; + /// Return true if allows LSR to drop solution when less profitable. + bool isAllowLSRDropSolution() const; + /// \returns true if LSR should not optimize a chain that includes \p I. bool isProfitableLSRChainElement(Instruction *I) const; @@ -1629,6 +1632,7 @@ virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) = 0; virtual bool isNumRegsMajorCostOfLSR() = 0; + virtual bool isAllowLSRDropSolution() = 0; virtual bool isProfitableLSRChainElement(Instruction *I) = 0; virtual bool canMacroFuseCmp() = 0; virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE, @@ -2046,6 +2050,9 @@ bool isNumRegsMajorCostOfLSR() override { return Impl.isNumRegsMajorCostOfLSR(); } + bool isAllowLSRDropSolution() override { + return Impl.isAllowLSRDropSolution(); + } bool isProfitableLSRChainElement(Instruction *I) override { return Impl.isProfitableLSRChainElement(I); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -222,6 +222,8 @@ bool isNumRegsMajorCostOfLSR() const { return true; } + bool isAllowLSRDropSolution() const { return false; } + bool isProfitableLSRChainElement(Instruction *I) const { return false; } bool canMacroFuseCmp() const { return false; } diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h --- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -362,6 +362,10 @@ return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); } + bool isAllowLSRDropSolution() const { + return TargetTransformInfoImplBase::isAllowLSRDropSolution(); + } + bool isProfitableLSRChainElement(Instruction *I) { return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -368,6 +368,10 @@ return TTIImpl->isNumRegsMajorCostOfLSR(); } +bool TargetTransformInfo::isAllowLSRDropSolution() const { + return TTIImpl->isAllowLSRDropSolution(); +} + bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const { return TTIImpl->isProfitableLSRChainElement(I); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -315,6 +315,8 @@ } llvm_unreachable("unknown register class"); } + + bool isAllowLSRDropSolution() const; }; } // end namespace llvm diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -1019,3 +1019,5 @@ // TODO: Figure out constant materialization cost modeling and remove. return SLPMaxVF; } + +bool RISCVTTIImpl::isAllowLSRDropSolution() const { return true; } diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -5187,7 +5187,7 @@ if (BaselineCost.isLess(SolutionCost)) { LLVM_DEBUG(dbgs() << "The baseline solution requires "; BaselineCost.print(dbgs()); dbgs() << "\n"); - if (!AllowDropSolutionIfLessProfitable) + if (!AllowDropSolutionIfLessProfitable && !TTI.isAllowLSRDropSolution()) LLVM_DEBUG( dbgs() << "Baseline is more profitable than chosen solution, " "add option 'lsr-drop-solution' to drop LSR solution.\n"); diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -257,43 +257,42 @@ define void @sink_splat_mul_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_mul_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB7_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB7_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB7_5 ; CHECK-NEXT: .LBB7_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB7_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vmul.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB7_3 +; CHECK-NEXT: bne a5, a3, .LBB7_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB7_7 ; CHECK-NEXT: .LBB7_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB7_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: mulw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: mulw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB7_6 +; CHECK-NEXT: bnez a2, .LBB7_6 ; CHECK-NEXT: .LBB7_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -350,43 +349,42 @@ define void @sink_splat_add_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_add_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB8_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB8_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB8_5 ; CHECK-NEXT: .LBB8_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB8_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vadd.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB8_3 +; CHECK-NEXT: bne a5, a3, .LBB8_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB8_7 ; CHECK-NEXT: .LBB8_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB8_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: addw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: addw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB8_6 +; CHECK-NEXT: bnez a2, .LBB8_6 ; CHECK-NEXT: .LBB8_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -443,43 +441,42 @@ define void @sink_splat_sub_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB9_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB9_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB9_5 ; CHECK-NEXT: .LBB9_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB9_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsub.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB9_3 +; CHECK-NEXT: bne a5, a3, .LBB9_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB9_7 ; CHECK-NEXT: .LBB9_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB9_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: addw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: addw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB9_6 +; CHECK-NEXT: bnez a2, .LBB9_6 ; CHECK-NEXT: .LBB9_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -536,43 +533,42 @@ define void @sink_splat_rsub_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_rsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB10_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB10_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB10_5 ; CHECK-NEXT: .LBB10_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB10_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vrsub.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB10_3 +; CHECK-NEXT: bne a5, a3, .LBB10_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB10_7 ; CHECK-NEXT: .LBB10_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB10_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: subw a2, a1, a2 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: subw a3, a1, a3 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB10_6 +; CHECK-NEXT: bnez a2, .LBB10_6 ; CHECK-NEXT: .LBB10_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -629,43 +625,42 @@ define void @sink_splat_and_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_and_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB11_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB11_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB11_5 ; CHECK-NEXT: .LBB11_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB11_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vand.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB11_3 +; CHECK-NEXT: bne a5, a3, .LBB11_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB11_7 ; CHECK-NEXT: .LBB11_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB11_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: and a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: and a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB11_6 +; CHECK-NEXT: bnez a2, .LBB11_6 ; CHECK-NEXT: .LBB11_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -722,43 +717,42 @@ define void @sink_splat_or_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_or_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB12_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB12_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB12_5 ; CHECK-NEXT: .LBB12_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB12_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vor.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB12_3 +; CHECK-NEXT: bne a5, a3, .LBB12_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB12_7 ; CHECK-NEXT: .LBB12_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB12_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: or a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: or a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB12_6 +; CHECK-NEXT: bnez a2, .LBB12_6 ; CHECK-NEXT: .LBB12_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -815,43 +809,42 @@ define void @sink_splat_xor_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_xor_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB13_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB13_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB13_5 ; CHECK-NEXT: .LBB13_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB13_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vxor.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB13_3 +; CHECK-NEXT: bne a5, a3, .LBB13_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB13_7 ; CHECK-NEXT: .LBB13_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB13_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: xor a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: xor a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB13_6 +; CHECK-NEXT: bnez a2, .LBB13_6 ; CHECK-NEXT: .LBB13_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1016,43 +1009,42 @@ define void @sink_splat_shl_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_shl_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB17_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB17_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB17_5 ; CHECK-NEXT: .LBB17_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB17_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsll.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB17_3 +; CHECK-NEXT: bne a5, a3, .LBB17_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB17_7 ; CHECK-NEXT: .LBB17_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB17_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: sllw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: sllw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB17_6 +; CHECK-NEXT: bnez a2, .LBB17_6 ; CHECK-NEXT: .LBB17_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1109,43 +1101,42 @@ define void @sink_splat_lshr_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_lshr_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB18_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB18_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB18_5 ; CHECK-NEXT: .LBB18_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB18_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vsrl.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB18_3 +; CHECK-NEXT: bne a5, a3, .LBB18_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB18_7 ; CHECK-NEXT: .LBB18_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB18_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: srlw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: srlw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB18_6 +; CHECK-NEXT: bnez a2, .LBB18_6 ; CHECK-NEXT: .LBB18_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1202,43 +1193,42 @@ define void @sink_splat_ashr_scalable(i32* nocapture %a) { ; CHECK-LABEL: sink_splat_ashr_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a4, vlenb -; CHECK-NEXT: srli a2, a4, 1 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a2, .LBB19_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 1 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB19_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB19_5 ; CHECK-NEXT: .LBB19_2: # %vector.ph -; CHECK-NEXT: addiw a1, a2, -1 -; CHECK-NEXT: andi a3, a1, 1024 -; CHECK-NEXT: xori a1, a3, 1024 -; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB19_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl2re32.v v8, (a5) ; CHECK-NEXT: vsra.vi v8, v8, 2 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs2r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a2 -; CHECK-NEXT: add a5, a5, a4 -; CHECK-NEXT: bnez a6, .LBB19_3 +; CHECK-NEXT: bne a4, a2, .LBB19_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a3, .LBB19_7 ; CHECK-NEXT: .LBB19_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB19_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a1, 0(a0) -; CHECK-NEXT: srli a1, a1, 2 -; CHECK-NEXT: sw a1, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: lw a2, 0(a0) +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: sw a2, 0(a0) +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB19_6 +; CHECK-NEXT: bnez a1, .LBB19_6 ; CHECK-NEXT: .LBB19_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1511,42 +1501,42 @@ define void @sink_splat_fmul_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fmul_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB26_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB26_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB26_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfmul.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB26_3 +; CHECK-NEXT: bne a4, a2, .LBB26_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB26_7 +; CHECK-NEXT: beqz a3, .LBB26_7 ; CHECK-NEXT: .LBB26_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB26_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fmul.s ft0, ft0, fa0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB26_6 +; CHECK-NEXT: bnez a1, .LBB26_6 ; CHECK-NEXT: .LBB26_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1603,42 +1593,42 @@ define void @sink_splat_fdiv_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB27_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB27_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB27_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfdiv.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB27_3 +; CHECK-NEXT: bne a4, a2, .LBB27_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB27_7 +; CHECK-NEXT: beqz a3, .LBB27_7 ; CHECK-NEXT: .LBB27_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB27_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fdiv.s ft0, ft0, fa0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB27_6 +; CHECK-NEXT: bnez a1, .LBB27_6 ; CHECK-NEXT: .LBB27_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1695,42 +1685,42 @@ define void @sink_splat_frdiv_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB28_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB28_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB28_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrdiv.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB28_3 +; CHECK-NEXT: bne a4, a2, .LBB28_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB28_7 +; CHECK-NEXT: beqz a3, .LBB28_7 ; CHECK-NEXT: .LBB28_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB28_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fdiv.s ft0, fa0, ft0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB28_6 +; CHECK-NEXT: bnez a1, .LBB28_6 ; CHECK-NEXT: .LBB28_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1787,42 +1777,42 @@ define void @sink_splat_fadd_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fadd_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB29_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB29_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB29_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfadd.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB29_3 +; CHECK-NEXT: bne a4, a2, .LBB29_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB29_7 +; CHECK-NEXT: beqz a3, .LBB29_7 ; CHECK-NEXT: .LBB29_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB29_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fadd.s ft0, ft0, fa0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB29_6 +; CHECK-NEXT: bnez a1, .LBB29_6 ; CHECK-NEXT: .LBB29_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1879,42 +1869,42 @@ define void @sink_splat_fsub_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_fsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB30_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB30_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB30_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfsub.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB30_3 +; CHECK-NEXT: bne a4, a2, .LBB30_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB30_7 +; CHECK-NEXT: beqz a3, .LBB30_7 ; CHECK-NEXT: .LBB30_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB30_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fsub.s ft0, ft0, fa0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB30_6 +; CHECK-NEXT: bnez a1, .LBB30_6 ; CHECK-NEXT: .LBB30_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -1971,42 +1961,42 @@ define void @sink_splat_frsub_scalable(float* nocapture %a, float %x) { ; CHECK-LABEL: sink_splat_frsub_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: bgeu a1, a3, .LBB31_2 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: li a2, 1024 +; CHECK-NEXT: bgeu a2, a1, .LBB31_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph -; CHECK-NEXT: addiw a1, a3, -1 -; CHECK-NEXT: andi a4, a1, 1024 -; CHECK-NEXT: xori a1, a4, 1024 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: vsetvli a5, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a5, a0 -; CHECK-NEXT: mv a6, a1 ; CHECK-NEXT: .LBB31_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a5, a4, 2 +; CHECK-NEXT: add a5, a0, a5 ; CHECK-NEXT: vl1re32.v v8, (a5) ; CHECK-NEXT: vfrsub.vf v8, v8, fa0 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vs1r.v v8, (a5) -; CHECK-NEXT: sub a6, a6, a3 -; CHECK-NEXT: add a5, a5, a2 -; CHECK-NEXT: bnez a6, .LBB31_3 +; CHECK-NEXT: bne a4, a2, .LBB31_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a4, .LBB31_7 +; CHECK-NEXT: beqz a3, .LBB31_7 ; CHECK-NEXT: .LBB31_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a1, -1024 -; CHECK-NEXT: slli a1, a1, 2 -; CHECK-NEXT: add a0, a0, a1 +; CHECK-NEXT: addi a1, a2, -1024 +; CHECK-NEXT: slli a2, a2, 2 +; CHECK-NEXT: add a0, a0, a2 ; CHECK-NEXT: .LBB31_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: flw ft0, 0(a0) ; CHECK-NEXT: fsub.s ft0, fa0, ft0 ; CHECK-NEXT: fsw ft0, 0(a0) -; CHECK-NEXT: addi a2, a2, 1 +; CHECK-NEXT: addi a1, a1, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a2, .LBB31_6 +; CHECK-NEXT: bnez a1, .LBB31_6 ; CHECK-NEXT: .LBB31_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2146,35 +2136,34 @@ ; CHECK-LABEL: sink_splat_fma_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a4, 1024 -; CHECK-NEXT: bgeu a4, a3, .LBB34_2 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB34_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB34_5 ; CHECK-NEXT: .LBB34_2: # %vector.ph -; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: addiw a4, a3, -1 -; CHECK-NEXT: andi a5, a4, 1024 -; CHECK-NEXT: xori a4, a5, 1024 -; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a7, a4 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add t0, a0, a6 -; CHECK-NEXT: vl1re32.v v8, (t0) -; CHECK-NEXT: add t1, a1, a6 -; CHECK-NEXT: vl1re32.v v9, (t1) +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a7, a0, a6 +; CHECK-NEXT: vl1re32.v v8, (a7) +; CHECK-NEXT: add a6, a1, a6 +; CHECK-NEXT: vl1re32.v v9, (a6) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 -; CHECK-NEXT: vs1r.v v9, (t0) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a2 -; CHECK-NEXT: bnez a7, .LBB34_3 +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vs1r.v v9, (a7) +; CHECK-NEXT: bne a5, a3, .LBB34_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a5, .LBB34_7 +; CHECK-NEXT: beqz a4, .LBB34_7 ; CHECK-NEXT: .LBB34_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a4, -1024 -; CHECK-NEXT: slli a3, a4, 2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB34_6: # %for.body @@ -2249,35 +2238,34 @@ ; CHECK-LABEL: sink_splat_fma_commute_scalable: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: csrr a2, vlenb -; CHECK-NEXT: srli a3, a2, 2 -; CHECK-NEXT: li a4, 1024 -; CHECK-NEXT: bgeu a4, a3, .LBB35_2 +; CHECK-NEXT: srli a2, a2, 2 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB35_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB35_5 ; CHECK-NEXT: .LBB35_2: # %vector.ph -; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: addiw a4, a3, -1 -; CHECK-NEXT: andi a5, a4, 1024 -; CHECK-NEXT: xori a4, a5, 1024 -; CHECK-NEXT: vsetvli a7, zero, e32, m1, ta, ma -; CHECK-NEXT: mv a7, a4 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 +; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, ma ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: add t0, a0, a6 -; CHECK-NEXT: vl1re32.v v8, (t0) -; CHECK-NEXT: add t1, a1, a6 -; CHECK-NEXT: vl1re32.v v9, (t1) +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a7, a0, a6 +; CHECK-NEXT: vl1re32.v v8, (a7) +; CHECK-NEXT: add a6, a1, a6 +; CHECK-NEXT: vl1re32.v v9, (a6) ; CHECK-NEXT: vfmacc.vf v9, fa0, v8 -; CHECK-NEXT: vs1r.v v9, (t0) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a2 -; CHECK-NEXT: bnez a7, .LBB35_3 +; CHECK-NEXT: add a5, a5, a2 +; CHECK-NEXT: vs1r.v v9, (a7) +; CHECK-NEXT: bne a5, a3, .LBB35_3 ; CHECK-NEXT: # %bb.4: # %middle.block -; CHECK-NEXT: beqz a5, .LBB35_7 +; CHECK-NEXT: beqz a4, .LBB35_7 ; CHECK-NEXT: .LBB35_5: # %for.body.preheader -; CHECK-NEXT: addi a2, a4, -1024 -; CHECK-NEXT: slli a3, a4, 2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 ; CHECK-NEXT: add a1, a1, a3 ; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB35_6: # %for.body @@ -2576,43 +2564,42 @@ define void @sink_splat_udiv_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_udiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB42_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB42_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB42_5 ; CHECK-NEXT: .LBB42_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB42_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vdivu.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB42_3 +; CHECK-NEXT: bne a5, a3, .LBB42_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB42_7 ; CHECK-NEXT: .LBB42_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB42_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: divuw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: divuw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB42_6 +; CHECK-NEXT: bnez a2, .LBB42_6 ; CHECK-NEXT: .LBB42_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2669,43 +2656,42 @@ define void @sink_splat_sdiv_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_sdiv_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB43_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB43_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB43_5 ; CHECK-NEXT: .LBB43_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB43_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vdiv.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB43_3 +; CHECK-NEXT: bne a5, a3, .LBB43_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB43_7 ; CHECK-NEXT: .LBB43_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB43_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: divw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: divw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB43_6 +; CHECK-NEXT: bnez a2, .LBB43_6 ; CHECK-NEXT: .LBB43_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2762,43 +2748,42 @@ define void @sink_splat_urem_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_urem_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB44_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB44_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB44_5 ; CHECK-NEXT: .LBB44_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB44_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vremu.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB44_3 +; CHECK-NEXT: bne a5, a3, .LBB44_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB44_7 ; CHECK-NEXT: .LBB44_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB44_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: remuw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: remuw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB44_6 +; CHECK-NEXT: bnez a2, .LBB44_6 ; CHECK-NEXT: .LBB44_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: @@ -2855,43 +2840,42 @@ define void @sink_splat_srem_scalable(i32* nocapture %a, i32 signext %x) { ; CHECK-LABEL: sink_splat_srem_scalable: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: csrr a5, vlenb -; CHECK-NEXT: srli a3, a5, 1 -; CHECK-NEXT: li a2, 1024 -; CHECK-NEXT: bgeu a2, a3, .LBB45_2 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a2, a2, 1 +; CHECK-NEXT: li a3, 1024 +; CHECK-NEXT: bgeu a3, a2, .LBB45_2 ; CHECK-NEXT: # %bb.1: -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a3, 0 ; CHECK-NEXT: j .LBB45_5 ; CHECK-NEXT: .LBB45_2: # %vector.ph -; CHECK-NEXT: addiw a2, a3, -1 -; CHECK-NEXT: andi a4, a2, 1024 -; CHECK-NEXT: xori a2, a4, 1024 -; CHECK-NEXT: slli a5, a5, 1 +; CHECK-NEXT: li a5, 0 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma -; CHECK-NEXT: mv a6, a0 -; CHECK-NEXT: mv a7, a2 ; CHECK-NEXT: .LBB45_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a6, a5, 2 +; CHECK-NEXT: add a6, a0, a6 ; CHECK-NEXT: vl2re32.v v8, (a6) ; CHECK-NEXT: vrem.vx v8, v8, a1 +; CHECK-NEXT: add a5, a5, a2 ; CHECK-NEXT: vs2r.v v8, (a6) -; CHECK-NEXT: sub a7, a7, a3 -; CHECK-NEXT: add a6, a6, a5 -; CHECK-NEXT: bnez a7, .LBB45_3 +; CHECK-NEXT: bne a5, a3, .LBB45_3 ; CHECK-NEXT: # %bb.4: # %middle.block ; CHECK-NEXT: beqz a4, .LBB45_7 ; CHECK-NEXT: .LBB45_5: # %for.body.preheader -; CHECK-NEXT: addi a3, a2, -1024 -; CHECK-NEXT: slli a2, a2, 2 -; CHECK-NEXT: add a0, a0, a2 +; CHECK-NEXT: addi a2, a3, -1024 +; CHECK-NEXT: slli a3, a3, 2 +; CHECK-NEXT: add a0, a0, a3 ; CHECK-NEXT: .LBB45_6: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: lw a2, 0(a0) -; CHECK-NEXT: remw a2, a2, a1 -; CHECK-NEXT: sw a2, 0(a0) -; CHECK-NEXT: addi a3, a3, 1 +; CHECK-NEXT: lw a3, 0(a0) +; CHECK-NEXT: remw a3, a3, a1 +; CHECK-NEXT: sw a3, 0(a0) +; CHECK-NEXT: addi a2, a2, 1 ; CHECK-NEXT: addi a0, a0, 4 -; CHECK-NEXT: bnez a3, .LBB45_6 +; CHECK-NEXT: bnez a2, .LBB45_6 ; CHECK-NEXT: .LBB45_7: # %for.cond.cleanup ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -643,14 +643,14 @@ ; CHECK-NEXT: blez a0, .LBB12_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: slli a4, a2, 3 ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB12_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: add a3, a3, a2 -; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: vse64.v v8, (a4) ; CHECK-NEXT: blt a3, a0, .LBB12_2 ; CHECK-NEXT: .LBB12_3: # %for.end ; CHECK-NEXT: ret @@ -680,15 +680,15 @@ ; CHECK-NEXT: blez a0, .LBB13_3 ; CHECK-NEXT: # %bb.1: # %for.body.preheader ; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: slli a4, a2, 3 -; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB13_2: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a4, a3, 3 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vsetvli zero, a2, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: add a3, a3, a2 -; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: vse64.v v8, (a4) ; CHECK-NEXT: blt a3, a0, .LBB13_2 ; CHECK-NEXT: .LBB13_3: # %for.end ; CHECK-NEXT: ret @@ -716,15 +716,15 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: li a2, 0 ; CHECK-NEXT: vsetivli a3, 4, e64, m1, ta, mu -; CHECK-NEXT: slli a4, a3, 3 -; CHECK-NEXT: vsetvli a5, zero, e64, m1, ta, ma +; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma ; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: .LBB14_1: # %for.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: slli a4, a2, 3 +; CHECK-NEXT: add a4, a4, a1 ; CHECK-NEXT: vsetivli zero, 4, e64, m1, ta, ma -; CHECK-NEXT: vse64.v v8, (a1) ; CHECK-NEXT: add a2, a2, a3 -; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: vse64.v v8, (a4) ; CHECK-NEXT: blt a2, a0, .LBB14_1 ; CHECK-NEXT: # %bb.2: # %for.end ; CHECK-NEXT: ret diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/icmp-zero.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/icmp-zero.ll --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/icmp-zero.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/icmp-zero.ll @@ -357,11 +357,11 @@ ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[T1:%.*]] ; CHECK: t1: -; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], [[T1]] ], [ -1, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[LSR_IV_NEXT]] = add nsw i64 [[LSR_IV]], 1 +; CHECK-NEXT: [[T2:%.*]] = phi i64 [ [[T3:%.*]], [[T1]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[T3]] = add nuw i64 [[T2]], 1 ; CHECK-NEXT: br i1 true, label [[T4:%.*]], label [[T1]] ; CHECK: t4: -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[LSR_IV_NEXT]] to i32 +; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T2]] to i32 ; CHECK-NEXT: [[T6:%.*]] = add i32 [[T5]], 1 ; CHECK-NEXT: [[T7:%.*]] = icmp eq i32 [[T5]], [[T6]] ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution-dbg-msg.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution-dbg-msg.ll --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution-dbg-msg.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution-dbg-msg.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -O3 -mattr=+v -debug -lsr-drop-solution 2>&1 | FileCheck --check-prefix=DEBUG %s -; RUN: llc < %s -O3 -mattr=+v -debug 2>&1 | FileCheck --check-prefix=DEBUG2 %s +; RUN: llc < %s -O3 -mattr=+v -debug 2>&1 | FileCheck --check-prefix=DEBUG %s target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64-unknown-linux-gnu" @@ -10,7 +9,6 @@ ;DEBUG: The baseline solution requires 2 instructions 4 regs, with addrec cost 2, plus 3 setup cost ;DEBUG: Baseline is more profitable than chosen solution, dropping LSR solution. -;DEBUG2: Baseline is more profitable than chosen solution, add option 'lsr-drop-solution' to drop LSR solution. entry: %0 = ptrtoint ptr %a0 to i64 %1 = tail call i64 @llvm.riscv.vsetvli.i64(i64 %a2, i64 0, i64 3) diff --git a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll --- a/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/RISCV/lsr-drop-solution.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -O3 -mattr=+v -lsr-drop-solution | FileCheck --check-prefix=CHECK %s +; RUN: llc < %s -O3 -mattr=+v | FileCheck --check-prefix=CHECK %s target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128" target triple = "riscv64-unknown-linux-gnu"