diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -546,6 +546,9 @@ return BypassSlowDivWidths; } + /// Return true only if vscale must be a power of two. + virtual bool isVScaleKnownToBeAPowerOfTwo() const { return false; } + /// Return true if Flow Control is an expensive operation that should be /// avoided. bool isJumpExpensive() const { return JumpIsExpensive; } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3869,6 +3869,12 @@ if (C->getAPIntValue().zextOrTrunc(BitWidth).isPowerOf2()) return true; + // vscale(power-of-two) is a power-of-two for some targets + if (Val.getOpcode() == ISD::VSCALE && + getTargetLoweringInfo().isVScaleKnownToBeAPowerOfTwo() && + isKnownToBeAPowerOfTwo(Val.getOperand(0))) + return true; + // More could be done here, though the above checks are enough // to handle some common cases. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -597,6 +597,8 @@ unsigned uid, MCContext &Ctx) const override; + bool isVScaleKnownToBeAPowerOfTwo() const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -12130,6 +12130,17 @@ return MCSymbolRefExpr::create(MBB->getSymbol(), Ctx); } +bool RISCVTargetLowering::isVScaleKnownToBeAPowerOfTwo() const { + // We define vscale to be VLEN/RVVBitsPerBlock. VLEN is always a power + // of two >= 64, and RVVBitsPerBlock is 64. Thus, vscale must be + // a power of two as well. + // FIXME: This doesn't work for zve32, but that's already broken + // elsewhere for the same reason. + assert(Subtarget.getRealMinVLen() >= 64 && "zve32* unsupported"); + assert(RISCV::RVVBitsPerBlock == 64 && "RVVBitsPerBlock changed, audit needed"); + return true; +} + bool RISCVTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); diff --git a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll --- a/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll +++ b/llvm/test/CodeGen/RISCV/rvv/sink-splat-operands.ll @@ -266,8 +266,9 @@ ; CHECK-NEXT: j .LBB7_5 ; CHECK-NEXT: .LBB7_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -358,8 +359,9 @@ ; CHECK-NEXT: j .LBB8_5 ; CHECK-NEXT: .LBB8_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -450,8 +452,9 @@ ; CHECK-NEXT: j .LBB9_5 ; CHECK-NEXT: .LBB9_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -542,8 +545,9 @@ ; CHECK-NEXT: j .LBB10_5 ; CHECK-NEXT: .LBB10_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -634,8 +638,9 @@ ; CHECK-NEXT: j .LBB11_5 ; CHECK-NEXT: .LBB11_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -726,8 +731,9 @@ ; CHECK-NEXT: j .LBB12_5 ; CHECK-NEXT: .LBB12_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -818,8 +824,9 @@ ; CHECK-NEXT: j .LBB13_5 ; CHECK-NEXT: .LBB13_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -1018,8 +1025,9 @@ ; CHECK-NEXT: j .LBB17_5 ; CHECK-NEXT: .LBB17_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -1110,8 +1118,9 @@ ; CHECK-NEXT: j .LBB18_5 ; CHECK-NEXT: .LBB18_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -1202,8 +1211,9 @@ ; CHECK-NEXT: j .LBB19_5 ; CHECK-NEXT: .LBB19_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a3, a2, a1 -; CHECK-NEXT: sub a2, a2, a3 +; CHECK-NEXT: addiw a2, a1, -1 +; CHECK-NEXT: andi a3, a2, 1024 +; CHECK-NEXT: xori a2, a3, 1024 ; CHECK-NEXT: slli a4, a4, 1 ; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a6, a0 @@ -1510,8 +1520,9 @@ ; CHECK-NEXT: j .LBB26_5 ; CHECK-NEXT: .LBB26_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB26_3: # %vector.body @@ -1601,8 +1612,9 @@ ; CHECK-NEXT: j .LBB27_5 ; CHECK-NEXT: .LBB27_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB27_3: # %vector.body @@ -1692,8 +1704,9 @@ ; CHECK-NEXT: j .LBB28_5 ; CHECK-NEXT: .LBB28_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB28_3: # %vector.body @@ -1783,8 +1796,9 @@ ; CHECK-NEXT: j .LBB29_5 ; CHECK-NEXT: .LBB29_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB29_3: # %vector.body @@ -1874,8 +1888,9 @@ ; CHECK-NEXT: j .LBB30_5 ; CHECK-NEXT: .LBB30_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB30_3: # %vector.body @@ -1965,8 +1980,9 @@ ; CHECK-NEXT: j .LBB31_5 ; CHECK-NEXT: .LBB31_2: # %vector.ph ; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: vsetvli a6, zero, e32, m1, ta, mu ; CHECK-NEXT: mv a6, a0 ; CHECK-NEXT: .LBB31_3: # %vector.body @@ -2139,8 +2155,9 @@ ; CHECK-NEXT: .LBB34_2: # %vector.ph ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: li a7, 0 -; CHECK-NEXT: remu a5, a4, a3 -; CHECK-NEXT: sub a4, a4, a5 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, mu ; CHECK-NEXT: .LBB34_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2241,8 +2258,9 @@ ; CHECK-NEXT: .LBB35_2: # %vector.ph ; CHECK-NEXT: li a6, 0 ; CHECK-NEXT: li a7, 0 -; CHECK-NEXT: remu a5, a4, a3 -; CHECK-NEXT: sub a4, a4, a5 +; CHECK-NEXT: addiw a4, a3, -1 +; CHECK-NEXT: andi a5, a4, 1024 +; CHECK-NEXT: xori a4, a5, 1024 ; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, mu ; CHECK-NEXT: .LBB35_3: # %vector.body ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 @@ -2567,8 +2585,9 @@ ; CHECK-NEXT: j .LBB42_5 ; CHECK-NEXT: .LBB42_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -2659,8 +2678,9 @@ ; CHECK-NEXT: j .LBB43_5 ; CHECK-NEXT: .LBB43_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -2751,8 +2771,9 @@ ; CHECK-NEXT: j .LBB44_5 ; CHECK-NEXT: .LBB44_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 @@ -2843,8 +2864,9 @@ ; CHECK-NEXT: j .LBB45_5 ; CHECK-NEXT: .LBB45_2: # %vector.ph ; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: remu a4, a3, a2 -; CHECK-NEXT: sub a3, a3, a4 +; CHECK-NEXT: addiw a3, a2, -1 +; CHECK-NEXT: andi a4, a3, 1024 +; CHECK-NEXT: xori a3, a4, 1024 ; CHECK-NEXT: slli a5, a5, 1 ; CHECK-NEXT: vsetvli a7, zero, e32, m2, ta, mu ; CHECK-NEXT: mv a7, a0 diff --git a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll --- a/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vscale-power-of-two.ll @@ -8,7 +8,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 6 -; CHECK-NEXT: remu a0, a0, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %shifted = lshr i64 %vscale, 3 @@ -21,7 +22,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: remu a0, a0, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %urem = urem i64 %TC, %vscale @@ -32,7 +34,8 @@ ; CHECK-LABEL: vscale_shl: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: remu a0, a0, a1 +; CHECK-NEXT: addi a1, a1, -1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %shifted = shl i64 %vscale, 3 @@ -45,8 +48,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb ; CHECK-NEXT: srli a1, a1, 3 -; CHECK-NEXT: remu a1, a0, a1 -; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: neg a1, a1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %urem = urem i64 %TC, %vscale @@ -58,8 +61,8 @@ ; CHECK-LABEL: TC_minus_rem_shl: ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: remu a1, a0, a1 -; CHECK-NEXT: sub a0, a0, a1 +; CHECK-NEXT: neg a1, a1 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %shifted = shl i64 %vscale, 3 @@ -73,9 +76,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 -; CHECK-NEXT: li a1, 1024 -; CHECK-NEXT: remu a0, a1, a0 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: negw a0, a0 +; CHECK-NEXT: andi a0, a0, 1024 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %urem = urem i64 1024, %vscale @@ -90,10 +92,10 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: neg a0, a0 ; CHECK-NEXT: lui a1, 1 ; CHECK-NEXT: addiw a1, a1, -2048 -; CHECK-NEXT: remu a0, a1, a0 -; CHECK-NEXT: sub a0, a1, a0 +; CHECK-NEXT: and a0, a0, a1 ; CHECK-NEXT: ret %vscale = call i64 @llvm.vscale.i64() %urem = urem i64 2048, %vscale