diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2001,6 +2001,9 @@ return SplitVector(N, DL, LoVT, HiVT); } + /// Split the explicit vector length parameter of a VP operation. + std::pair SplitEVL(SDValue N, EVT VecVT, const SDLoc &DL); + /// Split the node's operand with EXTRACT_SUBVECTOR and /// return the low/high part. std::pair SplitVectorOperand(const SDNode *N, unsigned OpNo) diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -1193,7 +1193,7 @@ llvm_unreachable("Do not know how to expand the result of this operator!"); case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::MERGE_VALUES: ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -2252,7 +2252,7 @@ case ISD::ARITH_FENCE: SplitRes_ARITH_FENCE(N, Lo, Hi); break; case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::FREEZE: SplitRes_FREEZE(N, Lo, Hi); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -1022,7 +1022,7 @@ void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo, SDValue &Lo, SDValue &Hi); void SplitRes_ARITH_FENCE (SDNode *N, SDValue &Lo, SDValue &Hi); - void SplitRes_SELECT (SDNode *N, SDValue &Lo, SDValue &Hi); + void SplitRes_Select (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_SELECT_CC (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_UNDEF (SDNode *N, SDValue &Lo, SDValue &Hi); void SplitRes_FREEZE (SDNode *N, SDValue &Lo, SDValue &Hi); diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -506,9 +506,10 @@ GetSplitOp(Op, Lo, Hi); } -void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { +void DAGTypeLegalizer::SplitRes_Select(SDNode *N, SDValue &Lo, SDValue &Hi) { SDValue LL, LH, RL, RH, CL, CH; SDLoc dl(N); + unsigned Opcode = N->getOpcode(); GetSplitOp(N->getOperand(1), LL, LH); GetSplitOp(N->getOperand(2), RL, RH); @@ -539,8 +540,18 @@ std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } - Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL); - Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH); + if (Opcode != ISD::VP_SELECT) { + Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL); + Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH); + return; + } + + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl); + + Lo = DAG.getNode(Opcode, dl, LL.getValueType(), CL, LL, RL, EVLLo); + Hi = DAG.getNode(Opcode, dl, LH.getValueType(), CH, LH, RH, EVLHi); } void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo, diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -914,7 +914,8 @@ case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break; case ISD::VSELECT: - case ISD::SELECT: SplitRes_SELECT(N, Lo, Hi); break; + case ISD::SELECT: + case ISD::VP_SELECT: SplitRes_Select(N, Lo, Hi); break; case ISD::SELECT_CC: SplitRes_SELECT_CC(N, Lo, Hi); break; case ISD::UNDEF: SplitRes_UNDEF(N, Lo, Hi); break; case ISD::BITCAST: SplitVecRes_BITCAST(N, Lo, Hi); break; @@ -1140,21 +1141,9 @@ else std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, SDLoc(Mask)); - // Split the vector length parameter. - // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts). - SDValue EVL = N->getOperand(3); - EVT VecVT = N->getValueType(0); - EVT EVLVT = EVL.getValueType(); - assert(VecVT.getVectorElementCount().isKnownEven() && - "Expecting the mask to be an evenly-sized vector"); - unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; - SDValue HalfNumElts = - VecVT.isFixedLengthVector() - ? DAG.getConstant(HalfMinNumElts, dl, EVLVT) - : DAG.getVScale(dl, EVLVT, - APInt(EVLVT.getScalarSizeInBits(), HalfMinNumElts)); - SDValue EVLLo = DAG.getNode(ISD::UMIN, dl, EVLVT, EVL, HalfNumElts); - SDValue EVLHi = DAG.getNode(ISD::USUBSAT, dl, EVLVT, EVL, HalfNumElts); + SDValue EVLLo, EVLHi; + std::tie(EVLLo, EVLHi) = + DAG.SplitEVL(N->getOperand(3), N->getValueType(0), dl); Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), {LHSLo, RHSLo, MaskLo, EVLLo}, Flags); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -10648,6 +10648,23 @@ return std::make_pair(Lo, Hi); } +std::pair SelectionDAG::SplitEVL(SDValue N, EVT VecVT, + const SDLoc &DL) { + // Split the vector length parameter. + // %evl -> umin(%evl, %halfnumelts) and usubsat(%evl - %halfnumelts). + EVT VT = N.getValueType(); + assert(VecVT.getVectorElementCount().isKnownEven() && + "Expecting the mask to be an evenly-sized vector"); + unsigned HalfMinNumElts = VecVT.getVectorMinNumElements() / 2; + SDValue HalfNumElts = + VecVT.isFixedLengthVector() + ? getConstant(HalfMinNumElts, DL, VT) + : getVScale(DL, VT, APInt(VT.getScalarSizeInBits(), HalfMinNumElts)); + SDValue Lo = getNode(ISD::UMIN, DL, VT, N, HalfNumElts); + SDValue Hi = getNode(ISD::USUBSAT, DL, VT, N, HalfNumElts); + return std::make_pair(Lo, Hi); +} + /// Widen the vector up to the next power of two using INSERT_SUBVECTOR. SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) { EVT VT = N.getValueType(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vselect-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v,+m -target-abi=ilp32d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d -riscv-v-vector-bits-min=128 \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v,+m -target-abi=lp64d -riscv-v-vector-bits-min=128 \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare <1 x i1> @llvm.vp.select.v1i1(<1 x i1>, <1 x i1>, <1 x i1>, i32) @@ -146,6 +146,131 @@ ret <16 x i8> %v } +declare <256 x i8> @llvm.vp.select.v256i8(<256 x i1>, <256 x i8>, <256 x i8>, i32) + +define <256 x i8> @select_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 zeroext %evl) { +; CHECK-LABEL: select_v256i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a4, 24 +; CHECK-NEXT: mul a2, a2, a4 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a4, 128 +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v24, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, sp, a2 +; CHECK-NEXT: addi a2, a2, 16 +; CHECK-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v1, v8 +; CHECK-NEXT: addi a1, a1, 128 +; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: bltu a3, a4, .LBB11_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vle8.v v24, (a1) +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: addi a0, a3, -128 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8re8.v v16, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v8, v16, v0 +; CHECK-NEXT: bltu a3, a0, .LBB11_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: .LBB11_4: +; CHECK-NEXT: vsetvli zero, a4, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 %evl) + ret <256 x i8> %v +} + +define <256 x i8> @select_evl_v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c) { +; CHECK-LABEL: select_evl_v256i8: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a2, a2, a3 +; CHECK-NEXT: sub sp, sp, a2 +; CHECK-NEXT: li a2, 128 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, a1, 128 +; CHECK-NEXT: vle8.v v24, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vle8.v v16, (a1) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v24, v24, v16, v0 +; CHECK-NEXT: vsetvli zero, a2, e8, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v16, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v16, v8, v0 +; CHECK-NEXT: vmv8r.v v16, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call <256 x i8> @llvm.vp.select.v256i8(<256 x i1> %a, <256 x i8> %b, <256 x i8> %c, i32 129) + ret <256 x i8> %v +} + declare <2 x i16> @llvm.vp.select.v2i16(<2 x i1>, <2 x i16>, <2 x i16>, i32) define <2 x i16> @select_v2i16(<2 x i1> %a, <2 x i16> %b, <2 x i16> %c, i32 zeroext %evl) { @@ -290,6 +415,91 @@ ret <16 x i64> %v } +declare <32 x i64> @llvm.vp.select.v32i64(<32 x i1>, <32 x i64>, <32 x i64>, i32) + +define <32 x i64> @select_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 zeroext %evl) { +; CHECK-LABEL: select_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: li a3, 24 +; CHECK-NEXT: mul a1, a1, a3 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: addi a1, a0, 128 +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v24, (a1) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a3, a2, -16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v24, v0 +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: bltu a2, a3, .LBB25_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: .LBB25_2: +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v24, 2 +; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu +; CHECK-NEXT: li a0, 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8re8.v v8, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: bltu a2, a0, .LBB25_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: li a2, 16 +; CHECK-NEXT: .LBB25_4: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v24 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v8, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: li a1, 24 +; CHECK-NEXT: mul a0, a0, a1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 %evl) + ret <32 x i64> %v +} + +define <32 x i64> @select_evl_v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c) { +; CHECK-LABEL: select_evl_v32i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: vle64.v v24, (a0) +; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v0, 2 +; CHECK-NEXT: vsetivli zero, 1, e64, m8, ta, mu +; CHECK-NEXT: vmerge.vvm v16, v24, v16, v0 +; CHECK-NEXT: ret + %v = call <32 x i64> @llvm.vp.select.v32i64(<32 x i1> %a, <32 x i64> %b, <32 x i64> %c, i32 17) + ret <32 x i64> %v +} + declare <2 x half> @llvm.vp.select.v2f16(<2 x i1>, <2 x half>, <2 x half>, i32) define <2 x half> @select_v2f16(<2 x i1> %a, <2 x half> %b, <2 x half> %c, i32 zeroext %evl) { @@ -386,6 +596,61 @@ ret <16 x float> %v } +declare <64 x float> @llvm.vp.select.v64f32(<64 x i1>, <64 x float>, <64 x float>, i32) + +define <64 x float> @select_v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %c, i32 zeroext %evl) { +; CHECK-LABEL: select_v64f32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: li a3, 32 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: vle32.v v24, (a0) +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: addi a0, a0, 128 +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: bltu a2, a3, .LBB35_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: .LBB35_2: +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: vle32.v v16, (a0) +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: addi a0, a2, -32 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vl8re8.v v24, (a1) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: bltu a2, a0, .LBB35_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: .LBB35_4: +; CHECK-NEXT: vsetivli zero, 4, e8, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v0, v0, 4 +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v16, v16, v24, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call <64 x float> @llvm.vp.select.v64f32(<64 x i1> %a, <64 x float> %b, <64 x float> %c, i32 %evl) + ret <64 x float> %v +} + declare <2 x double> @llvm.vp.select.v2f64(<2 x i1>, <2 x double>, <2 x double>, i32) define <2 x double> @select_v2f64(<2 x i1> %a, <2 x double> %b, <2 x double> %c, i32 zeroext %evl) { diff --git a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vselect-vp.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=ilp32d \ +; RUN: llc -mtriple=riscv32 -mattr=+d,+m,+experimental-zfh,+experimental-v -target-abi=ilp32d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s -; RUN: llc -mtriple=riscv64 -mattr=+d,+experimental-zfh,+experimental-v -target-abi=lp64d \ +; RUN: llc -mtriple=riscv64 -mattr=+d,+m,+experimental-zfh,+experimental-v -target-abi=lp64d \ ; RUN: -verify-machineinstrs < %s | FileCheck %s declare @llvm.vp.select.nxv1i1(, , , i32) @@ -342,6 +342,124 @@ ret %v } +declare @llvm.vp.select.nxv32i32(, , , i32) + +define @select_nxv32i32( %a, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: select_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a4, a1, 3 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vl8re32.v v8, (a4) +; CHECK-NEXT: srli a5, a1, 2 +; CHECK-NEXT: vsetvli a4, zero, e8, mf2, ta, mu +; CHECK-NEXT: slli a1, a1, 1 +; CHECK-NEXT: sub a4, a2, a1 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: bltu a2, a4, .LBB27_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a4 +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: bltu a2, a1, .LBB27_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: .LBB27_4: +; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.select.nxv32i32( %a, %b, %c, i32 %evl) + ret %v +} + +declare i32 @llvm.vscale.i32() + +define @select_evl_nxv32i32( %a, %b, %c) { +; CHECK-LABEL: select_evl_nxv32i32: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 3 +; CHECK-NEXT: add a2, a0, a2 +; CHECK-NEXT: vl8re32.v v8, (a2) +; CHECK-NEXT: srli a5, a1, 2 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, mu +; CHECK-NEXT: slli a2, a1, 1 +; CHECK-NEXT: sub a4, a1, a2 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: bltu a1, a4, .LBB28_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a4 +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: vl8re32.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, mu +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: bltu a1, a2, .LBB28_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: .LBB28_4: +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %evl = call i32 @llvm.vscale.i32() + %evl0 = mul i32 %evl, 8 + %v = call @llvm.vp.select.nxv32i32( %a, %b, %c, i32 %evl0) + ret %v +} + declare @llvm.vp.select.nxv1i64(, , , i32) define @select_nxv1i64( %a, %b, %c, i32 zeroext %evl) { @@ -569,3 +687,60 @@ %v = call @llvm.vp.select.nxv8f64( %a, %b, %c, i32 %evl) ret %v } + +declare @llvm.vp.select.nxv16f64(, , , i32) + +define @select_nxv16f64( %a, %b, %c, i32 zeroext %evl) { +; CHECK-LABEL: select_nxv16f64: +; CHECK: # %bb.0: +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 4 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; CHECK-NEXT: vmv1r.v v1, v0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a4, a1, 3 +; CHECK-NEXT: add a4, a0, a4 +; CHECK-NEXT: vl8re64.v v8, (a4) +; CHECK-NEXT: srli a5, a1, 3 +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, mu +; CHECK-NEXT: sub a4, a2, a1 +; CHECK-NEXT: vslidedown.vx v0, v0, a5 +; CHECK-NEXT: bltu a2, a4, .LBB48_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a3, a4 +; CHECK-NEXT: .LBB48_2: +; CHECK-NEXT: vl8re64.v v24, (a0) +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetvli zero, a3, e64, m8, ta, mu +; CHECK-NEXT: vmerge.vvm v16, v8, v16, v0 +; CHECK-NEXT: bltu a2, a1, .LBB48_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: .LBB48_4: +; CHECK-NEXT: vsetvli zero, a2, e64, m8, ta, mu +; CHECK-NEXT: vmv1r.v v0, v1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 3 +; CHECK-NEXT: add a0, sp, a0 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: vl8re8.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vl8re8.v v24, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vmerge.vvm v8, v24, v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 4 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: ret + %v = call @llvm.vp.select.nxv16f64( %a, %b, %c, i32 %evl) + ret %v +}