diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3096,15 +3096,14 @@ return false; } -// Return true if we can make sure mask of N is all-ones mask. -static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) { +static bool usesAllOnesMask(SDValue MaskOp, SDValue GlueOp) { // Check that we're using V0 as a mask register. - if (!isa(N->getOperand(MaskOpIdx)) || - cast(N->getOperand(MaskOpIdx))->getReg() != RISCV::V0) + if (!isa(MaskOp) || + cast(MaskOp)->getReg() != RISCV::V0) return false; // The glued user defines V0. - const auto *Glued = N->getGluedNode(); + const auto *Glued = GlueOp.getNode(); if (!Glued || Glued->getOpcode() != ISD::CopyToReg) return false; @@ -3131,6 +3130,12 @@ IsVMSet(MaskSetter.getMachineOpcode()); } +// Return true if we can make sure mask of N is all-ones mask. +static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) { + return usesAllOnesMask(N->getOperand(MaskOpIdx), + N->getOperand(N->getNumOperands() - 1)); +} + static bool isImplicitDef(SDValue V) { return V.isMachineOpcode() && V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF; @@ -3188,6 +3193,58 @@ return true; } +// A vmv.v.v is equivalent to a vmerge with an all-ones mask. If N is a +// PseudoVMV_V_V, this places the operands for an equivalent PseudoVMERGE into +// Ops. +static bool getVMergeOpsFromVMv(SDNode *N, SelectionDAG *DAG, + SmallVectorImpl &Ops) { + unsigned VmsetOpc; + switch (N->getMachineOpcode()) { + case RISCV::PseudoVMV_V_V_MF8: + VmsetOpc = RISCV::PseudoVMSET_M_B1; + break; + case RISCV::PseudoVMV_V_V_MF4: + VmsetOpc = RISCV::PseudoVMSET_M_B2; + break; + case RISCV::PseudoVMV_V_V_MF2: + VmsetOpc = RISCV::PseudoVMSET_M_B4; + break; + case RISCV::PseudoVMV_V_V_M1: + VmsetOpc = RISCV::PseudoVMSET_M_B8; + break; + case RISCV::PseudoVMV_V_V_M2: + VmsetOpc = RISCV::PseudoVMSET_M_B16; + break; + case RISCV::PseudoVMV_V_V_M4: + VmsetOpc = RISCV::PseudoVMSET_M_B32; + break; + case RISCV::PseudoVMV_V_V_M8: + VmsetOpc = RISCV::PseudoVMSET_M_B64; + break; + default: + return false; + } + + SDLoc DL(N); + ElementCount EC = N->getValueType(0).getVectorElementCount(); + MVT MaskVT = MVT::getVectorVT(MVT::i1, EC); + SDValue Merge = N->getOperand(0); + SDValue Src = N->getOperand(1); + SDValue VL = N->getOperand(2); + SDValue SEW = N->getOperand(3); + SDValue Policy = N->getOperand(4); + + SDValue AllOnesMask = + SDValue(DAG->getMachineNode(VmsetOpc, DL, MaskVT, VL, SEW), 0); + SDValue MaskCopy = DAG->getCopyToReg(DAG->getEntryNode(), DL, RISCV::V0, + AllOnesMask, SDValue()); + SDValue Mask = DAG->getRegister(RISCV::V0, MaskVT); + + Ops = SmallVector( + {Merge, Merge, Src, Mask, VL, SEW, Policy, MaskCopy.getValue(1)}); + return true; +} + // Try to fold away VMERGE_VVM instructions. We handle these cases: // -Masked TU VMERGE_VVM combined with an unmasked TA instruction instruction // folds to a masked TU instruction. VMERGE_VVM must have have merge operand @@ -3202,15 +3259,19 @@ // form with an IMPLICIT_DEF passthrough operand or the unsuffixed (TA) pseudo // form. bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) { - - SDValue Merge = N->getOperand(0); - SDValue False = N->getOperand(1); - SDValue True = N->getOperand(2); - SDValue Mask = N->getOperand(3); - SDValue VL = N->getOperand(4); + SmallVector VMergeOps; + // Handle vmv.v.v's, since they are equivalent to a vmerge with an all-ones + // mask. + if (!getVMergeOpsFromVMv(N, CurDAG, VMergeOps)) + VMergeOps = SmallVector(N->ops()); + SDValue Merge = VMergeOps[0]; + SDValue False = VMergeOps[1]; + SDValue True = VMergeOps[2]; + SDValue Mask = VMergeOps[3]; + SDValue VL = VMergeOps[4]; // We always have a glue node for the mask at v0 + SDValue Glue = VMergeOps[VMergeOps.size() - 1]; assert(cast(Mask)->getReg() == RISCV::V0); - SDValue Glue = N->getOperand(N->getNumOperands() - 1); assert(Glue.getValueType() == MVT::Glue); // We require that either merge and false are the same, or that merge @@ -3293,7 +3354,7 @@ // the mask from the True instruction. // FIXME: Support mask agnostic True instruction which would have an // undef merge operand. - if (!usesAllOnesMask(N, /* MaskOpIdx */ 3)) + if (!usesAllOnesMask(Mask, Glue)) return false; Mask = True->getOperand(Info->MaskOpIdx); @@ -3443,8 +3504,18 @@ Opcode == RISCV::PseudoVMERGE_VVM_M8; }; + auto IsVMv = [](unsigned Opcode) { + return Opcode == RISCV::PseudoVMV_V_V_MF8 || + Opcode == RISCV::PseudoVMV_V_V_MF4 || + Opcode == RISCV::PseudoVMV_V_V_MF2 || + Opcode == RISCV::PseudoVMV_V_V_M1 || + Opcode == RISCV::PseudoVMV_V_V_M2 || + Opcode == RISCV::PseudoVMV_V_V_M4 || + Opcode == RISCV::PseudoVMV_V_V_M8; + }; + unsigned Opc = N->getMachineOpcode(); - if (IsVMerge(Opc)) + if (IsVMerge(Opc) || IsVMv(Opc)) MadeChange |= performCombineVMergeAndVOps(N); if (IsVMerge(Opc) && N->getOperand(0) == N->getOperand(1)) MadeChange |= performVMergeToVMv(N); diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll --- a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll @@ -80,10 +80,8 @@ define @foldable_load( %passthru, ptr %p) { ; CHECK-LABEL: foldable_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v = call @llvm.riscv.vle.nxv4i32( poison, ptr %p, iXLen 4) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen 2) diff --git a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll @@ -470,13 +470,12 @@ ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: srli a0, a0, 2 ; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v11, v10, a0 ; CHECK-NEXT: vslidedown.vx v8, v9, a0 -; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma -; CHECK-NEXT: vmv.v.v v9, v11 ; CHECK-NEXT: add a1, a0, a0 ; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma ; CHECK-NEXT: vslideup.vx v8, v10, a0 +; CHECK-NEXT: vsetvli zero, a0, e16, m1, tu, ma +; CHECK-NEXT: vslidedown.vx v9, v10, a0 ; CHECK-NEXT: ret %res = call @llvm.vector.extract.nxv6f16.nxv12f16( %in, i64 6) ret %res diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -5,10 +5,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v4i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <4 x i32>, ptr %p %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -19,10 +17,8 @@ define <4 x i32> @insert_subvector_vp_load_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.vp.load.v4i32(ptr %p, <4 x i1> %mask, i32 4) %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -48,11 +44,8 @@ define <4 x i32> @insert_subvector_load_foldable_passthru_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_load_foldable_passthru_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x i32> %v1) %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -64,9 +57,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v3 = add <4 x i32> %v2, %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -77,10 +69,8 @@ define <4 x i32> @insert_subvector_vp_add_v4i32_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vadd.vi v9, v9, 1, v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, mu +; CHECK-NEXT: vadd.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %v3 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %v2, <4 x i32> , <4 x i1> %mask, i32 4) %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -149,10 +139,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v8i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <8 x i32>, ptr %p %v3 = shufflevector <8 x i32> %v2, <8 x i32> poison, <4 x i32> @@ -180,9 +168,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v9 ; CHECK-NEXT: ret %v3 = add <8 x i32> %v2, %v4 = shufflevector <8 x i32> %v3, <8 x i32> poison, <4 x i32>