diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -183,6 +183,7 @@ bool doPeepholeMergeVVMFold(); bool performVMergeToVMv(SDNode *N); bool performCombineVMergeAndVOps(SDNode *N, bool IsTA); + bool performCombineVMvAndVOps(SDNode *N, bool IsTA); }; namespace RISCV { diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3452,6 +3452,171 @@ return true; } +// A vmv.v.v can be thought of a vmerge, where instead of masking with a mask, +// we're masking the tail with VL. For example, in the sequence below the +// vmv.v.v copies over the first 2 elements from the vadd.vv: +// +// vsetivli zero, 4, e32, m1, ta, ma +// vadd.vv v9, v10, v11 +// vsetivli zero, 2, e32, m1, tu, ma +// vmv.v.v v8, v9 +// +// This optimisation folds the vmv.v.v into the preceding op if it has only use: +// +// vsetivli zero, 2, e32, m1, ta, ma +// vadd.vv v8, v10, v11 +// +// In general, we can just replace the VL of the op with the VL of the vmv.v.v +// (Unless it's a load, in which case we make sure we're only loading a VL less +// than or equal to the original, so we don't end up loading more elements than +// before) +bool RISCVDAGToDAGISel::performCombineVMvAndVOps(SDNode *N, bool IsTA) { + SDLoc DL(N); + SDValue Passthru; + if (IsTA) + Passthru = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, + N->getValueType(0)), + 0); + else + Passthru = N->getOperand(0); + SDValue Src = N->getOperand(IsTA ? 0 : 1); + SDValue VL = N->getOperand(IsTA ? 1 : 2); + assert(Src.getResNo() == 0 && "Src should be the first value of a node."); + + // TODO: We should peel off layers of COPY_TO_REGCLASS so we can + // handle merges of different vec lengths + + // Src can only have one user, N. + if (!Src.hasOneUse()) + return false; + + if (!Src.isMachineOpcode()) + return false; + + unsigned SrcOpc = Src.getMachineOpcode(); + + const MCInstrDesc &SrcMCID = TII->get(SrcOpc); + bool HasTiedDest = RISCVII::isFirstDefTiedToFirstUse(SrcMCID); + // The last operand of a masked instruction may be glued. + bool HasGlueOp = Src->getGluedNode() != nullptr; + // The chain operand may exist either before the glued operands or in the last + // position. + unsigned SrcChainOpIdx = Src.getNumOperands() - HasGlueOp - 1; + bool HasChainOp = Src.getOperand(SrcChainOpIdx).getValueType() == MVT::Other; + bool HasVecPolicyOp = RISCVII::hasVecPolicyOp(SrcMCID.TSFlags); + unsigned SrcVLIndex = + Src.getNumOperands() - HasVecPolicyOp - HasChainOp - HasGlueOp - 2; + + unsigned NewOpc; + + // If Src has a passthru operand, then we can then reuse the same pseudo + // opcode. + if (HasTiedDest) { + NewOpc = SrcOpc; + + // Make sure there's a corresponding MaskedPseudo otherwise it's not safe to + // change VL. + if (!RISCV::lookupMaskedIntrinsicByUnmaskedTU(SrcOpc) && + !RISCV::getMaskedPseudoInfo(SrcOpc)) + return false; + + // The Src passthru must be IMPLICIT_DEF or the same as the vmv.v.v + // passthru. + SDValue SrcPassthru = Src.getOperand(0); + if (!isImplicitDef(SrcPassthru) && SrcPassthru != Passthru) + return false; + } + // Otherwise if the Src pseudo doesn't have a passthrough operand, see if we + // can find a variant of it that does. + else if (const RISCV::RISCVMaskedPseudoInfo *Info = + RISCV::lookupMaskedIntrinsicByUnmaskedTA(SrcOpc)) { + + NewOpc = Info->UnmaskedTUPseudo; + } + // Otherwise, there's no pseudo we can use to preserve the tail. + else { + return false; + } + + if (SrcMCID.hasUnmodeledSideEffects()) + return false; + + if (this->mayRaiseFPException(Src.getNode()) && + !Src->getFlags().hasNoFPExcept()) + return false; + + // If the src is a load, we can only fold in the VL if it's a constant that's + // the same size or smaller: Otherwise we might end up loading more elements + // than before. + if (SrcMCID.mayLoad()) { + ConstantSDNode *VLN = dyn_cast(N->getOperand(2)); + if (!VLN) + return false; + + SDValue SrcVL = Src.getOperand(SrcVLIndex); + if (!isa(SrcVL) || + Src.getConstantOperandVal(SrcVLIndex) < VLN->getZExtValue()) + return false; + } + + if (HasChainOp) { + // Avoid creating cycles in the DAG. We must ensure that none of the other + // operands depend on Src through it's Chain. + SmallVector LoopWorklist; + SmallPtrSet Visited; + LoopWorklist.push_back(Passthru.getNode()); + LoopWorklist.push_back(VL.getNode()); + if (SDNode *Glued = N->getGluedNode()) + LoopWorklist.push_back(Glued); + if (SDNode::hasPredecessorHelper(Src.getNode(), Visited, LoopWorklist)) + return false; + } + + // Make sure the policy is tail undisturbed. + uint64_t Policy; + if (HasVecPolicyOp) { + // Pseudo operands are VL, SEW, [Policy] + uint64_t SrcPolicy = Src.getConstantOperandVal(SrcVLIndex + 2); + Policy = SrcPolicy & -RISCVII::TAIL_AGNOSTIC; + } else { + Policy = RISCVII::MASK_AGNOSTIC; + } + + SmallVector Ops; + Ops.push_back(Passthru); + Ops.append(HasTiedDest ? Src->op_begin() + 1 : Src->op_begin(), + Src->op_begin() + SrcVLIndex); + Ops.append({VL, /* SEW */ Src.getOperand(SrcVLIndex + 1)}); + if (RISCVII::hasVecPolicyOp(TII->get(NewOpc).TSFlags)) + Ops.push_back( + CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT())); + + // Result node should have chain operand of Src. + if (HasChainOp) + Ops.push_back(Src.getOperand(SrcChainOpIdx)); + + // Copy over the old glue operand, or N's glue operand if it has one. + if (HasGlueOp) + Ops.push_back(Src.getOperand(Src->getNumOperands() - 1)); + else if (N->getGluedNode()) + Ops.push_back(N->getOperand(N->getNumOperands() - 1)); + + SDNode *Result = CurDAG->getMachineNode(NewOpc, DL, Src->getVTList(), Ops); + + Result->setFlags(Src->getFlags()); + + // Replace the old vmv.v.v node with Result. + ReplaceUses(SDValue(N, 0), SDValue(Result, 0)); + + // Replace Src's other values e.g. the Chain. + for (unsigned Idx = 1; Idx < Src->getNumValues(); ++Idx) + ReplaceUses(Src.getValue(Idx), SDValue(Result, Idx)); + + // Try to transform Result to an unmasked pseudo, if it's masked. + doPeepholeMaskedRVV(Result); + return true; +} + bool RISCVDAGToDAGISel::doPeepholeMergeVVMFold() { bool MadeChange = false; SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end(); @@ -3481,11 +3646,33 @@ Opcode == RISCV::PseudoVMERGE_VVM_M8; }; + auto IsVMvTU = [](unsigned Opcode) { + return Opcode == RISCV::PseudoVMV_V_V_MF8_TU || + Opcode == RISCV::PseudoVMV_V_V_MF4_TU || + Opcode == RISCV::PseudoVMV_V_V_MF2_TU || + Opcode == RISCV::PseudoVMV_V_V_M1_TU || + Opcode == RISCV::PseudoVMV_V_V_M2_TU || + Opcode == RISCV::PseudoVMV_V_V_M4_TU || + Opcode == RISCV::PseudoVMV_V_V_M8_TU; + }; + + auto IsVMvTA = [](unsigned Opcode) { + return Opcode == RISCV::PseudoVMV_V_V_MF8 || + Opcode == RISCV::PseudoVMV_V_V_MF4 || + Opcode == RISCV::PseudoVMV_V_V_MF2 || + Opcode == RISCV::PseudoVMV_V_V_M1 || + Opcode == RISCV::PseudoVMV_V_V_M2 || + Opcode == RISCV::PseudoVMV_V_V_M4 || + Opcode == RISCV::PseudoVMV_V_V_M8; + }; + unsigned Opc = N->getMachineOpcode(); if (IsVMergeTU(Opc) || IsVMergeTA(Opc)) MadeChange |= performCombineVMergeAndVOps(N, IsVMergeTA(Opc)); if (IsVMergeTU(Opc) && N->getOperand(0) == N->getOperand(1)) MadeChange |= performVMergeToVMv(N); + if (IsVMvTU(Opc) || IsVMvTA(Opc)) + MadeChange |= performCombineVMvAndVOps(N, IsVMvTA(Opc)); } return MadeChange; } diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll --- a/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-vmv.ll @@ -9,10 +9,8 @@ define @vadd( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v10, v10, v12 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vv v8, v10, v12 ; CHECK-NEXT: ret %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, iXLen %vl1) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen %vl2) @@ -22,25 +20,19 @@ define @vadd_undef( %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd_undef: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, ta, ma -; CHECK-NEXT: vmv.v.v v8, v8 +; CHECK-NEXT: vadd.vv v8, v8, v10 ; CHECK-NEXT: ret %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( poison, %a, %b, iXLen %vl1) %w = call @llvm.riscv.vmv.v.v.nxv4i32( poison, %v, iXLen %vl2) ret %w } -; TODO: Is this correct if there's already a passthru in the src? define @vadd_same_passthru( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd_same_passthru: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, tu, ma -; CHECK-NEXT: vmv2r.v v14, v8 -; CHECK-NEXT: vadd.vv v14, v10, v12 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v14 +; CHECK-NEXT: vadd.vv v8, v10, v12 ; CHECK-NEXT: ret %v = call @llvm.riscv.vadd.nxv4i32.nxv4i32( %passthru, %a, %b, iXLen %vl1) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen %vl2) @@ -52,10 +44,8 @@ define @vadd_mask_ma( %passthru, %a, %b, %mask, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd_mask_ma: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v10, v10, v12, v0.t ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vadd.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %v = call @llvm.riscv.vadd.mask.nxv4i32.nxv4i32( poison, %a, %b, %mask, iXLen %vl1, iXLen 2) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen %vl2) @@ -65,10 +55,8 @@ define @vadd_mask_mu( %passthru, %a, %b, %mask, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vadd_mask_mu: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v10, v10, v12, v0.t -; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, mu +; CHECK-NEXT: vadd.vv v8, v10, v12, v0.t ; CHECK-NEXT: ret %v = call @llvm.riscv.vadd.mask.nxv4i32.nxv4i32( poison, %a, %b, %mask, iXLen %vl1, iXLen 0) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen %vl2) @@ -80,10 +68,8 @@ define @foldable_load( %passthru, ptr %p) { ; CHECK-LABEL: foldable_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v10, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v = call @llvm.riscv.vle.nxv4i32( poison, ptr %p, iXLen 4) %w = call @llvm.riscv.vmv.v.v.nxv4i32( %passthru, %v, iXLen 2) @@ -111,12 +97,28 @@ define @vfadd( %passthru, %a, %b, iXLen %vl1, iXLen %vl2) { ; CHECK-LABEL: vfadd: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; CHECK-NEXT: vfadd.vv v10, v10, v12 ; CHECK-NEXT: vsetvli zero, a1, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vfadd.vv v8, v10, v12 ; CHECK-NEXT: ret %v = call @llvm.riscv.vfadd.nxv4f32.nxv4f32( poison, %a, %b, iXLen %vl1) %w = call @llvm.riscv.vmv.v.v.nxv4f32( %passthru, %v, iXLen %vl2) ret %w } + +declare @llvm.riscv.vmv.v.v.nxv2i32(, , iXLen) + +declare @llvm.riscv.vredsum.nxv2i32.nxv4i32(, , , iXLen) + +; This shouldn't be folded because VL affects the result. +define @vredsum( %passthru, %v1, %v2, iXLen %vl1, iXLen %vl2) { +; CHECK-LABEL: vredsum: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli zero, a0, e32, m2, ta, ma +; CHECK-NEXT: vredsum.vs v9, v10, v9 +; CHECK-NEXT: vsetvli zero, a1, e32, m1, tu, ma +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret + %v = call @llvm.riscv.vredsum.nxv2i32.nxv4i32( poison, %v1, %v2, iXLen %vl1) + %w = call @llvm.riscv.vmv.v.v.nxv2i32( %passthru, %v, iXLen %vl2) + ret %w +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -5,10 +5,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v4i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <4 x i32>, ptr %p %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -19,10 +17,8 @@ define <4 x i32> @insert_subvector_vp_load_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.vp.load.v4i32(ptr %p, <4 x i1> %mask, i32 4) %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -48,11 +44,8 @@ define <4 x i32> @insert_subvector_load_foldable_passthru_v4i32_v4i32(<4 x i32> %v1, ptr %p, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_load_foldable_passthru_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu -; CHECK-NEXT: vmv1r.v v9, v8 -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %p, i32 4, <4 x i1> %mask, <4 x i32> %v1) %v3 = shufflevector <4 x i32> %v2, <4 x i32> %v1, <4 x i32> @@ -64,9 +57,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vv v9, v9, v10 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v3 = add <4 x i32> %v2, %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -77,10 +69,8 @@ define <4 x i32> @insert_subvector_vp_add_v4i32_v4i32(<4 x i32> %v1, <4 x i32> %v2, <4 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vadd.vi v9, v9, 1, v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, mu +; CHECK-NEXT: vadd.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %v3 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %v2, <4 x i32> , <4 x i1> %mask, i32 4) %v4 = shufflevector <4 x i32> %v3, <4 x i32> %v1, <4 x i32> @@ -149,10 +139,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v8i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v8i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <8 x i32>, ptr %p %v3 = shufflevector <8 x i32> %v2, <8 x i32> poison, <4 x i32> @@ -180,9 +168,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: vid.v v9 -; CHECK-NEXT: vadd.vv v9, v10, v9 ; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vadd.vv v8, v10, v9 ; CHECK-NEXT: ret %v3 = add <8 x i32> %v2, %v4 = shufflevector <8 x i32> %v3, <8 x i32> poison, <4 x i32>