diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -3352,6 +3352,61 @@ assert(True.getResNo() == 0 && "Expect True is the first output of an instruction."); + // True and False may not be the same vector type because of bitcasts, but + // they should always be the same size. + assert(True.getSimpleValueType().getSizeInBits() == + False.getSimpleValueType().getSizeInBits()); + + // We want to be able to handle vmerges and vmv.v.vs where True is a subreg, + // e.g: + // + // t22: nxv4i32 = PseudoVMV_V_V_M2 t42, t38, ... + // t38: nxv4i32 = INSERT_SUBREG IMPLICIT_DEF:nxv4i32, t40, + // TargetConstant:i32<4> + // t40: v2i32 = COPY_TO_REGCLASS t41, TargetConstant:i64<22> + // t41: nxv1i32,ch = PseudoVLE32_V_MF2 ... + // + // If we're inserting into the bottom subregister of an implicit_def, then we + // can unwrap True (t38) down to the underlying operation, in this case + // PseudoVLE32. We just need to make sure to match up the types in False and + // Result, and then we end up with something like this: + // + // t48: nxv4i32 = INSERT_SUBREG t42, t47, TargetConstant:i32<4> + // t47: nxv1i32,ch = PseudoVLE32_V_MF2_MASK t46, ... + // t46: nxv1i32 = EXTRACT_SUBREG t42, TargetConstant:i32<4> + std::optional SubRegIdx; + + auto UnwrapSubReg = [this, &SubRegIdx](SDValue V) { + if (!V->isMachineOpcode()) + return SDValue(); + if (V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS) + return V->getOperand(0); + // If we're inserting into the bottom subreg of a vector register, unwrap + // it. + if (V->getMachineOpcode() == TargetOpcode::INSERT_SUBREG && + isImplicitDef(V->getOperand(0))) { + MVT SubVecVT = V->getOperand(1).getSimpleValueType(); + if (SubVecVT.isFixedLengthVector()) + SubVecVT = + Subtarget->getTargetLowering()->getContainerForFixedLengthVector( + SubVecVT); + unsigned LeftoverIdx; + std::tie(SubRegIdx, LeftoverIdx) = + RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs( + V->getSimpleValueType(0), SubVecVT, 0, + Subtarget->getRegisterInfo()); + if (SubRegIdx == V->getConstantOperandVal(2) && LeftoverIdx == 0) + return V->getOperand(1); + } + return SDValue(); + }; + + while (SDValue SubRegTrue = UnwrapSubReg(True)) { + if (True->use_empty() || !True->use_begin()->isOnlyUserOf(True.getNode())) + return false; + True = SubRegTrue; + } + // Need N is the exactly one using True. if (!True.hasOneUse()) return false; @@ -3504,9 +3559,14 @@ SDValue PolicyOp = CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT()); - SmallVector Ops; - Ops.push_back(False); + // If True is operating on a subreg, then we need to extract out a subreg of + // False so the types match. + if (SubRegIdx) + Ops.push_back(CurDAG->getTargetExtractSubreg( + *SubRegIdx, DL, True->getSimpleValueType(0), False)); + else + Ops.push_back(False); const bool HasRoundingMode = RISCVII::hasRoundModeOp(TrueTSFlags); const unsigned NormalOpsEnd = TrueVLIndex - IsMasked - HasRoundingMode; @@ -3539,7 +3599,16 @@ CurDAG->setNodeMemRefs(Result, cast(True)->memoperands()); // Replace vmerge.vvm node by Result. - ReplaceUses(SDValue(N, 0), SDValue(Result, 0)); + if (SubRegIdx) { + // If True was operating on a subreg, then we need to insert the subreg back + // into the full size False. + MVT MergeVT = N->getSimpleValueType(0); + SDValue Insert = CurDAG->getTargetInsertSubreg(*SubRegIdx, DL, MergeVT, + False, SDValue(Result, 0)); + ReplaceUses(SDValue(N, 0), Insert); + } else { + ReplaceUses(SDValue(N, 0), SDValue(Result, 0)); + } // Replace another value of True. E.g. chain and VL. for (unsigned Idx = 1; Idx < True->getNumValues(); ++Idx) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector-shuffle.ll @@ -80,10 +80,8 @@ define <4 x i32> @insert_subvector_load_v4i32_v2i32(<4 x i32> %v1, ptr %p) { ; CHECK-LABEL: insert_subvector_load_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %v2 = load <2 x i32>, ptr %p %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> @@ -95,10 +93,8 @@ define <4 x i32> @insert_subvector_vp_load_v4i32_v2i32(<4 x i32> %v1, ptr %p, <2 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_load_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v9, (a0), v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vle32.v v8, (a0), v0.t ; CHECK-NEXT: ret %v2 = call <2 x i32> @llvm.vp.load.v2i32(ptr %p, <2 x i1> %mask, i32 2) %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> @@ -111,9 +107,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma ; CHECK-NEXT: vid.v v10 -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v3 = add <2 x i32> %v2, %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> @@ -125,10 +120,8 @@ define <4 x i32> @insert_subvector_vp_add_v4i32_v2i32(<4 x i32> %v1, <2 x i32> %v2, <2 x i1> %mask) { ; CHECK-LABEL: insert_subvector_vp_add_v4i32_v2i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vi v9, v9, 1, v0.t -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, mu +; CHECK-NEXT: vadd.vi v8, v9, 1, v0.t ; CHECK-NEXT: ret %v3 = call <2 x i32> @llvm.vp.add.v2i32(<2 x i32> %v2, <2 x i32> , <2 x i1> %mask, i32 2) %v4 = shufflevector <2 x i32> %v3, <2 x i32> poison, <4 x i32> @@ -195,10 +188,8 @@ define <4 x i8> @insert_subvector_add_mf8(<4 x i8> %v1, <2 x i8> %a, <2 x i8> %b) { ; CHECK-LABEL: insert_subvector_add_mf8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e8, mf8, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v2 = add <2 x i8> %a, %b %v3 = shufflevector <2 x i8> %v2, <2 x i8> poison, <4 x i32> @@ -209,10 +200,8 @@ define <4 x i16> @insert_subvector_add_mf4(<4 x i16> %v1, <2 x i16> %a, <2 x i16> %b) { ; CHECK-LABEL: insert_subvector_add_mf4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v2 = add <2 x i16> %a, %b %v3 = shufflevector <2 x i16> %v2, <2 x i16> poison, <4 x i32> @@ -223,10 +212,8 @@ define <4 x i32> @insert_subvector_add_mf2(<4 x i32> %v1, <2 x i32> %a, <2 x i32> %b) { ; CHECK-LABEL: insert_subvector_add_mf2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vadd.vv v9, v9, v10 -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vadd.vv v8, v9, v10 ; CHECK-NEXT: ret %v2 = add <2 x i32> %a, %b %v3 = shufflevector <2 x i32> %v2, <2 x i32> poison, <4 x i32> @@ -237,10 +224,8 @@ define <8 x i32> @insert_subvector_add_m1(<8 x i32> %v1, <4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: insert_subvector_add_m1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vadd.vv v10, v10, v11 -; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; CHECK-NEXT: vadd.vv v8, v10, v11 ; CHECK-NEXT: ret %v2 = add <4 x i32> %a, %b %v3 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> @@ -251,10 +236,8 @@ define <16 x i32> @insert_subvector_add_m2(<16 x i32> %v1, <8 x i32> %a, <8 x i32> %b) { ; CHECK-LABEL: insert_subvector_add_m2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vadd.vv v12, v12, v14 -; CHECK-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vadd.vv v8, v12, v14 ; CHECK-NEXT: ret %v2 = add <8 x i32> %a, %b %v3 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> @@ -265,10 +248,8 @@ define <32 x i32> @insert_subvector_add_m4(<32 x i32> %v1, <16 x i32> %a, <16 x i32> %b) { ; CHECK-LABEL: insert_subvector_add_m4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; CHECK-NEXT: vadd.vv v16, v16, v20 -; CHECK-NEXT: vsetivli zero, 16, e32, m8, tu, ma -; CHECK-NEXT: vmv.v.v v8, v16 +; CHECK-NEXT: vsetivli zero, 16, e32, m4, tu, ma +; CHECK-NEXT: vadd.vv v8, v16, v20 ; CHECK-NEXT: ret %v2 = add <16 x i32> %a, %b %v3 = shufflevector <16 x i32> %v2, <16 x i32> poison, <32 x i32> diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert-subvector.ll @@ -12,10 +12,8 @@ define @insert_nxv8i32_v2i32_0( %vec, ptr %svp) { ; CHECK-LABEL: insert_nxv8i32_v2i32_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m4, tu, ma -; CHECK-NEXT: vmv.v.v v8, v12 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp %v = call @llvm.vector.insert.v2i32.nxv8i32( %vec, <2 x i32> %sv, i64 0) @@ -51,22 +49,19 @@ define @insert_nxv8i32_v8i32_0( %vec, ptr %svp) { ; LMULMAX2-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vle32.v v12, (a0) -; LMULMAX2-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX2-NEXT: vmv.v.v v8, v12 +; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; LMULMAX2-NEXT: vle32.v v8, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_nxv8i32_v8i32_0: ; LMULMAX1: # %bb.0: +; LMULMAX1-NEXT: addi a1, a0, 16 ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v12, (a0) -; LMULMAX1-NEXT: addi a0, a0, 16 -; LMULMAX1-NEXT: vle32.v v16, (a0) -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m4, tu, ma -; LMULMAX1-NEXT: vmv.v.v v8, v12 +; LMULMAX1-NEXT: vle32.v v12, (a1) +; LMULMAX1-NEXT: vsetvli zero, zero, e32, m1, tu, ma +; LMULMAX1-NEXT: vle32.v v8, (a0) ; LMULMAX1-NEXT: vsetivli zero, 8, e32, m4, tu, ma -; LMULMAX1-NEXT: vslideup.vi v8, v16, 4 +; LMULMAX1-NEXT: vslideup.vi v8, v12, 4 ; LMULMAX1-NEXT: ret %sv = load <8 x i32>, ptr %svp %v = call @llvm.vector.insert.v8i32.nxv8i32( %vec, <8 x i32> %sv, i64 0) @@ -112,14 +107,12 @@ define void @insert_v4i32_v2i32_0(ptr %vp, ptr %svp) { ; CHECK-LABEL: insert_v4i32_v2i32_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; CHECK-NEXT: vmv.v.v v9, v8 +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; CHECK-NEXT: vle32.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v9, (a0) +; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <4 x i32>, ptr %vp @@ -162,26 +155,22 @@ define void @insert_v8i32_v2i32_0(ptr %vp, ptr %svp) { ; LMULMAX2-LABEL: insert_v8i32_v2i32_0: ; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vle32.v v10, (a0) -; LMULMAX2-NEXT: vsetivli zero, 2, e32, m2, tu, ma -; LMULMAX2-NEXT: vmv.v.v v10, v8 +; LMULMAX2-NEXT: vle32.v v8, (a0) +; LMULMAX2-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; LMULMAX2-NEXT: vle32.v v8, (a1) ; LMULMAX2-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; LMULMAX2-NEXT: vse32.v v10, (a0) +; LMULMAX2-NEXT: vse32.v v8, (a0) ; LMULMAX2-NEXT: ret ; ; LMULMAX1-LABEL: insert_v8i32_v2i32_0: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; LMULMAX1-NEXT: vle32.v v8, (a1) ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vle32.v v9, (a0) -; LMULMAX1-NEXT: vsetivli zero, 2, e32, m1, tu, ma -; LMULMAX1-NEXT: vmv.v.v v9, v8 +; LMULMAX1-NEXT: vle32.v v8, (a0) +; LMULMAX1-NEXT: vsetivli zero, 2, e32, mf2, tu, ma +; LMULMAX1-NEXT: vle32.v v8, (a1) ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vse32.v v9, (a0) +; LMULMAX1-NEXT: vse32.v v8, (a0) ; LMULMAX1-NEXT: ret %sv = load <2 x i32>, ptr %svp %vec = load <8 x i32>, ptr %vp @@ -277,10 +266,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a1) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, ma +; CHECK-NEXT: vle16.v v8, (a1) ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret @@ -430,10 +417,8 @@ define @insert_nxv2i16_v2i16_0( %v, ptr %svp) { ; CHECK-LABEL: insert_nxv2i16_v2i16_0: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, tu, ma +; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: ret %sv = load <2 x i16>, ptr %svp %c = call @llvm.vector.insert.v2i16.nxv2i16( %v, <2 x i16> %sv, i64 0)