diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3275,6 +3275,48 @@ } } + // For very small build_vectors, use a single scalar insert of a constant. + // TODO: Base this on constant rematerialization cost, not size. + const unsigned EltBitSize = VT.getScalarSizeInBits(); + if (VT.getSizeInBits() <= 32 && + ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { + MVT ViaIntVT = MVT::getIntegerVT(VT.getSizeInBits()); + assert((ViaIntVT == MVT::i16 || ViaIntVT == MVT::i32) && + "Unexpected sequence type"); + // If we can use the original VL with the modified element type, this + // means we only have a VTYPE toggle, not a VL toggle. TODO: Should this + // be moved into InsertVSETVLI? + unsigned ViaVecLen = + (Subtarget.getRealMinVLen() >= VT.getSizeInBits() * NumElts) ? NumElts : 1; + MVT ViaVecVT = MVT::getVectorVT(ViaIntVT, ViaVecLen); + + uint64_t EltMask = maskTrailingOnes(EltBitSize); + uint64_t SplatValue = 0; + // Construct the amalgamated value at this larger vector type. + for (const auto &OpIdx : enumerate(Op->op_values())) { + const auto &SeqV = OpIdx.value(); + if (!SeqV.isUndef()) + SplatValue |= ((cast(SeqV)->getZExtValue() & EltMask) + << (OpIdx.index() * EltBitSize)); + } + + // On RV64, sign-extend from 32 to 64 bits where possible in order to + // achieve better constant materializion. + if (Subtarget.is64Bit() && ViaIntVT == MVT::i32) + SplatValue = SignExtend64<32>(SplatValue); + + SDValue Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ViaVecVT, + DAG.getUNDEF(ViaVecVT), + DAG.getConstant(SplatValue, DL, XLenVT), + DAG.getConstant(0, DL, XLenVT)); + if (ViaVecLen != 1) + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, + MVT::getVectorVT(ViaIntVT, 1), Vec, + DAG.getConstant(0, DL, XLenVT)); + return DAG.getBitcast(VT, Vec); + } + + // Attempt to detect "hidden" splats, which only reveal themselves as splats // when re-interpreted as a vector with a larger element type. For example, // v4i16 = build_vector i16 0, i16 1, i16 0, i16 1 @@ -3283,7 +3325,6 @@ // TODO: This optimization could also work on non-constant splats, but it // would require bit-manipulation instructions to construct the splat value. SmallVector Sequence; - unsigned EltBitSize = VT.getScalarSizeInBits(); const auto *BV = cast(Op); if (VT.isInteger() && EltBitSize < 64 && ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -233,27 +233,49 @@ %x6v4i8 = type {<4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>, <4 x i8>} define %x6v4i8 @buildvec_no_vid_v4i8() { -; CHECK-LABEL: buildvec_no_vid_v4i8: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI14_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI14_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_1) -; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: slli a0, a0, 11 -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vmv.v.x v10, a0 -; CHECK-NEXT: li a0, 2047 -; CHECK-NEXT: vmv.v.x v11, a0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: lui a0, %hi(.LCPI14_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI14_2) -; CHECK-NEXT: vle8.v v13, (a0) -; CHECK-NEXT: vmv.v.i v12, -2 -; CHECK-NEXT: ret +; RV32-LABEL: buildvec_no_vid_v4i8: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 28768 +; RV32-NEXT: addi a0, a0, 769 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: lui a0, 28752 +; RV32-NEXT: addi a0, a0, 512 +; RV32-NEXT: vmv.s.x v9, a0 +; RV32-NEXT: lui a0, 32768 +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: lui a0, 28672 +; RV32-NEXT: addi a0, a0, 255 +; RV32-NEXT: vmv.s.x v11, a0 +; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV32-NEXT: vmv.v.i v12, -2 +; RV32-NEXT: lui a0, 1032144 +; RV32-NEXT: addi a0, a0, -257 +; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v13, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_no_vid_v4i8: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 28768 +; RV64-NEXT: addiw a0, a0, 769 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: lui a0, 28752 +; RV64-NEXT: addiw a0, a0, 512 +; RV64-NEXT: vmv.s.x v9, a0 +; RV64-NEXT: lui a0, 32768 +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: lui a0, 28672 +; RV64-NEXT: addiw a0, a0, 255 +; RV64-NEXT: vmv.s.x v11, a0 +; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-NEXT: vmv.v.i v12, -2 +; RV64-NEXT: lui a0, 1032144 +; RV64-NEXT: addiw a0, a0, -257 +; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v13, a0 +; RV64-NEXT: ret %1 = insertvalue %x6v4i8 poison, <4 x i8> , 0 %2 = insertvalue %x6v4i8 %1, <4 x i8> , 1 %3 = insertvalue %x6v4i8 %2, <4 x i8> , 2 @@ -662,22 +684,29 @@ define <4 x i8> @buildvec_not_vid_v4i8_1() { ; CHECK-LABEL: buildvec_not_vid_v4i8_1: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI37_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI37_0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) +; CHECK-NEXT: lui a0, 12320 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, a0 ; CHECK-NEXT: ret ret <4 x i8> } define <4 x i8> @buildvec_not_vid_v4i8_2() { -; CHECK-LABEL: buildvec_not_vid_v4i8_2: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI38_0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: ret +; RV32-LABEL: buildvec_not_vid_v4i8_2: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 16 +; RV32-NEXT: addi a0, a0, 771 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v8, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: buildvec_not_vid_v4i8_2: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 16 +; RV64-NEXT: addiw a0, a0, 771 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v8, a0 +; RV64-NEXT: ret ret <4 x i8> } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -551,25 +551,49 @@ ; This shouldn't be interleaved define <4 x i8> @unary_interleave_v4i8_invalid(<4 x i8> %x) { -; V128-LABEL: unary_interleave_v4i8_invalid: -; V128: # %bb.0: -; V128-NEXT: lui a0, %hi(.LCPI19_0) -; V128-NEXT: addi a0, a0, %lo(.LCPI19_0) -; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; V128-NEXT: vle8.v v10, (a0) -; V128-NEXT: vrgather.vv v9, v8, v10 -; V128-NEXT: vmv1r.v v8, v9 -; V128-NEXT: ret +; RV32-V128-LABEL: unary_interleave_v4i8_invalid: +; RV32-V128: # %bb.0: +; RV32-V128-NEXT: lui a0, 16 +; RV32-V128-NEXT: addi a0, a0, 768 +; RV32-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-V128-NEXT: vmv.s.x v10, a0 +; RV32-V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV32-V128-NEXT: vrgather.vv v9, v8, v10 +; RV32-V128-NEXT: vmv1r.v v8, v9 +; RV32-V128-NEXT: ret ; -; V512-LABEL: unary_interleave_v4i8_invalid: -; V512: # %bb.0: -; V512-NEXT: lui a0, %hi(.LCPI19_0) -; V512-NEXT: addi a0, a0, %lo(.LCPI19_0) -; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; V512-NEXT: vle8.v v10, (a0) -; V512-NEXT: vrgather.vv v9, v8, v10 -; V512-NEXT: vmv1r.v v8, v9 -; V512-NEXT: ret +; RV64-V128-LABEL: unary_interleave_v4i8_invalid: +; RV64-V128: # %bb.0: +; RV64-V128-NEXT: lui a0, 16 +; RV64-V128-NEXT: addiw a0, a0, 768 +; RV64-V128-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-V128-NEXT: vmv.s.x v10, a0 +; RV64-V128-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-V128-NEXT: vrgather.vv v9, v8, v10 +; RV64-V128-NEXT: vmv1r.v v8, v9 +; RV64-V128-NEXT: ret +; +; RV32-V512-LABEL: unary_interleave_v4i8_invalid: +; RV32-V512: # %bb.0: +; RV32-V512-NEXT: lui a0, 16 +; RV32-V512-NEXT: addi a0, a0, 768 +; RV32-V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; RV32-V512-NEXT: vmv.s.x v10, a0 +; RV32-V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV32-V512-NEXT: vrgather.vv v9, v8, v10 +; RV32-V512-NEXT: vmv1r.v v8, v9 +; RV32-V512-NEXT: ret +; +; RV64-V512-LABEL: unary_interleave_v4i8_invalid: +; RV64-V512: # %bb.0: +; RV64-V512-NEXT: lui a0, 16 +; RV64-V512-NEXT: addiw a0, a0, 768 +; RV64-V512-NEXT: vsetivli zero, 4, e32, mf2, ta, ma +; RV64-V512-NEXT: vmv.s.x v10, a0 +; RV64-V512-NEXT: vsetvli zero, zero, e8, mf8, ta, ma +; RV64-V512-NEXT: vrgather.vv v9, v8, v10 +; RV64-V512-NEXT: vmv1r.v v8, v9 +; RV64-V512-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-vslide1up.ll @@ -373,10 +373,10 @@ define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert(<4 x i8> %v, i8 %b) { ; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: lui a0, 8208 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma ; CHECK-NEXT: vrgather.vv v9, v8, v10 ; CHECK-NEXT: vmv1r.v v8, v9 ; CHECK-NEXT: ret @@ -397,15 +397,27 @@ } define <4 x i8> @vslide1up_4xi8_neg_incorrect_insert3(<4 x i8> %v, i8 %b) { -; CHECK-LABEL: vslide1up_4xi8_neg_incorrect_insert3: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI25_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv1r.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: vslide1up_4xi8_neg_incorrect_insert3: +; RV32: # %bb.0: +; RV32-NEXT: lui a0, 8208 +; RV32-NEXT: addi a0, a0, 1 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vmv.s.x v10, a0 +; RV32-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV32-NEXT: vrgather.vv v9, v8, v10 +; RV32-NEXT: vmv1r.v v8, v9 +; RV32-NEXT: ret +; +; RV64-LABEL: vslide1up_4xi8_neg_incorrect_insert3: +; RV64: # %bb.0: +; RV64-NEXT: lui a0, 8208 +; RV64-NEXT: addiw a0, a0, 1 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vmv.s.x v10, a0 +; RV64-NEXT: vsetvli zero, zero, e8, mf4, ta, ma +; RV64-NEXT: vrgather.vv v9, v8, v10 +; RV64-NEXT: vmv1r.v v8, v9 +; RV64-NEXT: ret %v2 = shufflevector <4 x i8> poison, <4 x i8> %v, <4 x i32> ret <4 x i8> %v2 }