diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1038,7 +1038,8 @@ if (Subtarget.hasVInstructions()) setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER, ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL, - ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR}); + ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR, + ISD::VECTOR_DEINTERLEAVE}); if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); @@ -10233,6 +10234,48 @@ return tryFoldSelectIntoOp(N, DAG, FalseVal, TrueVal, /*Swapped*/true); } +static LoadSDNode *getLoadFromVectorDeinterleave(SDNode *N) { + SDValue L = N->getOperand(0), R = N->getOperand(1); + if (L->getOpcode() != ISD::EXTRACT_SUBVECTOR || + R->getOpcode() != ISD::EXTRACT_SUBVECTOR) + return nullptr; + if (L->getOperand(0) != R->getOperand(0)) + return nullptr; + auto *Load = dyn_cast(L->getOperand(0)); + if (!Load) + return nullptr; + if (L->getConstantOperandVal(1) != 0) + return nullptr; + if (R->getConstantOperandVal(1) != + Load->getValueType(0).getVectorMinNumElements() / 2) + return nullptr; + return Load; +} + +static SDNode *getVectorInterleaveFromStore(StoreSDNode *N) { + SDValue Val = N->getValue(); + + if (auto *Shuffle = dyn_cast(Val)) { + // Shuffles are used on fixed nodes + auto Mask = createInterleaveMask( + Val.getValueType().getVectorMinNumElements() / 2, 2); + if (Mask == Shuffle->getMask() && + Shuffle->getOperand(0).getOpcode() == ISD::CONCAT_VECTORS) + return Shuffle->getOperand(0).getNode(); + return nullptr; + } + + if (Val.getOpcode() != ISD::CONCAT_VECTORS) + return nullptr; + if (Val.getOperand(0).getNode() != Val.getOperand(1).getNode()) + return nullptr; + if (Val.getOperand(0).getNode()->getOpcode() != ISD::VECTOR_INTERLEAVE) + return nullptr; + if (Val.getOperand(0).getResNo() != 0 || Val.getOperand(1).getResNo() != 1) + return nullptr; + return Val.getOperand(0).getNode(); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -10675,6 +10718,7 @@ case ISD::STORE: { auto *Store = cast(N); SDValue Val = Store->getValue(); + SDLoc DL(N); // Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1. // vfmv.f.s is represented as extract element from 0. Match it late to avoid // any illegal types. @@ -10686,9 +10730,7 @@ MVT VecVT = Src.getSimpleValueType(); EVT MemVT = Store->getMemoryVT(); // VecVT should be scalable and memory VT should match the element type. - if (VecVT.isScalableVector() && - MemVT == VecVT.getVectorElementType()) { - SDLoc DL(N); + if (VecVT.isScalableVector() && MemVT == VecVT.getVectorElementType()) { MVT MaskVT = getMaskTypeFor(VecVT); return DAG.getStoreVP( Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(), @@ -10698,9 +10740,54 @@ Store->isTruncatingStore(), /*IsCompress*/ false); } } - + // Combine store of vector_interleave into vsseg2 + if (auto *Interleave = getVectorInterleaveFromStore(Store)) { + MVT VecVT = Val.getSimpleValueType(); + // Don't try to combine loads of elements that don't have a valid SEW + if (!RISCVVType::isValidSEW(VecVT.getScalarSizeInBits())) + break; + MVT ContainerVT = VecVT; + SDValue Op0 = Interleave->getOperand(0), Op1 = Interleave->getOperand(1); + if (VecVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VecVT); + Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget); + Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget); + } + auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vsseg2, DL, + Subtarget.getXLenVT()); + SDValue Ops[] = {Store->getChain(), IntID, Op0, Op1, + Store->getBasePtr(), VL}; + SDValue VSSeg2 = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + Store->getMemoryVT(), Store->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(SDValue(Store, 0), VSSeg2); + return VSSeg2; + } break; } + case ISD::VECTOR_DEINTERLEAVE: + if (auto *Load = getLoadFromVectorDeinterleave(N)) { + SDLoc DL(Load); + MVT VecVT = Load->getSimpleValueType(0); + MVT SubVecVT = N->getSimpleValueType(0); + // Don't try to combine loads of elements that don't have a valid SEW + if (!RISCVVType::isValidSEW(VecVT.getScalarSizeInBits())) + break; + auto [Mask, VL] = getDefaultVLOps(VecVT, VecVT, DL, DAG, Subtarget); + SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vlseg2, DL, + Subtarget.getXLenVT()); + SDValue Passthru = DAG.getUNDEF(VecVT); + SDValue Ops[] = {Load->getChain(), IntID, Passthru, Passthru, + Load->getBasePtr(), VL}; + SDValue VLSeg2 = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList({SubVecVT, SubVecVT, MVT::Other}), Ops, VecVT, + Load->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(SDValue(Load, 1), VLSeg2.getValue(2)); + return DAG.getMergeValues({VLSeg2.getValue(0), VLSeg2.getValue(1)}, DL); + } + break; case ISD::SPLAT_VECTOR: { EVT VT = N->getValueType(0); // Only perform this combine on legal MVT types. diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll @@ -0,0 +1,163 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_load_nxv16i1_nxv32i1(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vlm.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vrgather.vv v16, v12, v8 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v16, v16, 1 +; CHECK-NEXT: vmsne.vi v0, v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vrgather.vv v16, v12, v8 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v10, v16, 1 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv16i8_nxv32i8(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv8i16_nxv16i16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4i32_nxvv8i32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2i64_nxv4i64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg2e64.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_load_nxv2f16_nxv4f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4f16_nxv8f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2f32_nxv4f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv8f16_nxv16f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4f32_nxv8f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2f64_nxv4f64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg2e64.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { +%vec = load <32 x i1>, ptr %p +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 8 +; CHECK-NEXT: ret +%vec = load <32 x i8>, ptr %p +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 16 +; CHECK-NEXT: ret +%vec = load <16 x i16>, ptr %p +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load <8 x i32>, ptr %p +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v9, v10, 1 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret +%vec = load <4 x i64>, ptr %p +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 16 +; CHECK-NEXT: ret +%vec = load <4 x half>, ptr %p +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 16 +; CHECK-NEXT: ret +%vec = load <8 x half>, ptr %p +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%vec = load <4 x float>, ptr %p +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 16 +; CHECK-NEXT: ret +%vec = load <16 x half>, ptr %p +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load <8 x float>, ptr %p +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v9, v10, 1 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret +%vec = load <4 x double>, ptr %p +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define void @vector_interleave_store_nxv32i1_nxv16i1( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: vslidedown.vx v14, v12, a1 +; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v16, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v16, v16, 1 +; CHECK-NEXT: vmsne.vi v9, v16, 0 +; CHECK-NEXT: viota.m v16, v9 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v18, v14, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 +; CHECK-NEXT: vslidedown.vx v14, v10, a1 +; CHECK-NEXT: vmnot.m v8, v9 +; CHECK-NEXT: viota.m v20, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vv v18, v14, v20, v0.t +; CHECK-NEXT: vand.vi v14, v18, 1 +; CHECK-NEXT: vmsne.vi v18, v14, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v14, v12, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vv v14, v10, v20, v0.t +; CHECK-NEXT: vand.vi v8, v14, 1 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v10, v18, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vsm.v v10, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv16i16_nxv8i16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv16i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8i32_nxv4i32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4i64_nxv2i64( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e64.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + store %res, ptr %p + ret void +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define void @vector_interleave_store_nxv4f16_nxv2f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8f16_nxv4f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4f32_nxv2f32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv16f16_nxv8f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv16f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8f32_nxv4f32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4f64_nxv2f64( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e64.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + store %res, ptr %p + ret void +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr %p) { + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + store <32 x i1> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e16.v v10, (a0) +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + store <16 x i16> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e32.v v10, (a0) +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + store <8 x i32> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4i64_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e64.v v10, (a0) +; CHECK-NEXT: ret + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %res, ptr %p + ret void +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define void @vector_interleave_store_v4f16_v2f16(<2 x half> %a, <2 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + store <4 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8f16_v4f16(<4 x half> %a, <4 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + store <8 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4f32_v2f32(<2 x float> %a, <2 x float> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + store <4 x float> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v16f16_v8f16(<8 x half> %a, <8 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e16.v v10, (a0) +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + store <16 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8f32_v4f32(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e32.v v10, (a0) +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + store <8 x float> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e64.v v10, (a0) +; CHECK-NEXT: ret + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + store <4 x double> %res, ptr %p + ret void +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)