diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -14,6 +14,7 @@ #include "RISCVISelLowering.h" #include "MCTargetDesc/RISCVMatInt.h" #include "RISCV.h" +#include "RISCVISelDAGToDAG.h" #include "RISCVMachineFunctionInfo.h" #include "RISCVRegisterInfo.h" #include "RISCVSubtarget.h" @@ -21,6 +22,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -1073,7 +1075,8 @@ if (Subtarget.hasVInstructions()) setTargetDAGCombine({ISD::FCOPYSIGN, ISD::MGATHER, ISD::MSCATTER, ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL, - ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR}); + ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR, + ISD::VECTOR_DEINTERLEAVE}); if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); @@ -10375,6 +10378,52 @@ return tryFoldSelectIntoOp(N, DAG, FalseVal, TrueVal, /*Swapped*/true); } +static LoadSDNode *getLoadFromVectorDeinterleave(SDNode *N) { + SDValue L = N->getOperand(0), R = N->getOperand(1); + if (L->getOpcode() != ISD::EXTRACT_SUBVECTOR || + R->getOpcode() != ISD::EXTRACT_SUBVECTOR) + return nullptr; + if (L->getOperand(0) != R->getOperand(0)) + return nullptr; + auto *Load = dyn_cast(L->getOperand(0)); + if (!Load || !Load->isSimple()) + return nullptr; + if (Load->getOpcode() != ISD::LOAD) + return nullptr; + if (L->getConstantOperandVal(1) != 0) + return nullptr; + if (R->getConstantOperandVal(1) != + Load->getValueType(0).getVectorMinNumElements() / 2) + return nullptr; + return Load; +} + +// Returns either a vector_interleave if operating on scalable vectors, or a +// concat_vectors used in a vector_shuffle if operating on fixed vectors. +static SDNode *getVectorInterleaveFromStore(StoreSDNode *N) { + SDValue Val = N->getValue(); + + if (auto *Shuffle = dyn_cast(Val)) { + // Shuffles are used on fixed nodes + auto Mask = createInterleaveMask( + Val.getValueType().getVectorMinNumElements() / 2, 2); + if (Mask == Shuffle->getMask() && + Shuffle->getOperand(0).getOpcode() == ISD::CONCAT_VECTORS) + return Shuffle->getOperand(0).getNode(); + return nullptr; + } + + if (Val.getOpcode() != ISD::CONCAT_VECTORS) + return nullptr; + if (Val.getOperand(0).getNode() != Val.getOperand(1).getNode()) + return nullptr; + if (Val.getOperand(0).getNode()->getOpcode() != ISD::VECTOR_INTERLEAVE) + return nullptr; + if (Val.getOperand(0).getResNo() != 0 || Val.getOperand(1).getResNo() != 1) + return nullptr; + return Val.getOperand(0).getNode(); +} + SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -10817,6 +10866,7 @@ case ISD::STORE: { auto *Store = cast(N); SDValue Val = Store->getValue(); + SDLoc DL(N); // Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1. // vfmv.f.s is represented as extract element from 0. Match it late to avoid // any illegal types. @@ -10828,9 +10878,7 @@ MVT VecVT = Src.getSimpleValueType(); EVT MemVT = Store->getMemoryVT(); // VecVT should be scalable and memory VT should match the element type. - if (VecVT.isScalableVector() && - MemVT == VecVT.getVectorElementType()) { - SDLoc DL(N); + if (VecVT.isScalableVector() && MemVT == VecVT.getVectorElementType()) { MVT MaskVT = getMaskTypeFor(VecVT); return DAG.getStoreVP( Store->getChain(), DL, Src, Store->getBasePtr(), Store->getOffset(), @@ -10840,9 +10888,78 @@ Store->isTruncatingStore(), /*IsCompress*/ false); } } + // Combine interleaved store of vector_interleave or vector_shuffle into + // vsseg2 + if (auto *Interleave = getVectorInterleaveFromStore(Store)) { + MVT VecVT = Val.getSimpleValueType(); + if (!isTypeLegal(VecVT)) + break; + if (!Store->isSimple() || !ISD::isNormalStore(Store)) + break; + MVT ContainerVT = VecVT; + SDValue Op0 = Interleave->getOperand(0), Op1 = Interleave->getOperand(1); + if (VecVT.isFixedLengthVector()) { + ContainerVT = getContainerForFixedLengthVector(VecVT); + Op0 = convertToScalableVector(ContainerVT, Op0, DAG, Subtarget); + Op1 = convertToScalableVector(ContainerVT, Op1, DAG, Subtarget); + } + + // Check that a VSSEG pseudo exists for VecVTs LMUL and SEW + unsigned Log2SEW = Log2_32(ContainerVT.getScalarSizeInBits()); + RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(ContainerVT); + const RISCV::VSSEGPseudo *P = + RISCV::getVSSEGPseudo(2, /* IsMasked */ false, /*IsStrided */ false, + Log2SEW, static_cast(LMUL)); + if (!P) + break; + + auto [Mask, VL] = getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget); + SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vsseg2, DL, + Subtarget.getXLenVT()); + SDValue Ops[] = {Store->getChain(), IntID, Op0, Op1, + Store->getBasePtr(), VL}; + SDValue VSSeg2 = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops, + Store->getMemoryVT(), Store->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(SDValue(Store, 0), VSSeg2); + return VSSeg2; + } break; } + case ISD::VECTOR_DEINTERLEAVE: + if (auto *Load = getLoadFromVectorDeinterleave(N)) { + SDLoc DL(Load); + EVT VecVT = Load->getValueType(0); + if (!isTypeLegal(VecVT)) + break; + + assert(VecVT.isScalableVector()); + + // Check that a VLSEG pseudo exists for VecVTs LMUL and SEW + unsigned Log2SEW = Log2_32(VecVT.getScalarSizeInBits()); + RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VecVT.getSimpleVT()); + const RISCV::VLSEGPseudo *P = RISCV::getVLSEGPseudo( + 2, /* IsMasked */ false, /* IsTU */ false, /*IsStrided */ false, + /*FF*/ false, Log2SEW, static_cast(LMUL)); + if (!P) + break; + + SDValue VL = DAG.getRegister(RISCV::X0, Subtarget.getXLenVT()); + SDValue IntID = DAG.getTargetConstant(Intrinsic::riscv_vlseg2, DL, + Subtarget.getXLenVT()); + SDValue Passthru = DAG.getUNDEF(VecVT); + SDValue Ops[] = {Load->getChain(), IntID, Passthru, Passthru, + Load->getBasePtr(), VL}; + EVT SubVecVT = N->getValueType(0); + SDValue VLSeg2 = DAG.getMemIntrinsicNode( + ISD::INTRINSIC_W_CHAIN, DL, + DAG.getVTList({SubVecVT, SubVecVT, MVT::Other}), Ops, VecVT, + Load->getMemOperand()); + DAG.makeEquivalentMemoryOrdering(SDValue(Load, 1), VLSeg2.getValue(2)); + return DAG.getMergeValues({VLSeg2.getValue(0), VLSeg2.getValue(1)}, DL); + } + break; case ISD::SPLAT_VECTOR: { EVT VT = N->getValueType(0); // Only perform this combine on legal MVT types. diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-combine-load.ll @@ -0,0 +1,156 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_load_nxv16i1_nxv32i1(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vlm.v v8, (a0) +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v8, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vnsrl.wi v8, v12, 0 +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: vnsrl.wi v8, v12, 8 +; CHECK-NEXT: vand.vi v10, v8, 1 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv16i8_nxv32i8(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vlseg2e8.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv8i16_nxv16i16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4i32_nxvv8i32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2i64_nxv4i64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg2e64.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_load_nxv2f16_nxv4f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4f16_nxv8f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2f32_nxv4f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv8f16_nxv16f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vlseg2e16.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv4f32_nxv8f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_load_nxv2f64_nxv4f64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vlseg2e64.v v8, (a0) +; CHECK-NEXT: ret +%vec = load , ptr %p +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed-combine-load.ll @@ -0,0 +1,182 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_load_v16i1_v32i1(ptr %p) { +%vec = load <32 x i1>, ptr %p +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_load_v16i8_v32i8(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 32 +; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 8 +; CHECK-NEXT: ret +%vec = load <32 x i8>, ptr %p +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_load_v8i16_v16i16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 16 +; CHECK-NEXT: ret +%vec = load <16 x i16>, ptr %p +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_load_v4i32_vv8i32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load <8 x i32>, ptr %p +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_load_v2i64_v4i64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v9, v10, 1 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret +%vec = load <4 x i64>, ptr %p +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_load_v2f16_v4f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 16 +; CHECK-NEXT: ret +%vec = load <4 x half>, ptr %p +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_load_v4f16_v8f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v9, 0 +; CHECK-NEXT: vnsrl.wi v9, v9, 16 +; CHECK-NEXT: ret +%vec = load <8 x half>, ptr %p +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_load_v2f32_v4f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%vec = load <4 x float>, ptr %p +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_load_v8f16_v16f16(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: vnsrl.wi v9, v10, 16 +; CHECK-NEXT: ret +%vec = load <16 x half>, ptr %p +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_load_v4f32_v8f32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v10, a0 +; CHECK-NEXT: vnsrl.wi v8, v10, 0 +; CHECK-NEXT: ret +%vec = load <8 x float>, ptr %p +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v10, 0 +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v9, v10, 1 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret +%vec = load <4 x double>, ptr %p +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-combine-store.ll @@ -0,0 +1,167 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define void @vector_interleave_store_nxv32i1_nxv16i1( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v9, v0 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vmerge.vim v8, v10, 1, v0 +; CHECK-NEXT: vwaddu.vv v16, v8, v12 +; CHECK-NEXT: li a1, -1 +; CHECK-NEXT: vwmaccu.vx v16, a1, v12 +; CHECK-NEXT: vand.vi v8, v18, 1 +; CHECK-NEXT: vmsne.vi v10, v8, 0 +; CHECK-NEXT: vand.vi v8, v16, 1 +; CHECK-NEXT: vmsne.vi v11, v8, 0 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: add a2, a1, a1 +; CHECK-NEXT: vsetvli zero, a2, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v11, v10, a1 +; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma +; CHECK-NEXT: vsm.v v11, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv16i16_nxv8i16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv16i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8i32_nxv4i32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4i64_nxv2i64( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e64.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + store %res, ptr %p + ret void +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define void @vector_interleave_store_nxv4f16_nxv2f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8f16_nxv4f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4f32_nxv2f32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv16f16_nxv8f16( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv16f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv8f32_nxv4f32( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + store %res, ptr %p + ret void +} + +define void @vector_interleave_store_nxv4f64_nxv2f64( %a, %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v10m2 killed $v10m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: # kill: def $v8m2 killed $v8m2 killed $v8m2_v10m2 def $v8m2_v10m2 +; CHECK-NEXT: vsseg2e64.v v8, (a0) +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + store %res, ptr %p + ret void +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed-combine-store.ll @@ -0,0 +1,143 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define void @vector_interleave_store_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b, ptr %p) { + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + store <32 x i1> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e16.v v10, (a0) +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + store <16 x i16> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e32.v v10, (a0) +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + store <8 x i32> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4i64_v2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e64.v v10, (a0) +; CHECK-NEXT: ret + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %res, ptr %p + ret void +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define void @vector_interleave_store_v4f16_v2f16(<2 x half> %a, <2 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + store <4 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8f16_v4f16(<4 x half> %a, <4 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e16.v v8, (a0) +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + store <8 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4f32_v2f32(<2 x float> %a, <2 x float> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $v9 killed $v9 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: # kill: def $v8 killed $v8 killed $v8_v9 def $v8_v9 +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + store <4 x float> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v16f16_v8f16(<8 x half> %a, <8 x half> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e16.v v10, (a0) +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + store <16 x half> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v8f32_v4f32(<4 x float> %a, <4 x float> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e32.v v10, (a0) +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + store <8 x float> %res, ptr %p + ret void +} + +define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %b, ptr %p) { +; CHECK-LABEL: vector_interleave_store_v4f64_v2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v12, v9 +; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v10, v8 +; CHECK-NEXT: vsseg2e64.v v10, (a0) +; CHECK-NEXT: ret + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + store <4 x double> %res, ptr %p + ret void +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>)