diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -678,6 +678,8 @@ SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_DEINTERLEAVE(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerVECTOR_INTERLEAVE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSTEP_VECTOR(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_REVERSE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECTOR_SPLICE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -583,6 +583,10 @@ setOperationAction({ISD::VP_FP_TO_SINT, ISD::VP_FP_TO_UINT, ISD::VP_TRUNCATE, ISD::VP_SETCC}, VT, Custom); + + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_REVERSE, VT, Custom); setOperationPromotedToType( @@ -674,6 +678,9 @@ VT, Expand); } + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + // Splice setOperationAction(ISD::VECTOR_SPLICE, VT, Custom); @@ -755,6 +762,9 @@ {ISD::CONCAT_VECTORS, ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR}, VT, Custom); + setOperationAction(ISD::VECTOR_DEINTERLEAVE, VT, Custom); + setOperationAction(ISD::VECTOR_INTERLEAVE, VT, Custom); + setOperationAction({ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE}, VT, Custom); setOperationAction(FloatingPointVPOps, VT, Custom); @@ -4046,6 +4056,10 @@ return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::EXTRACT_SUBVECTOR: return lowerEXTRACT_SUBVECTOR(Op, DAG); + case ISD::VECTOR_DEINTERLEAVE: + return lowerVECTOR_DEINTERLEAVE(Op, DAG); + case ISD::VECTOR_INTERLEAVE: + return lowerVECTOR_INTERLEAVE(Op, DAG); case ISD::STEP_VECTOR: return lowerSTEP_VECTOR(Op, DAG); case ISD::VECTOR_REVERSE: @@ -6432,6 +6446,155 @@ return DAG.getBitcast(Op.getSimpleValueType(), Slidedown); } +// Widen a vector's operands to i8, then truncate its results back to the +// original type, typically i8. All operand and result types must be the same. +static SDValue wideVectorOpToi8(SDValue N, SDLoc &DL, SelectionDAG &DAG) { + MVT Ty = N.getSimpleValueType(); + MVT WideTy = Ty.changeVectorElementType(MVT::i8); + SmallVector WideOps; + for (SDValue Op : N.getNode()->ops()) { + assert(Op.getSimpleValueType() == Ty && + "Operands and result must be same type"); + WideOps.push_back(DAG.getNode(ISD::ZERO_EXTEND, DL, WideTy, Op)); + } + + unsigned NumVals = N.getNode()->getNumValues(); + + SDVTList VTs = DAG.getVTList(SmallVector( + NumVals, N.getValueType().changeVectorElementType(MVT::i8))); + SDValue WideN = DAG.getNode(N.getOpcode(), DL, VTs, WideOps); + SmallVector TruncVals; + for (unsigned I = 0; I < WideN.getNode()->getNumValues(); I++) { + TruncVals.push_back( + DAG.getNode(ISD::TRUNCATE, DL, Ty, SDValue(WideN.getNode(), I))); + } + + if (TruncVals.size() > 1) + return DAG.getMergeValues(TruncVals, DL); + return TruncVals.front(); +} + +SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + // 1 bit element vectors need to be widened to e8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG); + + // Reconstruct the concatenated array to deinterleave + MVT WideVT = MVT::getScalableVectorVT(VecVT.getVectorElementType(), + VecVT.getVectorMinNumElements() * 2); + SDValue Wide = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideVT, Op.getOperand(0), + Op.getOperand(1)); + + MVT IdxVT = WideVT.changeVectorElementTypeToInteger(); + // Create a vector of even indices {0, 2, 4, ...} + SDValue EvenIdx = + DAG.getStepVector(DL, IdxVT, APInt(WideVT.getScalarSizeInBits(), 2)); + // Create a vector of odd indices {1, 3, 5, ... } + SDValue OddIdx = + DAG.getNode(ISD::ADD, DL, IdxVT, EvenIdx, DAG.getConstant(1, DL, IdxVT)); + + auto [Mask, VL] = getDefaultScalableVLOps(WideVT, DL, DAG, Subtarget); + + // Gather the even and odd elements into two separate vectors + SDValue EvenWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + EvenIdx, DAG.getUNDEF(WideVT), Mask, VL); + SDValue OddWide = DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, WideVT, Wide, + OddIdx, DAG.getUNDEF(WideVT), Mask, VL); + + SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide, + DAG.getConstant(0, DL, Subtarget.getXLenVT())); + SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide, + DAG.getConstant(0, DL, Subtarget.getXLenVT())); + + return DAG.getMergeValues({Even, Odd}, DL); +} + +SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc DL(Op); + MVT VecVT = Op.getSimpleValueType(); + + // 1 bit element vectors need to be widened to e8 + if (VecVT.getVectorElementType() == MVT::i1) + return wideVectorOpToi8(Op, DL, DAG); + + MVT MaskTy = getMaskTypeFor(VecVT); + MVT XLenVT = Subtarget.getXLenVT(); + + auto VLOps = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget); + SDValue Mask = VLOps.first, VL = VLOps.second; + + // ... 0 1 0 1 0 1 + // Build a mask on even indices by creating a vector of [1, 1, 1, 1], then + // bitcasting it into [0, 1, 0, 1, 0, 1, 0, 1] + SDValue EvenMask = DAG.getSplatVector( + MVT::getVectorVT(MVT::i16, + MaskTy.getVectorElementCount().divideCoefficientBy(2)), + DL, DAG.getConstant(1, DL, XLenVT)); + EvenMask = DAG.getBitcast(MaskTy.changeVectorElementType(MVT::i8), EvenMask); + EvenMask = DAG.getNode(ISD::TRUNCATE, DL, MaskTy, EvenMask); + + SDValue AllOnesMask = getAllOnesMask(VecVT, VL, DL, DAG); + + // ... 1 0 1 0 1 0 + // (xor x 1) = (not x) + SDValue OddMask = + DAG.getNode(RISCVISD::VMXOR_VL, DL, MaskTy, EvenMask, AllOnesMask, VL); + + // Interleave the source vectors by decompressing them: + // h g f e d c b a <- Source vec 0 + // 0 1 0 1 0 1 0 1 <- Even mask + // --------------- + // x d x c x b x a <- Decompressed + // + // Then taking the odd mask: + // w v u t s r q p <- Op 1 + // 1 0 1 0 1 0 1 0 <- Odd mask + // x d x c x b x a <- Passthru (decompressed source vec 0) + // --------------- + // s d r c q b p a <- Decompressed lower half + // Note that decompression isn't an instruction, instead it's emulated through + // viota.m + vrgather.vv + // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#1651-synthesizing-vdecompress + auto GetDecompress = [&DAG, &DL, &VecVT, &AllOnesMask, + &VL](SDValue Passthru, SDValue Op, SDValue Mask) { + SDValue Iota = DAG.getNode(RISCVISD::VIOTA_VL, DL, + VecVT.changeVectorElementTypeToInteger(), + Passthru, Mask, AllOnesMask, VL); + return DAG.getNode(RISCVISD::VRGATHER_VV_VL, DL, VecVT, Op, Iota, Passthru, + Mask, VL); + }; + + // x d x c x b x a + SDValue Lo = GetDecompress(DAG.getUNDEF(VecVT), Op.getOperand(0), EvenMask); + // s d r c q b p a + Lo = GetDecompress(Lo, Op.getOperand(1), OddMask); + + // Now interleave the upper half by shifting down the source vectors halfway + SDValue SlideAmt = + DAG.getNode(ISD::VSCALE, DL, XLenVT, + getVLOp(VecVT.getVectorMinNumElements(), DL, DAG, Subtarget)); + SlideAmt = DAG.getNode(ISD::SRL, DL, XLenVT, SlideAmt, + DAG.getConstant(1, DL, XLenVT)); + + auto ExtractUpperHalf = [&DAG, &DL, &SlideAmt, &Mask, &VL, this](SDValue Op) { + return getVSlidedown(DAG, Subtarget, DL, Op.getValueType(), + DAG.getUNDEF(Op.getValueType()), Op, SlideAmt, Mask, + VL); + }; + // x h x g x f x e + SDValue Hi = GetDecompress(DAG.getUNDEF(VecVT), + ExtractUpperHalf(Op.getOperand(0)), EvenMask); + // w h v g u f t e + Hi = GetDecompress(Hi, ExtractUpperHalf(Op.getOperand(1)), OddMask); + + return DAG.getMergeValues({Lo, Hi}, VL); +} + // Lower step_vector to the vid instruction. Any non-identity step value must // be accounted for my manual expansion. SDValue RISCVTargetLowering::lowerSTEP_VECTOR(SDValue Op, diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll @@ -0,0 +1,166 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {<16 x i1>, <16 x i1>} @vector_deinterleave_v16i1_v32i1(<32 x i1> %vec) { +%retval = call {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1> %vec) +ret {<16 x i1>, <16 x i1>} %retval +} + +define {<16 x i8>, <16 x i8>} @vector_deinterleave_v16i8_v32i8(<32 x i8> %vec) { +; CHECK-LABEL: vector_deinterleave_v16i8_v32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 8 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %vec) +ret {<16 x i8>, <16 x i8>} %retval +} + +define {<8 x i16>, <8 x i16>} @vector_deinterleave_v8i16_v16i16(<16 x i16> %vec) { +; CHECK-LABEL: vector_deinterleave_v8i16_v16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %vec) +ret {<8 x i16>, <8 x i16>} %retval +} + +define {<4 x i32>, <4 x i32>} @vector_deinterleave_v4i32_vv8i32(<8 x i32> %vec) { +; CHECK-LABEL: vector_deinterleave_v4i32_vv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %vec) +ret {<4 x i32>, <4 x i32>} %retval +} + +define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) { +; CHECK-LABEL: vector_deinterleave_v2i64_v4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec) +ret {<2 x i64>, <2 x i64>} %retval +} + +declare {<16 x i1>, <16 x i1>} @llvm.experimental.vector.deinterleave2.v32i1(<32 x i1>) +declare {<16 x i8>, <16 x i8>} @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare {<8 x i16>, <8 x i16>} @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare {<4 x i32>, <4 x i32>} @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) + +; Floats + +define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<2 x half>, <2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %vec) +ret {<2 x half>, <2 x half>} %retval +} + +define {<4 x half>, <4 x half>} @vector_deinterleave_v4f16_v8f16(<8 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v9, v8, 16 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret +%retval = call {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %vec) +ret {<4 x half>, <4 x half>} %retval +} + +define {<2 x float>, <2 x float>} @vector_deinterleave_v2f32_v4f32(<4 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vnsrl.wx v9, v8, a0 +; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: ret +%retval = call {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float> %vec) +ret {<2 x float>, <2 x float>} %retval +} + +define {<8 x half>, <8 x half>} @vector_deinterleave_v8f16_v16f16(<16 x half> %vec) { +; CHECK-LABEL: vector_deinterleave_v8f16_v16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vnsrl.wi v10, v8, 0 +; CHECK-NEXT: vnsrl.wi v11, v8, 16 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %vec) +ret {<8 x half>, <8 x half>} %retval +} + +define {<4 x float>, <4 x float>} @vector_deinterleave_v4f32_v8f32(<8 x float> %vec) { +; CHECK-LABEL: vector_deinterleave_v4f32_v8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: li a0, 32 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vnsrl.wx v10, v8, a0 +; CHECK-NEXT: vnsrl.wi v11, v8, 0 +; CHECK-NEXT: vmv.v.v v8, v11 +; CHECK-NEXT: vmv.v.v v9, v10 +; CHECK-NEXT: ret +%retval = call {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %vec) +ret {<4 x float>, <4 x float>} %retval +} + +define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double> %vec) { +; CHECK-LABEL: vector_deinterleave_v2f64_v4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v8, 2 +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vrgather.vi v10, v8, 0 +; CHECK-NEXT: vrgather.vi v10, v12, 0, v0.t +; CHECK-NEXT: vrgather.vi v11, v8, 1 +; CHECK-NEXT: vrgather.vi v11, v12, 1, v0.t +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: vmv.v.v v9, v11 +; CHECK-NEXT: ret +%retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec) +ret {<2 x double>, <2 x double>} %retval +} + +declare {<2 x half>,<2 x half>} @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>) +declare {<4 x half>, <4 x half>} @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>) +declare {<2 x float>, <2 x float>} @llvm.experimental.vector.deinterleave2.v4f32(<4 x float>) +declare {<8 x half>, <8 x half>} @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>) +declare {<4 x float>, <4 x float>} @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define {, } @vector_deinterleave_nxv16i1_nxv32i1( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v8, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v0, a0 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v14, v10, 1, v0 +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v8, v8, v8 +; CHECK-NEXT: vrgather.vv v16, v12, v8 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v16, v16, 1 +; CHECK-NEXT: vmsne.vi v0, v16, 0 +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vadd.vi v8, v8, 1 +; CHECK-NEXT: vrgather.vv v16, v12, v8 +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v10, v16, 1 +; CHECK-NEXT: vmsne.vi v8, v10, 0 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i1( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv16i8_nxv32i8( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv32i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv32i8( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8i16_nxv16i16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv16i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16i16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4i32_nxvv8i32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv8i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2i64_nxv4i64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv4i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4i64( %vec) +ret {, } %retval +} + +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i1() +declare {, } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4i64() + +; Floats + +define {, } @vector_deinterleave_nxv2f16_nxv4f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma +; CHECK-NEXT: vid.v v9 +; CHECK-NEXT: vadd.vv v9, v9, v9 +; CHECK-NEXT: vrgather.vv v10, v8, v9 +; CHECK-NEXT: vadd.vi v11, v9, 1 +; CHECK-NEXT: vrgather.vv v9, v8, v11 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f16_nxv8f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vv v12, v10, v10 +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vadd.vi v12, v12, 1 +; CHECK-NEXT: vrgather.vv v14, v8, v12 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f32_nxv4f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vid.v v10 +; CHECK-NEXT: vadd.vv v12, v10, v10 +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vadd.vi v12, v12, 1 +; CHECK-NEXT: vrgather.vv v14, v8, v12 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: vmv1r.v v9, v14 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv8f16_nxv16f16( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv16f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e16, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv16f16( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv4f32_nxv8f32( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv8f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8f32( %vec) +ret {, } %retval +} + +define {, } @vector_deinterleave_nxv2f64_nxv4f64( %vec) { +; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv4f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma +; CHECK-NEXT: vid.v v12 +; CHECK-NEXT: vadd.vv v16, v12, v12 +; CHECK-NEXT: vrgather.vv v12, v8, v16 +; CHECK-NEXT: vadd.vi v16, v16, 1 +; CHECK-NEXT: vrgather.vv v20, v8, v16 +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: vmv2r.v v10, v20 +; CHECK-NEXT: ret +%retval = call {, } @llvm.experimental.vector.deinterleave2.nxv4f64( %vec) +ret {, } %retval +} + +declare {,} @llvm.experimental.vector.deinterleave2.nxv4f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv16f16() +declare {, } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare {, } @llvm.experimental.vector.deinterleave2.nxv4f64() diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll @@ -0,0 +1,176 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define <32 x i1> @vector_interleave_v32i1_v16i1(<16 x i1> %a, <16 x i1> %b) { + %res = call <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1> %a, <16 x i1> %b) + ret <32 x i1> %res +} + +define <16 x i16> @vector_interleave_v16i16_v8i16(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: vector_interleave_v16i16_v8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI1_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI1_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %a, <8 x i16> %b) + ret <16 x i16> %res +} + +define <8 x i32> @vector_interleave_v8i32_v4i32(<4 x i32> %a, <4 x i32> %b) { +; CHECK-LABEL: vector_interleave_v8i32_v4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI2_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI2_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %a, <4 x i32> %b) + ret <8 x i32> %res +} + +define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) { + %res = call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %a, <2 x i64> %b) + ret <4 x i64> %res +} + +declare <32 x i1> @llvm.experimental.vector.interleave2.v32i1(<16 x i1>, <16 x i1>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) + +; Floats + +define <4 x half> @vector_interleave_v4f16_v2f16(<2 x half> %a, <2 x half> %b) { +; CHECK-LABEL: vector_interleave_v4f16_v2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI4_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI4_0) +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %a, <2 x half> %b) + ret <4 x half> %res +} + +define <8 x half> @vector_interleave_v8f16_v4f16(<4 x half> %a, <4 x half> %b) { +; CHECK-LABEL: vector_interleave_v8f16_v4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 4, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI5_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %a, <4 x half> %b) + ret <8 x half> %res +} + +define <4 x float> @vector_interleave_v4f32_v2f32(<2 x float> %a, <2 x float> %b) { +; CHECK-LABEL: vector_interleave_v4f32_v2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vsetivli zero, 2, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v8, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, tu, ma +; CHECK-NEXT: vslideup.vi v10, v9, 2 +; CHECK-NEXT: lui a0, %hi(.LCPI6_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI6_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vle32.v v9, (a0) +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float> %a, <2 x float> %b) + ret <4 x float> %res +} + +define <16 x half> @vector_interleave_v16f16_v8f16(<8 x half> %a, <8 x half> %b) { +; CHECK-LABEL: vector_interleave_v16f16_v8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 8, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 16, e16, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 8 +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %a, <8 x half> %b) + ret <16 x half> %res +} + +define <8 x float> @vector_interleave_v8f32_v4f32(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: vector_interleave_v8f32_v4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv1r.v v10, v9 +; CHECK-NEXT: # kill: def $v8 killed $v8 def $v8m2 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vmv.v.i v12, 0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v8, 0 +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, ma +; CHECK-NEXT: vslideup.vi v12, v10, 4 +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vrgather.vv v8, v12, v10 +; CHECK-NEXT: ret + %res = call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %a, <4 x float> %b) + ret <8 x float> %res +} + +define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double> %b) { + %res = call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b) + ret <4 x double> %res +} + + +declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>) +declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>) +declare <4 x float> @llvm.experimental.vector.interleave2.v4f32(<2 x float>, <2 x float>) +declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -0,0 +1,379 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh | FileCheck %s + +; Integers + +define @vector_interleave_nxv32i1_nxv16i1( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv32i1_nxv16i1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 0 +; CHECK-NEXT: vmerge.vim v12, v10, 1, v0 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: vslidedown.vx v14, v12, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv.v.i v16, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma +; CHECK-NEXT: vand.vi v16, v16, 1 +; CHECK-NEXT: vmsne.vi v9, v16, 0 +; CHECK-NEXT: viota.m v16, v9 +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v18, v14, v16, v0.t +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vmerge.vim v10, v10, 1, v0 +; CHECK-NEXT: vslidedown.vx v14, v10, a0 +; CHECK-NEXT: vmnot.m v8, v9 +; CHECK-NEXT: viota.m v20, v8 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vv v18, v14, v20, v0.t +; CHECK-NEXT: vand.vi v14, v18, 1 +; CHECK-NEXT: vmsne.vi v18, v14, 0 +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v9 +; CHECK-NEXT: vrgather.vv v14, v12, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v8 +; CHECK-NEXT: vrgather.vv v14, v10, v20, v0.t +; CHECK-NEXT: vand.vi v8, v14, 1 +; CHECK-NEXT: vmsne.vi v0, v8, 0 +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e8, mf2, tu, ma +; CHECK-NEXT: vslideup.vx v0, v18, a0 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv32i1( %a, %b) + ret %res +} + +define @vector_interleave_nxv16i16_nxv8i16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16i16_nxv8i16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16i16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8i32_nxv4i32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8i32_nxv4i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8i32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4i64_nxv2i64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4i64_nxv2i64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4i64( %a, %b) + ret %res +} + +declare @llvm.experimental.vector.interleave2.nxv32i1(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) + +; Floats + +define @vector_interleave_nxv4f16_nxv2f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f16_nxv2f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a1, a0, 3 +; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v8, a1 +; CHECK-NEXT: vsetvli a2, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vsetvli a2, zero, e8, mf4, ta, ma +; CHECK-NEXT: vand.vi v10, v10, 1 +; CHECK-NEXT: vmsne.vi v10, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: viota.m v12, v10 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v13, v11, v12, v0.t +; CHECK-NEXT: vslidedown.vx v14, v9, a1 +; CHECK-NEXT: vmnot.m v11, v10 +; CHECK-NEXT: viota.m v15, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vrgather.vv v13, v14, v15, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v10, v8, v12, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v11 +; CHECK-NEXT: vrgather.vv v10, v9, v15, v0.t +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: add a1, a0, a0 +; CHECK-NEXT: vsetvli zero, a1, e16, m1, tu, ma +; CHECK-NEXT: vslideup.vx v10, v13, a0 +; CHECK-NEXT: vmv1r.v v8, v10 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f16_nxv4f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f16_nxv4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vand.vi v10, v10, 1 +; CHECK-NEXT: vmsne.vi v10, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: viota.m v14, v10 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v13, v11, v14, v0.t +; CHECK-NEXT: vslidedown.vx v15, v9, a0 +; CHECK-NEXT: vmnot.m v11, v10 +; CHECK-NEXT: viota.m v16, v11 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.v v0, v11 +; CHECK-NEXT: vrgather.vv v13, v15, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v12, v8, v14, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vmv.v.v v0, v11 +; CHECK-NEXT: vrgather.vv v12, v9, v16, v0.t +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f32_nxv2f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f32_nxv2f32: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma +; CHECK-NEXT: vslidedown.vx v11, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v10, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; CHECK-NEXT: vand.vi v10, v10, 1 +; CHECK-NEXT: vmsne.vi v10, v10, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: viota.m v14, v10 +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v13, v11, v14, v0.t +; CHECK-NEXT: vslidedown.vx v15, v9, a0 +; CHECK-NEXT: vmnot.m v11, v10 +; CHECK-NEXT: viota.m v16, v11 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.v v0, v11 +; CHECK-NEXT: vrgather.vv v13, v15, v16, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma +; CHECK-NEXT: vmv1r.v v0, v10 +; CHECK-NEXT: vrgather.vv v12, v8, v14, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vmv.v.v v0, v11 +; CHECK-NEXT: vrgather.vv v12, v9, v16, v0.t +; CHECK-NEXT: vmv2r.v v8, v12 +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv16f16_nxv8f16( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv16f16_nxv8f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 1 +; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv16f16( %a, %b) + ret %res +} + +define @vector_interleave_nxv8f32_nxv4f32( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv8f32_nxv4f32: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 2 +; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv8f32( %a, %b) + ret %res +} + +define @vector_interleave_nxv4f64_nxv2f64( %a, %b) { +; CHECK-LABEL: vector_interleave_nxv4f64_nxv2f64: +; CHECK: # %bb.0: +; CHECK-NEXT: vmv2r.v v12, v10 +; CHECK-NEXT: vmv2r.v v14, v8 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: srli a0, a0, 3 +; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma +; CHECK-NEXT: vslidedown.vx v18, v8, a0 +; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma +; CHECK-NEXT: vmv.v.i v8, 1 +; CHECK-NEXT: vsetvli a1, zero, e8, mf4, ta, ma +; CHECK-NEXT: vand.vi v8, v8, 1 +; CHECK-NEXT: vmsne.vi v16, v8, 0 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: viota.m v20, v16 +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v10, v18, v20, v0.t +; CHECK-NEXT: vslidedown.vx v18, v12, a0 +; CHECK-NEXT: vmnot.m v17, v16 +; CHECK-NEXT: viota.m v22, v17 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v10, v18, v22, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma +; CHECK-NEXT: vmv1r.v v0, v16 +; CHECK-NEXT: vrgather.vv v8, v14, v20, v0.t +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmv1r.v v0, v17 +; CHECK-NEXT: vrgather.vv v8, v12, v22, v0.t +; CHECK-NEXT: ret + %res = call @llvm.experimental.vector.interleave2.nxv4f64( %a, %b) + ret %res +} + + +declare @llvm.experimental.vector.interleave2.nxv4f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f16(, ) +declare @llvm.experimental.vector.interleave2.nxv4f32(, ) +declare @llvm.experimental.vector.interleave2.nxv16f16(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, )