diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h --- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h @@ -1568,6 +1568,27 @@ return 0; } + /// Checks if the shuffle is a rotation of the first operand, e.g: + /// + /// shuffle %a:v8i8, %b:v8i8, <1, 0, 3, 2, 5, 4, 7, 6> + /// + /// could be expressed as + /// + /// rotl (bitcast %a):v4i16, 8 + /// + /// If it can be expressed as a rotation, returns the type that should be used + /// for the rotation and the number of bits to rotate by. + std::optional> isBitRotate() { + EVT VT = getValueType(0); + ArrayRef Mask = getMask(); + return isBitRotate(VT.getScalarSizeInBits(), Mask, 2, Mask.size()); + } + + static std::optional> isBitRotate(int EltSizeInBits, + ArrayRef Mask, + int MinSubElts, + int MaxSubElts); + static bool isSplatMask(const int *Mask, EVT VT); /// Change values in a shuffle permute mask assuming diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -12388,6 +12388,48 @@ return std::make_pair(Start, Stride); } +/// Try to lower a vector shuffle as a bit rotation. +/// +/// Look for a repeated rotation pattern in each sub group. +/// Returns a ISD::ROTL element rotation amount or -1 if failed. +static int matchShuffleAsBitRotate(ArrayRef Mask, int NumSubElts) { + int NumElts = Mask.size(); + assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"); + + int RotateAmt = -1; + for (int i = 0; i != NumElts; i += NumSubElts) { + for (int j = 0; j != NumSubElts; ++j) { + int M = Mask[i + j]; + if (M < 0) + continue; + if (M < i || M >= i + NumSubElts) + return -1; + int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; + if (0 <= RotateAmt && Offset != RotateAmt) + return -1; + RotateAmt = Offset; + } + } + return RotateAmt; +} + +std::optional> +ShuffleVectorSDNode::isBitRotate(int EltSizeInBits, ArrayRef Mask, + int MinSubElts, int MaxSubElts) { + int NumElts = Mask.size(); + for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { + int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); + if (RotateAmt < 0) + continue; + + MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); + MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); + return std::make_pair(RotateVT, RotateAmt * EltSizeInBits); + } + + return std::nullopt; +} + bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) { // Find the first non-undef value in the shuffle mask. unsigned i, e; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4118,6 +4118,48 @@ return Interleaved; } +// Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can +// reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and +// lower it as a vror.vi (if legal with zvbb enabled). +static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDLoc DL(SVN); + + EVT VT = SVN->getValueType(0); + auto MaybeRotate = SVN->isBitRotate(); + if (!MaybeRotate) + return SDValue(); + auto [RotateEVT, RotateAmt] = *MaybeRotate; + MVT RotateVT = RotateEVT.getSimpleVT(); + + // We might have a RotateVT that isn't legal, e.g. v4i64 on zve32x. + if (!Subtarget.getTargetLowering()->isOperationLegalOrCustom(ISD::ROTL, + RotateVT)) + return SDValue(); + + // If we just create the shift amount with + // + // DAG.getConstant(RotateAmt, DL, RotateVT) + // + // then for e64 we get a weird bitcasted build_vector on RV32 that we're + // unable to detect as a splat during pattern matching. So directly lower it + // to a vmv.v.x gets picked up and matched to a vror.vi. + MVT ContainerVT = getContainerForFixedLengthVector(DAG, RotateVT, Subtarget); + SDValue VL = + getDefaultVLOps(RotateVT, ContainerVT, DL, DAG, Subtarget).second; + SDValue RotateAmtSplat = DAG.getNode( + RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), + DAG.getConstant(RotateAmt, DL, Subtarget.getXLenVT()), VL); + RotateAmtSplat = + convertFromScalableVector(RotateVT, RotateAmtSplat, DAG, Subtarget); + + SDValue Rotate = + DAG.getNode(ISD::ROTL, DL, RotateVT, + DAG.getBitcast(RotateVT, SVN->getOperand(0)), RotateAmtSplat); + return DAG.getBitcast(VT, Rotate); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -4128,6 +4170,11 @@ unsigned NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); + // Lower groups of rotations to element-wise rotations. Do this before we + // promote i1s to i8s. + if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) + return V; + // Promote i1 shuffle to i8 shuffle. if (VT.getVectorElementType() == MVT::i1) { MVT WidenVT = MVT::getVectorVT(MVT::i8, VT.getVectorElementCount()); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10971,31 +10971,6 @@ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } -/// Try to lower a vector shuffle as a bit rotation. -/// -/// Look for a repeated rotation pattern in each sub group. -/// Returns a ISD::ROTL element rotation amount or -1 if failed. -static int matchShuffleAsBitRotate(ArrayRef Mask, int NumSubElts) { - int NumElts = Mask.size(); - assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"); - - int RotateAmt = -1; - for (int i = 0; i != NumElts; i += NumSubElts) { - for (int j = 0; j != NumSubElts; ++j) { - int M = Mask[i + j]; - if (M < 0) - continue; - if (!isInRange(M, i, i + NumSubElts)) - return -1; - int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; - if (0 <= RotateAmt && Offset != RotateAmt) - return -1; - RotateAmt = Offset; - } - } - return RotateAmt; -} - static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, const X86Subtarget &Subtarget, ArrayRef Mask) { @@ -11005,18 +10980,13 @@ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; int MaxSubElts = 64 / EltSizeInBits; - for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { - int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); - if (RotateAmt < 0) - continue; - - int NumElts = Mask.size(); - MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); - RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); - return RotateAmt * EltSizeInBits; - } - - return -1; + auto BitRotate = ShuffleVectorSDNode::isBitRotate(EltSizeInBits, Mask, + MinSubElts, MaxSubElts); + if (!BitRotate) + return -1; + auto [RotateEVT, RotateAmt] = *BitRotate; + RotateVT = RotateEVT.getSimpleVT(); + return RotateAmt; } /// Lower shuffle using X86ISD::VROTLI rotations. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -0,0 +1,767 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_V +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_V +; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_ZVE32X +; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_ZVE32X + +define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 7 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_1: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 1 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_1: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 1 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_2(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 6 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_2: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 2 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_2: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 2 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_3(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 5 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_3: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 3 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_3: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 3 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_4(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_4: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 4 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_4: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 4 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_5(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslideup.vi v9, v8, 3 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_5: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 5 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_5: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 5 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_6(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslideup.vi v9, v8, 2 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_6: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 6 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_6: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 6 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_7(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_7: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 7 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_7: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 7 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e16, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 8 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_8: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_8: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 8 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI9_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_24: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI10_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_24: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 24 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_24: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 24 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_8(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 7 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_8: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_8: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 7 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 6 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 2 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 6 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_24(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 5 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_24: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 24 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_24: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 3 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 5 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_32(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 4 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 4 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_40(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_40: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslideup.vi v9, v8, 3 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_40: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 40 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_40: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 5 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 3 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_48(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslideup.vi v9, v8, 2 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 6 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 2 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_56: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_56: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 56 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_56: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 7 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 1 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI19_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI19_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI20_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI20_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI20_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI21_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI21_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI21_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { +; CHECK-LABEL: shuffle_v8i32_as_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI22_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI22_0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i32_as_i64: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i32_as_i64: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI22_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI22_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma +; ZVBB_ZVE32X-NEXT: vle32.v v24, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v16, v8, v24 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI24_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI24_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI24_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI24_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI25_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI25_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI26_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI26_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI26_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { +; CHECK-LABEL: shuffle_v8f32_as_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI27_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI27_0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f32_as_i64: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f32_as_i64: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI27_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI27_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma +; ZVBB_ZVE32X-NEXT: vle32.v v24, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v16, v8, v24 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> + ret <8 x float> %shuffle +}