diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -2444,6 +2444,21 @@ return isInterleaveMask(Mask, Factor, NumInputElts, StartIndexes); } + /// Checks if the shuffle is a bit rotation of the first operand across + /// multiple subelements, e.g: + /// + /// shuffle <8 x i8> %a, <8 x i8> poison, <8 x i32> <1, 0, 3, 2, 5, 4, 7, 6> + /// + /// could be expressed as + /// + /// rotl <4 x i16> %a, 8 + /// + /// If it can be expressed as a rotation, returns the number of subelements to + /// group by in NumSubElts and the number of bits to rotate left in RotateAmt. + static bool isBitRotateMask(ArrayRef Mask, unsigned EltSizeInBits, + unsigned MinSubElts, unsigned MaxSubElts, + unsigned &NumSubElts, unsigned &RotateAmt); + // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const Instruction *I) { return I->getOpcode() == Instruction::ShuffleVector; diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -2806,6 +2806,45 @@ return true; } +/// Try to lower a vector shuffle as a bit rotation. +/// +/// Look for a repeated rotation pattern in each sub group. +/// Returns an element-wise left bit rotation amount or -1 if failed. +static int matchShuffleAsBitRotate(ArrayRef Mask, int NumSubElts) { + int NumElts = Mask.size(); + assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"); + + int RotateAmt = -1; + for (int i = 0; i != NumElts; i += NumSubElts) { + for (int j = 0; j != NumSubElts; ++j) { + int M = Mask[i + j]; + if (M < 0) + continue; + if (M < i || M >= i + NumSubElts) + return -1; + int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; + if (0 <= RotateAmt && Offset != RotateAmt) + return -1; + RotateAmt = Offset; + } + } + return RotateAmt; +} + +bool ShuffleVectorInst::isBitRotateMask( + ArrayRef Mask, unsigned EltSizeInBits, unsigned MinSubElts, + unsigned MaxSubElts, unsigned &NumSubElts, unsigned &RotateAmt) { + for (NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { + int EltRotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); + if (EltRotateAmt < 0) + continue; + RotateAmt = EltRotateAmt * EltSizeInBits; + return true; + } + + return false; +} + //===----------------------------------------------------------------------===// // InsertValueInst Class //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4118,6 +4118,51 @@ return Interleaved; } +// Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can +// reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and +// lower it as a vror.vi (if legal with zvbb enabled). +static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN, + SelectionDAG &DAG, + const RISCVSubtarget &Subtarget) { + SDLoc DL(SVN); + + EVT VT = SVN->getValueType(0); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + unsigned NumSubElts, RotateAmt; + if (!ShuffleVectorInst::isBitRotateMask(SVN->getMask(), EltSizeInBits, 2, + NumElts, NumSubElts, RotateAmt)) + return SDValue(); + MVT RotateVT = MVT::getVectorVT(MVT::getIntegerVT(EltSizeInBits * NumSubElts), + NumElts / NumSubElts); + + // We might have a RotateVT that isn't legal, e.g. v4i64 on zve32x. + if (!Subtarget.getTargetLowering()->isOperationLegalOrCustom(ISD::ROTL, + RotateVT)) + return SDValue(); + + // If we just create the shift amount with + // + // DAG.getConstant(RotateAmt, DL, RotateVT) + // + // then for e64 we get a weird bitcasted build_vector on RV32 that we're + // unable to detect as a splat during pattern matching. So directly lower it + // to a vmv.v.x which gets matched to vror.vi. + MVT ContainerVT = getContainerForFixedLengthVector(DAG, RotateVT, Subtarget); + SDValue VL = + getDefaultVLOps(RotateVT, ContainerVT, DL, DAG, Subtarget).second; + SDValue RotateAmtSplat = DAG.getNode( + RISCVISD::VMV_V_X_VL, DL, ContainerVT, DAG.getUNDEF(ContainerVT), + DAG.getConstant(RotateAmt, DL, Subtarget.getXLenVT()), VL); + RotateAmtSplat = + convertFromScalableVector(RotateVT, RotateAmtSplat, DAG, Subtarget); + + SDValue Rotate = + DAG.getNode(ISD::ROTL, DL, RotateVT, + DAG.getBitcast(RotateVT, SVN->getOperand(0)), RotateAmtSplat); + return DAG.getBitcast(VT, Rotate); +} + static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { SDValue V1 = Op.getOperand(0); @@ -4128,6 +4173,11 @@ unsigned NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); + // Lower to a vror.vi of a larger element type if possible. Do this before we + // promote i1s to i8s. + if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) + return V; + // Promote i1 shuffle to i8 shuffle. if (VT.getVectorElementType() == MVT::i1) { MVT WidenVT = MVT::getVectorVT(MVT::i8, VT.getVectorElementCount()); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -10971,31 +10971,6 @@ return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); } -/// Try to lower a vector shuffle as a bit rotation. -/// -/// Look for a repeated rotation pattern in each sub group. -/// Returns a ISD::ROTL element rotation amount or -1 if failed. -static int matchShuffleAsBitRotate(ArrayRef Mask, int NumSubElts) { - int NumElts = Mask.size(); - assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask"); - - int RotateAmt = -1; - for (int i = 0; i != NumElts; i += NumSubElts) { - for (int j = 0; j != NumSubElts; ++j) { - int M = Mask[i + j]; - if (M < 0) - continue; - if (!isInRange(M, i, i + NumSubElts)) - return -1; - int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; - if (0 <= RotateAmt && Offset != RotateAmt) - return -1; - RotateAmt = Offset; - } - } - return RotateAmt; -} - static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, const X86Subtarget &Subtarget, ArrayRef Mask) { @@ -11005,18 +10980,14 @@ // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; int MaxSubElts = 64 / EltSizeInBits; - for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { - int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); - if (RotateAmt < 0) - continue; - - int NumElts = Mask.size(); - MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); - RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); - return RotateAmt * EltSizeInBits; - } - - return -1; + unsigned RotateAmt, NumSubElts; + if (!ShuffleVectorInst::isBitRotateMask(Mask, EltSizeInBits, MinSubElts, + MaxSubElts, NumSubElts, RotateAmt)) + return -1; + unsigned NumElts = Mask.size(); + MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); + RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); + return RotateAmt * EltSizeInBits; } /// Lower shuffle using X86ISD::VROTLI rotations. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -0,0 +1,767 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_V +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_V +; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_ZVE32X +; RUN: llc -mtriple=riscv64 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB_ZVE32X + +define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 7 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_1: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 1 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_1: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 1 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_2(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 6 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_2: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 2 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_2: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 2 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_3(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 5 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_3: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 3 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_3: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 3 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_4(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_4: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 4 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_4: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 4 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_5(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslideup.vi v9, v8, 3 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_5: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 5 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_5: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 5 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_6(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslideup.vi v9, v8, 2 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_6: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 6 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_6: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 6 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i1> @shuffle_v8i1_as_i8_7(<8 x i1> %v) { +; CHECK-LABEL: shuffle_v8i1_as_i8_7: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i1_as_i8_7: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; ZVBB_V-NEXT: vror.vi v0, v0, 7 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i1_as_i8_7: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v0, v0, 7 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i1> %v, <8 x i1> poison, <8 x i32> + ret <8 x i1> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI7_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e16, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 8 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI8_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_8: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_8: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 8 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI9_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i32_24: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI10_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vle8.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i32_24: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 24 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i32_24: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 2, e32, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 24 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_8(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vslideup.vi v9, v8, 7 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_8: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 8 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_8: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 1 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 7 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_16(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 6 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 2 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 6 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_24(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 3 +; CHECK-NEXT: vslideup.vi v9, v8, 5 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_24: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 24 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_24: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 3 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 5 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_32(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 4 +; CHECK-NEXT: vslideup.vi v9, v8, 4 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 4 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 4 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_40(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_40: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 5 +; CHECK-NEXT: vslideup.vi v9, v8, 3 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_40: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 40 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_40: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 5 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 3 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_48(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 6 +; CHECK-NEXT: vslideup.vi v9, v8, 2 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 6 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 2 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i8> @shuffle_v8i8_as_i64_56(<8 x i8> %v) { +; CHECK-LABEL: shuffle_v8i8_as_i64_56: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vi v9, v8, 7 +; CHECK-NEXT: vslideup.vi v9, v8, 1 +; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i8_as_i64_56: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 1, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 56 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i8_as_i64_56: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e8, m2, ta, ma +; ZVBB_ZVE32X-NEXT: vslidedown.vi v10, v8, 7 +; ZVBB_ZVE32X-NEXT: vslideup.vi v10, v8, 1 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v10 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> + ret <8 x i8> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI18_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI19_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI19_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI19_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI20_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI20_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI20_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI20_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { +; CHECK-LABEL: shuffle_v8i16_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI21_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI21_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i16_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i16_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI21_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI21_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> + ret <8 x i16> %shuffle +} + +define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { +; CHECK-LABEL: shuffle_v8i32_as_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI22_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI22_0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8i32_as_i64: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8i32_as_i64: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI22_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI22_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma +; ZVBB_ZVE32X-NEXT: vle32.v v24, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v16, v8, v24 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> + ret <8 x i32> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI23_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: vsetivli zero, 4, e32, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vror.vi v8, v8, 16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_16: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI24_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI24_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_16: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 16 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_16: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI24_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI24_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_32: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI25_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_32: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_32: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI25_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI25_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { +; CHECK-LABEL: shuffle_v8f16_as_i64_48: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI26_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0) +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: vle16.v v10, (a0) +; CHECK-NEXT: vrgather.vv v9, v8, v10 +; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f16_as_i64_48: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 48 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f16_as_i64_48: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI26_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI26_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e16, m4, ta, ma +; ZVBB_ZVE32X-NEXT: vle16.v v16, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v12, v8, v16 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v12 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x half> %v, <8 x half> poison, <8 x i32> + ret <8 x half> %shuffle +} + +define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { +; CHECK-LABEL: shuffle_v8f32_as_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: lui a0, %hi(.LCPI27_0) +; CHECK-NEXT: addi a0, a0, %lo(.LCPI27_0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v12, (a0) +; CHECK-NEXT: vrgather.vv v10, v8, v12 +; CHECK-NEXT: vmv.v.v v8, v10 +; CHECK-NEXT: ret +; +; ZVBB_V-LABEL: shuffle_v8f32_as_i64: +; ZVBB_V: # %bb.0: +; ZVBB_V-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; ZVBB_V-NEXT: vror.vi v8, v8, 32 +; ZVBB_V-NEXT: ret +; +; ZVBB_ZVE32X-LABEL: shuffle_v8f32_as_i64: +; ZVBB_ZVE32X: # %bb.0: +; ZVBB_ZVE32X-NEXT: lui a0, %hi(.LCPI27_0) +; ZVBB_ZVE32X-NEXT: addi a0, a0, %lo(.LCPI27_0) +; ZVBB_ZVE32X-NEXT: vsetivli zero, 8, e32, m8, ta, ma +; ZVBB_ZVE32X-NEXT: vle32.v v24, (a0) +; ZVBB_ZVE32X-NEXT: vrgather.vv v16, v8, v24 +; ZVBB_ZVE32X-NEXT: vmv.v.v v8, v16 +; ZVBB_ZVE32X-NEXT: ret + %shuffle = shufflevector <8 x float> %v, <8 x float> poison, <8 x i32> + ret <8 x float> %shuffle +}