diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4230,8 +4230,8 @@ } // Given a shuffle mask like <3, 0, 1, 2, 7, 4, 5, 6> for v8i8, we can -// reinterpret it as a shuffle of v2i32 where the two i32s are bit rotated, and -// lower it as a vror.vi (if legal with zvbb enabled). +// reinterpret it as a v2i32 and rotate it right by 8 instead. We can lower this +// as a vror.vi if we have zvbb, or otherwise as a vsll, vsrl and vor. static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN, SelectionDAG &DAG, const RISCVSubtarget &Subtarget) { @@ -4248,8 +4248,7 @@ NumElts / NumSubElts); // We might have a RotateVT that isn't legal, e.g. v4i64 on zve32x. - if (!Subtarget.getTargetLowering()->isOperationLegalOrCustom(ISD::ROTL, - RotateVT)) + if (!Subtarget.getTargetLowering()->isTypeLegal(RotateVT)) return SDValue(); SDValue Op = DAG.getBitcast(RotateVT, SVN->getOperand(0)); @@ -4276,12 +4275,11 @@ unsigned NumElts = VT.getVectorNumElements(); ShuffleVectorSDNode *SVN = cast(Op.getNode()); - // Lower to a vror.vi of a larger element type if possible. Do this before we - // promote i1s to i8s. - if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) - return V; - if (VT.getVectorElementType() == MVT::i1) { + // Lower to a vror.vi of a larger element type if possible before we promote + // i1s to i8s. + if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) + return V; if (SDValue V = lowerBitreverseShuffle(SVN, DAG, Subtarget)) return V; @@ -4384,6 +4382,12 @@ lowerVECTOR_SHUFFLEAsVSlidedown(DL, VT, V1, V2, Mask, Subtarget, DAG)) return V; + // A bitrotate will be one instruction on zvbb, so try to lower to it first if + // available. + if (Subtarget.hasStdExtZvbb()) + if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) + return V; + // Lower rotations to a SLIDEDOWN and a SLIDEUP. One of the source vectors may // be undef which can be handled with a single SLIDEDOWN/UP. int LoSrc, HiSrc; @@ -4510,6 +4514,12 @@ if (IsSelect) return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2); + // We might be able to express the shuffle as a bitrotate. But even if we + // don't have zvbb and have to expand, the expanded sequence of approx. 2 + // shifts and a vor will have a higher throughput than a vrgather. + if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget)) + return V; + if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) { // On such a large vector we're unable to use i8 as the index type. // FIXME: We could promote the index to i16 and use vrgatherei16, but that diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll @@ -787,23 +787,13 @@ ; This interleaves the first 2 elements of a vector in opposite order. With ; undefs for the remaining elements. We use to miscompile this. define <4 x i8> @unary_interleave_10uu_v4i8(<4 x i8> %x) { -; V128-LABEL: unary_interleave_10uu_v4i8: -; V128: # %bb.0: -; V128-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; V128-NEXT: vid.v v9 -; V128-NEXT: vrsub.vi v10, v9, 1 -; V128-NEXT: vrgather.vv v9, v8, v10 -; V128-NEXT: vmv1r.v v8, v9 -; V128-NEXT: ret -; -; V512-LABEL: unary_interleave_10uu_v4i8: -; V512: # %bb.0: -; V512-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; V512-NEXT: vid.v v9 -; V512-NEXT: vrsub.vi v10, v9, 1 -; V512-NEXT: vrgather.vv v9, v8, v10 -; V512-NEXT: vmv1r.v v8, v9 -; V512-NEXT: ret +; CHECK-LABEL: unary_interleave_10uu_v4i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 +; CHECK-NEXT: ret %a = shufflevector <4 x i8> %x, <4 x i8> poison, <4 x i32> ret <4 x i8> %a } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-rotate.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK -; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV32 +; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK,RV64 ; RUN: llc -mtriple=riscv32 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-V ; RUN: llc -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-V ; RUN: llc -mtriple=riscv32 -mattr=+zve32x,+zvfh,+experimental-zvbb -verify-machineinstrs < %s | FileCheck %s -check-prefixes=ZVBB-ZVE32X @@ -9,12 +9,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_1(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 1 -; CHECK-NEXT: vslideup.vi v9, v8, 7 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 1 +; CHECK-NEXT: vsll.vi v9, v0, 7 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_1: @@ -35,12 +33,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_2(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 2 -; CHECK-NEXT: vslideup.vi v9, v8, 6 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 2 +; CHECK-NEXT: vsll.vi v9, v0, 6 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_2: @@ -61,12 +57,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_3(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_3: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 3 -; CHECK-NEXT: vslideup.vi v9, v8, 5 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 3 +; CHECK-NEXT: vsll.vi v9, v0, 5 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_3: @@ -87,12 +81,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_4(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 4 -; CHECK-NEXT: vslideup.vi v9, v8, 4 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 4 +; CHECK-NEXT: vsll.vi v9, v0, 4 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_4: @@ -113,12 +105,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_5(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_5: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 5 -; CHECK-NEXT: vslideup.vi v9, v8, 3 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 5 +; CHECK-NEXT: vsll.vi v9, v0, 3 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_5: @@ -139,12 +129,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_6(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_6: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 6 -; CHECK-NEXT: vslideup.vi v9, v8, 2 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 6 +; CHECK-NEXT: vsll.vi v9, v0, 2 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_6: @@ -165,12 +153,10 @@ define <8 x i1> @shuffle_v8i1_as_i8_7(<8 x i1> %v) { ; CHECK-LABEL: shuffle_v8i1_as_i8_7: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vslidedown.vi v9, v8, 7 -; CHECK-NEXT: vslideup.vi v9, v8, 1 -; CHECK-NEXT: vmsne.vi v0, v9, 0 +; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma +; CHECK-NEXT: vsrl.vi v8, v0, 7 +; CHECK-NEXT: vadd.vv v9, v0, v0 +; CHECK-NEXT: vor.vv v0, v9, v8 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i1_as_i8_7: @@ -191,12 +177,10 @@ define <8 x i8> @shuffle_v8i8_as_i16(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI7_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI7_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i8_as_i16: @@ -217,12 +201,10 @@ define <8 x i8> @shuffle_v8i8_as_i32_8(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_8: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI8_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI8_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 8 +; CHECK-NEXT: vsll.vi v8, v8, 24 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i8_as_i32_8: @@ -243,12 +225,10 @@ define <8 x i8> @shuffle_v8i8_as_i32_16(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_16: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI9_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI9_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 16 +; CHECK-NEXT: vsll.vi v8, v8, 16 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i8_as_i32_16: @@ -269,12 +249,10 @@ define <8 x i8> @shuffle_v8i8_as_i32_24(<8 x i8> %v) { ; CHECK-LABEL: shuffle_v8i8_as_i32_24: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI10_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI10_0) -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv1r.v v8, v9 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 24 +; CHECK-NEXT: vsll.vi v8, v8, 8 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i8_as_i32_24: @@ -477,12 +455,10 @@ define <8 x i16> @shuffle_v8i16_as_i32(<8 x i16> %v) { ; CHECK-LABEL: shuffle_v8i16_as_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI18_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI18_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 16 +; CHECK-NEXT: vsll.vi v8, v8, 16 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i16_as_i32: @@ -501,15 +477,28 @@ } define <8 x i16> @shuffle_v8i16_as_i64_16(<8 x i16> %v) { -; CHECK-LABEL: shuffle_v8i16_as_i64_16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI19_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8i16_as_i64_16: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 48 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8i16_as_i64_16: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 48 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsll.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 16 +; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i16_as_i64_16: ; ZVBB-V: # %bb.0: @@ -531,15 +520,28 @@ } define <8 x i16> @shuffle_v8i16_as_i64_32(<8 x i16> %v) { -; CHECK-LABEL: shuffle_v8i16_as_i64_32: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI20_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI20_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8i16_as_i64_32: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8i16_as_i64_32: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i16_as_i64_32: ; ZVBB-V: # %bb.0: @@ -561,15 +563,28 @@ } define <8 x i16> @shuffle_v8i16_as_i64_48(<8 x i16> %v) { -; CHECK-LABEL: shuffle_v8i16_as_i64_48: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI21_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI21_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8i16_as_i64_48: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8i16_as_i64_48: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 48 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsll.vi v8, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i16_as_i64_48: ; ZVBB-V: # %bb.0: @@ -591,15 +606,28 @@ } define <8 x i32> @shuffle_v8i32_as_i64(<8 x i32> %v) { -; CHECK-LABEL: shuffle_v8i32_as_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI22_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI22_0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8i32_as_i64: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8i32_as_i64: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8i32_as_i64: ; ZVBB-V: # %bb.0: @@ -623,12 +651,10 @@ define <8 x half> @shuffle_v8f16_as_i32(<8 x half> %v) { ; CHECK-LABEL: shuffle_v8f16_as_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI23_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI23_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsrl.vi v9, v8, 16 +; CHECK-NEXT: vsll.vi v8, v8, 16 +; CHECK-NEXT: vor.vv v8, v8, v9 ; CHECK-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8f16_as_i32: @@ -647,15 +673,28 @@ } define <8 x half> @shuffle_v8f16_as_i64_16(<8 x half> %v) { -; CHECK-LABEL: shuffle_v8f16_as_i64_16: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI24_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI24_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8f16_as_i64_16: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 48 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8f16_as_i64_16: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 48 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsll.vx v9, v8, a0 +; RV64-NEXT: vsrl.vi v8, v8, 16 +; RV64-NEXT: vor.vv v8, v9, v8 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8f16_as_i64_16: ; ZVBB-V: # %bb.0: @@ -677,15 +716,28 @@ } define <8 x half> @shuffle_v8f16_as_i64_32(<8 x half> %v) { -; CHECK-LABEL: shuffle_v8f16_as_i64_32: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI25_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI25_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8f16_as_i64_32: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8f16_as_i64_32: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8f16_as_i64_32: ; ZVBB-V: # %bb.0: @@ -707,15 +759,28 @@ } define <8 x half> @shuffle_v8f16_as_i64_48(<8 x half> %v) { -; CHECK-LABEL: shuffle_v8f16_as_i64_48: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI26_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI26_0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vle16.v v10, (a0) -; CHECK-NEXT: vrgather.vv v9, v8, v10 -; CHECK-NEXT: vmv.v.v v8, v9 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8f16_as_i64_48: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 16 +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vmv.v.x v9, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v10, v9, a0 +; RV32-NEXT: vsll.vv v10, v8, v10 +; RV32-NEXT: vrsub.vi v9, v9, 0 +; RV32-NEXT: vand.vx v9, v9, a0 +; RV32-NEXT: vsrl.vv v8, v8, v9 +; RV32-NEXT: vor.vv v8, v10, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8f16_as_i64_48: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 48 +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: vsrl.vx v9, v8, a0 +; RV64-NEXT: vsll.vi v8, v8, 16 +; RV64-NEXT: vor.vv v8, v8, v9 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8f16_as_i64_48: ; ZVBB-V: # %bb.0: @@ -737,15 +802,28 @@ } define <8 x float> @shuffle_v8f32_as_i64(<8 x float> %v) { -; CHECK-LABEL: shuffle_v8f32_as_i64: -; CHECK: # %bb.0: -; CHECK-NEXT: lui a0, %hi(.LCPI27_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI27_0) -; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; CHECK-NEXT: vle32.v v12, (a0) -; CHECK-NEXT: vrgather.vv v10, v8, v12 -; CHECK-NEXT: vmv.v.v v8, v10 -; CHECK-NEXT: ret +; RV32-LABEL: shuffle_v8f32_as_i64: +; RV32: # %bb.0: +; RV32-NEXT: li a0, 32 +; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-NEXT: vmv.v.x v10, a0 +; RV32-NEXT: li a0, 63 +; RV32-NEXT: vand.vx v12, v10, a0 +; RV32-NEXT: vsll.vv v12, v8, v12 +; RV32-NEXT: vrsub.vi v10, v10, 0 +; RV32-NEXT: vand.vx v10, v10, a0 +; RV32-NEXT: vsrl.vv v8, v8, v10 +; RV32-NEXT: vor.vv v8, v12, v8 +; RV32-NEXT: ret +; +; RV64-LABEL: shuffle_v8f32_as_i64: +; RV64: # %bb.0: +; RV64-NEXT: li a0, 32 +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-NEXT: vsrl.vx v10, v8, a0 +; RV64-NEXT: vsll.vx v8, v8, a0 +; RV64-NEXT: vor.vv v8, v8, v10 +; RV64-NEXT: ret ; ; ZVBB-V-LABEL: shuffle_v8f32_as_i64: ; ZVBB-V: # %bb.0: