diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -660,6 +660,16 @@ /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilderBase &IRB) const override; + unsigned getMaxSupportedInterleaveFactor() const override { return 8; } + + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef Shuffles, + ArrayRef Indices, + unsigned Factor) const override; + + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling @@ -794,6 +804,11 @@ /// For available scheduling models FDIV + two independent FMULs are much /// faster than two FDIVs. unsigned combineRepeatedFPDivisors() const override; + + /// Returns whether or not generating a fixed length interleaved load/store + /// intrinsic for this type will be legal. + bool isLegalInterleavedAccessType(FixedVectorType *, unsigned Factor, + bool IsVsseg, const DataLayout &) const; }; namespace RISCVVIntrinsicsTable { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -18,9 +18,11 @@ #include "RISCVRegisterInfo.h" #include "RISCVSubtarget.h" #include "RISCVTargetMachine.h" +#include "RISCVISelDAGToDAG.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -1799,7 +1801,7 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const { if (ScalarTy->isPointerTy()) - return true; + return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) || ScalarTy->isIntegerTy(32)) @@ -14856,6 +14858,137 @@ return TargetLowering::getIRStackGuard(IRB); } +bool RISCVTargetLowering::isLegalInterleavedAccessType( + FixedVectorType *VTy, unsigned Factor, bool IsVsseg, + const DataLayout &DL) const { + if (!Subtarget.useRVVForFixedLengthVectors()) + return false; + if (!isLegalElementTypeForRVV(VTy->getElementType())) + return false; + EVT VT = getValueType(DL, VTy); + // Don't lower vlseg/vsseg for fixed length vector types that can't be split. + if (!isTypeLegal(VT)) + return false; + // Sometimes the interleaved access pass picks up splats as interleaves of one + // element. Don't lower these. + if (VTy->getNumElements() < 2) + return false; + + // Need to make sure that EMUL * NFIELDS ≤ 8 + MVT ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT()); + unsigned LMUL = RISCVVType::decodeVLMUL(getLMUL(ContainerVT)).first; + if (LMUL * Factor > 8) + return false; + + return true; +} + +/// Lower an interleaved load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.vlseg.v4i32.p0.i64( +/// %ptr, i64 4) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 +bool RISCVTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + IRBuilder<> Builder(LI); + + auto *VTy = cast(Shuffles[0]->getType()); + if (!isLegalInterleavedAccessType(VTy, Factor, false, + LI->getModule()->getDataLayout())) + return false; + + auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + + static const Intrinsic::ID FixedLenIntrIds[] = { + Intrinsic::riscv_seg2_load, Intrinsic::riscv_seg3_load, + Intrinsic::riscv_seg4_load, Intrinsic::riscv_seg5_load, + Intrinsic::riscv_seg6_load, Intrinsic::riscv_seg7_load, + Intrinsic::riscv_seg8_load}; + Function *VlsegNFunc = + Intrinsic::getDeclaration(LI->getModule(), FixedLenIntrIds[Factor - 2], + {VTy, LI->getPointerOperandType(), XLenTy}); + + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + + CallInst *VlsegN = + Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL}); + + for (unsigned i = 0; i < Shuffles.size(); i++) { + Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); + Shuffles[i]->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// Lower an interleaved store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.riscv.vsseg3.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2, +/// %ptr, i32 4) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// vsseg3 instruction in CodeGen. +bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + IRBuilder<> Builder(SI); + auto *ShuffleVTy = cast(SVI->getType()); + // Given SVI : , then VTy : + auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), + ShuffleVTy->getNumElements() / Factor); + if (!isLegalInterleavedAccessType(VTy, Factor, true, + SI->getModule()->getDataLayout())) + return false; + + auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + + static const Intrinsic::ID FixedLenIntrIds[] = { + Intrinsic::riscv_seg2_store, Intrinsic::riscv_seg3_store, + Intrinsic::riscv_seg4_store, Intrinsic::riscv_seg5_store, + Intrinsic::riscv_seg6_store, Intrinsic::riscv_seg7_store, + Intrinsic::riscv_seg8_store}; + + Function *VssegNFunc = + Intrinsic::getDeclaration(SI->getModule(), FixedLenIntrIds[Factor - 2], + {VTy, SI->getPointerOperandType(), XLenTy}); + + auto Mask = SVI->getShuffleMask(); + SmallVector Ops; + + for (unsigned i = 0; i < Factor; i++) { + Value *Shuffle = Builder.CreateShuffleVector( + SVI->getOperand(0), SVI->getOperand(1), + createSequentialMask(Mask[i], VTy->getNumElements(), 0)); + Ops.push_back(Shuffle); + } + // This VL should be OK (should be executable in one vsseg instruction, + // potentially under larger LMULs) because we checked that the fixed vector + // type fits in isLegalInterleavedAccessType + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + Ops.append({SI->getPointerOperand(), VL}); + + Builder.CreateCall(VssegNFunc, Ops); + + return true; +} + #define GET_REGISTER_MATCHER #include "RISCVGenAsmMatcher.inc" diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -278,6 +278,9 @@ if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVGatherScatterLoweringPass()); + if (getOptLevel() != CodeGenOpt::None) + addPass(createInterleavedAccessPass()); + if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVCodeGenPreparePass()); diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: RISCV gather/scatter lowering +; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: RISCV CodeGenPrepare ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll @@ -0,0 +1,79 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zve32x,+zvl1024b -O2 | FileCheck %s -check-prefix=ZVE32X +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zve64x,+zvl1024b -O2 | FileCheck %s -check-prefix=ZVE64X + +; This checks to make sure that ptr types aren't lowered if XLEN isn't a +; supported SEW + +define <4 x i1> @load_large_vector(ptr %p) { +; ZVE32X-LABEL: load_large_vector: +; ZVE32X: # %bb.0: +; ZVE32X-NEXT: addi sp, sp, -16 +; ZVE32X-NEXT: .cfi_def_cfa_offset 16 +; ZVE32X-NEXT: ld a1, 80(a0) +; ZVE32X-NEXT: ld a2, 72(a0) +; ZVE32X-NEXT: ld a3, 56(a0) +; ZVE32X-NEXT: ld a4, 32(a0) +; ZVE32X-NEXT: ld a5, 24(a0) +; ZVE32X-NEXT: ld a6, 48(a0) +; ZVE32X-NEXT: ld a7, 8(a0) +; ZVE32X-NEXT: ld a0, 0(a0) +; ZVE32X-NEXT: xor a4, a5, a4 +; ZVE32X-NEXT: snez a4, a4 +; ZVE32X-NEXT: sb a4, 12(sp) +; ZVE32X-NEXT: xor a0, a0, a7 +; ZVE32X-NEXT: snez a0, a0 +; ZVE32X-NEXT: sb a0, 15(sp) +; ZVE32X-NEXT: xor a0, a6, a3 +; ZVE32X-NEXT: snez a0, a0 +; ZVE32X-NEXT: sb a0, 13(sp) +; ZVE32X-NEXT: xor a1, a2, a1 +; ZVE32X-NEXT: snez a0, a1 +; ZVE32X-NEXT: sb a0, 14(sp) +; ZVE32X-NEXT: addi a0, sp, 12 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: addi a0, sp, 15 +; ZVE32X-NEXT: vlm.v v8, (a0) +; ZVE32X-NEXT: vmv.v.i v9, 0 +; ZVE32X-NEXT: vmerge.vim v10, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmv.v.i v11, 0 +; ZVE32X-NEXT: vmv1r.v v0, v8 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; ZVE32X-NEXT: vslideup.vi v8, v10, 1 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: addi a0, sp, 13 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: vmerge.vim v10, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma +; ZVE32X-NEXT: vslideup.vi v8, v10, 2 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: addi a0, sp, 14 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: vmerge.vim v9, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vslideup.vi v8, v9, 3 +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: addi sp, sp, 16 +; ZVE32X-NEXT: ret +; +; ZVE64X-LABEL: load_large_vector: +; ZVE64X: # %bb.0: +; ZVE64X-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; ZVE64X-NEXT: vlseg3e64.v v8, (a0) +; ZVE64X-NEXT: vmsne.vv v0, v8, v9 +; ZVE64X-NEXT: ret + %l = load <12 x ptr>, ptr %p + %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %ret = icmp ne <4 x ptr> %s1, %s2 + ret <4 x i1> %ret +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zvl32b.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zvl32b.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zvl32b.ll @@ -0,0 +1,377 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zve32x,+zvl32b -O2 | FileCheck %s -check-prefix=ZVL32B +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zve32x,+zvl64b -O2 | FileCheck %s -check-prefix=ZVL64B +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zve32x,+zvl128b -O2 | FileCheck %s -check-prefix=ZVL128B +; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+zve32x,+zvl256b -O2 | FileCheck %s -check-prefix=ZVL256B + +; Make sure that we don't lower interleaved loads that won't fit into the minimum vlen +; In the test case below we are trying to load two 16 * 32 = 512 bit vectors +; +; With a minimum vlen of 32, we have 32 registers * 32 bits = 1024 bits total +; We can't use LMUL=8 since v0-v7 is used for the mask, so we really only have 768 bits +; +; With a minimum vlen of 64, we can't use LMUL=8 because in 7.8 of the spec: +; > The EMUL setting must be such that EMUL * NFIELDS ≤ 8, otherwise the +; > instruction encoding is reserved. +define {<16 x i32>, <16 x i32>} @load_factor2_large(ptr %ptr) { +; ZVL32B-LABEL: load_factor2_large: +; ZVL32B: # %bb.0: +; ZVL32B-NEXT: addi sp, sp, -16 +; ZVL32B-NEXT: .cfi_def_cfa_offset 16 +; ZVL32B-NEXT: csrr a2, vlenb +; ZVL32B-NEXT: li a3, 72 +; ZVL32B-NEXT: mul a2, a2, a3 +; ZVL32B-NEXT: sub sp, sp, a2 +; ZVL32B-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 72 * vlenb +; ZVL32B-NEXT: addi a2, a1, 96 +; ZVL32B-NEXT: vsetivli zero, 8, e32, m8, ta, mu +; ZVL32B-NEXT: vle32.v v8, (a2) +; ZVL32B-NEXT: csrr a2, vlenb +; ZVL32B-NEXT: li a3, 40 +; ZVL32B-NEXT: mul a2, a2, a3 +; ZVL32B-NEXT: add a2, sp, a2 +; ZVL32B-NEXT: addi a2, a2, 16 +; ZVL32B-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVL32B-NEXT: addi a2, a1, 32 +; ZVL32B-NEXT: vle32.v v8, (a2) +; ZVL32B-NEXT: csrr a2, vlenb +; ZVL32B-NEXT: slli a2, a2, 5 +; ZVL32B-NEXT: add a2, sp, a2 +; ZVL32B-NEXT: addi a2, a2, 16 +; ZVL32B-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; ZVL32B-NEXT: addi a2, a1, 64 +; ZVL32B-NEXT: vle32.v v24, (a1) +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 3 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: vid.v v8 +; ZVL32B-NEXT: vadd.vv v8, v8, v8 +; ZVL32B-NEXT: li a1, 240 +; ZVL32B-NEXT: vmv.s.x v0, a1 +; ZVL32B-NEXT: vle32.v v16, (a2) +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 24 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: vmv.v.v v16, v8 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: vrgather.vv v8, v24, v16 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 6 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: vadd.vi v8, v16, -8 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 5 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 6 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v8, v24, v16, v0.t +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 6 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 24 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v24, v16, v8 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 40 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v24, v16, v8, v0.t +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 4 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vadd.vi v8, v8, 1 +; ZVL32B-NEXT: addi a1, sp, 16 +; ZVL32B-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 3 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v16, v24, v8 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vadd.vi v24, v8, -7 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 5 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v16, v8, v24, v0.t +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 48 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 24 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: addi a1, sp, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v16, v24, v8 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 40 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vmv.v.v v8, v16 +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: li a2, 56 +; ZVL32B-NEXT: mul a1, a1, a2 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vrgather.vv v8, v24, v16, v0.t +; ZVL32B-NEXT: addi a1, a0, 96 +; ZVL32B-NEXT: vse32.v v8, (a1) +; ZVL32B-NEXT: addi a1, a0, 64 +; ZVL32B-NEXT: csrr a2, vlenb +; ZVL32B-NEXT: li a3, 48 +; ZVL32B-NEXT: mul a2, a2, a3 +; ZVL32B-NEXT: add a2, sp, a2 +; ZVL32B-NEXT: addi a2, a2, 16 +; ZVL32B-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVL32B-NEXT: vse32.v v8, (a1) +; ZVL32B-NEXT: addi a1, a0, 32 +; ZVL32B-NEXT: csrr a2, vlenb +; ZVL32B-NEXT: slli a2, a2, 4 +; ZVL32B-NEXT: add a2, sp, a2 +; ZVL32B-NEXT: addi a2, a2, 16 +; ZVL32B-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload +; ZVL32B-NEXT: vse32.v v8, (a1) +; ZVL32B-NEXT: csrr a1, vlenb +; ZVL32B-NEXT: slli a1, a1, 6 +; ZVL32B-NEXT: add a1, sp, a1 +; ZVL32B-NEXT: addi a1, a1, 16 +; ZVL32B-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; ZVL32B-NEXT: vse32.v v8, (a0) +; ZVL32B-NEXT: csrr a0, vlenb +; ZVL32B-NEXT: li a1, 72 +; ZVL32B-NEXT: mul a0, a0, a1 +; ZVL32B-NEXT: add sp, sp, a0 +; ZVL32B-NEXT: addi sp, sp, 16 +; ZVL32B-NEXT: ret +; +; ZVL64B-LABEL: load_factor2_large: +; ZVL64B: # %bb.0: +; ZVL64B-NEXT: addi sp, sp, -16 +; ZVL64B-NEXT: .cfi_def_cfa_offset 16 +; ZVL64B-NEXT: csrr a1, vlenb +; ZVL64B-NEXT: li a2, 42 +; ZVL64B-NEXT: mul a1, a1, a2 +; ZVL64B-NEXT: sub sp, sp, a1 +; ZVL64B-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb +; ZVL64B-NEXT: addi a1, a0, 64 +; ZVL64B-NEXT: vsetivli zero, 16, e32, m8, ta, mu +; ZVL64B-NEXT: vle32.v v0, (a0) +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a2, a0, 4 +; ZVL64B-NEXT: add a0, a2, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v0, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: vle32.v v8, (a1) +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: li a1, 25 +; ZVL64B-NEXT: mul a0, a0, a1 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: vid.v v8 +; ZVL64B-NEXT: vadd.vv v16, v8, v8 +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: vrgather.vv v8, v0, v16 +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 5 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: lui a0, 16 +; ZVL64B-NEXT: addi a0, a0, -256 +; ZVL64B-NEXT: vmv.s.x v0, a0 +; ZVL64B-NEXT: addi a0, sp, 16 +; ZVL64B-NEXT: vs1r.v v0, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: vadd.vi v8, v16, -16 +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 3 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v8, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 3 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: li a1, 25 +; ZVL64B-NEXT: mul a0, a0, a1 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 5 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: vrgather.vv v16, v8, v24, v0.t +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 5 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: vadd.vi v24, v0, 1 +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 4 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: vrgather.vv v16, v8, v24 +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 3 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vs8r.v v16, (a0) # Unknown-size Folded Spill +; ZVL64B-NEXT: vadd.vi v8, v0, -15 +; ZVL64B-NEXT: addi a0, sp, 16 +; ZVL64B-NEXT: vl1r.v v0, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 3 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: li a1, 25 +; ZVL64B-NEXT: mul a0, a0, a1 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: vrgather.vv v16, v24, v8, v0.t +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: slli a1, a0, 5 +; ZVL64B-NEXT: add a0, a1, a0 +; ZVL64B-NEXT: add a0, sp, a0 +; ZVL64B-NEXT: addi a0, a0, 16 +; ZVL64B-NEXT: vl8r.v v8, (a0) # Unknown-size Folded Reload +; ZVL64B-NEXT: csrr a0, vlenb +; ZVL64B-NEXT: li a1, 42 +; ZVL64B-NEXT: mul a0, a0, a1 +; ZVL64B-NEXT: add sp, sp, a0 +; ZVL64B-NEXT: addi sp, sp, 16 +; ZVL64B-NEXT: ret +; +; ZVL128B-LABEL: load_factor2_large: +; ZVL128B: # %bb.0: +; ZVL128B-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; ZVL128B-NEXT: vlseg2e32.v v8, (a0) +; ZVL128B-NEXT: ret +; +; ZVL256B-LABEL: load_factor2_large: +; ZVL256B: # %bb.0: +; ZVL256B-NEXT: vsetivli zero, 16, e32, m2, ta, ma +; ZVL256B-NEXT: vlseg2e32.v v8, (a0) +; ZVL256B-NEXT: ret + %interleaved.vec = load <32 x i32>, ptr %ptr + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %res0 = insertvalue {<16 x i32>, <16 x i32>} undef, <16 x i32> %v0, 0 + %res1 = insertvalue {<16 x i32>, <16 x i32>} %res0, <16 x i32> %v1, 1 + ret {<16 x i32>, <16 x i32>} %res1 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v -O2 | FileCheck %s + +; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 +define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { +; CHECK-LABEL: load_factor2_v3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v9, v8, v8 +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vadd.vi v14, v9, 1 +; CHECK-NEXT: vrgather.vv v9, v10, v14 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret + %interleaved.vec = load <6 x i32>, ptr %ptr + %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> + %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> + %res0 = insertvalue {<3 x i32>, <3 x i32>} undef, <3 x i32> %v0, 0 + %res1 = insertvalue {<3 x i32>, <3 x i32>} %res0, <3 x i32> %v1, 1 + ret {<3 x i32>, <3 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>} @load_factor3(ptr %ptr) { +; CHECK-LABEL: load_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <12 x i32>, ptr %ptr + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -7,10 +7,8 @@ define void @vnsrl_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: vnsrl_0_i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 0 +; CHECK-NEXT: vlseg2e8.v v8, (a0) ; CHECK-NEXT: vse8.v v8, (a1) ; CHECK-NEXT: ret entry: @@ -23,11 +21,9 @@ define void @vnsrl_8_i8(ptr %in, ptr %out) { ; CHECK-LABEL: vnsrl_8_i8: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) ; CHECK-NEXT: vsetivli zero, 8, e8, mf4, ta, ma -; CHECK-NEXT: vnsrl.wi v8, v8, 8 -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: vlseg2e8.v v8, (a0) +; CHECK-NEXT: vse8.v v9, (a1) ; CHECK-NEXT: ret entry: %0 = load <16 x i8>, ptr %in, align 1 @@ -39,19 +35,15 @@ define void @vnsrl_0_i16(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_i16: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; V-NEXT: vle16.v v8, (a0) ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vlseg2e16.v v8, (a0) ; V-NEXT: vse16.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i16: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vlseg2e16.v v8, (a0) ; ZVE32F-NEXT: vse16.v v8, (a1) ; ZVE32F-NEXT: ret entry: @@ -64,20 +56,16 @@ define void @vnsrl_16_i16(ptr %in, ptr %out) { ; V-LABEL: vnsrl_16_i16: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; V-NEXT: vle16.v v8, (a0) ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 16 -; V-NEXT: vse16.v v8, (a1) +; V-NEXT: vlseg2e16.v v8, (a0) +; V-NEXT: vse16.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_16_i16: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 -; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: vlseg2e16.v v8, (a0) +; ZVE32F-NEXT: vse16.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <8 x i16>, ptr %in, align 2 @@ -89,19 +77,15 @@ define void @vnsrl_0_half(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_half: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; V-NEXT: vle16.v v8, (a0) ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vlseg2e16.v v8, (a0) ; V-NEXT: vse16.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_half: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vnsrl.wi v8, v8, 0 +; ZVE32F-NEXT: vlseg2e16.v v8, (a0) ; ZVE32F-NEXT: vse16.v v8, (a1) ; ZVE32F-NEXT: ret entry: @@ -114,20 +98,16 @@ define void @vnsrl_16_half(ptr %in, ptr %out) { ; V-LABEL: vnsrl_16_half: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; V-NEXT: vle16.v v8, (a0) ; V-NEXT: vsetivli zero, 4, e16, mf4, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 16 -; V-NEXT: vse16.v v8, (a1) +; V-NEXT: vlseg2e16.v v8, (a0) +; V-NEXT: vse16.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_16_half: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 8, e16, mf2, ta, ma -; ZVE32F-NEXT: vle16.v v8, (a0) ; ZVE32F-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; ZVE32F-NEXT: vnsrl.wi v8, v8, 16 -; ZVE32F-NEXT: vse16.v v8, (a1) +; ZVE32F-NEXT: vlseg2e16.v v8, (a0) +; ZVE32F-NEXT: vse16.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <8 x half>, ptr %in, align 2 @@ -139,24 +119,16 @@ define void @vnsrl_0_i32(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_i32: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V-NEXT: vle32.v v8, (a0) ; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vlseg2e32.v v8, (a0) ; V-NEXT: vse32.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i32: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vlseg2e32.v v8, (a0) +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -168,25 +140,16 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) { ; V-LABEL: vnsrl_32_i32: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V-NEXT: vle32.v v8, (a0) -; V-NEXT: li a0, 32 ; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V-NEXT: vnsrl.wx v8, v8, a0 -; V-NEXT: vse32.v v8, (a1) +; V-NEXT: vlseg2e32.v v8, (a0) +; V-NEXT: vse32.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_32_i32: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vlseg2e32.v v8, (a0) +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x i32>, ptr %in, align 4 @@ -198,24 +161,16 @@ define void @vnsrl_0_float(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_float: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V-NEXT: vle32.v v8, (a0) ; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V-NEXT: vnsrl.wi v8, v8, 0 +; V-NEXT: vlseg2e32.v v8, (a0) ; V-NEXT: vse32.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_float: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 0 -; ZVE32F-NEXT: vrgather.vi v10, v9, 0, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vlseg2e32.v v8, (a0) +; ZVE32F-NEXT: vse32.v v8, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -227,25 +182,16 @@ define void @vnsrl_32_float(ptr %in, ptr %out) { ; V-LABEL: vnsrl_32_float: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e32, mf2, ta, ma -; V-NEXT: vle32.v v8, (a0) -; V-NEXT: li a0, 32 ; V-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; V-NEXT: vnsrl.wx v8, v8, a0 -; V-NEXT: vse32.v v8, (a1) +; V-NEXT: vlseg2e32.v v8, (a0) +; V-NEXT: vse32.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_32_float: ; ZVE32F: # %bb.0: # %entry -; ZVE32F-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; ZVE32F-NEXT: vle32.v v8, (a0) -; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, mu -; ZVE32F-NEXT: vslidedown.vi v9, v8, 2 -; ZVE32F-NEXT: li a0, 2 -; ZVE32F-NEXT: vmv.s.x v0, a0 -; ZVE32F-NEXT: vrgather.vi v10, v8, 1 -; ZVE32F-NEXT: vrgather.vi v10, v9, 1, v0.t -; ZVE32F-NEXT: vse32.v v10, (a1) +; ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma +; ZVE32F-NEXT: vlseg2e32.v v8, (a0) +; ZVE32F-NEXT: vse32.v v9, (a1) ; ZVE32F-NEXT: ret entry: %0 = load <4 x float>, ptr %in, align 4 @@ -257,15 +203,9 @@ define void @vnsrl_0_i64(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_i64: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma -; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vlseg2e64.v v8, (a0) +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_i64: @@ -285,15 +225,9 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) { ; V-LABEL: vnsrl_64_i64: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma -; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vlseg2e64.v v8, (a0) +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_i64: @@ -313,15 +247,9 @@ define void @vnsrl_0_double(ptr %in, ptr %out) { ; V-LABEL: vnsrl_0_double: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma -; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 0 -; V-NEXT: vrgather.vi v10, v9, 0, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vlseg2e64.v v8, (a0) +; V-NEXT: vse64.v v8, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_0_double: @@ -341,15 +269,9 @@ define void @vnsrl_64_double(ptr %in, ptr %out) { ; V-LABEL: vnsrl_64_double: ; V: # %bb.0: # %entry -; V-NEXT: vsetivli zero, 4, e64, m1, ta, ma -; V-NEXT: vle64.v v8, (a0) -; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu -; V-NEXT: vslidedown.vi v9, v8, 2 -; V-NEXT: li a0, 2 -; V-NEXT: vmv.s.x v0, a0 -; V-NEXT: vrgather.vi v10, v8, 1 -; V-NEXT: vrgather.vi v10, v9, 1, v0.t -; V-NEXT: vse64.v v10, (a1) +; V-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; V-NEXT: vlseg2e64.v v8, (a0) +; V-NEXT: vse64.v v9, (a1) ; V-NEXT: ret ; ; ZVE32F-LABEL: vnsrl_64_double: diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -0,0 +1,376 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=riscv32 -mattr=+v -interleaved-access -S | FileCheck %s --check-prefix=RV32 +; RUN: opt < %s -mtriple=riscv64 -mattr=+v -interleaved-access -S | FileCheck %s --check-prefix=RV64 + +define void @load_factor2(ptr %ptr) { +; RV32-LABEL: @load_factor2( +; RV32-NEXT: [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p0.i32(ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2( +; RV64-NEXT: [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p0.i64(ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x i32>, ptr %ptr + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> + ret void +} + +define void @load_factor3(ptr %ptr) { +; RV32-LABEL: @load_factor3( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg3.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor3( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg3.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <12 x i32>, ptr %ptr + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor4(ptr %ptr) { +; RV32-LABEL: @load_factor4( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg4.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor4( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg4.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x i32>, ptr %ptr + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor5(ptr %ptr) { +; RV32-LABEL: @load_factor5( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor5( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <20 x i32>, ptr %ptr + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor6(ptr %ptr) { +; RV32-LABEL: @load_factor6( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg6.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor6( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg6.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <24 x i32>, ptr %ptr + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v3 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v4 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v5 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor7(ptr %ptr) { +; RV32-LABEL: @load_factor7( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor7( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <28 x i32>, ptr %ptr + %v0 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v1 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v2 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v3 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v4 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v5 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v6 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor8(ptr %ptr) { +; RV32-LABEL: @load_factor8( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg8.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 7 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor8( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg8.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 7 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <32 x i32>, ptr %ptr + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v4 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v5 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v6 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v7 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + ret void +} + + +define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) { +; RV32-LABEL: @store_factor2( +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg2.store.v8i8.p0.i32(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor2( +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg2.store.v8i8.p0.i64(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> + store <16 x i8> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; RV32-LABEL: @store_factor3( +; RV32-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: call void @llvm.riscv.seg3.store.v4i32.p0.i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor3( +; RV64-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: call void @llvm.riscv.seg3.store.v4i32.p0.i64(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: ret void +; + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + store <12 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; RV32-LABEL: @store_factor4( +; RV32-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: call void @llvm.riscv.seg4.store.v4i32.p0.i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4( +; RV64-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: call void @llvm.riscv.seg4.store.v4i32.p0.i64(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: ret void +; + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + + +define void @store_factor2_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; RV32-LABEL: @store_factor2_wide( +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg2.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor2_wide( +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg2.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor3_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; RV32-LABEL: @store_factor3_wide( +; RV32-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg3.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor3_wide( +; RV64-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg3.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; RV32-LABEL: @store_factor4_wide( +; RV32-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg4.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4_wide( +; RV64-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg4.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @load_factor2_fp128(ptr %ptr) { +; RV32-LABEL: @load_factor2_fp128( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16 +; RV32-NEXT: [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV32-NEXT: [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_fp128( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16 +; RV64-NEXT: [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV64-NEXT: [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV64-NEXT: ret void +; + %interleaved.vec = load <4 x fp128>, ptr %ptr, align 16 + %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> + %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> + ret void +} + + + + + + + + + + + + diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=riscv64 -mattr=+zve32x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVE32X +; RUN: opt < %s -mtriple=riscv64 -mattr=+zve64x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVE64X + +define <4 x i1> @load_large_vector(ptr %p) { +; ZVE32X-LABEL: @load_large_vector( +; ZVE32X-NEXT: [[L:%.*]] = load <12 x ptr>, ptr [[P:%.*]], align 128 +; ZVE32X-NEXT: [[S1:%.*]] = shufflevector <12 x ptr> [[L]], <12 x ptr> poison, <4 x i32> +; ZVE32X-NEXT: [[S2:%.*]] = shufflevector <12 x ptr> [[L]], <12 x ptr> poison, <4 x i32> +; ZVE32X-NEXT: [[RET:%.*]] = icmp ne <4 x ptr> [[S1]], [[S2]] +; ZVE32X-NEXT: ret <4 x i1> [[RET]] +; +; ZVE64X-LABEL: @load_large_vector( +; ZVE64X-NEXT: [[TMP1:%.*]] = call { <4 x ptr>, <4 x ptr>, <4 x ptr> } @llvm.riscv.seg3.load.v4p0.p0.i64(ptr [[P:%.*]], i64 4) +; ZVE64X-NEXT: [[TMP2:%.*]] = extractvalue { <4 x ptr>, <4 x ptr>, <4 x ptr> } [[TMP1]], 1 +; ZVE64X-NEXT: [[TMP3:%.*]] = extractvalue { <4 x ptr>, <4 x ptr>, <4 x ptr> } [[TMP1]], 0 +; ZVE64X-NEXT: [[RET:%.*]] = icmp ne <4 x ptr> [[TMP3]], [[TMP2]] +; ZVE64X-NEXT: ret <4 x i1> [[RET]] +; + %l = load <12 x ptr>, ptr %p + %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %ret = icmp ne <4 x ptr> %s1, %s2 + ret <4 x i1> %ret +} diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -march=riscv32 -mattr=+zve32x,+zvl32b -interleaved-access -S | FileCheck %s -check-prefix=ZVL32B +; RUN: opt < %s -mtriple=riscv32 -mattr=+zve32x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVL128B + +; Make sure that we don't lower interleaved loads that won't fit into the minimum vlen + +define {<16 x i32>, <16 x i32>} @load_factor2_large(ptr %ptr) { +; ZVL32B-LABEL: @load_factor2_large( +; ZVL32B-NEXT: [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, ptr [[PTR:%.*]], align 128 +; ZVL32B-NEXT: [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <16 x i32> +; ZVL32B-NEXT: [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <16 x i32> +; ZVL32B-NEXT: [[RES0:%.*]] = insertvalue { <16 x i32>, <16 x i32> } undef, <16 x i32> [[V0]], 0 +; ZVL32B-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES0]], <16 x i32> [[V1]], 1 +; ZVL32B-NEXT: ret { <16 x i32>, <16 x i32> } [[RES1]] +; +; ZVL128B-LABEL: @load_factor2_large( +; ZVL128B-NEXT: [[TMP1:%.*]] = call { <16 x i32>, <16 x i32> } @llvm.riscv.seg2.load.v16i32.p0.i32(ptr [[PTR:%.*]], i32 16) +; ZVL128B-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i32>, <16 x i32> } [[TMP1]], 1 +; ZVL128B-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i32>, <16 x i32> } [[TMP1]], 0 +; ZVL128B-NEXT: [[RES0:%.*]] = insertvalue { <16 x i32>, <16 x i32> } undef, <16 x i32> [[TMP3]], 0 +; ZVL128B-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES0]], <16 x i32> [[TMP2]], 1 +; ZVL128B-NEXT: ret { <16 x i32>, <16 x i32> } [[RES1]] +; + %interleaved.vec = load <32 x i32>, ptr %ptr + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %res0 = insertvalue {<16 x i32>, <16 x i32>} undef, <16 x i32> %v0, 0 + %res1 = insertvalue {<16 x i32>, <16 x i32>} %res0, <16 x i32> %v1, 1 + ret {<16 x i32>, <16 x i32>} %res1 + ; ret void +}