diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -674,6 +674,16 @@ /// returns the address of that location. Otherwise, returns nullptr. Value *getIRStackGuard(IRBuilderBase &IRB) const override; + unsigned getMaxSupportedInterleaveFactor() const override { return 8; } + + bool lowerInterleavedLoad(LoadInst *LI, + ArrayRef Shuffles, + ArrayRef Indices, + unsigned Factor) const override; + + bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const override; + private: /// RISCVCCAssignFn - This target-specific function extends the default /// CCValAssign with additional information used to lower RISC-V calling @@ -810,6 +820,11 @@ /// For available scheduling models FDIV + two independent FMULs are much /// faster than two FDIVs. unsigned combineRepeatedFPDivisors() const override; + + /// Returns whether or not generating a fixed length interleaved load/store + /// intrinsic for this type will be legal. + bool isLegalInterleavedAccessType(FixedVectorType *, unsigned Factor, + bool IsVsseg, const DataLayout &) const; }; namespace RISCVVIntrinsicsTable { diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21,6 +21,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/MemoryLocation.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -1835,7 +1836,7 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(Type *ScalarTy) const { if (ScalarTy->isPointerTy()) - return true; + return Subtarget.is64Bit() ? Subtarget.hasVInstructionsI64() : true; if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) || ScalarTy->isIntegerTy(32)) @@ -14947,6 +14948,136 @@ return TargetLowering::getIRStackGuard(IRB); } +bool RISCVTargetLowering::isLegalInterleavedAccessType( + FixedVectorType *VTy, unsigned Factor, bool IsVsseg, + const DataLayout &DL) const { + if (!Subtarget.useRVVForFixedLengthVectors()) + return false; + if (!isLegalElementTypeForRVV(VTy->getElementType())) + return false; + EVT VT = getValueType(DL, VTy); + // Don't lower vlseg/vsseg for fixed length vector types that can't be split. + if (!isTypeLegal(VT)) + return false; + // Sometimes the interleaved access pass picks up splats as interleaves of one + // element. Don't lower these. + if (VTy->getNumElements() < 2) + return false; + + // Need to make sure that EMUL * NFIELDS ≤ 8 + MVT ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT()); + auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT)); + if (Fractional) + return Factor / LMUL <= 8; + return Factor * LMUL <= 8; +} + +/// Lower an interleaved load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.vlseg.v4i32.p0.i64( +/// %ptr, i64 4) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 +bool RISCVTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + IRBuilder<> Builder(LI); + + auto *VTy = cast(Shuffles[0]->getType()); + if (!isLegalInterleavedAccessType(VTy, Factor, false, + LI->getModule()->getDataLayout())) + return false; + + auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + + static const Intrinsic::ID FixedLenIntrIds[] = { + Intrinsic::riscv_seg2_load, Intrinsic::riscv_seg3_load, + Intrinsic::riscv_seg4_load, Intrinsic::riscv_seg5_load, + Intrinsic::riscv_seg6_load, Intrinsic::riscv_seg7_load, + Intrinsic::riscv_seg8_load}; + Function *VlsegNFunc = + Intrinsic::getDeclaration(LI->getModule(), FixedLenIntrIds[Factor - 2], + {VTy, LI->getPointerOperandType(), XLenTy}); + + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + + CallInst *VlsegN = + Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL}); + + for (unsigned i = 0; i < Shuffles.size(); i++) { + Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); + Shuffles[i]->replaceAllUsesWith(SubVec); + } + + return true; +} + +/// Lower an interleaved store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.riscv.vsseg3.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2, +/// %ptr, i32 4) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// vsseg3 instruction in CodeGen. +bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + IRBuilder<> Builder(SI); + auto *ShuffleVTy = cast(SVI->getType()); + // Given SVI : , then VTy : + auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), + ShuffleVTy->getNumElements() / Factor); + if (!isLegalInterleavedAccessType(VTy, Factor, true, + SI->getModule()->getDataLayout())) + return false; + + auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + + static const Intrinsic::ID FixedLenIntrIds[] = { + Intrinsic::riscv_seg2_store, Intrinsic::riscv_seg3_store, + Intrinsic::riscv_seg4_store, Intrinsic::riscv_seg5_store, + Intrinsic::riscv_seg6_store, Intrinsic::riscv_seg7_store, + Intrinsic::riscv_seg8_store}; + + Function *VssegNFunc = + Intrinsic::getDeclaration(SI->getModule(), FixedLenIntrIds[Factor - 2], + {VTy, SI->getPointerOperandType(), XLenTy}); + + auto Mask = SVI->getShuffleMask(); + SmallVector Ops; + + for (unsigned i = 0; i < Factor; i++) { + Value *Shuffle = Builder.CreateShuffleVector( + SVI->getOperand(0), SVI->getOperand(1), + createSequentialMask(Mask[i], VTy->getNumElements(), 0)); + Ops.push_back(Shuffle); + } + // This VL should be OK (should be executable in one vsseg instruction, + // potentially under larger LMULs) because we checked that the fixed vector + // type fits in isLegalInterleavedAccessType + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + Ops.append({SI->getPointerOperand(), VL}); + + Builder.CreateCall(VssegNFunc, Ops); + + return true; +} + #define GET_REGISTER_MATCHER #include "RISCVGenAsmMatcher.inc" diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -283,6 +283,9 @@ if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVGatherScatterLoweringPass()); + if (getOptLevel() != CodeGenOpt::None) + addPass(createInterleavedAccessPass()); + if (getOptLevel() != CodeGenOpt::None) addPass(createRISCVCodeGenPreparePass()); diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll --- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll +++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll @@ -29,6 +29,7 @@ ; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: RISCV gather/scatter lowering +; CHECK-NEXT: Interleaved Access Pass ; CHECK-NEXT: RISCV CodeGenPrepare ; CHECK-NEXT: Module Verifier ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access-zve32x.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zve32x,+zvl1024b -O2 | FileCheck %s -check-prefix=ZVE32X +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zve64x,+zvl1024b -O2 | FileCheck %s -check-prefix=ZVE64X + +; TODO: Currently we don't lower interleaved accesses of ptr types if XLEN isn't +; a supported SEW. We should improve this with a wide load and a set of shuffles. +define <4 x i1> @load_large_vector(ptr %p) { +; ZVE32X-LABEL: load_large_vector: +; ZVE32X: # %bb.0: +; ZVE32X-NEXT: addi sp, sp, -16 +; ZVE32X-NEXT: .cfi_def_cfa_offset 16 +; ZVE32X-NEXT: ld a1, 80(a0) +; ZVE32X-NEXT: ld a2, 72(a0) +; ZVE32X-NEXT: ld a3, 56(a0) +; ZVE32X-NEXT: ld a4, 32(a0) +; ZVE32X-NEXT: ld a5, 24(a0) +; ZVE32X-NEXT: ld a6, 48(a0) +; ZVE32X-NEXT: ld a7, 8(a0) +; ZVE32X-NEXT: ld a0, 0(a0) +; ZVE32X-NEXT: xor a4, a5, a4 +; ZVE32X-NEXT: snez a4, a4 +; ZVE32X-NEXT: sb a4, 12(sp) +; ZVE32X-NEXT: xor a0, a0, a7 +; ZVE32X-NEXT: snez a0, a0 +; ZVE32X-NEXT: sb a0, 15(sp) +; ZVE32X-NEXT: xor a0, a6, a3 +; ZVE32X-NEXT: snez a0, a0 +; ZVE32X-NEXT: sb a0, 13(sp) +; ZVE32X-NEXT: xor a1, a2, a1 +; ZVE32X-NEXT: snez a0, a1 +; ZVE32X-NEXT: sb a0, 14(sp) +; ZVE32X-NEXT: addi a0, sp, 12 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: addi a0, sp, 15 +; ZVE32X-NEXT: vlm.v v8, (a0) +; ZVE32X-NEXT: vmv.v.i v9, 0 +; ZVE32X-NEXT: vmerge.vim v10, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmv.v.i v11, 0 +; ZVE32X-NEXT: vmv1r.v v0, v8 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 2, e8, mf4, tu, ma +; ZVE32X-NEXT: vslideup.vi v8, v10, 1 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: addi a0, sp, 13 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: vmerge.vim v10, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 3, e8, mf4, tu, ma +; ZVE32X-NEXT: vslideup.vi v8, v10, 2 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: vmerge.vim v8, v11, 1, v0 +; ZVE32X-NEXT: addi a0, sp, 14 +; ZVE32X-NEXT: vsetivli zero, 1, e8, mf4, ta, ma +; ZVE32X-NEXT: vlm.v v0, (a0) +; ZVE32X-NEXT: vmerge.vim v9, v9, 1, v0 +; ZVE32X-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; ZVE32X-NEXT: vslideup.vi v8, v9, 3 +; ZVE32X-NEXT: vmsne.vi v0, v8, 0 +; ZVE32X-NEXT: addi sp, sp, 16 +; ZVE32X-NEXT: ret +; +; ZVE64X-LABEL: load_large_vector: +; ZVE64X: # %bb.0: +; ZVE64X-NEXT: vsetivli zero, 4, e64, m1, ta, ma +; ZVE64X-NEXT: vlseg3e64.v v8, (a0) +; ZVE64X-NEXT: vmsne.vv v0, v8, v9 +; ZVE64X-NEXT: ret + %l = load <12 x ptr>, ptr %p + %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %ret = icmp ne <4 x ptr> %s1, %s2 + ret <4 x i1> %ret +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -0,0 +1,1305 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=riscv32 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV32 %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,m -O2 | FileCheck -check-prefixes=CHECK,RV64 %s + +; ------------------------------------------------------------------------------ +; Loads +; ------------------------------------------------------------------------------ + +; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3 +define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) { +; CHECK-LABEL: load_factor2_v3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vid.v v8 +; CHECK-NEXT: vadd.vv v9, v8, v8 +; CHECK-NEXT: vrgather.vv v8, v10, v9 +; CHECK-NEXT: li a0, 4 +; CHECK-NEXT: vmv.s.x v0, a0 +; CHECK-NEXT: vsetivli zero, 4, e32, m2, ta, ma +; CHECK-NEXT: vslidedown.vi v12, v10, 4 +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vrgather.vi v8, v12, 0, v0.t +; CHECK-NEXT: vadd.vi v11, v9, 1 +; CHECK-NEXT: vrgather.vv v9, v10, v11 +; CHECK-NEXT: vrgather.vi v9, v12, 1, v0.t +; CHECK-NEXT: ret + %interleaved.vec = load <6 x i32>, ptr %ptr + %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> + %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> + %res0 = insertvalue {<3 x i32>, <3 x i32>} undef, <3 x i32> %v0, 0 + %res1 = insertvalue {<3 x i32>, <3 x i32>} %res0, <3 x i32> %v1, 1 + ret {<3 x i32>, <3 x i32>} %res1 +} + +define {<4 x i32>, <4 x i32>} @load_factor2(ptr %ptr) { +; CHECK-LABEL: load_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <8 x i32>, ptr %ptr + %v0 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %v1 = shufflevector <8 x i32> %interleaved.vec, <8 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + ret {<4 x i32>, <4 x i32>} %res1 +} + + +define {<4 x i32>, <4 x i32>, <4 x i32>} @load_factor3(ptr %ptr) { +; CHECK-LABEL: load_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <12 x i32>, ptr %ptr + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + ret {<4 x i32>, <4 x i32>, <4 x i32>} %res2 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @load_factor4(ptr %ptr) { +; CHECK-LABEL: load_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <16 x i32>, ptr %ptr + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3 +} + +define {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} @load_factor5(ptr %ptr) { +; CHECK-LABEL: load_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <20 x i32>, ptr %ptr + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %res0 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} undef, <4 x i32> %v0, 0 + %res1 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res0, <4 x i32> %v1, 1 + %res2 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res1, <4 x i32> %v2, 2 + %res3 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res2, <4 x i32> %v3, 3 + %res4 = insertvalue {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res3, <4 x i32> %v4, 4 + ret {<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>} %res4 +} + +define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @load_factor6(ptr %ptr) { +; CHECK-LABEL: load_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = load <12 x i16>, ptr %ptr + %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v1 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v2 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v3 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v4 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v5 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %res0 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} undef, <2 x i16> %v0, 0 + %res1 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res0, <2 x i16> %v1, 1 + %res2 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res1, <2 x i16> %v2, 2 + %res3 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res2, <2 x i16> %v3, 3 + %res4 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res3, <2 x i16> %v4, 4 + %res5 = insertvalue {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res4, <2 x i16> %v5, 5 + ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res5 +} + +; LMUL * NF is > 8 here and so shouldn't be lowered to a vlseg +define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_factor6_too_big(ptr %ptr) { +; RV32-LABEL: load_factor6_too_big: +; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 78 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: sub sp, sp, a2 +; RV32-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xce, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 78 * vlenb +; RV32-NEXT: addi a3, a1, 256 +; RV32-NEXT: li a2, 32 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v24, (a3) +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: vid.v v12 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 37 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs4r.v v12, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vi v8, v12, -4 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: slli a4, a3, 4 +; RV32-NEXT: add a3, a4, a3 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs4r.v v8, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vrgather.vv v4, v24, v8 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 41 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs4r.v v4, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vadd.vi v8, v12, -10 +; RV32-NEXT: lui a3, 12 +; RV32-NEXT: vmv.s.x v1, a3 +; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma +; RV32-NEXT: vslidedown.vi v16, v24, 16 +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 45 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: vrgather.vv v4, v16, v8, v0.t +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 69 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill +; RV32-NEXT: lui a3, %hi(.LCPI6_0) +; RV32-NEXT: addi a3, a3, %lo(.LCPI6_0) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a3, vlenb +; RV32-NEXT: li a4, 53 +; RV32-NEXT: mul a3, a3, a4 +; RV32-NEXT: add a3, sp, a3 +; RV32-NEXT: addi a3, a3, 16 +; RV32-NEXT: vs8r.v v24, (a3) # Unknown-size Folded Spill +; RV32-NEXT: addi a1, a1, 128 +; RV32-NEXT: vrgather.vv v8, v24, v16 +; RV32-NEXT: lui a3, %hi(.LCPI6_1) +; RV32-NEXT: addi a3, a3, %lo(.LCPI6_1) +; RV32-NEXT: lui a4, 1 +; RV32-NEXT: addi a5, a4, -64 +; RV32-NEXT: vle32.v v16, (a3) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 61 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v2, a5 +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: addi a4, a4, -1 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV32-NEXT: vmv.s.x v3, a4 +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vmerge.vvm v8, v4, v8, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 5 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 37 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vi v16, v20, -2 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v4, v8, v16 +; RV32-NEXT: vadd.vi v16, v20, -8 +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 69 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v4, v8, v16, v0.t +; RV32-NEXT: lui a1, %hi(.LCPI6_2) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_2) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: lui a3, %hi(.LCPI6_3) +; RV32-NEXT: addi a3, a3, %lo(.LCPI6_3) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vle32.v v8, (a3) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 25 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 53 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v8, v24 +; RV32-NEXT: vmv1r.v v0, v2 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 61 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 25 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: lui a1, %hi(.LCPI6_4) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_4) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vmv1r.v v0, v3 +; RV32-NEXT: vmerge.vvm v16, v4, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 25 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v16, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v4, v16, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 37 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vadd.vi v24, v16, -6 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 69 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v4, v16, v24, v0.t +; RV32-NEXT: lui a1, %hi(.LCPI6_5) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_5) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 53 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v8, v24 +; RV32-NEXT: lui a1, %hi(.LCPI6_6) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_6) +; RV32-NEXT: li a3, 960 +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 61 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v8, v24, v0.t +; RV32-NEXT: li a1, 1023 +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, ma +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 37 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: lui a1, %hi(.LCPI6_7) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_7) +; RV32-NEXT: vle32.v v12, (a1) +; RV32-NEXT: vmerge.vvm v8, v4, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v16, v12 +; RV32-NEXT: vmv1r.v v0, v1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 69 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v16, v12, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI6_8) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_8) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: lui a3, %hi(.LCPI6_9) +; RV32-NEXT: addi a3, a3, %lo(.LCPI6_9) +; RV32-NEXT: vle32.v v24, (a1) +; RV32-NEXT: vle32.v v8, (a3) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 53 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v0, v24 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 61 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v16, v24, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 37 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 4 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI6_10) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_10) +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: lui a1, 15 +; RV32-NEXT: vmv.s.x v0, a1 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v20, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v24, v20 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 69 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI6_11) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_11) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 53 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v24, v0, v16 +; RV32-NEXT: lui a1, %hi(.LCPI6_12) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_12) +; RV32-NEXT: li a3, 1008 +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a4, a1, 2 +; RV32-NEXT: add a1, a4, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, ma +; RV32-NEXT: vmv.s.x v0, a3 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 45 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 61 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v24, v16, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, mu +; RV32-NEXT: lui a1, %hi(.LCPI6_13) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_13) +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 37 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: addi a1, sp, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a3, a1, 2 +; RV32-NEXT: add a1, a3, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a1, a1, 2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 69 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 41 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v8, v24, v16, v0.t +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a3, 41 +; RV32-NEXT: mul a1, a1, a3 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: lui a1, %hi(.LCPI6_14) +; RV32-NEXT: addi a1, a1, %lo(.LCPI6_14) +; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, mu +; RV32-NEXT: lui a2, %hi(.LCPI6_15) +; RV32-NEXT: addi a2, a2, %lo(.LCPI6_15) +; RV32-NEXT: vle32.v v16, (a1) +; RV32-NEXT: vle32.v v8, (a2) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 69 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 53 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v24, v0, v16 +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 45 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 61 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 69 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vrgather.vv v24, v16, v8, v0.t +; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 37 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: li a2, 41 +; RV32-NEXT: mul a1, a1, a2 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV32-NEXT: addi a1, a0, 320 +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 256 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a3, a2, 2 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 192 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: slli a3, a2, 4 +; RV32-NEXT: add a2, a3, a2 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 128 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 13 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: addi a1, a0, 64 +; RV32-NEXT: csrr a2, vlenb +; RV32-NEXT: li a3, 25 +; RV32-NEXT: mul a2, a2, a3 +; RV32-NEXT: add a2, sp, a2 +; RV32-NEXT: addi a2, a2, 16 +; RV32-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: slli a2, a1, 5 +; RV32-NEXT: add a1, a2, a1 +; RV32-NEXT: add a1, sp, a1 +; RV32-NEXT: addi a1, a1, 16 +; RV32-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: csrr a0, vlenb +; RV32-NEXT: li a1, 78 +; RV32-NEXT: mul a0, a0, a1 +; RV32-NEXT: add sp, sp, a0 +; RV32-NEXT: addi sp, sp, 16 +; RV32-NEXT: ret +; +; RV64-LABEL: load_factor6_too_big: +; RV64: # %bb.0: +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 82 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: sub sp, sp, a2 +; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0xd2, 0x00, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 82 * vlenb +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: addi a2, a1, 128 +; RV64-NEXT: vle64.v v16, (a2) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 6 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v16, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vle64.v v0, (a1) +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 73 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vid.v v8 +; RV64-NEXT: li a2, 6 +; RV64-NEXT: vmul.vx v8, v8, a2 +; RV64-NEXT: vrgather.vv v24, v0, v8 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 49 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs8r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: li a2, 56 +; RV64-NEXT: vmv.s.x v0, a2 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 5 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs1r.v v0, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v8, v8, -16 +; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t +; RV64-NEXT: vmv4r.v v8, v24 +; RV64-NEXT: addi a1, a1, 256 +; RV64-NEXT: vle64.v v16, (a1) +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: li a1, 128 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vrgather.vi v12, v16, 4 +; RV64-NEXT: vsetivli zero, 8, e64, m8, ta, ma +; RV64-NEXT: vslidedown.vi v24, v16, 8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: li a1, 63 +; RV64-NEXT: vmv.s.x v16, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs1r.v v16, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vrgather.vi v12, v24, 2, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 41 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv1r.v v0, v16 +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 25 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 49 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v0, 1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 73 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v8, v16 +; RV64-NEXT: vadd.vi v16, v0, -15 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v8, v16, v0.t +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v12, v8, 5 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v28, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmv1r.v v0, v28 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 41 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v12, v16, 3, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v12, v24, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 21 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv.v.i v8, 6 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v24, zero +; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma +; RV64-NEXT: vslideup.vi v8, v24, 5 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v12, v0, v8 +; RV64-NEXT: vmv1r.v v0, v28 +; RV64-NEXT: vrgather.vi v12, v16, 4, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 49 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v8, 2 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 73 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v0, v16 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs8r.v v24, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 24 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v24, v8, -14 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV64-NEXT: li a1, 31 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 4 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: li a1, 1 +; RV64-NEXT: vmv.v.i v8, 7 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 3 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vs4r.v v8, (a2) # Unknown-size Folded Spill +; RV64-NEXT: vmv.s.x v12, a1 +; RV64-NEXT: vsetivli zero, 6, e64, m4, tu, ma +; RV64-NEXT: vslideup.vi v8, v12, 5 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v12, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v20, v16, v8 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 41 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v20, v8, 5, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 49 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v0, v8, 3 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 73 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v16, v0 +; RV64-NEXT: vadd.vi v8, v8, -13 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 29 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 7, e64, m4, tu, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vslideup.vi v16, v8, 6 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: li a1, 192 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v20, v8, 2 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 41 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v20, v8, v16, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 49 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v8, 4 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 73 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v0, v16 +; RV64-NEXT: li a1, 28 +; RV64-NEXT: vmv.s.x v0, a1 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vs1r.v v0, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vadd.vi v8, v8, -12 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v24, v16, v8, v0.t +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v8, v24, v0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 2 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v8, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 7, e64, m4, tu, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 3 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vslideup.vi v16, v8, 6 +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vi v20, v8, 3 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 13 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 41 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v20, v8, v16, v0.t +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vs4r.v v20, (a1) # Unknown-size Folded Spill +; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, mu +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 49 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v24, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vadd.vi v16, v24, 5 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 73 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v8, v0, v16 +; RV64-NEXT: vadd.vi v24, v24, -11 +; RV64-NEXT: addi a1, sp, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 6 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl8r.v v16, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vrgather.vv v8, v16, v24, v0.t +; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: slli a2, a1, 5 +; RV64-NEXT: add a1, a2, a1 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl1r.v v0, (a1) # Unknown-size Folded Reload +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 57 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v12, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vmerge.vvm v8, v12, v8, v0 +; RV64-NEXT: addi a1, a0, 320 +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: addi a1, a0, 256 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 2 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: addi a1, a0, 192 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 29 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: addi a1, a0, 128 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: slli a3, a2, 4 +; RV64-NEXT: add a2, a3, a2 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: addi a1, a0, 64 +; RV64-NEXT: csrr a2, vlenb +; RV64-NEXT: li a3, 21 +; RV64-NEXT: mul a2, a2, a3 +; RV64-NEXT: add a2, sp, a2 +; RV64-NEXT: addi a2, a2, 16 +; RV64-NEXT: vl4r.v v8, (a2) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a1) +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: li a2, 25 +; RV64-NEXT: mul a1, a1, a2 +; RV64-NEXT: add a1, sp, a1 +; RV64-NEXT: addi a1, a1, 16 +; RV64-NEXT: vl4r.v v8, (a1) # Unknown-size Folded Reload +; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: csrr a0, vlenb +; RV64-NEXT: li a1, 82 +; RV64-NEXT: mul a0, a0, a1 +; RV64-NEXT: add sp, sp, a0 +; RV64-NEXT: addi sp, sp, 16 +; RV64-NEXT: ret + %interleaved.vec = load <48 x i64>, ptr %ptr + %v0 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %v1 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %v2 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %v3 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %v4 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %v5 = shufflevector <48 x i64> %interleaved.vec, <48 x i64> poison, <8 x i32> + %res0 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} undef, <8 x i64> %v0, 0 + %res1 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res0, <8 x i64> %v1, 1 + %res2 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res1, <8 x i64> %v2, 2 + %res3 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res2, <8 x i64> %v3, 3 + %res4 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res3, <8 x i64> %v4, 4 + %res5 = insertvalue {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res4, <8 x i64> %v5, 5 + ret {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} %res5 +} + + +; ------------------------------------------------------------------------------ +; Stores +; ------------------------------------------------------------------------------ + +define void @store_factor2(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1) { +; CHECK-LABEL: store_factor2: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg2e32.v v8, (a0) +; CHECK-NEXT: ret + %interleaved.vec = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + store <8 x i32> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; CHECK-LABEL: store_factor3: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg3e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + store <12 x i32> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; CHECK-LABEL: store_factor4: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor5(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { +; CHECK-LABEL: store_factor5: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vsseg5e32.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %s2 = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + %s3 = shufflevector <4 x i32> %v4, <4 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s2, <16 x i32> %s3, <20 x i32> + store <20 x i32> %interleaved.vec, ptr %ptr + ret void +} + +define void @store_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) { +; CHECK-LABEL: store_factor6: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma +; CHECK-NEXT: vsseg6e16.v v8, (a0) +; CHECK-NEXT: ret + %s0 = shufflevector <2 x i16> %v0, <2 x i16> %v1, <4 x i32> + %s1 = shufflevector <2 x i16> %v2, <2 x i16> %v3, <4 x i32> + %s2 = shufflevector <4 x i16> %s0, <4 x i16> %s1, <8 x i32> + %s3 = shufflevector <2 x i16> %v4, <2 x i16> %v5, <8 x i32> + %interleaved.vec = shufflevector <8 x i16> %s2, <8 x i16> %s3, <12 x i32> + store <12 x i16> %interleaved.vec, ptr %ptr + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll @@ -1,8 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+experimental-zvfh,+zvl256b \ -; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,V +; RUN: -lower-interleaved-accesses=false -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefixes=CHECK,V ; RUN: llc < %s -mtriple=riscv64 -mattr=+f,+zve32f,+zfh,+experimental-zvfh,+zvl256b \ -; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,ZVE32F +; RUN: -lower-interleaved-accesses=false -verify-machineinstrs \ +; RUN: | FileCheck %s --check-prefixes=CHECK,ZVE32F define void @vnsrl_0_i8(ptr %in, ptr %out) { ; CHECK-LABEL: vnsrl_0_i8: diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/interleaved-accesses.ll @@ -0,0 +1,376 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=riscv32 -mattr=+v -interleaved-access -S | FileCheck %s --check-prefix=RV32 +; RUN: opt < %s -mtriple=riscv64 -mattr=+v -interleaved-access -S | FileCheck %s --check-prefix=RV64 + +define void @load_factor2(ptr %ptr) { +; RV32-LABEL: @load_factor2( +; RV32-NEXT: [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p0.i32(ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2( +; RV64-NEXT: [[TMP1:%.*]] = call { <8 x i32>, <8 x i32> } @llvm.riscv.seg2.load.v8i32.p0.i64(ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i32>, <8 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x i32>, ptr %ptr + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <8 x i32> + ret void +} + +define void @load_factor3(ptr %ptr) { +; RV32-LABEL: @load_factor3( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg3.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor3( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg3.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <12 x i32>, ptr %ptr + %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v1 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + %v2 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor4(ptr %ptr) { +; RV32-LABEL: @load_factor4( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg4.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor4( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg4.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <16 x i32>, ptr %ptr + %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v2 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + %v3 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor5(ptr %ptr) { +; RV32-LABEL: @load_factor5( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor5( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg5.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <20 x i32>, ptr %ptr + %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v1 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v2 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v3 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + %v4 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor6(ptr %ptr) { +; RV32-LABEL: @load_factor6( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg6.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor6( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg6.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <24 x i32>, ptr %ptr + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v3 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v4 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + %v5 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor7(ptr %ptr) { +; RV32-LABEL: @load_factor7( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor7( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg7.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <28 x i32>, ptr %ptr + %v0 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v1 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v2 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v3 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v4 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v5 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + %v6 = shufflevector <28 x i32> %interleaved.vec, <28 x i32> poison, <4 x i32> + ret void +} + +define void @load_factor8(ptr %ptr) { +; RV32-LABEL: @load_factor8( +; RV32-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg8.load.v4i32.p0.i32(ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 7 +; RV32-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV32-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV32-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV32-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV32-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV32-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV32-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor8( +; RV64-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.riscv.seg8.load.v4i32.p0.i64(ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 7 +; RV64-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 6 +; RV64-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 5 +; RV64-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 4 +; RV64-NEXT: [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 3 +; RV64-NEXT: [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 2 +; RV64-NEXT: [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 1 +; RV64-NEXT: [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[TMP1]], 0 +; RV64-NEXT: ret void +; + %interleaved.vec = load <32 x i32>, ptr %ptr + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v4 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v5 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v6 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + %v7 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <4 x i32> + ret void +} + + +define void @store_factor2(ptr %ptr, <8 x i8> %v0, <8 x i8> %v1) { +; RV32-LABEL: @store_factor2( +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg2.store.v8i8.p0.i32(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor2( +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i8> [[V0:%.*]], <8 x i8> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i8> [[V0]], <8 x i8> [[V1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg2.store.v8i8.p0.i64(<8 x i8> [[TMP1]], <8 x i8> [[TMP2]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %interleaved.vec = shufflevector <8 x i8> %v0, <8 x i8> %v1, <16 x i32> + store <16 x i8> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor3(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2) { +; RV32-LABEL: @store_factor3( +; RV32-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: call void @llvm.riscv.seg3.store.v4i32.p0.i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor3( +; RV64-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> poison, <8 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: call void @llvm.riscv.seg3.store.v4i32.p0.i64(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: ret void +; + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <12 x i32> + store <12 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor4(ptr %ptr, <4 x i32> %v0, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +; RV32-LABEL: @store_factor4( +; RV32-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV32-NEXT: call void @llvm.riscv.seg4.store.v4i32.p0.i32(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[PTR:%.*]], i32 4) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4( +; RV64-NEXT: [[S0:%.*]] = shufflevector <4 x i32> [[V0:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[S0]], <8 x i32> [[S1]], <4 x i32> +; RV64-NEXT: call void @llvm.riscv.seg4.store.v4i32.p0.i64(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], ptr [[PTR:%.*]], i64 4) +; RV64-NEXT: ret void +; + %s0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <8 x i32> + %s1 = shufflevector <4 x i32> %v2, <4 x i32> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32> %s0, <8 x i32> %s1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + + +define void @store_factor2_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1) { +; RV32-LABEL: @store_factor2_wide( +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg2.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor2_wide( +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg2.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor3_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) { +; RV32-LABEL: @store_factor3_wide( +; RV32-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg3.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor3_wide( +; RV64-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> poison, <16 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg3.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor4_wide(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) { +; RV32-LABEL: @store_factor4_wide( +; RV32-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV32-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> +; RV32-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV32-NEXT: call void @llvm.riscv.seg4.store.v8i32.p0.i32(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], ptr [[PTR:%.*]], i32 8) +; RV32-NEXT: ret void +; +; RV64-LABEL: @store_factor4_wide( +; RV64-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <16 x i32> +; RV64-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2:%.*]], <8 x i32> [[V3:%.*]], <16 x i32> +; RV64-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> +; RV64-NEXT: call void @llvm.riscv.seg4.store.v8i32.p0.i64(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], ptr [[PTR:%.*]], i64 8) +; RV64-NEXT: ret void +; + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @load_factor2_fp128(ptr %ptr) { +; RV32-LABEL: @load_factor2_fp128( +; RV32-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16 +; RV32-NEXT: [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV32-NEXT: [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV32-NEXT: ret void +; +; RV64-LABEL: @load_factor2_fp128( +; RV64-NEXT: [[INTERLEAVED_VEC:%.*]] = load <4 x fp128>, ptr [[PTR:%.*]], align 16 +; RV64-NEXT: [[V0:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV64-NEXT: [[V1:%.*]] = shufflevector <4 x fp128> [[INTERLEAVED_VEC]], <4 x fp128> poison, <2 x i32> +; RV64-NEXT: ret void +; + %interleaved.vec = load <4 x fp128>, ptr %ptr, align 16 + %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> + %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> poison, <2 x i32> + ret void +} + + + + + + + + + + + + diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/zve32x.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -mtriple=riscv64 -mattr=+zve32x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVE32X +; RUN: opt < %s -mtriple=riscv64 -mattr=+zve64x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVE64X + +define <4 x i1> @load_large_vector(ptr %p) { +; ZVE32X-LABEL: @load_large_vector( +; ZVE32X-NEXT: [[L:%.*]] = load <12 x ptr>, ptr [[P:%.*]], align 128 +; ZVE32X-NEXT: [[S1:%.*]] = shufflevector <12 x ptr> [[L]], <12 x ptr> poison, <4 x i32> +; ZVE32X-NEXT: [[S2:%.*]] = shufflevector <12 x ptr> [[L]], <12 x ptr> poison, <4 x i32> +; ZVE32X-NEXT: [[RET:%.*]] = icmp ne <4 x ptr> [[S1]], [[S2]] +; ZVE32X-NEXT: ret <4 x i1> [[RET]] +; +; ZVE64X-LABEL: @load_large_vector( +; ZVE64X-NEXT: [[TMP1:%.*]] = call { <4 x ptr>, <4 x ptr>, <4 x ptr> } @llvm.riscv.seg3.load.v4p0.p0.i64(ptr [[P:%.*]], i64 4) +; ZVE64X-NEXT: [[TMP2:%.*]] = extractvalue { <4 x ptr>, <4 x ptr>, <4 x ptr> } [[TMP1]], 1 +; ZVE64X-NEXT: [[TMP3:%.*]] = extractvalue { <4 x ptr>, <4 x ptr>, <4 x ptr> } [[TMP1]], 0 +; ZVE64X-NEXT: [[RET:%.*]] = icmp ne <4 x ptr> [[TMP3]], [[TMP2]] +; ZVE64X-NEXT: ret <4 x i1> [[RET]] +; + %l = load <12 x ptr>, ptr %p + %s1 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %s2 = shufflevector <12 x ptr> %l, <12 x ptr> poison, <4 x i32> + %ret = icmp ne <4 x ptr> %s1, %s2 + ret <4 x i1> %ret +} diff --git a/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll b/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/RISCV/zvl32b.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -march=riscv32 -mattr=+zve32x,+zvl32b -interleaved-access -S | FileCheck %s -check-prefix=ZVL32B +; RUN: opt < %s -mtriple=riscv32 -mattr=+zve32x,+zvl128b -interleaved-access -S | FileCheck %s -check-prefix=ZVL128B + +; Make sure that we don't lower interleaved loads that won't fit into the minimum vlen + +define {<16 x i32>, <16 x i32>} @load_factor2_large(ptr %ptr) { +; ZVL32B-LABEL: @load_factor2_large( +; ZVL32B-NEXT: [[INTERLEAVED_VEC:%.*]] = load <32 x i32>, ptr [[PTR:%.*]], align 128 +; ZVL32B-NEXT: [[V0:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <16 x i32> +; ZVL32B-NEXT: [[V1:%.*]] = shufflevector <32 x i32> [[INTERLEAVED_VEC]], <32 x i32> poison, <16 x i32> +; ZVL32B-NEXT: [[RES0:%.*]] = insertvalue { <16 x i32>, <16 x i32> } undef, <16 x i32> [[V0]], 0 +; ZVL32B-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES0]], <16 x i32> [[V1]], 1 +; ZVL32B-NEXT: ret { <16 x i32>, <16 x i32> } [[RES1]] +; +; ZVL128B-LABEL: @load_factor2_large( +; ZVL128B-NEXT: [[TMP1:%.*]] = call { <16 x i32>, <16 x i32> } @llvm.riscv.seg2.load.v16i32.p0.i32(ptr [[PTR:%.*]], i32 16) +; ZVL128B-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i32>, <16 x i32> } [[TMP1]], 1 +; ZVL128B-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i32>, <16 x i32> } [[TMP1]], 0 +; ZVL128B-NEXT: [[RES0:%.*]] = insertvalue { <16 x i32>, <16 x i32> } undef, <16 x i32> [[TMP3]], 0 +; ZVL128B-NEXT: [[RES1:%.*]] = insertvalue { <16 x i32>, <16 x i32> } [[RES0]], <16 x i32> [[TMP2]], 1 +; ZVL128B-NEXT: ret { <16 x i32>, <16 x i32> } [[RES1]] +; + %interleaved.vec = load <32 x i32>, ptr %ptr + %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> poison, <16 x i32> + %res0 = insertvalue {<16 x i32>, <16 x i32>} undef, <16 x i32> %v0, 0 + %res1 = insertvalue {<16 x i32>, <16 x i32>} %res0, <16 x i32> %v1, 1 + ret {<16 x i32>, <16 x i32>} %res1 + ; ret void +}