diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td --- a/llvm/include/llvm/IR/IntrinsicsRISCV.td +++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td @@ -1245,4 +1245,15 @@ defm vsuxseg # nf : RISCVISegStore; } + // Strided loads/stores for fixed vectors. + def int_riscv_masked_strided_load + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyptr_ty, + llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [NoCapture>, IntrReadMem]>; + def int_riscv_masked_strided_store + : Intrinsic<[], + [llvm_anyvector_ty, llvm_anyptr_ty, + llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], + [NoCapture>, IntrWriteMem]>; } // TargetPrefix = "riscv" diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -24,6 +24,7 @@ RISCVExpandAtomicPseudoInsts.cpp RISCVExpandPseudoInsts.cpp RISCVFrameLowering.cpp + RISCVGatherScatterLowering.cpp RISCVInsertVSETVLI.cpp RISCVInstrInfo.cpp RISCVInstructionSelector.cpp @@ -50,6 +51,7 @@ SelectionDAG Support Target + TransformUtils GlobalISel ADD_TO_COMPONENT diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h --- a/llvm/lib/Target/RISCV/RISCV.h +++ b/llvm/lib/Target/RISCV/RISCV.h @@ -37,6 +37,9 @@ FunctionPass *createRISCVISelDag(RISCVTargetMachine &TM); +FunctionPass *createRISCVGatherScatterLoweringPass(); +void initializeRISCVGatherScatterLoweringPass(PassRegistry &); + FunctionPass *createRISCVMergeBaseOffsetOptPass(); void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &); diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -0,0 +1,488 @@ +//===- RISCVGatherScatterLowering.cpp - Gather/Scatter lowering -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass custom lowers llvm.gather and llvm.scatter instructions to +// RISCV intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVTargetMachine.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/GetElementPtrTypeIterator.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "riscv-gather-scatter-lowering" + +namespace { + +class RISCVGatherScatterLowering : public FunctionPass { + const RISCVSubtarget *ST = nullptr; + const TargetLowering *TLI = nullptr; + LoopInfo *LI = nullptr; + const DataLayout *DL = nullptr; + + SmallVector MaybeDeadPHIs; + +public: + static char ID; // Pass identification, replacement for typeid + + RISCVGatherScatterLowering() : FunctionPass(ID) {} + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + } + + StringRef getPassName() const override { + return "RISCV gather/scatter lowering"; + } + +private: + bool isLegalTypeAndAlignment(Type *DataType, Value *AlignOp); + + bool tryCreateStridedLoadStore(IntrinsicInst *II, Type *DataType, Value *Ptr, + Value *AlignOp); + + std::pair determineBaseAndStride(GetElementPtrInst *GEP, + IRBuilder<> &Builder); + + bool matchStridedRecurrence(Value *Index, Loop *L, Value *&Stride, + PHINode *&BasePtr, BinaryOperator *&Inc, + IRBuilder<> &Builder); +}; + +} // end anonymous namespace + +char RISCVGatherScatterLowering::ID = 0; + +INITIALIZE_PASS(RISCVGatherScatterLowering, DEBUG_TYPE, + "RISCV gather/scatter lowering pass", false, false) + +FunctionPass *llvm::createRISCVGatherScatterLoweringPass() { + return new RISCVGatherScatterLowering(); +} + +static bool isLegalElementType(Type *ScalarTy, const RISCVSubtarget *ST) { + if (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) || + ScalarTy->isIntegerTy(32) || ScalarTy->isIntegerTy(64)) + return true; + + if (ScalarTy->isHalfTy()) + return ST->hasStdExtZfh(); + if (ScalarTy->isFloatTy()) + return ST->hasStdExtF(); + if (ScalarTy->isDoubleTy()) + return ST->hasStdExtD(); + + return false; +} + +bool RISCVGatherScatterLowering::isLegalTypeAndAlignment(Type *DataType, + Value *AlignOp) { + Type *ScalarType = DataType->getScalarType(); + if (!isLegalElementType(ScalarType, ST)) + return false; + + MaybeAlign MA = cast(AlignOp)->getMaybeAlignValue(); + if (MA && MA->value() < DL->getTypeStoreSize(ScalarType).getFixedSize()) + return false; + + // FIXME: Let the backend type legalize by splitting/widening? + EVT DataVT = TLI->getValueType(*DL, DataType); + if (!TLI->isTypeLegal(DataVT)) + return false; + + return true; +} + +// TODO: Should we consider the mask when looking for a stride? +static std::pair matchStridedConstant(Constant *StartC) { + unsigned NumElts = cast(StartC->getType())->getNumElements(); + + // Check that the start value is a strided constant. + auto *StartVal = + dyn_cast_or_null(StartC->getAggregateElement((unsigned)0)); + if (!StartVal) + return std::make_pair(nullptr, nullptr); + APInt StrideVal(StartVal->getValue().getBitWidth(), 0); + ConstantInt *Prev = StartVal; + for (unsigned i = 1; i != NumElts; ++i) { + auto *C = dyn_cast_or_null(StartC->getAggregateElement(i)); + if (!C) + return std::make_pair(nullptr, nullptr); + + APInt LocalStride = C->getValue() - Prev->getValue(); + if (i == 1) + StrideVal = LocalStride; + else if (StrideVal != LocalStride) + return std::make_pair(nullptr, nullptr); + + Prev = C; + } + + Value *Stride = ConstantInt::get(StartVal->getType(), StrideVal); + + return std::make_pair(StartVal, Stride); +} + +// Recursively, walk about the use-def chain until we find a Phi with a strided +// start value. Build and update a scalar recurrence as we unwind the recursion. +// We also update the Stride as we unwind. Our goal is to move all of the +// arithmetic out of the loop. +bool RISCVGatherScatterLowering::matchStridedRecurrence(Value *Index, Loop *L, + Value *&Stride, + PHINode *&BasePtr, + BinaryOperator *&Inc, + IRBuilder<> &Builder) { + // Our base case is a Phi. + if (auto *Phi = dyn_cast(Index)) { + // A phi node we want to perform this function on should be from the + // loop header. + if (Phi->getParent() != L->getHeader()) + return false; + + Value *Step, *Start; + if (!matchSimpleRecurrence(Phi, Inc, Start, Step) || + Inc->getOpcode() != Instruction::Add) + return false; + unsigned IncrementingBlock = Phi->getIncomingValue(0) == Inc ? 0 : 1; + + // Only proceed if the step is loop invariant. + if (!L->isLoopInvariant(Step)) + return false; + + // Step should be a splat. + Step = getSplatValue(Step); + if (!Step) + return false; + + // Start should be a strided constant. + auto *StartC = dyn_cast(Start); + if (!StartC) + return false; + + std::tie(Start, Stride) = matchStridedConstant(StartC); + if (!Start) + return false; + assert(Stride != nullptr); + + // Build scalar phi and increment. + BasePtr = + PHINode::Create(Start->getType(), 2, Phi->getName() + ".scalar", Phi); + Inc = BinaryOperator::CreateAdd(BasePtr, Step, Inc->getName() + ".scalar", + Inc); + BasePtr->addIncoming(Start, Phi->getIncomingBlock(1 - IncrementingBlock)); + BasePtr->addIncoming(Inc, Phi->getIncomingBlock(IncrementingBlock)); + + // Note that this Phi might be eligible for removal. + MaybeDeadPHIs.push_back(Phi); + return true; + } + + // Otherwise look for binary operator. + auto *BO = dyn_cast(Index); + if (!BO) + return false; + + if (BO->getOpcode() != Instruction::Add && + BO->getOpcode() != Instruction::Or && + BO->getOpcode() != Instruction::Mul && + BO->getOpcode() != Instruction::Shl) + return false; + + // Only support shift by constant. + if (BO->getOpcode() == Instruction::Shl && !isa(BO->getOperand(1))) + return false; + + // We need to be able to treat Or as Add. + if (BO->getOpcode() == Instruction::Or && + !haveNoCommonBitsSet(BO->getOperand(0), BO->getOperand(1), *DL)) + return false; + + // We should have one operand in the loop and one splat. + Value *OtherOp; + if (isa(BO->getOperand(0)) && + L->contains(cast(BO->getOperand(0)))) { + Index = cast(BO->getOperand(0)); + OtherOp = BO->getOperand(1); + } else if (isa(BO->getOperand(1)) && + L->contains(cast(BO->getOperand(1)))) { + Index = cast(BO->getOperand(1)); + OtherOp = BO->getOperand(0); + } else { + return false; + } + + // Make sure other op is loop invariant. + if (!L->isLoopInvariant(OtherOp)) + return false; + + // Make sure we have a splat. + Value *SplatOp = getSplatValue(OtherOp); + if (!SplatOp) + return false; + + // Recurse up the use-def chain. + if (!matchStridedRecurrence(Index, L, Stride, BasePtr, Inc, Builder)) + return false; + + // Locate the Step and Start values from the recurrence. + unsigned StepIndex = Inc->getOperand(0) == BasePtr ? 1 : 0; + unsigned StartBlock = BasePtr->getOperand(0) == Inc ? 1 : 0; + Value *Step = Inc->getOperand(StepIndex); + Value *Start = BasePtr->getOperand(StartBlock); + + // We need to adjust the start value in the preheader. + Builder.SetInsertPoint( + BasePtr->getIncomingBlock(StartBlock)->getTerminator()); + Builder.SetCurrentDebugLocation(DebugLoc()); + + switch (BO->getOpcode()) { + default: + llvm_unreachable("Unexpected opcode!"); + case Instruction::Add: + case Instruction::Or: { + // An add only affects the start value. It's ok to do this for Or because + // we already checked that there are no common set bits. + + // If the start value is Zero, just take the SplatOp. + if (isa(Start) && cast(Start)->isZero()) + Start = SplatOp; + else + Start = Builder.CreateAdd(Start, SplatOp, "start"); + BasePtr->setIncomingValue(StartBlock, Start); + break; + } + case Instruction::Mul: { + // If the start is zero we don't need to multiply. + if (!isa(Start) || !cast(Start)->isZero()) + Start = Builder.CreateMul(Start, SplatOp, "start"); + + Step = Builder.CreateMul(Step, SplatOp, "step"); + + // If the Stride is 1 just take the SplatOpt. + if (isa(Stride) && cast(Stride)->isOne()) + Stride = SplatOp; + else + Stride = Builder.CreateMul(Stride, SplatOp, "stride"); + Inc->setOperand(StepIndex, Step); + BasePtr->setIncomingValue(StartBlock, Start); + break; + } + case Instruction::Shl: { + // If the start is zero we don't need to shift. + if (!isa(Start) || !cast(Start)->isZero()) + Start = Builder.CreateShl(Start, SplatOp, "start"); + Step = Builder.CreateShl(Step, SplatOp, "step"); + Stride = Builder.CreateShl(Stride, SplatOp, "stride"); + Inc->setOperand(StepIndex, Step); + BasePtr->setIncomingValue(StartBlock, Start); + break; + } + } + + return true; +} + +std::pair +RISCVGatherScatterLowering::determineBaseAndStride(GetElementPtrInst *GEP, + IRBuilder<> &Builder) { + + SmallVector Ops(GEP->operands()); + + // Base pointer needs to be a scalar. + if (Ops[0]->getType()->isVectorTy()) + return std::make_pair(nullptr, nullptr); + + // Make sure we're in a loop and it is in loop simplify form. + Loop *L = LI->getLoopFor(GEP->getParent()); + if (!L || !L->isLoopSimplifyForm()) + return std::make_pair(nullptr, nullptr); + + int VecOperand = -1; + unsigned TypeScale = 0; + + // Look for a vector operand and scale. + gep_type_iterator GTI = gep_type_begin(GEP); + for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) { + if (!Ops[i]->getType()->isVectorTy()) + continue; + + if (VecOperand >= 0) + return std::make_pair(nullptr, nullptr); + + VecOperand = i; + + TypeSize TS = DL->getTypeAllocSize(GTI.getIndexedType()); + if (TS.isScalable()) + return std::make_pair(nullptr, nullptr); + + TypeScale = TS.getFixedSize(); + } + + // We need to find a vector index to simplify. + if (VecOperand < 0) + return std::make_pair(nullptr, nullptr); + + // We can't extract the stride if the arithmetic is done at a different size + // than the pointer type. Adding the stride later may not wrap correctly. + // Technically we could handle wider indices, but I don't expect that in + // practice. + Value *VecIndex = Ops[VecOperand]; + Type *VecIntPtrTy = DL->getIntPtrType(GEP->getType()); + if (VecIndex->getType() != VecIntPtrTy) + return std::make_pair(nullptr, nullptr); + + Value *Stride; + BinaryOperator *Inc; + PHINode *BasePhi; + if (!matchStridedRecurrence(VecIndex, L, Stride, BasePhi, Inc, Builder)) + return std::make_pair(nullptr, nullptr); + + unsigned IncrementingBlock = BasePhi->getOperand(0) == Inc ? 0 : 1; + + Builder.SetInsertPoint(GEP); + + // Replace the vector index with the scalar phi and build a scalar GEP. + Ops[VecOperand] = BasePhi; + Type *SourceTy = GEP->getSourceElementType(); + Value *BasePtr = + Builder.CreateGEP(SourceTy, Ops[0], makeArrayRef(Ops).drop_front()); + + // Cast the GEP to an i8*. + LLVMContext &Ctx = GEP->getContext(); + Type *I8PtrTy = + Type::getInt8PtrTy(Ctx, GEP->getType()->getPointerAddressSpace()); + if (BasePtr->getType() != I8PtrTy) + BasePtr = Builder.CreatePointerCast(BasePtr, I8PtrTy); + + // Final adjustments to stride should go in the start block. + Builder.SetInsertPoint( + BasePhi->getIncomingBlock(1 - IncrementingBlock)->getTerminator()); + + // Convert stride to pointer size if needed. + Type *IntPtrTy = DL->getIntPtrType(BasePtr->getType()); + assert(Stride->getType() == IntPtrTy && "Unexpected type"); + + // Scale the stride by the size of the indexed type. + if (TypeScale != 1) + Stride = Builder.CreateMul(Stride, ConstantInt::get(IntPtrTy, TypeScale)); + + auto BaseAndStride = std::make_pair(BasePtr, Stride); + + return BaseAndStride; +} + +bool RISCVGatherScatterLowering::tryCreateStridedLoadStore(IntrinsicInst *II, + Type *DataType, + Value *Ptr, + Value *AlignOp) { + // Make sure the operation will be supported by the backend. + if (!isLegalTypeAndAlignment(DataType, AlignOp)) + return false; + + // Pointer should be a GEP. + auto *GEP = dyn_cast(Ptr); + if (!GEP) + return false; + + IRBuilder<> Builder(GEP); + + Value *BasePtr, *Stride; + std::tie(BasePtr, Stride) = determineBaseAndStride(GEP, Builder); + if (!BasePtr) + return false; + assert(Stride != nullptr); + + Builder.SetInsertPoint(II); + + CallInst *Call; + if (II->getIntrinsicID() == Intrinsic::masked_gather) + Call = Builder.CreateIntrinsic( + Intrinsic::riscv_masked_strided_load, + {DataType, BasePtr->getType(), Stride->getType()}, + {II->getArgOperand(3), BasePtr, Stride, II->getArgOperand(2)}); + else + Call = Builder.CreateIntrinsic( + Intrinsic::riscv_masked_strided_store, + {DataType, BasePtr->getType(), Stride->getType()}, + {II->getArgOperand(0), BasePtr, Stride, II->getArgOperand(3)}); + + Call->takeName(II); + II->replaceAllUsesWith(Call); + II->eraseFromParent(); + + if (GEP->use_empty()) + RecursivelyDeleteTriviallyDeadInstructions(GEP); + + return true; +} + +bool RISCVGatherScatterLowering::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + auto &TPC = getAnalysis(); + auto &TM = TPC.getTM(); + ST = &TM.getSubtarget(F); + if (!ST->hasStdExtV() || !ST->useRVVForFixedLengthVectors()) + return false; + + TLI = ST->getTargetLowering(); + DL = &F.getParent()->getDataLayout(); + LI = &getAnalysis().getLoopInfo(); + + SmallVector Gathers; + SmallVector Scatters; + + bool Changed = false; + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + // Do an initial optimization pass to push out as much address arithmetic + // as possible to get a more canonical IR. + IntrinsicInst *II = dyn_cast(&I); + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa(II->getType())) { + Gathers.push_back(II); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa(II->getArgOperand(0)->getType())) { + Scatters.push_back(II); + } + } + } + + // Rewrite gather/scatter to form strided load/store if possible. + for (auto *II : Gathers) + Changed |= tryCreateStridedLoadStore( + II, II->getType(), II->getArgOperand(0), II->getArgOperand(1)); + for (auto *II : Scatters) + Changed |= + tryCreateStridedLoadStore(II, II->getArgOperand(0)->getType(), + II->getArgOperand(1), II->getArgOperand(2)); + + // Remove any dead phis. + while (!MaybeDeadPHIs.empty()) { + if (auto *Phi = dyn_cast_or_null(MaybeDeadPHIs.pop_back_val())) + RecursivelyDeleteDeadPHINode(Phi); + } + + return Changed; +} diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -553,6 +553,7 @@ SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerVectorMaskVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -429,6 +429,7 @@ } setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); static unsigned IntegerVPOps[] = { ISD::VP_ADD, ISD::VP_SUB, ISD::VP_MUL, ISD::VP_SDIV, ISD::VP_UDIV, @@ -914,6 +915,23 @@ MachineMemOperand::MOVolatile; return true; } + case Intrinsic::riscv_masked_strided_load: + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getType()->getScalarType()); + Info.align = Align(I.getType()->getScalarSizeInBits() / 8); + Info.size = ~UINT64_C(0); + Info.flags |= MachineMemOperand::MOLoad; + return true; + case Intrinsic::riscv_masked_strided_store: + Info.opc = ISD::INTRINSIC_VOID; + Info.ptrVal = nullptr; + Info.memVT = MVT::getVT(I.getArgOperand(0)->getType()->getScalarType()); + Info.align = + Align(I.getArgOperand(0)->getType()->getScalarSizeInBits() / 8); + Info.size = ~UINT64_C(0); + Info.flags |= MachineMemOperand::MOStore; + return true; } } @@ -2240,6 +2258,8 @@ return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); + case ISD::INTRINSIC_VOID: + return LowerINTRINSIC_VOID(Op, DAG); case ISD::BSWAP: case ISD::BITREVERSE: { // Convert BSWAP/BITREVERSE to GREVI to enable GREVI combinining. @@ -3787,9 +3807,109 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + break; + case Intrinsic::riscv_masked_strided_load: { + SDLoc DL(Op); + MVT XLenVT = Subtarget.getXLenVT(); + + // If the mask is known to be all ones, optimize to an unmasked intrinsic; + // the selection of the masked intrinsics doesn't do this for us. + SDValue Mask = Op.getOperand(5); + bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); + + MVT VT = Op->getSimpleValueType(0); + MVT ContainerVT = getContainerForFixedLengthVector(VT); + + SDValue PassThru = Op.getOperand(2); + if (!IsUnmasked) { + MVT MaskVT = + MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + PassThru = convertToScalableVector(ContainerVT, PassThru, DAG, Subtarget); + } + + SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + + SDValue IntID = DAG.getTargetConstant( + IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL, + XLenVT); + + auto *Load = cast(Op); + SmallVector Ops{Load->getChain(), IntID}; + if (!IsUnmasked) + Ops.push_back(PassThru); + Ops.push_back(Op.getOperand(3)); // Ptr + Ops.push_back(Op.getOperand(4)); // Stride + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + + SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other}); + SDValue Result = + DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops, + Load->getMemoryVT(), Load->getMemOperand()); + SDValue Chain = Result.getValue(1); + Result = convertFromScalableVector(VT, Result, DAG, Subtarget); + return DAG.getMergeValues({Result, Chain}, DL); + } + } + return lowerVectorIntrinsicSplats(Op, DAG, Subtarget); } +SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op, + SelectionDAG &DAG) const { + unsigned IntNo = Op.getConstantOperandVal(1); + switch (IntNo) { + default: + break; + case Intrinsic::riscv_masked_strided_store: { + SDLoc DL(Op); + MVT XLenVT = Subtarget.getXLenVT(); + + // If the mask is known to be all ones, optimize to an unmasked intrinsic; + // the selection of the masked intrinsics doesn't do this for us. + SDValue Mask = Op.getOperand(5); + bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode()); + + SDValue Val = Op.getOperand(2); + MVT VT = Val.getSimpleValueType(); + MVT ContainerVT = getContainerForFixedLengthVector(VT); + + Val = convertToScalableVector(ContainerVT, Val, DAG, Subtarget); + if (!IsUnmasked) { + MVT MaskVT = + MVT::getVectorVT(MVT::i1, ContainerVT.getVectorElementCount()); + Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget); + } + + SDValue VL = DAG.getConstant(VT.getVectorNumElements(), DL, XLenVT); + + SDValue IntID = DAG.getTargetConstant( + IsUnmasked ? Intrinsic::riscv_vsse : Intrinsic::riscv_vsse_mask, DL, + XLenVT); + + auto *Store = cast(Op); + SmallVector Ops{Store->getChain(), IntID}; + Ops.push_back(Val); + Ops.push_back(Op.getOperand(3)); // Ptr + Ops.push_back(Op.getOperand(4)); // Stride + if (!IsUnmasked) + Ops.push_back(Mask); + Ops.push_back(VL); + + return DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, Store->getVTList(), + Ops, Store->getMemoryVT(), + Store->getMemOperand()); + } + } + + return SDValue(); +} + static MVT getLMUL1VT(MVT VT) { assert(VT.getVectorElementType().getSizeInBits() <= 64 && "Unexpected vector MVT"); diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp @@ -37,6 +37,7 @@ RegisterTargetMachine Y(getTheRISCV64Target()); auto *PR = PassRegistry::getPassRegistry(); initializeGlobalISel(*PR); + initializeRISCVGatherScatterLoweringPass(*PR); initializeRISCVMergeBaseOffsetOptPass(*PR); initializeRISCVExpandPseudoPass(*PR); initializeRISCVInsertVSETVLIPass(*PR); @@ -149,6 +150,9 @@ void RISCVPassConfig::addIRPasses() { addPass(createAtomicExpandPass()); + + addPass(createRISCVGatherScatterLoweringPass()); + TargetPassConfig::addIRPasses(); } diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-negative.ll @@ -0,0 +1,155 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s + +; This contains negative tests for the strided load/store recognition in +; RISCVGatherScatterLowering.cpp + +; Negative test for treating OR as ADD. +define void @gather_bad_or(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_bad_or( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], +; CHECK-NEXT: [[OR:%.*]] = or <32 x i64> [[I]], +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> , <32 x i8> undef) +; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1 +; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], +; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %i = mul nuw nsw <32 x i64> %vec.ind, + %or = or <32 x i64> %i, + %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> , <32 x i8> undef) + %i2 = getelementptr inbounds i8, i8* %A, i64 %index + %i3 = bitcast i8* %i2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1 + %i4 = add <32 x i8> %wide.load, %wide.masked.gather + %i5 = bitcast i8* %i2 to <32 x i8>* + store <32 x i8> %i4, <32 x i8>* %i5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %i6 = icmp eq i64 %index.next, 1024 + br i1 %i6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; Don't transform since we might not handle wrap correctly with narrow indices. +define void @gather_narrow_index(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_narrow_index( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i32> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = mul nuw nsw <32 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i32> [[TMP0]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[TMP1]], i32 1, <32 x i1> , <32 x i8> undef) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i8* [[TMP2]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[TMP4]], <32 x i8>* [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i32> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i32> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i32> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i32> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; The last element of the start value of the phi has the wrong stride. +define void @gather_broken_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_broken_stride( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <32 x i64> [ , [[ENTRY]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[I:%.*]] = mul nuw nsw <32 x i64> [[VEC_IND]], +; CHECK-NEXT: [[OR:%.*]] = or <32 x i64> [[I]], +; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], <32 x i64> [[OR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> [[I1]], i32 1, <32 x i1> , <32 x i8> undef) +; CHECK-NEXT: [[I2:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[I3:%.*]] = bitcast i8* [[I2]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[I3]], align 1 +; CHECK-NEXT: [[I4:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[I5:%.*]] = bitcast i8* [[I2]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[I4]], <32 x i8>* [[I5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <32 x i64> [[VEC_IND]], +; CHECK-NEXT: [[I6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[I6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %i = mul nuw nsw <32 x i64> %vec.ind, + %or = or <32 x i64> %i, + %i1 = getelementptr inbounds i8, i8* %B, <32 x i64> %or + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %i1, i32 1, <32 x i1> , <32 x i8> undef) + %i2 = getelementptr inbounds i8, i8* %A, i64 %index + %i3 = bitcast i8* %i2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %i3, align 1 + %i4 = add <32 x i8> %wide.load, %wide.masked.gather + %i5 = bitcast i8* %i2 to <32 x i8>* + store <32 x i8> %i4, <32 x i8>* %i5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %i6 = icmp eq i64 %index.next, 1024 + br i1 %i6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -0,0 +1,831 @@ +; RUN: opt %s -S -riscv-gather-scatter-lowering -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s +; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefix=CHECK-ASM + +%struct.foo = type { i32, i32, i32, i32 } + +; void gather(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i != 1024; ++i) +; A[i] += B[i * 5]; +; } +define void @gather(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: addi a6, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, 5 +; CHECK-ASM-NEXT: addi a5, zero, 1024 +; CHECK-ASM-NEXT: .LBB0_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetvli zero, a6, e8, m1, ta, mu +; CHECK-ASM-NEXT: vlse8.v v25, (a1), a4 +; CHECK-ASM-NEXT: add a3, a0, a2 +; CHECK-ASM-NEXT: vle8.v v26, (a3) +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vse8.v v25, (a3) +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a1, a1, 160 +; CHECK-ASM-NEXT: bne a2, a5, .LBB0_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { +; CHECK-LABEL: @gather_masked( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP0]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather_masked: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: lui a3, 983765 +; CHECK-ASM-NEXT: addiw a3, a3, 873 +; CHECK-ASM-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-ASM-NEXT: vmv.s.x v0, a3 +; CHECK-ASM-NEXT: addi a6, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, 5 +; CHECK-ASM-NEXT: addi a5, zero, 1024 +; CHECK-ASM-NEXT: .LBB1_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetvli zero, a6, e8, m1, tu, mu +; CHECK-ASM-NEXT: vmv1r.v v25, v8 +; CHECK-ASM-NEXT: vlse8.v v25, (a1), a4, v0.t +; CHECK-ASM-NEXT: add a3, a0, a2 +; CHECK-ASM-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-ASM-NEXT: vle8.v v26, (a3) +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vse8.v v25, (a3) +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a1, a1, 160 +; CHECK-ASM-NEXT: bne a2, a5, .LBB1_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> %maskedoff) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_negative_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_negative_stride( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 155, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 -5, <32 x i1> ) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather_negative_stride: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: addi a1, a1, 155 +; CHECK-ASM-NEXT: addi a6, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, -5 +; CHECK-ASM-NEXT: addi a5, zero, 1024 +; CHECK-ASM-NEXT: .LBB2_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetvli zero, a6, e8, m1, ta, mu +; CHECK-ASM-NEXT: vlse8.v v25, (a1), a4 +; CHECK-ASM-NEXT: add a3, a0, a2 +; CHECK-ASM-NEXT: vle8.v v26, (a3) +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vse8.v v25, (a3) +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a1, a1, 160 +; CHECK-ASM-NEXT: bne a2, a5, .LBB2_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @gather_zero_stride(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_zero_stride( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP0]], i64 0, <32 x i1> ) +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = add <32 x i8> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i8* [[TMP1]] to <32 x i8>* +; CHECK-NEXT: store <32 x i8> [[TMP3]], <32 x i8>* [[TMP4]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather_zero_stride: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: addi a3, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, 1024 +; CHECK-ASM-NEXT: .LBB3_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-ASM-NEXT: vlse8.v v25, (a1), zero +; CHECK-ASM-NEXT: add a5, a0, a2 +; CHECK-ASM-NEXT: vle8.v v26, (a5) +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vse8.v v25, (a5) +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a1, a1, 160 +; CHECK-ASM-NEXT: bne a2, a4, .LBB3_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <32 x i64> %vec.ind, + %1 = getelementptr inbounds i8, i8* %B, <32 x i64> %0 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %1, i32 1, <32 x i1> , <32 x i8> undef) + %2 = getelementptr inbounds i8, i8* %A, i64 %index + %3 = bitcast i8* %2 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %3, align 1 + %4 = add <32 x i8> %wide.load, %wide.masked.gather + %5 = bitcast i8* %2 to <32 x i8>* + store <32 x i8> %4, <32 x i8>* %5, align 1 + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void scatter(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i * 5] += B[i]; +;} +define void @scatter(i8* noalias nocapture %A, i8* noalias nocapture readonly %B) { +; CHECK-LABEL: @scatter( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> undef, i8* [[TMP2]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: scatter: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: addi a6, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, 5 +; CHECK-ASM-NEXT: addi a5, zero, 1024 +; CHECK-ASM-NEXT: .LBB4_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: add a3, a1, a2 +; CHECK-ASM-NEXT: vsetvli zero, a6, e8, m1, ta, mu +; CHECK-ASM-NEXT: vle8.v v25, (a3) +; CHECK-ASM-NEXT: vlse8.v v26, (a0), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse8.v v25, (a0), a4 +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a0, a0, 160 +; CHECK-ASM-NEXT: bne a2, a5, .LBB4_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %B, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %1, align 1 + %2 = mul nuw nsw <32 x i64> %vec.ind, + %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> , <32 x i8> undef) + %4 = add <32 x i8> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> ) + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) { +; CHECK-LABEL: @scatter_masked( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <32 x i8>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, <32 x i8>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, i8* [[A:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, i8* [[A]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <32 x i8> @llvm.riscv.masked.strided.load.v32i8.p0i8.i64(<32 x i8> [[MASKEDOFF:%.*]], i8* [[TMP2]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[TMP4:%.*]] = add <32 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v32i8.p0i8.i64(<32 x i8> [[TMP4]], i8* [[TMP3]], i64 5, <32 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 160 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 160 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP5]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: scatter_masked: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: mv a2, zero +; CHECK-ASM-NEXT: addi a6, zero, 32 +; CHECK-ASM-NEXT: lui a4, 983765 +; CHECK-ASM-NEXT: addiw a4, a4, 873 +; CHECK-ASM-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-ASM-NEXT: vmv.s.x v0, a4 +; CHECK-ASM-NEXT: addi a4, zero, 5 +; CHECK-ASM-NEXT: addi a5, zero, 1024 +; CHECK-ASM-NEXT: .LBB5_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: add a3, a1, a2 +; CHECK-ASM-NEXT: vsetvli zero, a6, e8, m1, ta, mu +; CHECK-ASM-NEXT: vle8.v v25, (a3) +; CHECK-ASM-NEXT: vsetvli zero, zero, e8, m1, tu, mu +; CHECK-ASM-NEXT: vmv1r.v v26, v8 +; CHECK-ASM-NEXT: vlse8.v v26, (a0), a4, v0.t +; CHECK-ASM-NEXT: vsetvli zero, zero, e8, m1, ta, mu +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse8.v v25, (a0), a4, v0.t +; CHECK-ASM-NEXT: addi a2, a2, 32 +; CHECK-ASM-NEXT: addi a0, a0, 160 +; CHECK-ASM-NEXT: bne a2, a5, .LBB5_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <32 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %B, i64 %index + %1 = bitcast i8* %0 to <32 x i8>* + %wide.load = load <32 x i8>, <32 x i8>* %1, align 1 + %2 = mul nuw nsw <32 x i64> %vec.ind, + %3 = getelementptr inbounds i8, i8* %A, <32 x i64> %2 + %wide.masked.gather = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> %3, i32 1, <32 x i1> , <32 x i8> %maskedoff) + %4 = add <32 x i8> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> %4, <32 x i8*> %3, i32 1, <32 x i1> ) + %index.next = add nuw i64 %index, 32 + %vec.ind.next = add <32 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +; void gather_pow2(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i != 1024; ++i) +; A[i] += B[i * 4]; +; } +define void @gather_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_pow2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP3]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP2]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP4]], <8 x i32>* [[TMP5]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather_pow2: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: addi a2, zero, 1024 +; CHECK-ASM-NEXT: addi a3, zero, 16 +; CHECK-ASM-NEXT: addi a4, zero, 32 +; CHECK-ASM-NEXT: .LBB6_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-ASM-NEXT: vlse32.v v25, (a1), a3 +; CHECK-ASM-NEXT: vsetvli zero, a4, e8, m1, ta, mu +; CHECK-ASM-NEXT: vle8.v v26, (a0) +; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsetvli zero, a4, e8, m1, ta, mu +; CHECK-ASM-NEXT: vse8.v v25, (a0) +; CHECK-ASM-NEXT: addi a2, a2, -8 +; CHECK-ASM-NEXT: addi a0, a0, 32 +; CHECK-ASM-NEXT: addi a1, a1, 128 +; CHECK-ASM-NEXT: bnez a2, .LBB6_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = shl nsw <8 x i64> %vec.ind, + %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, i64 %index + %3 = bitcast i32* %2 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %3, align 1 + %4 = add <8 x i32> %wide.load, %wide.masked.gather + %5 = bitcast i32* %2 to <8 x i32>* + store <8 x i32> %4, <8 x i32>* %5, align 1 + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %6 = icmp eq i64 %index.next, 1024 + br i1 %6, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void scatter_pow2(signed char * __restrict A, signed char * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i * 4] += B[i]; +;} +define void @scatter_pow2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; CHECK-LABEL: @scatter_pow2( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[WIDE_MASKED_GATHER]], [[WIDE_LOAD]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP7]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: scatter_pow2: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: addi a2, zero, 1024 +; CHECK-ASM-NEXT: addi a3, zero, 32 +; CHECK-ASM-NEXT: addi a4, zero, 16 +; CHECK-ASM-NEXT: .LBB7_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetvli zero, a3, e8, m1, ta, mu +; CHECK-ASM-NEXT: vle8.v v25, (a1) +; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-ASM-NEXT: vlse32.v v26, (a0), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse32.v v25, (a0), a4 +; CHECK-ASM-NEXT: addi a2, a2, -8 +; CHECK-ASM-NEXT: addi a1, a1, 32 +; CHECK-ASM-NEXT: addi a0, a0, 128 +; CHECK-ASM-NEXT: bnez a2, .LBB7_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %B, i64 %index + %1 = bitcast i32* %0 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %1, align 1 + %2 = shl nuw nsw <8 x i64> %vec.ind, + %3 = getelementptr inbounds i32, i32* %A, <8 x i64> %2 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %3, i32 4, <8 x i1> , <8 x i32> undef) + %4 = add <8 x i32> %wide.masked.gather, %wide.load + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %4, <8 x i32*> %3, i32 4, <8 x i1> ) + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %5 = icmp eq i64 %index.next, 1024 + br i1 %5, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;struct foo { +; int a, b, c, d; +;}; +; +;void struct_gather(int * __restrict A, struct foo * __restrict B) { +; for (int i = 0; i < 1024; ++i) +; A[i] += B[i].b; +;} +define void @struct_gather(i32* noalias nocapture %A, %struct.foo* noalias nocapture readonly %B) { +; CHECK-LABEL: @struct_gather( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[B:%.*]], i64 [[VEC_IND_SCALAR]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr [[STRUCT_FOO]], %struct.foo* [[B]], i64 [[VEC_IND_SCALAR1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[WIDE_MASKED_GATHER9:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i64 8 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <8 x i32>, <8 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_LOAD]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <8 x i32> [[WIDE_LOAD10]], [[WIDE_MASKED_GATHER9]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP8]], <8 x i32>* [[TMP10]], align 4 +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* +; CHECK-NEXT: store <8 x i32> [[TMP9]], <8 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 16 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP12]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: struct_gather: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: addi a0, a0, 32 +; CHECK-ASM-NEXT: addi a1, a1, 132 +; CHECK-ASM-NEXT: addi a2, zero, 1024 +; CHECK-ASM-NEXT: addi a3, zero, 16 +; CHECK-ASM-NEXT: .LBB8_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: addi a4, a1, -128 +; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-ASM-NEXT: vlse32.v v25, (a4), a3 +; CHECK-ASM-NEXT: vlse32.v v26, (a1), a3 +; CHECK-ASM-NEXT: addi a4, a0, -32 +; CHECK-ASM-NEXT: vle32.v v27, (a4) +; CHECK-ASM-NEXT: vle32.v v28, (a0) +; CHECK-ASM-NEXT: vadd.vv v25, v27, v25 +; CHECK-ASM-NEXT: vadd.vv v26, v28, v26 +; CHECK-ASM-NEXT: vse32.v v25, (a4) +; CHECK-ASM-NEXT: vse32.v v26, (a0) +; CHECK-ASM-NEXT: addi a2, a2, -16 +; CHECK-ASM-NEXT: addi a0, a0, 64 +; CHECK-ASM-NEXT: addi a1, a1, 256 +; CHECK-ASM-NEXT: bnez a2, .LBB8_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %step.add = add <8 x i64> %vec.ind, + %0 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %vec.ind, i32 1 + %1 = getelementptr inbounds %struct.foo, %struct.foo* %B, <8 x i64> %step.add, i32 1 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %0, i32 4, <8 x i1> , <8 x i32> undef) + %wide.masked.gather9 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, i64 %index + %3 = bitcast i32* %2 to <8 x i32>* + %wide.load = load <8 x i32>, <8 x i32>* %3, align 4 + %4 = getelementptr inbounds i32, i32* %2, i64 8 + %5 = bitcast i32* %4 to <8 x i32>* + %wide.load10 = load <8 x i32>, <8 x i32>* %5, align 4 + %6 = add nsw <8 x i32> %wide.load, %wide.masked.gather + %7 = add nsw <8 x i32> %wide.load10, %wide.masked.gather9 + %8 = bitcast i32* %2 to <8 x i32>* + store <8 x i32> %6, <8 x i32>* %8, align 4 + %9 = bitcast i32* %4 to <8 x i32>* + store <8 x i32> %7, <8 x i32>* %9, align 4 + %index.next = add nuw i64 %index, 16 + %vec.ind.next = add <8 x i64> %vec.ind, + %10 = icmp eq i64 %index.next, 1024 + br i1 %10, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +;void gather_unroll(int * __restrict A, int * __restrict B) { +; for (int i = 0; i < 1024; i+= 4 ) { +; A[i] += B[i * 4]; +; A[i+1] += B[(i+1) * 4]; +; A[i+2] += B[(i+2) * 4]; +; A[i+3] += B[(i+3) * 4]; +; } +;} +define void @gather_unroll(i32* noalias nocapture %A, i32* noalias nocapture readonly %B) { +; CHECK-LABEL: @gather_unroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR1:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR3:%.*]] = phi i64 [ 4, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR5:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR6:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR7:%.*]] = phi i64 [ 8, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR8:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR9:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR11:%.*]] = phi i64 [ 12, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR12:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR13:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR14:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR15:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR16:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR17:%.*]] = phi i64 [ 1, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR18:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR19:%.*]] = phi i64 [ 2, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR20:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND_SCALAR21:%.*]] = phi i64 [ 3, [[ENTRY]] ], [ [[VEC_IND_NEXT_SCALAR22:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[VEC_IND_SCALAR]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32* [[TMP0]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP1]], i64 64, <8 x i1> ) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[VEC_IND_SCALAR1]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR15]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER52:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP3]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP6:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER52]], [[WIDE_MASKED_GATHER]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP6]], i8* [[TMP5]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR3]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER53:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP8]], i64 64, <8 x i1> ) +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR5]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8* +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR17]] +; CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[TMP11]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER54:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP10]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER54]], [[WIDE_MASKED_GATHER53]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP13]], i8* [[TMP12]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR7]] +; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER55:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP15]], i64 64, <8 x i1> ) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR9]] +; CHECK-NEXT: [[TMP17:%.*]] = bitcast i32* [[TMP16]] to i8* +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR19]] +; CHECK-NEXT: [[TMP19:%.*]] = bitcast i32* [[TMP18]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER56:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP17]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP20:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER56]], [[WIDE_MASKED_GATHER55]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP20]], i8* [[TMP19]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i32, i32* [[B]], i64 [[VEC_IND_SCALAR11]] +; CHECK-NEXT: [[TMP22:%.*]] = bitcast i32* [[TMP21]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER57:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP22]], i64 64, <8 x i1> ) +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR13]] +; CHECK-NEXT: [[TMP24:%.*]] = bitcast i32* [[TMP23]] to i8* +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i32, i32* [[A]], i64 [[VEC_IND_SCALAR21]] +; CHECK-NEXT: [[TMP26:%.*]] = bitcast i32* [[TMP25]] to i8* +; CHECK-NEXT: [[WIDE_MASKED_GATHER58:%.*]] = call <8 x i32> @llvm.riscv.masked.strided.load.v8i32.p0i8.i64(<8 x i32> undef, i8* [[TMP24]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[TMP27:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_GATHER58]], [[WIDE_MASKED_GATHER57]] +; CHECK-NEXT: call void @llvm.riscv.masked.strided.store.v8i32.p0i8.i64(<8 x i32> [[TMP27]], i8* [[TMP26]], i64 16, <8 x i1> ) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR]] = add i64 [[VEC_IND_SCALAR]], 128 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR2]] = add i64 [[VEC_IND_SCALAR1]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR4]] = add i64 [[VEC_IND_SCALAR3]], 128 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR6]] = add i64 [[VEC_IND_SCALAR5]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR8]] = add i64 [[VEC_IND_SCALAR7]], 128 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR10]] = add i64 [[VEC_IND_SCALAR9]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR12]] = add i64 [[VEC_IND_SCALAR11]], 128 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR14]] = add i64 [[VEC_IND_SCALAR13]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR16]] = add i64 [[VEC_IND_SCALAR15]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR18]] = add i64 [[VEC_IND_SCALAR17]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR20]] = add i64 [[VEC_IND_SCALAR19]], 32 +; CHECK-NEXT: [[VEC_IND_NEXT_SCALAR22]] = add i64 [[VEC_IND_SCALAR21]], 32 +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP28]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; +; CHECK-ASM-LABEL: gather_unroll: +; CHECK-ASM: # %bb.0: # %entry +; CHECK-ASM-NEXT: addi a2, zero, 256 +; CHECK-ASM-NEXT: addi a3, zero, 64 +; CHECK-ASM-NEXT: addi a4, zero, 16 +; CHECK-ASM-NEXT: .LBB9_1: # %vector.body +; CHECK-ASM-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-ASM-NEXT: vsetivli zero, 8, e32, m1, ta, mu +; CHECK-ASM-NEXT: vlse32.v v25, (a1), a3 +; CHECK-ASM-NEXT: vlse32.v v26, (a0), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse32.v v25, (a0), a4 +; CHECK-ASM-NEXT: addi a5, a1, 16 +; CHECK-ASM-NEXT: vlse32.v v25, (a5), a3 +; CHECK-ASM-NEXT: addi a5, a0, 4 +; CHECK-ASM-NEXT: vlse32.v v26, (a5), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse32.v v25, (a5), a4 +; CHECK-ASM-NEXT: addi a5, a1, 32 +; CHECK-ASM-NEXT: vlse32.v v25, (a5), a3 +; CHECK-ASM-NEXT: addi a5, a0, 8 +; CHECK-ASM-NEXT: vlse32.v v26, (a5), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse32.v v25, (a5), a4 +; CHECK-ASM-NEXT: addi a5, a1, 48 +; CHECK-ASM-NEXT: vlse32.v v25, (a5), a3 +; CHECK-ASM-NEXT: addi a5, a0, 12 +; CHECK-ASM-NEXT: vlse32.v v26, (a5), a4 +; CHECK-ASM-NEXT: vadd.vv v25, v26, v25 +; CHECK-ASM-NEXT: vsse32.v v25, (a5), a4 +; CHECK-ASM-NEXT: addi a2, a2, -8 +; CHECK-ASM-NEXT: addi a1, a1, 512 +; CHECK-ASM-NEXT: addi a0, a0, 128 +; CHECK-ASM-NEXT: bnez a2, .LBB9_1 +; CHECK-ASM-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-ASM-NEXT: ret +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i64> [ , %entry ], [ %vec.ind.next, %vector.body ] + %0 = shl nuw nsw <8 x i64> %vec.ind, + %1 = getelementptr inbounds i32, i32* %B, <8 x i64> %0 + %wide.masked.gather = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %1, i32 4, <8 x i1> , <8 x i32> undef) + %2 = getelementptr inbounds i32, i32* %A, <8 x i64> %vec.ind + %wide.masked.gather52 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %2, i32 4, <8 x i1> , <8 x i32> undef) + %3 = add nsw <8 x i32> %wide.masked.gather52, %wide.masked.gather + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %3, <8 x i32*> %2, i32 4, <8 x i1> ) + %4 = or <8 x i64> %vec.ind, + %5 = shl nsw <8 x i64> %4, + %6 = getelementptr inbounds i32, i32* %B, <8 x i64> %5 + %wide.masked.gather53 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %6, i32 4, <8 x i1> , <8 x i32> undef) + %7 = getelementptr inbounds i32, i32* %A, <8 x i64> %4 + %wide.masked.gather54 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %7, i32 4, <8 x i1> , <8 x i32> undef) + %8 = add nsw <8 x i32> %wide.masked.gather54, %wide.masked.gather53 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %8, <8 x i32*> %7, i32 4, <8 x i1> ) + %9 = or <8 x i64> %vec.ind, + %10 = shl nsw <8 x i64> %9, + %11 = getelementptr inbounds i32, i32* %B, <8 x i64> %10 + %wide.masked.gather55 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %11, i32 4, <8 x i1> , <8 x i32> undef) + %12 = getelementptr inbounds i32, i32* %A, <8 x i64> %9 + %wide.masked.gather56 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %12, i32 4, <8 x i1> , <8 x i32> undef) + %13 = add nsw <8 x i32> %wide.masked.gather56, %wide.masked.gather55 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %13, <8 x i32*> %12, i32 4, <8 x i1> ) + %14 = or <8 x i64> %vec.ind, + %15 = shl nsw <8 x i64> %14, + %16 = getelementptr inbounds i32, i32* %B, <8 x i64> %15 + %wide.masked.gather57 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %16, i32 4, <8 x i1> , <8 x i32> undef) + %17 = getelementptr inbounds i32, i32* %A, <8 x i64> %14 + %wide.masked.gather58 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %17, i32 4, <8 x i1> , <8 x i32> undef) + %18 = add nsw <8 x i32> %wide.masked.gather58, %wide.masked.gather57 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %18, <8 x i32*> %17, i32 4, <8 x i1> ) + %index.next = add nuw i64 %index, 8 + %vec.ind.next = add <8 x i64> %vec.ind, + %19 = icmp eq i64 %index.next, 256 + br i1 %19, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32 immarg, <32 x i1>, <32 x i8>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32 immarg, <8 x i1>, <8 x i32>) +declare void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8>, <32 x i8*>, i32 immarg, <32 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immarg, <8 x i1>) diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -507,9 +507,10 @@ return false; std::vector PassNamePrefix = { - "x86-", "xcore-", "wasm-", "systemz-", "ppc-", "nvvm-", "nvptx-", - "mips-", "lanai-", "hexagon-", "bpf-", "avr-", "thumb2-", "arm-", - "si-", "gcn-", "amdgpu-", "aarch64-", "amdgcn-", "polly-"}; + "x86-", "xcore-", "wasm-", "systemz-", "ppc-", "nvvm-", + "nvptx-", "mips-", "lanai-", "hexagon-", "bpf-", "avr-", + "thumb2-", "arm-", "si-", "gcn-", "amdgpu-", "aarch64-", + "amdgcn-", "polly-", "riscv-"}; std::vector PassNameContain = {"ehprepare"}; std::vector PassNameExact = { "safe-stack", "cost-model",