diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -102,22 +102,11 @@ Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder, - unsigned Increment = 0); - // Create a gather from a vector of pointers + IRBuilder<> &Builder, int64_t Increment = 0); + // Create an incrementing gather from a vector of pointers Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, - unsigned Increment = 0); - // QI gathers can increment their offsets on their own if the increment is - // a constant value (digit) - Value *tryCreateIncrementingGather(IntrinsicInst *I, Value *BasePtr, - Value *Ptr, GetElementPtrInst *GEP, - IRBuilder<> &Builder); - // QI gathers can increment their offsets on their own if the increment is - // a constant value (digit) - this creates a writeback QI gather - Value *tryCreateIncrementingWBGather(IntrinsicInst *I, Value *BasePtr, - Value *Ptr, unsigned TypeScale, - IRBuilder<> &Builder); + int64_t Increment = 0); Value *lowerScatter(IntrinsicInst *I); // Create a scatter to a base + vector of offsets @@ -125,8 +114,24 @@ IRBuilder<> &Builder); // Create a scatter to a vector of pointers Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder); - + IRBuilder<> &Builder, + int64_t Increment = 0); + // Create an incrementing scatter from a vector of pointers + Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment = 0); + + // QI gathers and scatters can increment their offsets on their own if + // the increment is a constant value (digit) + Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // QI gathers/scatters can increment their offsets on their own if the + // increment is a constant value (digit) - this creates a writeback QI + // gather/scatter + Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, unsigned TypeScale, + IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -342,7 +347,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, - unsigned Increment) { + int64_t Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); @@ -362,7 +367,7 @@ } Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, unsigned Increment) { + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG( @@ -426,8 +431,7 @@ return nullptr; // Check whether the offset is a constant increment that could be merged into // a QI gather - Value *Load = - tryCreateIncrementingGather(I, BasePtr, Offsets, GEP, Builder); + Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); if (Load) return Load; @@ -453,19 +457,165 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } -Value *MVEGatherScatterLowering::tryCreateIncrementingGather( +Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); + + // @llvm.masked.scatter.*(data, ptrs, alignment, mask) + // Attempt to turn the masked scatter in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Value *Input = I->getArgOperand(0); + Value *Ptr = I->getArgOperand(1); + unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); + auto *Ty = cast(Input->getType()); + + if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), + Alignment)) + return nullptr; + + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); + if (!Store) + Store = tryCreateMaskedScatterBase(I, Ptr, Builder); + if (!Store) + return nullptr; + + LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); + I->eraseFromParent(); + return Store; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast(Input->getType()); + // Only QR variants allow truncating + if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { + // Can't build an intrinsic for this + return nullptr; + } + Value *Mask = I->getArgOperand(3); + // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) + LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast(Input->getType()); + LLVM_DEBUG( + dbgs() + << "masked scatters: storing to a vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(3); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_wb_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *InputTy = Input->getType(); + Type *MemoryTy = InputTy; + LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" + << " to base + vector of offsets\n"); + // If the input has been truncated, try to integrate that trunc into the + // scatter instruction (we don't care about alignment here) + if (TruncInst *Trunc = dyn_cast(Input)) { + Value *PreTrunc = Trunc->getOperand(0); + Type *PreTruncTy = PreTrunc->getType(); + if (PreTruncTy->getPrimitiveSizeInBits() == 128) { + Input = PreTrunc; + InputTy = PreTruncTy; + } + } + if (InputTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG( + dbgs() << "masked scatters: cannot create scatters for non-standard" + << " input types. Expanding.\n"); + return nullptr; + } + + GetElementPtrInst *GEP = dyn_cast(Ptr); + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + if (!BasePtr) + return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Store = + tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); + if (Store) + return Store; + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + if (Scale == -1) + return nullptr; + + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset_predicated, + {BasePtr->getType(), Offsets->getType(), Input->getType(), + Mask->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset, + {BasePtr->getType(), Offsets->getType(), Input->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale)}); +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat( IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP, IRBuilder<> &Builder) { - auto *Ty = cast(I->getType()); + VectorType *Ty; + if (I->getIntrinsicID() == Intrinsic::masked_gather) + Ty = cast(I->getType()); + else + Ty = cast(I->getArgOperand(0)->getType()); // Incrementing gathers only exist for v4i32 - if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + if (Ty->getNumElements() != 4 || + Ty->getScalarSizeInBits() != 32) return nullptr; Loop *L = LI->getLoopFor(I->getParent()); if (L == nullptr) // Incrementing gathers are not beneficial outside of a loop return nullptr; - LLVM_DEBUG( - dbgs() << "masked gathers: trying to build incrementing wb gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "wb gather/scatter\n"); // The gep was in charge of making sure the offsets are scaled correctly // - calculate that factor so it can be applied by hand @@ -482,12 +632,12 @@ // change the phi which does affect other users of the gep (which will still // be using the phi in the old way) Value *Load = - tryCreateIncrementingWBGather(I, BasePtr, Offsets, TypeScale, Builder); + tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder); if (Load != nullptr) return Load; } - LLVM_DEBUG( - dbgs() << "masked gathers: trying to build incrementing non-wb gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "non-wb gather/scatter\n"); std::pair Add = getVarAndConst(Offsets, TypeScale); if (Add.first == nullptr) @@ -510,11 +660,15 @@ cast(ScaledOffsets->getType())->getElementType())), "StartIndex", I); - return cast( - tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + if (I->getIntrinsicID() == Intrinsic::masked_gather) + return cast( + tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + else + return cast( + tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate)); } -Value *MVEGatherScatterLowering::tryCreateIncrementingWBGather( +Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat( IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale, IRBuilder<> &Builder) { // Check whether this gather's offset is incremented by a constant - if so, @@ -574,132 +728,26 @@ Builder.SetInsertPoint(I); - // Build the incrementing gather - Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); - - // One value to be handed to whoever uses the gather, one is the loop - // increment - Value *ExtractedLoad = Builder.CreateExtractValue(Load, 0, "Gather"); - Value *Inc = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + Value *EndResult; + Value *NewInduction; + if (I->getIntrinsicID() == Intrinsic::masked_gather) { + // Build the incrementing gather + Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); + // One value to be handed to whoever uses the gather, one is the loop + // increment + EndResult = Builder.CreateExtractValue(Load, 0, "Gather"); + NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + } else { + // Build the incrementing scatter + NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate); + EndResult = NewInduction; + } Instruction *AddInst = cast(Offsets); - AddInst->replaceAllUsesWith(Inc); + AddInst->replaceAllUsesWith(NewInduction); AddInst->eraseFromParent(); - Phi->setIncomingValue(IncrementIndex, Inc); - - return ExtractedLoad; -} - -Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { - using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); - - // @llvm.masked.scatter.*(data, ptrs, alignment, mask) - // Attempt to turn the masked scatter in I into a MVE intrinsic - // Potentially optimising the addressing modes as we do so. - Value *Input = I->getArgOperand(0); - Value *Ptr = I->getArgOperand(1); - unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); - auto *Ty = cast(Input->getType()); - - if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), - Alignment)) - return nullptr; - - lookThroughBitcast(Ptr); - assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); - - IRBuilder<> Builder(I->getContext()); - Builder.SetInsertPoint(I); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - - Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); - if (!Store) - Store = tryCreateMaskedScatterBase(I, Ptr, Builder); - if (!Store) - return nullptr; - - LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); - I->replaceAllUsesWith(Store); - I->eraseFromParent(); - return Store; -} - -Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { - using namespace PatternMatch; - Value *Input = I->getArgOperand(0); - Value *Mask = I->getArgOperand(3); - auto *Ty = cast(Input->getType()); - // Only QR variants allow truncating - if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { - // Can't build an intrinsic for this - return nullptr; - } - // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) - LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); - if (match(Mask, m_One())) - return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, - {Ptr->getType(), Input->getType()}, - {Ptr, Builder.getInt32(0), Input}); - else - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_base_predicated, - {Ptr->getType(), Input->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Input, Mask}); -} - -Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { - using namespace PatternMatch; - Value *Input = I->getArgOperand(0); - Value *Mask = I->getArgOperand(3); - Type *InputTy = Input->getType(); - Type *MemoryTy = InputTy; - LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" - << " to base + vector of offsets\n"); - // If the input has been truncated, try to integrate that trunc into the - // scatter instruction (we don't care about alignment here) - if (TruncInst *Trunc = dyn_cast(Input)) { - Value *PreTrunc = Trunc->getOperand(0); - Type *PreTruncTy = PreTrunc->getType(); - if (PreTruncTy->getPrimitiveSizeInBits() == 128) { - Input = PreTrunc; - InputTy = PreTruncTy; - } - } - if (InputTy->getPrimitiveSizeInBits() != 128) { - LLVM_DEBUG( - dbgs() << "masked scatters: cannot create scatters for non-standard" - << " input types. Expanding.\n"); - return nullptr; - } - - GetElementPtrInst *GEP = dyn_cast(Ptr); - Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); - if (!BasePtr) - return nullptr; - int Scale = computeScale( - BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), - MemoryTy->getScalarSizeInBits()); - if (Scale == -1) - return nullptr; + Phi->setIncomingValue(IncrementIndex, NewInduction); - if (!match(Mask, m_One())) - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_offset_predicated, - {BasePtr->getType(), Offsets->getType(), Input->getType(), - Mask->getType()}, - {BasePtr, Offsets, Input, - Builder.getInt32(MemoryTy->getScalarSizeInBits()), - Builder.getInt32(Scale), Mask}); - else - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_offset, - {BasePtr->getType(), Offsets->getType(), Input->getType()}, - {BasePtr, Offsets, Input, - Builder.getInt32(MemoryTy->getScalarSizeInBits()), - Builder.getInt32(Scale)}); + return EndResult; } void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi, diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -153,23 +153,22 @@ ; CHECK-LABEL: push_out_mul_scatter: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r1, .LCPI3_0 -; CHECK-NEXT: vmov.i32 q1, #0x18 -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q0, [r0, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q2, q1 +; CHECK-NEXT: vstrw.32 q0, [q1, #96]! ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { @@ -196,23 +195,22 @@ ; CHECK-LABEL: push_out_add_scatter: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r1, .LCPI4_0 -; CHECK-NEXT: vmov.i32 q2, #0x8 ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q0, [q1, #32]! ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -0,0 +1,244 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-arm-maskedgatscat %s -o - | FileCheck %s + + +define arm_aapcs_vfpcc void @scatter_inc_minipred_4i32(<4 x i32> %data, i32* %dst, <4 x i32> %offs) { +; CHECK-LABEL: scatter_inc_minipred_4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: movw r1, #3855 +; CHECK-NEXT: vmov.i32 q2, #0x4 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr + %1 = add <4 x i32> %offs, + %2 = getelementptr inbounds i32, i32* %dst, <4 x i32> %1 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data, <4 x i32*> %2, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scatter_inc_mini_8i16(<8 x i16> %data, i16* %dst, <8 x i32> %offs) { +; CHECK-LABEL: scatter_inc_mini_8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vmov.i32 q3, #0x10 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov.u16 r2, q0[0] +; CHECK-NEXT: vadd.i32 q4, q1, q3 +; CHECK-NEXT: vshl.i32 q1, q2, #1 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vadd.i32 q1, q1, q3 +; CHECK-NEXT: strh r2, [r1] +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr + %1 = add <8 x i32> %offs, + %2 = getelementptr inbounds i16, i16* %dst, <8 x i32> %1 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %data, <8 x i16*> %2, i32 4, <8 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scatter_inc_mini_16i8(<16 x i8> %data, i8* %dst, <16 x i32> %offs) { +; CHECK-LABEL: scatter_inc_mini_16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov.i32 q5, #0x10 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q4, q1, q5 +; CHECK-NEXT: vmov.u8 r2, q0[0] +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, q5 +; CHECK-NEXT: vadd.i32 q2, q2, q5 +; CHECK-NEXT: strb r2, [r1] +; CHECK-NEXT: add r1, sp, #32 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vadd.i32 q1, q1, q5 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr + %1 = add <16 x i32> %offs, + %2 = getelementptr inbounds i8, i8* %dst, <16 x i32> %1 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %data, <16 x i8*> %2, i32 2, <16 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scatter_inc_v4i32_complex(<4 x i32> %data1, <4 x i32> %data2, <4 x i32> %data3, i32* %dst, i32 %n) { +; CHECK-LABEL: scatter_inc_v4i32_complex: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: cmp r1, #1 +; CHECK-NEXT: blt .LBB3_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: adr r4, .LCPI3_2 +; CHECK-NEXT: bic r2, r1, #3 +; CHECK-NEXT: vldrw.u32 q3, [r4] +; CHECK-NEXT: sub.w r12, r2, #4 +; CHECK-NEXT: adr.w lr, .LCPI3_1 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: adr.w r12, .LCPI3_0 +; CHECK-NEXT: vadd.i32 q4, q3, r0 +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: .LBB3_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 +; CHECK-NEXT: vmov q6, q4 +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: .LBB3_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vstrw.32 q0, [q5, #48]! +; CHECK-NEXT: vstrw.32 q1, [q6, #48]! +; CHECK-NEXT: vstrw.32 q2, [q7, #48]! +; CHECK-NEXT: le lr, .LBB3_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 +; CHECK-NEXT: cmp r2, r1 +; CHECK-NEXT: bne .LBB3_2 +; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967260 @ 0xffffffdc +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .LCPI3_1: +; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967276 @ 0xffffffec +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .LCPI3_2: +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -4 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <4 x i32> %vec.ind, + %1 = getelementptr inbounds i32, i32* %dst, <4 x i32> %0 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data1, <4 x i32*> %1, i32 4, <4 x i1> ) + %2 = add nuw nsw <4 x i32> %0, + %3 = getelementptr inbounds i32, i32* %dst, <4 x i32> %2 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data2, <4 x i32*> %3, i32 4, <4 x i1> ) + %4 = add nuw nsw <4 x i32> %0, + %5 = getelementptr inbounds i32, i32* %dst, <4 x i32> %4 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %data3, <4 x i32*> %5, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %6 = icmp eq i32 %index.next, %n.vec + br i1 %6, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + + +declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>)