diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h --- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -158,7 +158,9 @@ bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment); - bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; } + bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { + return isLegalMaskedGather(Ty, Alignment); + } int getMemcpyCost(const Instruction *I); diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -80,6 +80,8 @@ // returning the base directly and the offsets indirectly using the Offsets // argument Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder); + // Compute the scale of this gather/scatter instruction + int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); bool lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets @@ -88,6 +90,14 @@ // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder); + + bool lowerScatter(IntrinsicInst *I); + // Create a scatter to a base + vector of offsets + Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); + // Create a scatter to a vector of pointers + Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); }; } // end anonymous namespace @@ -110,8 +120,8 @@ (NumElements == 16 && ElemSize == 8)) && ElemSize / 8 <= Alignment) return true; - LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " - << "alignment or vector type \n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have " + << "valid alignment or vector type \n"); return false; } @@ -119,17 +129,18 @@ IRBuilder<> Builder) { GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) { - LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n"); + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: no getelementpointer found\n"); return nullptr; } - LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" - << " from base + vector of offsets\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." + << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); if (GEPPtr->getType()->isVectorTy()) { return nullptr; } if (GEP->getNumOperands() != 2) { - LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many" + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } @@ -140,16 +151,16 @@ Offsets = ZextOffs->getOperand(0); Type *OffsType = VectorType::getInteger(cast(Ty)); // If the offset we found does not have the type the intrinsic expects, - // i.e., the same type as the gather itself, we need to convert it (only i - // types) or fall back to expanding the gather + // i.e., the same type as the gather (or scatter input) itself, we need to + // convert it (only i types) or fall back to expanding the gather if (OffsType != Offsets->getType()) { if (OffsType->getScalarSizeInBits() > Offsets->getType()->getScalarSizeInBits()) { - LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n"); Offsets = Builder.CreateZExt(Offsets, OffsType, ""); } else { - LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't" - << " create masked gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." + << " Can't create intrinsic.\n"); return nullptr; } } @@ -163,12 +174,28 @@ Type *BCTy = BitCast->getType(); Type *BCSrcTy = BitCast->getOperand(0)->getType(); if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { - LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: looking through bitcast\n"); Ptr = BitCast->getOperand(0); } } } +int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize, + unsigned MemoryElemSize) { + // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2, + // or a 8bit, 16bit or 32bit load/store scaled by 1 + if (GEPElemSize == 32 && MemoryElemSize == 32) + return 2; + else if (GEPElemSize == 16 && MemoryElemSize == 16) + return 1; + else if (GEPElemSize == 8) + return 0; + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't " + << "create intrinsic\n"); + return -1; +} + bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); @@ -193,7 +220,6 @@ Builder.SetCurrentDebugLocation(I->getDebugLoc()); Instruction *Root = I; - Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder); if (!Load) Load = tryCreateMaskedGatherBase(I, Ptr, Builder); @@ -219,9 +245,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { using namespace PatternMatch; - Type *Ty = I->getType(); - LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this @@ -279,23 +303,11 @@ if (!BasePtr) return nullptr; - unsigned Scale; - int GEPElemSize = - BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); - int MemoryElemSize = OriginalTy->getScalarSizeInBits(); - // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a - // 8bit, 16bit or 32bit load scaled by 1 - if (GEPElemSize == 32 && MemoryElemSize == 32) { - Scale = 2; - } else if (GEPElemSize == 16 && MemoryElemSize == 16) { - Scale = 1; - } else if (GEPElemSize == 8) { - Scale = 0; - } else { - LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" - << " create masked gather\n"); + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + OriginalTy->getScalarSizeInBits()); + if (Scale == -1) return nullptr; - } Root = Extend; Value *Mask = I->getArgOperand(2); @@ -313,6 +325,117 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } +bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); + + // @llvm.masked.scatter.*(data, ptrs, alignment, mask) + // Attempt to turn the masked scatter in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Value *Input = I->getArgOperand(0); + Value *Ptr = I->getArgOperand(1); + unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); + Type *Ty = Input->getType(); + + if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), + Ty->getScalarSizeInBits(), Alignment)) + return false; + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); + if (!Store) + Store = tryCreateMaskedScatterBase(I, Ptr, Builder); + if (!Store) + return false; + + LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); + I->replaceAllUsesWith(Store); + I->eraseFromParent(); + return true; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *Ty = Input->getType(); + // Only QR variants allow truncating + if (!(Ty->getVectorNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { + // Can't build an intrinsic for this + return nullptr; + } + // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) + LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(0), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *InputTy = Input->getType(); + Type *MemoryTy = InputTy; + LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" + << " to base + vector of offsets\n"); + // If the input has been truncated, try to integrate that trunc into the + // scatter instruction (we don't care about alignment here) + if (TruncInst *Trunc = dyn_cast(Input)) { + Value *PreTrunc = Trunc->getOperand(0); + Type *PreTruncTy = PreTrunc->getType(); + if (PreTruncTy->getPrimitiveSizeInBits() == 128) { + Input = PreTrunc; + InputTy = PreTruncTy; + } + } + if (InputTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG( + dbgs() << "masked scatters: cannot create scatters for non-standard" + << " input types. Expanding.\n"); + return nullptr; + } + + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, InputTy, Ptr, Builder); + if (!BasePtr) + return nullptr; + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + if (Scale == -1) + return nullptr; + + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset_predicated, + {BasePtr->getType(), Offsets->getType(), Input->getType(), + Mask->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset, + {BasePtr->getType(), Offsets->getType(), Input->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale)}); +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -322,19 +445,22 @@ if (!ST->hasMVEIntegerOps()) return false; SmallVector Gathers; + SmallVector Scatters; for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); if (II && II->getIntrinsicID() == Intrinsic::masked_gather) Gathers.push_back(II); + else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) + Scatters.push_back(II); } } - if (Gathers.empty()) - return false; - + bool Changed = false; for (IntrinsicInst *I : Gathers) - lowerGather(I); + Changed |= lowerGather(I); + for (IntrinsicInst *I : Scatters) + Changed |= lowerScatter(I); - return true; + return Changed; } diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -2,8 +2,8 @@ ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck --check-prefix NOGATSCAT %s ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-mve -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck --check-prefix NOMVE %s -define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) { -; NOGATSCAT-LABEL: unscaled_i32_i32: +define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32_gather(i8* %base, <4 x i32>* %offptr) { +; NOGATSCAT-LABEL: unscaled_i32_i32_gather: ; NOGATSCAT: @ %bb.0: @ %entry ; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] ; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 @@ -21,7 +21,7 @@ ; NOGATSCAT-NEXT: vmov.32 q0[3], r2 ; NOGATSCAT-NEXT: bx lr ; -; NOMVE-LABEL: unscaled_i32_i32: +; NOMVE-LABEL: unscaled_i32_i32_gather: ; NOMVE: @ %bb.0: @ %entry ; NOMVE-NEXT: .save {r4, lr} ; NOMVE-NEXT: push {r4, lr} @@ -35,6 +35,7 @@ ; NOMVE-NEXT: pop {r4, pc} + entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs @@ -44,3 +45,51 @@ } declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) + + +define arm_aapcs_vfpcc void @unscaled_i32_i8_scatter(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; NOGATSCAT-LABEL: unscaled_i32_i8_scatter: +; NOGATSCAT: @ %bb.0: @ %entry +; NOGATSCAT-NEXT: vldrb.u32 q1, [r1] +; NOGATSCAT-NEXT: vmov r1, s0 +; NOGATSCAT-NEXT: vadd.i32 q1, q1, r0 +; NOGATSCAT-NEXT: vmov r0, s4 +; NOGATSCAT-NEXT: str r1, [r0] +; NOGATSCAT-NEXT: vmov r0, s5 +; NOGATSCAT-NEXT: vmov r1, s1 +; NOGATSCAT-NEXT: str r1, [r0] +; NOGATSCAT-NEXT: vmov r0, s6 +; NOGATSCAT-NEXT: vmov r1, s2 +; NOGATSCAT-NEXT: str r1, [r0] +; NOGATSCAT-NEXT: vmov r0, s7 +; NOGATSCAT-NEXT: vmov r1, s3 +; NOGATSCAT-NEXT: str r1, [r0] +; NOGATSCAT-NEXT: bx lr +; +; NOMVE-LABEL: unscaled_i32_i8_scatter: +; NOMVE: @ %bb.0: @ %entry +; NOMVE-NEXT: .save {r4, lr} +; NOMVE-NEXT: push {r4, lr} +; NOMVE-NEXT: ldrb.w r12, [r1] +; NOMVE-NEXT: ldrb.w lr, [r1, #1] +; NOMVE-NEXT: ldrb r4, [r1, #2] +; NOMVE-NEXT: ldrb r1, [r1, #3] +; NOMVE-NEXT: str.w r2, [r0, r12] +; NOMVE-NEXT: ldr r2, [sp, #8] +; NOMVE-NEXT: str.w r3, [r0, lr] +; NOMVE-NEXT: str r2, [r0, r4] +; NOMVE-NEXT: ldr r2, [sp, #12] +; NOMVE-NEXT: str r2, [r0, r1] +; NOMVE-NEXT: pop {r4, pc} + + +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -0,0 +1,181 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +; VLDRH.16 Qd, [base, offs, uxtw #1] +define arm_aapcs_vfpcc void @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, offs, uxtw #1] +define arm_aapcs_vfpcc void @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr, <8 x half> %input) { +; CHECK-LABEL: scaled_v8f16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, offs, uxtw #1] +define arm_aapcs_vfpcc void @scaled_v8f16_half(half* %base, <8 x i16>* %offptr, <8 x half> %input) { +; CHECK-LABEL: scaled_v8f16_half: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) { +; CHECK-LABEL: scaled_v8f16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vshl.i32 q2, q1, #1 +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vstr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vstr.16 s1, [r1] +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vstr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, zext(offs), uxtw #1] +define arm_aapcs_vfpcc void @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: unsigned_scaled_v8i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, zext(offs), uxtw #1] +define arm_aapcs_vfpcc void @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr, <8 x half> %input) { +; CHECK-LABEL: unsigned_scaled_v8f16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext + %mask = icmp sgt <8 x i16> %offs, zeroinitializer + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -0,0 +1,460 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +; VLDRB.u16 Qd, [base, offs] +define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: ext_unscaled_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %t = trunc <8 x i16> %input to <8 x i8> + call void @llvm.masked.scatter.v8i8(<8 x i8> %t, <8 x i8*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRB.u16 Qd, [base, offs] +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %input.trunc = trunc <8 x i16> %input to <8 x i8> + call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, offs] +define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: unscaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.s16 Qd, [base, offs] +define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) { +; CHECK-LABEL: unscaled_v8f16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: unscaled_v8i16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) { +; CHECK-LABEL: unscaled_v8f16_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q2, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vstr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vstr.16 s1, [r1] +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vstr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.sext = sext <8 x i16> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: unscaled_v8i16_noext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) { +; CHECK-LABEL: unscaled_v8f16_noext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r1, #16] +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vstr.16 s0, [r1] +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vstr.16 s12, [r1] +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: vstr.16 s1, [r1] +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vstr.16 s8, [r1] +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vstr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i32>, <8 x i32>* %offptr, align 4 + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, zext(offs)] +define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: unsigned_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; VLDRH.16 Qd, [base, zext(offs)] +define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) { +; CHECK-LABEL: unsigned_unscaled_f16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> + call void @llvm.masked.scatter.v8f16(<8 x half> %input, <8 x half*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand ? +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.s32 q5, [r1] +; CHECK-NEXT: vldrb.s32 q4, [r1, #4] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.sext = sext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %input.trunc = trunc <8 x i64> %input to <8 x i16> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand ? +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q4[0], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.16 q4[1], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q4[2], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.16 q4[3], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.16 q4[4], r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.16 q4[5], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vstrh.16 q4, [r0, q0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %input.trunc = trunc <8 x i64> %input to <8 x i16> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand ? +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q3, [r1] +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.sext = sext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %input.trunc = trunc <8 x i32> %input to <8 x i16> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand ? +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov.16 q2[5], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.16 q2[6], r3 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vstrh.16 q2, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> + %input.trunc = trunc <8 x i32> %input to <8 x i16> + call void @llvm.masked.scatter.v8i16(<8 x i16> %input.trunc, <8 x i16*> %ptrs, i32 2, <8 x i1> ) + ret void +} + +; Expand ? +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vldrb.s32 q1, [r1, #4] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.sext = sext <8 x i8> %offs to <8 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.sext + %input.trunc = trunc <8 x i16> %input to <8 x i8> + call void @llvm.masked.scatter.v8i8(<8 x i8> %input.trunc, <8 x i8*> %byte_ptrs, i32 2, <8 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -0,0 +1,240 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s + +; VLDRH.u32 Qd, [base, offs, #uxtw #1] +define arm_aapcs_vfpcc void @ext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs, uxtw #2] +define arm_aapcs_vfpcc void @scaled_i32_i32(i32* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: scaled_i32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs, uxtw #2] +define arm_aapcs_vfpcc void @scaled_f32_i32(i32* %base, <4 x i32>* %offptr, <4 x float> %input) { +; CHECK-LABEL: scaled_f32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs + %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext, uxtw #2] +define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: unsigned_scaled_b_i32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext, uxtw #2] +define arm_aapcs_vfpcc void @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: signed_scaled_i32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext, uxtw #2] +define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) { +; CHECK-LABEL: a_unsigned_scaled_f32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext, uxtw #2] +define arm_aapcs_vfpcc void @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr, <4 x float> %input) { +; CHECK-LABEL: b_signed_scaled_f32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs.sext, uxtw #1] +define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_signed_scaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VSTRH.32 Qd, [base, offs.sext, uxtw #1] +define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unsigned_scaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext, uxtw #2] +define arm_aapcs_vfpcc void @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: unsigned_scaled_b_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext, uxtw #2] +define arm_aapcs_vfpcc void @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: signed_scaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext, uxtw #2] +define arm_aapcs_vfpcc void @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) { +; CHECK-LABEL: a_unsigned_scaled_f32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext, uxtw #2] +define arm_aapcs_vfpcc void @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr, <4 x float> %input) { +; CHECK-LABEL: b_signed_scaled_f32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VLDRH.z32 Qd, [base, offs.sext, uxtw #1] +define arm_aapcs_vfpcc void @ext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_signed_scaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRH.z32 Qd, [base, offs.zext, uxtw #1] +define arm_aapcs_vfpcc void @ext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unsigned_scaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -0,0 +1,419 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s + +; VLDRB.u32 Qd, [base, offs] +define arm_aapcs_vfpcc void @ext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unscaled_i8_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs + %t = trunc <4 x i32> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs] +define arm_aapcs_vfpcc void @ext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unscaled_i16_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs] +define arm_aapcs_vfpcc void @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: unscaled_i32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs] +define arm_aapcs_vfpcc void @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr, <4 x float> %input) { +; CHECK-LABEL: unscaled_f32_i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: unsigned_unscaled_b_i32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: signed_unscaled_i32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) { +; CHECK-LABEL: a_unsigned_unscaled_f32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr, <4 x float> %input) { +; CHECK-LABEL: b_signed_unscaled_f32_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @ext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_signed_unscaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unsigned_unscaled_i16_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRB.u32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_signed_unscaled_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.sext = sext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %t = trunc <4 x i32> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRB.s32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unsigned_unscaled_i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 + %offs.zext = zext <4 x i16> %offs to <4 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %t = trunc <4 x i32> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: unsigned_unscaled_b_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: signed_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) { +; CHECK-LABEL: a_unsigned_unscaled_f32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VSTRW.32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr, <4 x float> %input) { +; CHECK-LABEL: b_signed_unscaled_f32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %input, <4 x float*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs.sext] +define arm_aapcs_vfpcc void @ext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_signed_unscaled_i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %t = trunc <4 x i32> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +; VLDRH.u32 Qd, [base, offs.zext] +define arm_aapcs_vfpcc void @ext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_unsigned_unscaled_i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %t = trunc <4 x i32> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %t, <4 x i8*> %ptrs, i32 2, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vldrb.s32 q2, [r1] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0, q2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + %input.trunc = trunc <4 x i64> %input to <4 x i32> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s2 +; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0, q2] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> + %input.trunc = trunc <4 x i64> %input to <4 x i32> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %input.trunc, <4 x i32*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> + %input.trunc = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> + %input.trunc = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %input.trunc, <4 x i16*> %ptrs, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { +; CHECK-LABEL: trunc_signed_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.sext = sext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext + %input.trunc = trunc <4 x i16> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr, <4 x i16> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 + %offs.zext = zext <4 x i8> %offs to <4 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext + %input.trunc = trunc <4 x i16> %input to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %input.trunc, <4 x i8*> %byte_ptrs, i32 4, <4 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -0,0 +1,540 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o - | FileCheck %s + +; VLDRB.8 +define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vstrb.8 q0, [r0, q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) { +; CHECK-LABEL: unscaled_v8i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q2, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %input, <8 x i8*> %ptrs, i32 1, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v2i8_i8(i8* %base, <2 x i8>* %offptr, <2 x i8> %input) { +; CHECK-LABEL: unscaled_v2i8_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrb r2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0xff +; CHECK-NEXT: ldrb r1, [r1, #1] +; CHECK-NEXT: vmov.32 q2[0], r2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov.32 q2[2], r1 +; CHECK-NEXT: vand q1, q2, q1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strb r2, [r0, r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: strb r2, [r0, r1] +; CHECK-NEXT: bx lr +entry: + %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 + %offs.zext = zext <2 x i8> %offs to <2 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext + call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> %input, <2 x i8*> %ptrs, i32 1, <2 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_sext: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrb.s32 q4, [r1] +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.sext = sext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrh.s32 q4, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #24] +; CHECK-NEXT: vldrh.s32 q2, [r1, #16] +; CHECK-NEXT: vldrh.s32 q3, [r1, #8] +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 + %offs.sext = sext <16 x i16> %offs to <16 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_scaled: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrb.u32 q4, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #12] +; CHECK-NEXT: vldrb.u32 q2, [r1, #8] +; CHECK-NEXT: vldrb.u32 q3, [r1, #4] +; CHECK-NEXT: vshl.i32 q4, q4, #2 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vshl.i32 q2, q2, #2 +; CHECK-NEXT: vshl.i32 q3, q3, #2 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 4 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext + %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*> + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_next: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r1, #48] +; CHECK-NEXT: vldrw.u32 q2, [r1, #32] +; CHECK-NEXT: vldrw.u32 q3, [r1, #16] +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: add r3, sp, #40 +; CHECK-NEXT: vmov.8 q5[0], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.8 q5[1], r4 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.8 q5[2], r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.8 q5[3], r4 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.8 q5[4], r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vmov.8 q5[5], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: add.w lr, sp, #56 +; CHECK-NEXT: vmov.8 q5[6], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.8 q5[7], r4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q5[8], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vldrw.u32 q0, [lr] +; CHECK-NEXT: vmov.8 q5[9], r3 +; CHECK-NEXT: add.w r12, sp, #72 +; CHECK-NEXT: add r2, sp, #88 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: vmov.8 q5[10], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vmov.8 q5[11], r3 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q5[12], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.8 q5[13], r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.8 q5[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q5[15], r2 +; CHECK-NEXT: vstrb.8 q5, [r0, q0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext + %input.trunc = trunc <16 x i64> %input to <16 x i8> + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q4[0], r3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov.8 q4[1], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.8 q4[2], r3 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.8 q4[3], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.8 q4[4], r3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov.8 q4[5], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.8 q4[6], r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov.8 q4[7], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.8 q4[8], r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov.8 q4[9], r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.8 q4[10], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.8 q4[11], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.8 q4[12], r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov.8 q4[13], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.8 q4[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q4[15], r2 +; CHECK-NEXT: vstrb.8 q4, [r0, q0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext + %input.trunc = trunc <16 x i32> %input to <16 x i8> + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 1, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) { +; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.8 q2[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.8 q2[1], r3 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.8 q2[2], r3 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vmov.8 q2[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.8 q2[4], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.8 q2[5], r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov.8 q2[6], r3 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: vmov.8 q2[7], r3 +; CHECK-NEXT: vmov.u16 r3, q1[0] +; CHECK-NEXT: vmov.8 q2[8], r3 +; CHECK-NEXT: vmov.u16 r3, q1[1] +; CHECK-NEXT: vmov.8 q2[9], r3 +; CHECK-NEXT: vmov.u16 r3, q1[2] +; CHECK-NEXT: vmov.8 q2[10], r3 +; CHECK-NEXT: vmov.u16 r3, q1[3] +; CHECK-NEXT: vmov.8 q2[11], r3 +; CHECK-NEXT: vmov.u16 r3, q1[4] +; CHECK-NEXT: vmov.8 q2[12], r3 +; CHECK-NEXT: vmov.u16 r3, q1[5] +; CHECK-NEXT: vmov.8 q2[13], r3 +; CHECK-NEXT: vmov.u16 r3, q1[6] +; CHECK-NEXT: vmov.8 q2[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q2[15], r2 +; CHECK-NEXT: vstrb.8 q2, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %byte_ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext + %input.trunc = trunc <16 x i16> %input to <16 x i8> + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input.trunc, <16 x i8*> %byte_ptrs, i32 4, <16 x i1> ) + ret void +} + + +declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -0,0 +1,633 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-arm-maskedgatscat %s -o - | FileCheck %s + +; i32 + +; Expand +define arm_aapcs_vfpcc void @ptr_v2i32(<2 x i32> %v, <2 x i32*>* %offptr) { +; CHECK-LABEL: ptr_v2i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrd r1, r0, [r0] +; CHECK-NEXT: str r2, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <2 x i32*>, <2 x i32*>* %offptr, align 4 + call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %v, <2 x i32*> %offs, i32 4, <2 x i1> ) + ret void +} + +; VSTRW.32 Qd, [offs, 0] +define arm_aapcs_vfpcc void @ptr_v4i32(<4 x i32> %v, <4 x i32*>* %offptr) { +; CHECK-LABEL: ptr_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %offs, i32 4, <4 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v8i32(<8 x i32> %v, <8 x i32*>* %offptr) { +; CHECK-LABEL: ptr_v8i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i32*>, <8 x i32*>* %offptr, align 4 + call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %v, <8 x i32*> %offs, i32 4, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v16i32(<16 x i32> %v, <16 x i32*>* %offptr) { +; CHECK-LABEL: ptr_v16i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vldrw.u32 q7, [r0] +; CHECK-NEXT: vldrw.u32 q4, [r0, #48] +; CHECK-NEXT: vldrw.u32 q5, [r0, #32] +; CHECK-NEXT: vldrw.u32 q6, [r0, #16] +; CHECK-NEXT: vmov r0, s28 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s29 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s30 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s31 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s20 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s21 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s22 +; CHECK-NEXT: vmov r1, s10 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s23 +; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i32*>, <16 x i32*>* %offptr, align 4 + call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %v, <16 x i32*> %offs, i32 4, <16 x i1> ) + ret void +} + +; f32 + +; Expand +define arm_aapcs_vfpcc void @ptr_v2f32(<2 x float> %v, <2 x float*>* %offptr) { +; CHECK-LABEL: ptr_v2f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldrd r1, r0, [r0] +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: vstr s1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <2 x float*>, <2 x float*>* %offptr, align 4 + call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %v, <2 x float*> %offs, i32 4, <2 x i1> ) + ret void +} + +; VSTRW.32 Qd, [offs, 0] +define arm_aapcs_vfpcc void @ptr_v4f32(<4 x float> %v, <4 x float*>* %offptr) { +; CHECK-LABEL: ptr_v4f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x float*>, <4 x float*>* %offptr, align 4 + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %v, <4 x float*> %offs, i32 4, <4 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v8f32(<8 x float> %v, <8 x float*>* %offptr) { +; CHECK-LABEL: ptr_v8f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov r12, s11 +; CHECK-NEXT: vmov lr, s10 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: vstr s0, [r5] +; CHECK-NEXT: vstr s1, [r4] +; CHECK-NEXT: vstr s2, [r2] +; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s5, [r3] +; CHECK-NEXT: vstr s6, [lr] +; CHECK-NEXT: vstr s7, [r12] +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x float*>, <8 x float*>* %offptr, align 4 + call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %v, <8 x float*> %offs, i32 4, <8 x i1> ) + ret void +} + +; i16 + +; Expand. +define arm_aapcs_vfpcc void @ptr_i16(<8 x i16> %v, <8 x i16*>* %offptr) { +; CHECK-LABEL: ptr_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v, <8 x i16*> %offs, i32 2, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v2i16_trunc(<2 x i32> %v, <2 x i16*>* %offptr) { +; CHECK-LABEL: ptr_v2i16_trunc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrd r1, r0, [r0] +; CHECK-NEXT: strh r2, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <2 x i16*>, <2 x i16*>* %offptr, align 4 + %ext = trunc <2 x i32> %v to <2 x i16> + call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> %ext, <2 x i16*> %offs, i32 2, <2 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) { +; CHECK-LABEL: ptr_v4i16_trunc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 + %ext = trunc <4 x i32> %v to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %ext, <4 x i16*> %offs, i32 2, <4 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) { +; CHECK-LABEL: ptr_v8i16_trunc: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16*>, <8 x i16*>* %offptr, align 4 + %ext = trunc <8 x i32> %v to <8 x i16> + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %ext, <8 x i16*> %offs, i32 2, <8 x i1> ) + ret void +} + +; f16 + +; Expand. +define arm_aapcs_vfpcc void @ptr_f16(<8 x half> %v, <8 x half*>* %offptr) { +; CHECK-LABEL: ptr_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmovx.f16 s12, s0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vstr.16 s12, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vstr.16 s1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmovx.f16 s8, s1 +; CHECK-NEXT: vmovx.f16 s0, s3 +; CHECK-NEXT: vstr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vstr.16 s2, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmovx.f16 s8, s2 +; CHECK-NEXT: vstr.16 s8, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vstr.16 s3, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x half*>, <8 x half*>* %offptr, align 4 + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %v, <8 x half*> %offs, i32 2, <8 x i1> ) + ret void +} + +; i8 + +; Expand. +define arm_aapcs_vfpcc void @ptr_i8(<16 x i8> %v, <16 x i8*>* %offptr) { +; CHECK-LABEL: ptr_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #48] +; CHECK-NEXT: vldrw.u32 q2, [r0, #32] +; CHECK-NEXT: vldrw.u32 q3, [r0, #16] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8*>, <16 x i8*>* %offptr, align 4 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v, <16 x i8*> %offs, i32 2, <16 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v8i8_trunc16(<8 x i16> %v, <8 x i8*>* %offptr) { +; CHECK-LABEL: ptr_v8i8_trunc16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r0, #16] +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 + %ext = trunc <8 x i16> %v to <8 x i8> + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) { +; CHECK-LABEL: ptr_v4i8_trunc32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 + %ext = trunc <4 x i32> %v to <4 x i8> + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %ext, <4 x i8*> %offs, i32 1, <4 x i1> ) + ret void +} + +; Expand +define arm_aapcs_vfpcc void @ptr_v8i8_trunc32(<8 x i32> %v, <8 x i8*>* %offptr) { +; CHECK-LABEL: ptr_v8i8_trunc32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q3, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r0, #16] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 + %ext = trunc <8 x i32> %v to <8 x i8> + call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> %ext, <8 x i8*> %offs, i32 1, <8 x i1> ) + ret void +} + +; loops + +define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) { +; CHECK-LABEL: foo_ptr_p_int32_t: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bic r3, r2, #15 +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB16_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q1, [q0] +; CHECK-NEXT: bne .LBB16_1 +; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: bx lr +entry: + %and = and i32 %n, -16 + %cmp11 = icmp sgt i32 %and, 0 + br i1 %cmp11, label %vector.body, label %for.end + +vector.body: ; preds = %entry, %vector.body + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds i32*, i32** %src, i32 %index + %1 = bitcast i32** %0 to <4 x i32*>* + %wide.load = load <4 x i32*>, <4 x i32*>* %1, align 4 + %2 = icmp ne <4 x i32*> %wide.load, zeroinitializer + %3 = getelementptr inbounds i32, i32* %dest, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %wide.load, i32 4, <4 x i1> %2) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, %n + br i1 %5, label %for.end, label %vector.body + +for.end: ; preds = %vector.body, %entry + ret void +} + +define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) { +; CHECK-LABEL: foo_ptr_p_float: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: bic r3, r2, #15 +; CHECK-NEXT: cmp r3, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: .LBB17_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q0, [r1], #16 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q1, [q0] +; CHECK-NEXT: bne .LBB17_1 +; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: bx lr +entry: + %and = and i32 %n, -16 + %cmp11 = icmp sgt i32 %and, 0 + br i1 %cmp11, label %vector.body, label %for.end + +vector.body: ; preds = %entry, %vector.body + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %0 = getelementptr inbounds float*, float** %src, i32 %index + %1 = bitcast float** %0 to <4 x float*>* + %wide.load = load <4 x float*>, <4 x float*>* %1, align 4 + %2 = icmp ne <4 x float*> %wide.load, zeroinitializer + %3 = getelementptr inbounds float, float* %dest, i32 %index + %4 = bitcast float* %3 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>* %4, i32 4, <4 x i1> %2, <4 x i32> undef) + %5 = bitcast <4 x float*> %wide.load to <4 x i32*> + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %wide.masked.load, <4 x i32*> %5, i32 4, <4 x i1> %2) + %index.next = add i32 %index, 4 + %6 = icmp eq i32 %index.next, %n + br i1 %6, label %for.end, label %vector.body + +for.end: ; preds = %vector.body, %entry + ret void +} + +; VLSTW.u32 Qd, [P, 4] +define arm_aapcs_vfpcc void @qi4(<4 x i32> %v, <4 x i32*> %p) { +; CHECK-LABEL: qi4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.i32 q2, #0x10 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v, <4 x i32*> %g, i32 4, <4 x i1> ) + ret void +} + +declare void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float>, <2 x float*>, i32, <2 x i1>) +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) +declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>) +declare <4 x i32> @llvm.masked.load.v4i32.v4p0i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)