Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,8 +84,11 @@ static bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment) { - // Do only allow non-extending v4i32 gathers for now - return NumElements == 4 && ElemSize == 32 && Alignment >= 4; + // Do only allow non-extending gathers for now + return ((NumElements == 4 && ElemSize == 32) || + (NumElements == 8 && ElemSize == 16) || + (NumElements == 16 && ElemSize == 8)) && + ElemSize / 8 <= Alignment; } static bool LowerGather(IntrinsicInst *I) { @@ -125,18 +128,87 @@ } assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); - if (Ty->getVectorNumElements() != 4) - // Can't build an intrinsic for this - return false; - if (match(Mask, m_One())) - Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, - {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); - else - Load = Builder.CreateIntrinsic( - Intrinsic::arm_mve_vldr_gather_base_predicated, - {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); + Value *BasePtr = Ptr; + GetElementPtrInst *GEP = dyn_cast(Ptr); + if (!GEP) { + if (Ty->getVectorNumElements() != 4) + // Can't build an intrinsic for this + return false; + + LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found. Loading" + << " from vector of pointers\n"); + if (match(Mask, m_One())) + Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(0)}); + else + Load = Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Mask}); + } else { + LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" + << " from base + vector of offsets\n"); + // Load from base + vector of offsets + Value *Offsets = GEP->getOperand(1); + + if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + Offsets = ZextOffs->getOperand(0); + Type *OffsType = VectorType::getInteger(cast(Ty)); + // If the offset we found does not have the type the intrinsic expects, + // i.e., the same type as the gather itself, we need to convert it (only i + // types) or fall back to expanding the gather + if (OffsType != Offsets->getType()) { + if (OffsType->getScalarSizeInBits() >= + Offsets->getType()->getScalarSizeInBits()) { + LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); + Offsets = Builder.CreateZExt(Offsets, OffsType, ""); + } else { + LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. can't" + << " create masked gather\n"); + return false; + } + } + + // SExt offsets inside masked gathers are not permitted by the architecture; + // we therefore can't fold them + Value *GEPPtr = GEP->getPointerOperand(); + if (!GEPPtr->getType()->isVectorTy()) + BasePtr = GEPPtr; + + if (GEP->getNumOperands() != 2) { + LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with incorrect number" + << " of operands. Expanding.\n"); + return false; + } + + unsigned Scale; + LLVMContext &C = I->getContext(); + if (BasePtr->getType() == Type::getInt32PtrTy(C) && + Ty->getScalarSizeInBits() == 32) { + Scale = 2; + } else if (BasePtr->getType() == Type::getInt16PtrTy(C) && + Ty->getScalarSizeInBits() == 16) { + Scale = 1; + } else if (BasePtr->getType() == Type::getInt8PtrTy(C)) { + Scale = 0; + } else { + return false; + } + + if (!match(Mask, m_One())) + Load = Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset_predicated, + {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + else + Load = Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_offset, + {Ty, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(1)}); + } if (!isa(PassThru) && !match(PassThru, m_Zero())) { LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - " Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -5,44 +5,14 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) ret <8 x i16> %gather } @@ -50,51 +20,15 @@ define arm_aapcs_vfpcc <8 x half> @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8f16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr.16 s4, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } @@ -145,7 +79,7 @@ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds half, half* %base, <8 x i32> %offs.zext - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } @@ -190,7 +124,7 @@ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.sext = sext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) ret <8 x i16> %gather } @@ -242,7 +176,7 @@ %offs.sext = sext <8 x i16> %offs to <8 x i32> %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.sext %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } @@ -250,44 +184,14 @@ define arm_aapcs_vfpcc <8 x i16> @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: unsigned_scaled_v8i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) ret <8 x i16> %gather } @@ -295,460 +199,116 @@ define arm_aapcs_vfpcc <8 x half> @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: unsigned_scaled_v8f16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr.16 s4, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %i16_ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i16*> %i16_ptrs to <8 x half*> - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0t(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru0t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> zeroinitializer) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> zeroinitializer) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1t(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru1t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru1f(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru1f: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh.w lr, [r0] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[0], r0 -; CHECK-NEXT: vmov.16 q0[1], r4 -; CHECK-NEXT: vmov.16 q0[2], r5 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: movw r2, #65487 +; CHECK-NEXT: vmov.i16 q0, #0x1 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru0f(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru0f: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh.w lr, [r0] -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[0], r4 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[3], r0 -; CHECK-NEXT: vmov.16 q0[4], lr -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: movw r2, #65523 +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> ) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.u16 lr, q0[7] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: and r3, r1, #1 -; CHECK-NEXT: rsbs r2, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: bfi r3, r2, #0, #1 -; CHECK-NEXT: ubfx r2, r1, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #1, #1 -; CHECK-NEXT: ubfx r2, r1, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #2, #1 -; CHECK-NEXT: ubfx r2, r1, #6, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #3, #1 -; CHECK-NEXT: ubfx r2, r1, #8, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #4, #1 -; CHECK-NEXT: ubfx r2, r1, #10, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #5, #1 -; CHECK-NEXT: ubfx r2, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #14, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r2, r3, #31 -; CHECK-NEXT: beq .LBB11_2 -; CHECK-NEXT: @ %bb.1: @ %cond.load -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vdup.16 q0, r12 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: b .LBB11_3 -; CHECK-NEXT: .LBB11_2: -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: .LBB11_3: @ %else -; CHECK-NEXT: vmov.32 q1[3], lr -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s9 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[1], r2 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s10 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[2], r2 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s11 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[3], r2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: lsls r0, r1, #27 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s4 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[4], r0 -; CHECK-NEXT: lsls r0, r1, #26 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s5 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[5], r0 -; CHECK-NEXT: lsls r0, r1, #25 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s6 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[6], r0 -; CHECK-NEXT: lsls r0, r1, #24 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s7 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[7], r0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vldrht.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext %mask = icmp sgt <8 x i16> %offs, zeroinitializer - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_passthru_icmp1(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp1: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: vcmp.s16 gt, q0, zr -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.u16 r12, q0[7] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vmovlb.u16 q0, q2 -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: and lr, r1, #1 -; CHECK-NEXT: rsb.w r2, lr, #0 -; CHECK-NEXT: bfi r3, r2, #0, #1 -; CHECK-NEXT: ubfx r2, r1, #2, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #1, #1 -; CHECK-NEXT: ubfx r2, r1, #4, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #2, #1 -; CHECK-NEXT: ubfx r2, r1, #6, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #3, #1 -; CHECK-NEXT: ubfx r2, r1, #8, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #4, #1 -; CHECK-NEXT: ubfx r2, r1, #10, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #5, #1 -; CHECK-NEXT: ubfx r2, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #14, #1 -; CHECK-NEXT: rsbs r2, r2, #0 -; CHECK-NEXT: bfi r3, r2, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r3 -; CHECK-NEXT: lsls r2, r3, #31 -; CHECK-NEXT: beq .LBB12_2 -; CHECK-NEXT: @ %bb.1: @ %cond.load -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vdup.16 q0, r3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: b .LBB12_3 -; CHECK-NEXT: .LBB12_2: ; CHECK-NEXT: vmov.i16 q0, #0x1 -; CHECK-NEXT: .LBB12_3: @ %else -; CHECK-NEXT: vmov.32 q1[3], r12 -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s9 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[1], r2 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s10 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[2], r2 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s11 -; CHECK-NEXT: ldrhmi r2, [r2] -; CHECK-NEXT: vmovmi.16 q0[3], r2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: lsls r0, r1, #27 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s4 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[4], r0 -; CHECK-NEXT: lsls r0, r1, #26 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s5 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[5], r0 -; CHECK-NEXT: lsls r0, r1, #25 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s6 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[6], r0 -; CHECK-NEXT: lsls r0, r1, #24 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r0, s7 -; CHECK-NEXT: ldrhmi r0, [r0] -; CHECK-NEXT: vmovmi.16 q0[7], r0 -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vldrht.u16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: vpsel q0, q2, q0 +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %offs.zext %mask = icmp sgt <8 x i16> %offs, zeroinitializer - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> %mask, <8 x i16> ) ret <8 x i16> %gather } -declare <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 -declare <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 -declare <8 x half> @llvm.masked.gather.v8f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -40,7 +40,7 @@ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext - %gather = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) %gather.zext = zext <8 x i8> %gather to <8 x i16> ret <8 x i16> %gather.zext } @@ -84,7 +84,7 @@ %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext - %gather = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) %gather.sext = sext <8 x i8> %gather to <8 x i16> ret <8 x i16> %gather.sext } @@ -92,92 +92,30 @@ define arm_aapcs_vfpcc <8 x i16> @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x half> @unscaled_f16_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: unscaled_f16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr.16 s4, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } @@ -220,7 +158,7 @@ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext - %gather = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) %gather.zext = zext <8 x i8> %gather to <8 x i16> ret <8 x i16> %gather.zext } @@ -264,7 +202,7 @@ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext - %gather = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) %gather.sext = sext <8 x i8> %gather to <8 x i16> ret <8 x i16> %gather.sext } @@ -272,95 +210,33 @@ define arm_aapcs_vfpcc <8 x i16> @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: vmov.16 q0[2], r12 -; CHECK-NEXT: vmov.16 q0[3], lr -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x i16*> - %gather = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) ret <8 x i16> %gather } define arm_aapcs_vfpcc <8 x half> @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_f16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldr.16 s0, [r2] -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vldr.16 s0, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldr.16 s4, [r2] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[5], r0 -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.16 q0[6], r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr.16 s4, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext %ptrs = bitcast <8 x i8*> %byte_ptrs to <8 x half*> - %gather = call <8 x half> @llvm.masked.gather.v8f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) + %gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %ptrs, i32 2, <8 x i1> , <8 x half> undef) ret <8 x half> %gather } -declare <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 -declare <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 -declare <8 x half> @llvm.masked.gather.v8f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -24,7 +24,7 @@ entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -52,7 +52,7 @@ entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -60,15 +60,13 @@ define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32(i32* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: scaled_i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } @@ -77,82 +75,72 @@ define arm_aapcs_vfpcc <4 x float> @scaled_f32_i32(i32* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: scaled_f32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: unsigned_scaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: signed_scaled_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: a_unsigned_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i16(i32* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: b_signed_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } @@ -180,7 +168,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -209,7 +197,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -238,7 +226,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -267,7 +255,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -275,66 +263,58 @@ define arm_aapcs_vfpcc <4 x i32> @unsigned_scaled_b_i32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: unsigned_scaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x i32> @signed_scaled_i32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: signed_scaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x float> @a_unsigned_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: a_unsigned_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x float> @b_signed_scaled_f32_i8(i32* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: b_signed_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %i32_ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i32*> %i32_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } @@ -362,7 +342,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -391,7 +371,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.sext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -420,7 +400,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -449,13 +429,13 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs.zext - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } -declare <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) -declare <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) -declare <4 x half> @llvm.masked.gather.v4f16(<4 x half*>, i32, <4 x i1>, <4 x half>) -declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -24,7 +24,7 @@ entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.zext = zext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -52,7 +52,7 @@ entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.sext = sext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -80,7 +80,7 @@ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -108,7 +108,7 @@ %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -116,94 +116,88 @@ define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: unscaled_i32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x float> @unscaled_f32_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: unscaled_f32_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: signed_unscaled_i32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: a_unsigned_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: b_signed_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } @@ -231,7 +225,7 @@ %offs.sext = sext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -260,7 +254,7 @@ %offs.sext = sext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -289,7 +283,7 @@ %offs.zext = zext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -318,7 +312,7 @@ %offs.zext = zext <4 x i16> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -347,7 +341,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.zext = zext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -376,7 +370,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.sext = sext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.sext = sext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -405,7 +399,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.zext = zext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -434,7 +428,7 @@ %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 %offs.zext = zext <4 x i16> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.sext = sext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -442,64 +436,60 @@ define arm_aapcs_vfpcc <4 x i32> @unsigned_unscaled_b_i32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: unsigned_unscaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x i32> @signed_unscaled_i32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } define arm_aapcs_vfpcc <4 x float> @a_unsigned_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: a_unsigned_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } define arm_aapcs_vfpcc <4 x float> @b_signed_unscaled_f32_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: b_signed_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x float*> - %gather = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) + %gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %ptrs, i32 4, <4 x i1> , <4 x float> undef) ret <4 x float> %gather } @@ -527,7 +517,7 @@ %offs.sext = sext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -556,7 +546,7 @@ %offs.sext = sext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -585,7 +575,7 @@ %offs.zext = zext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.zext = zext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -614,7 +604,7 @@ %offs.zext = zext <4 x i8> %offs to <4 x i32> %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i16*> - %gather = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) + %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %ptrs, i32 2, <4 x i1> , <4 x i16> undef) %gather.sext = sext <4 x i16> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -643,7 +633,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.zext = zext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -672,7 +662,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.sext = sext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.sext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.sext = sext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.sext } @@ -701,7 +691,7 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.zext = zext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.zext } @@ -730,13 +720,13 @@ %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 %offs.zext = zext <4 x i8> %offs to <4 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs.zext - %gather = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) + %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %ptrs, i32 1, <4 x i1> , <4 x i8> undef) %gather.sext = sext <4 x i8> %gather to <4 x i32> ret <4 x i32> %gather.sext } -declare <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) -declare <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) -declare <4 x half> @llvm.masked.gather.v4f16(<4 x half*>, i32, <4 x i1>, <4 x half>) -declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) Index: llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -5,70 +5,14 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1, #8] -; CHECK-NEXT: vldrb.u32 q2, [r1, #4] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrb.u32 q0, [r1, #12] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r4, s3 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w lr, [r2] -; CHECK-NEXT: vmov r2, s1 -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r6 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r5 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r2 -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov.8 q0[15], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.zext = zext <16 x i8> %offs to <16 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.zext - %gather = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) ret <16 x i8> %gather } @@ -111,7 +55,7 @@ %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext - %gather = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) ret <8 x i8> %gather } @@ -136,7 +80,7 @@ %offs = load <2 x i8>, <2 x i8>* %offptr, align 1 %offs.zext = zext <2 x i8> %offs to <2 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <2 x i32> %offs.zext - %gather = call <2 x i8> @llvm.masked.gather.v2i8(<2 x i8*> %ptrs, i32 1, <2 x i1> , <2 x i8> undef) + %gather = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> %ptrs, i32 1, <2 x i1> , <2 x i8> undef) ret <2 x i8> %gather } @@ -207,7 +151,7 @@ %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.sext = sext <16 x i8> %offs to <16 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext - %gather = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) ret <16 x i8> %gather } @@ -278,7 +222,7 @@ %offs = load <16 x i16>, <16 x i16>* %offptr, align 2 %offs.sext = sext <16 x i16> %offs to <16 x i32> %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs.sext - %gather = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) ret <16 x i8> %gather } @@ -354,7 +298,7 @@ %offs.zext = zext <16 x i8> %offs to <16 x i32> %ptrs32 = getelementptr inbounds i32, i32* %base, <16 x i32> %offs.zext %ptrs = bitcast <16 x i32*> %ptrs32 to <16 x i8*> - %gather = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) ret <16 x i8> %gather } @@ -424,10 +368,10 @@ entry: %offs = load <16 x i32>, <16 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %offs - %gather = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) ret <16 x i8> %gather } -declare <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) -declare <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) -declare <2 x i8> @llvm.masked.gather.v2i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-opt.ll @@ -1,11 +1,12 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o 2>/dev/null - | FileCheck --check-prefix NOGATSCAT %s ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-mve -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck --check-prefix NOMVE %s define arm_aapcs_vfpcc <4 x i32> @unscaled_i32_i32(i8* %base, <4 x i32>* %offptr) { ; NOGATSCAT-LABEL: unscaled_i32_i32: ; NOGATSCAT: @ %bb.0: @ %entry -; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] -; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 +; NOGATSCAT-NEXT: vldrw.u32 q0, [r1] +; NOGATSCAT-NEXT: vadd.i32 q0, q0, r0 ; NOGATSCAT-NEXT: vmov r0, s0 ; NOGATSCAT-NEXT: vmov r3, s1 ; NOGATSCAT-NEXT: vmov r1, s2 @@ -19,26 +20,27 @@ ; NOGATSCAT-NEXT: vmov.32 q0[2], r1 ; NOGATSCAT-NEXT: vmov.32 q0[3], r2 ; NOGATSCAT-NEXT: bx lr - +; ; NOMVE-LABEL: unscaled_i32_i32: ; NOMVE: @ %bb.0: @ %entry -; NOMVE-NEXT: .save {r4, lr} -; NOMVE-NEXT: push {r4, lr} -; NOMVE-NEXT: ldm.w r1, {r2, r3, lr} -; NOMVE-NEXT: ldr r4, [r1, #12] -; NOMVE-NEXT: ldr.w r12, [r0, r2] -; NOMVE-NEXT: ldr r1, [r0, r3] -; NOMVE-NEXT: ldr.w r2, [r0, lr] -; NOMVE-NEXT: ldr r3, [r0, r4] -; NOMVE-NEXT: mov r0, r12 -; NOMVE-NEXT: pop {r4, pc} +; NOMVE-NEXT: .save {r4, lr} +; NOMVE-NEXT: push {r4, lr} +; NOMVE-NEXT: ldm.w r1, {r2, r3, lr} +; NOMVE-NEXT: ldr r4, [r1, #12] +; NOMVE-NEXT: ldr.w r12, [r0, r2] +; NOMVE-NEXT: ldr r1, [r0, r3] +; NOMVE-NEXT: ldr.w r2, [r0, lr] +; NOMVE-NEXT: ldr r3, [r0, r4] +; NOMVE-NEXT: mov r0, r12 +; NOMVE-NEXT: pop {r4, pc} + entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %byte_ptrs = getelementptr inbounds i8, i8* %base, <4 x i32> %offs %ptrs = bitcast <4 x i8*> %byte_ptrs to <4 x i32*> - %gather = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> , <4 x i32> undef) ret <4 x i32> %gather } -declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)