Diff 309872

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

Show First 20 Lines • Show All 1,740 Lines • ▼ Show 20 Lines	void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));		std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));

SDValue Ch = MGT->getChain();		SDValue Ch = MGT->getChain();
SDValue Ptr = MGT->getBasePtr();		SDValue Ptr = MGT->getBasePtr();
SDValue Mask = MGT->getMask();		SDValue Mask = MGT->getMask();
SDValue PassThru = MGT->getPassThru();		SDValue PassThru = MGT->getPassThru();
SDValue Index = MGT->getIndex();		SDValue Index = MGT->getIndex();
SDValue Scale = MGT->getScale();		SDValue Scale = MGT->getScale();
		EVT MemoryVT = MGT->getMemoryVT();
Align Alignment = MGT->getOriginalAlign();		Align Alignment = MGT->getOriginalAlign();

// Split Mask operand		// Split Mask operand
SDValue MaskLo, MaskHi;		SDValue MaskLo, MaskHi;
if (Mask.getOpcode() == ISD::SETCC) {		if (Mask.getOpcode() == ISD::SETCC) {
SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);		SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
} else {		} else {
if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)		if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Mask, MaskLo, MaskHi);		GetSplitVector(Mask, MaskLo, MaskHi);
else		else
std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);		std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
}		}

		EVT LoMemVT, HiMemVT;
		// Split MemoryVT
		std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
		yubingUnsubmitted Not Done Reply Inline Actions Do we really need to split the MemoryType here? I observed that in WidenVecRes_MGATHER, we don't widen the MemoryType. If we have a v17i32's masked_gather in avx512, we widen it to a v32i32's masked_gather with a v17i32's MemoryType. When the SplitVecRes_MGATHER process this v32i32's masked_gather, line1765 will assert fail since what you are going to split is v17i32. yubing: Do we really need to split the MemoryType here? I observed that in WidenVecRes_MGATHER, we…

SDValue PassThruLo, PassThruHi;		SDValue PassThruLo, PassThruHi;
if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)		if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(PassThru, PassThruLo, PassThruHi);		GetSplitVector(PassThru, PassThruLo, PassThruHi);
else		else
std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);		std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);

SDValue IndexHi, IndexLo;		SDValue IndexHi, IndexLo;
if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)		if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
GetSplitVector(Index, IndexLo, IndexHi);		GetSplitVector(Index, IndexLo, IndexHi);
else		else
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);		std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(		MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MGT->getPointerInfo(), MachineMemOperand::MOLoad,		MGT->getPointerInfo(), MachineMemOperand::MOLoad,
MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),		MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
MGT->getRanges());		MGT->getRanges());

SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};		SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,		Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl, OpsLo,
MMO, MGT->getIndexType());		MMO, MGT->getIndexType());

SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};		SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,		Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl, OpsHi,
MMO, MGT->getIndexType());		MMO, MGT->getIndexType());

// Build a factor node to remember that this load is independent of the		// Build a factor node to remember that this load is independent of the
// other one.		// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),		Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));		Hi.getValue(1));

// Legalize the chain result - switch anything that used the old chain to		// Legalize the chain result - switch anything that used the old chain to
▲ Show 20 Lines • Show All 623 Lines • ▼ Show 20 Lines	else
std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);		std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);

MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(		MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MGT->getPointerInfo(), MachineMemOperand::MOLoad,		MGT->getPointerInfo(), MachineMemOperand::MOLoad,
MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),		MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
MGT->getRanges());		MGT->getRanges());

SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};		SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,		SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoMemVT, dl,
OpsLo, MMO, MGT->getIndexType());		OpsLo, MMO, MGT->getIndexType());

SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};		SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,		SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiMemVT, dl,
OpsHi, MMO, MGT->getIndexType());		OpsHi, MMO, MGT->getIndexType());

// Build a factor node to remember that this load is independent of the		// Build a factor node to remember that this load is independent of the
// other one.		// other one.
Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),		Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
Hi.getValue(1));		Hi.getValue(1));

// Legalize the chain result - switch anything that used the old chain to		// Legalize the chain result - switch anything that used the old chain to
▲ Show 20 Lines • Show All 2,903 Lines • Show Last 20 Lines

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,304 Lines • ▼ Show 20 Lines	ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
dl.getIROrder(), VTs, VT, MMO, IndexType));		dl.getIROrder(), VTs, VT, MMO, IndexType));
ID.AddInteger(MMO->getPointerInfo().getAddrSpace());		ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
void *IP = nullptr;		void *IP = nullptr;
if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {		if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);		cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
return SDValue(E, 0);		return SDValue(E, 0);
}		}

		IndexType = TLI->getCanonicalIndexType(IndexType, VT, Ops[4]);
auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),		auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
VTs, VT, MMO, IndexType);		VTs, VT, MMO, IndexType);
createOperands(N, Ops);		createOperands(N, Ops);

assert(N->getPassThru().getValueType() == N->getValueType(0) &&		assert(N->getPassThru().getValueType() == N->getValueType(0) &&
"Incompatible type of the PassThru value in MaskedGatherSDNode");		"Incompatible type of the PassThru value in MaskedGatherSDNode");
assert(N->getMask().getValueType().getVectorNumElements() ==		assert(N->getMask().getValueType().getVectorElementCount() ==
N->getValueType(0).getVectorNumElements() &&		N->getValueType(0).getVectorElementCount() &&
"Vector width mismatch between mask and data");		"Vector width mismatch between mask and data");
assert(N->getIndex().getValueType().getVectorNumElements() >=		assert(N->getIndex().getValueType().getVectorElementCount().isScalable() ==
N->getValueType(0).getVectorNumElements() &&		N->getValueType(0).getVectorElementCount().isScalable() &&
		"Scalable flags of index and data do not match");
		assert(ElementCount::isKnownGE(
		N->getIndex().getValueType().getVectorElementCount(),
		N->getValueType(0).getVectorElementCount()) &&
"Vector width mismatch between index and data");		"Vector width mismatch between index and data");
assert(isa<ConstantSDNode>(N->getScale()) &&		assert(isa<ConstantSDNode>(N->getScale()) &&
cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&		cast<ConstantSDNode>(N->getScale())->getAPIntValue().isPowerOf2() &&
"Scale should be a constant power of 2");		"Scale should be a constant power of 2");

CSEMap.InsertNode(N, IP);		CSEMap.InsertNode(N, IP);
InsertNode(N);		InsertNode(N);
SDValue V(N, 0);		SDValue V(N, 0);
▲ Show 20 Lines • Show All 2,804 Lines • Show Last 20 Lines

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,410 Lines • ▼ Show 20 Lines	MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo(AS), MachineMemOperand::MOLoad,		MachinePointerInfo(AS), MachineMemOperand::MOLoad,
// TODO: Make MachineMemOperands aware of scalable		// TODO: Make MachineMemOperands aware of scalable
// vectors.		// vectors.
MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);		MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);

if (!UniformBase) {		if (!UniformBase) {
Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));		Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
Index = getValue(Ptr);		Index = getValue(Ptr);
IndexType = ISD::SIGNED_SCALED;		IndexType = ISD::SIGNED_UNSCALED;
Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));		Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
}		}
SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };		SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,		SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
Ops, MMO, IndexType);		Ops, MMO, IndexType);

PendingLoads.push_back(Gather.getValue(1));		PendingLoads.push_back(Gather.getValue(1));
setValue(&I, Gather);		setValue(&I, Gather);
▲ Show 20 Lines • Show All 6,280 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Show First 20 Lines • Show All 799 Lines • ▼ Show 20 Lines	SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
CallingConv::ID CallConv, bool isVarArg,		CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,		const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &DL, SelectionDAG &DAG,		const SDLoc &DL, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals, bool isThisReturn,		SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
SDValue ThisVal) const;		SDValue ThisVal) const;

SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;

		SDValue LowerMGATHER(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerMSCATTER(SDValue Op, SelectionDAG &DAG) const;

SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;		SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;

bool isEligibleForTailCallOptimization(		bool isEligibleForTailCallOptimization(
SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,		SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
const SmallVectorImpl<ISD::OutputArg> &Outs,		const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,		const SmallVectorImpl<SDValue> &OutVals,
▲ Show 20 Lines • Show All 208 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines	cl::opt<bool> EnableAArch64ELFLocalDynamicTLSGeneration(
cl::init(false));		cl::init(false));

static cl::opt<bool>		static cl::opt<bool>
EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,		EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
cl::desc("Enable AArch64 logical imm instruction "		cl::desc("Enable AArch64 logical imm instruction "
"optimization"),		"optimization"),
cl::init(true));		cl::init(true));

		// Temporary option added for the purpose of testing functionality added
		sdesmalenUnsubmitted Not Done Reply Inline Actions nit: can you add a comment here to describe that this option is temporary and only exists for the purpose of testing functionality added to DAGCombiner.cpp. Perhaps you can also say there is the expectation to remove it in the future, when both implementations of will be based off MGATHER. rather than relying on the ISD nodes we added for the `llvm.aarch64.sve.ld1.gather` intrinsics. sdesmalen: nit: can you add a comment here to describe that this option is temporary and only exists for…
		// to DAGCombiner.cpp in D92230. It is expected that this can be removed
		// in future when both implementations will be based off MGATHER rather
		// than the GLD1 nodes added for the SVE gather load intrinsics.
		static cl::opt<bool>
		EnableCombineMGatherIntrinsics("aarch64-enable-mgather-combine", cl::Hidden,
		cl::desc("Combine extends of AArch64 masked "
		"gather intrinsics"),
		cl::init(true));

/// Value type used for condition codes.		/// Value type used for condition codes.
static const MVT MVT_CC = MVT::i32;		static const MVT MVT_CC = MVT::i32;

static inline EVT getPackedSVEVectorVT(EVT VT) {		static inline EVT getPackedSVEVectorVT(EVT VT) {
switch (VT.getSimpleVT().SimpleTy) {		switch (VT.getSimpleVT().SimpleTy) {
default:		default:
llvm_unreachable("unexpected element type for vector");		llvm_unreachable("unexpected element type for vector");
case MVT::i8:		case MVT::i8:
▲ Show 20 Lines • Show All 930 Lines • ▼ Show 20 Lines	if (Subtarget->hasSVE()) {
// splat of 0 or undef) once vector selects supported in SVE codegen. See		// splat of 0 or undef) once vector selects supported in SVE codegen. See
// D68877 for more details.		// D68877 for more details.
for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {		for (auto VT : {MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64}) {
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);		setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
setOperationAction(ISD::UINT_TO_FP, VT, Custom);		setOperationAction(ISD::UINT_TO_FP, VT, Custom);
setOperationAction(ISD::SINT_TO_FP, VT, Custom);		setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::FP_TO_UINT, VT, Custom);		setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::FP_TO_SINT, VT, Custom);		setOperationAction(ISD::FP_TO_SINT, VT, Custom);
		setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);		setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);		setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);		setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SDIV, VT, Custom);		setOperationAction(ISD::SDIV, VT, Custom);
setOperationAction(ISD::UDIV, VT, Custom);		setOperationAction(ISD::UDIV, VT, Custom);
setOperationAction(ISD::SMIN, VT, Custom);		setOperationAction(ISD::SMIN, VT, Custom);
setOperationAction(ISD::UMIN, VT, Custom);		setOperationAction(ISD::UMIN, VT, Custom);
Show All 36 Lines	for (auto VT : {MVT::nxv16i1, MVT::nxv8i1, MVT::nxv4i1, MVT::nxv2i1}) {
AddPromotedToType(ISD::UINT_TO_FP, VT, getPromotedVTForPredicate(VT));		AddPromotedToType(ISD::UINT_TO_FP, VT, getPromotedVTForPredicate(VT));
}		}
}		}

for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,		for (auto VT : {MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32,
MVT::nxv4f32, MVT::nxv2f64}) {		MVT::nxv4f32, MVT::nxv2f64}) {
setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);		setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);		setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
		setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);		setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);		setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);		setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::FADD, VT, Custom);		setOperationAction(ISD::FADD, VT, Custom);
setOperationAction(ISD::FDIV, VT, Custom);		setOperationAction(ISD::FDIV, VT, Custom);
setOperationAction(ISD::FMA, VT, Custom);		setOperationAction(ISD::FMA, VT, Custom);
setOperationAction(ISD::FMUL, VT, Custom);		setOperationAction(ISD::FMUL, VT, Custom);
setOperationAction(ISD::FNEG, VT, Custom);		setOperationAction(ISD::FNEG, VT, Custom);
▲ Show 20 Lines • Show All 2,648 Lines • ▼ Show 20 Lines	bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {

return false;		return false;
}		}

bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {		bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();		return ExtVal.getValueType().isScalableVector();
}		}

		unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
		std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ false),
		AArch64ISD::GLD1_MERGE_ZERO},
		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ true),
		AArch64ISD::GLD1_UXTW_MERGE_ZERO},
		{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ false),
		AArch64ISD::GLD1_MERGE_ZERO},
		{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ true),
		AArch64ISD::GLD1_SXTW_MERGE_ZERO},
		{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ false),
		AArch64ISD::GLD1_SCALED_MERGE_ZERO},
		{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ true),
		AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO},
		{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ false),
		AArch64ISD::GLD1_SCALED_MERGE_ZERO},
		{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ true),
		AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO},
		};
		auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
		return AddrModes.find(Key)->second;
		}

unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {		unsigned getScatterVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {		std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ false),		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ false),
AArch64ISD::SST1_PRED},		AArch64ISD::SST1_PRED},
{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ true),		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ true),
AArch64ISD::SST1_UXTW_PRED},		AArch64ISD::SST1_UXTW_PRED},
{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ false),		{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ false),
AArch64ISD::SST1_PRED},		AArch64ISD::SST1_PRED},
{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ true),		{std::make_tuple(/Scaled/ false, /Signed/ true, /Extend/ true),
AArch64ISD::SST1_SXTW_PRED},		AArch64ISD::SST1_SXTW_PRED},
{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ false),		{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ false),
AArch64ISD::SST1_SCALED_PRED},		AArch64ISD::SST1_SCALED_PRED},
{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ true),		{std::make_tuple(/Scaled/ true, /Signed/ false, /Extend/ true),
AArch64ISD::SST1_UXTW_SCALED_PRED},		AArch64ISD::SST1_UXTW_SCALED_PRED},
{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ false),		{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ false),
AArch64ISD::SST1_SCALED_PRED},		AArch64ISD::SST1_SCALED_PRED},
{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ true),		{std::make_tuple(/Scaled/ true, /Signed/ true, /Extend/ true),
AArch64ISD::SST1_SXTW_SCALED_PRED},		AArch64ISD::SST1_SXTW_SCALED_PRED},
};		};
auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);		auto Key = std::make_tuple(IsScaled, IsSigned, NeedsExtend);
return AddrModes.find(Key)->second;		return AddrModes.find(Key)->second;
}		}

bool getScatterIndexIsExtended(SDValue Index) {		bool getGatherScatterIndexIsExtended(SDValue Index) {
unsigned Opcode = Index.getOpcode();		unsigned Opcode = Index.getOpcode();
if (Opcode == ISD::SIGN_EXTEND_INREG)		if (Opcode == ISD::SIGN_EXTEND_INREG)
return true;		return true;

if (Opcode == ISD::AND) {		if (Opcode == ISD::AND) {
SDValue Splat = Index.getOperand(1);		SDValue Splat = Index.getOperand(1);
if (Splat.getOpcode() != ISD::SPLAT_VECTOR)		if (Splat.getOpcode() != ISD::SPLAT_VECTOR)
return false;		return false;
ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));		ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(Splat.getOperand(0));
if (!Mask \|\| Mask->getZExtValue() != 0xFFFFFFFF)		if (!Mask \|\| Mask->getZExtValue() != 0xFFFFFFFF)
return false;		return false;
return true;		return true;
}		}

return false;		return false;
}		}

		SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
		SelectionDAG &DAG) const {
		SDLoc DL(Op);
		MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
		assert(MGT && "Can only custom lower gather load nodes");

		SDValue Index = MGT->getIndex();
		SDValue Chain = MGT->getChain();
		SDValue PassThru = MGT->getPassThru();
		SDValue Mask = MGT->getMask();
		SDValue BasePtr = MGT->getBasePtr();

		ISD::MemIndexType IndexType = MGT->getIndexType();
		bool IsScaled =
		IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::UNSIGNED_SCALED;
		bool IsSigned =
		IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::SIGNED_UNSCALED;
		bool IdxNeedsExtend =
		getGatherScatterIndexIsExtended(Index) \|\|
		Index.getSimpleValueType().getVectorElementType() == MVT::i32;

		EVT VT = PassThru.getSimpleValueType();
		EVT MemVT = MGT->getMemoryVT();
		SDValue InputVT = DAG.getValueType(MemVT);

		if (VT.getVectorElementType() == MVT::bf16 &&
		!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
		return SDValue();

		// Handle FP data
		if (VT.isFloatingPoint()) {
		VT = VT.changeVectorElementTypeToInteger();
		ElementCount EC = VT.getVectorElementCount();
		auto ScalarIntVT =
		MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
		PassThru = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
		MVT::getVectorVT(ScalarIntVT, EC), PassThru);

		InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
		}

		SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);

		SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
		return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL,
		VTs, Ops);
		}

SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,		SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc DL(Op);		SDLoc DL(Op);
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);		MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
assert(MSC && "Can only custom lower scatter store nodes");		assert(MSC && "Can only custom lower scatter store nodes");

SDValue Index = MSC->getIndex();		SDValue Index = MSC->getIndex();
SDValue Chain = MSC->getChain();		SDValue Chain = MSC->getChain();
SDValue StoreVal = MSC->getValue();		SDValue StoreVal = MSC->getValue();
SDValue Mask = MSC->getMask();		SDValue Mask = MSC->getMask();
SDValue BasePtr = MSC->getBasePtr();		SDValue BasePtr = MSC->getBasePtr();

ISD::MemIndexType IndexType = MSC->getIndexType();		ISD::MemIndexType IndexType = MSC->getIndexType();
bool IsScaled =		bool IsScaled =
IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::UNSIGNED_SCALED;		IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::UNSIGNED_SCALED;
bool IsSigned =		bool IsSigned =
IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::SIGNED_UNSCALED;		IndexType == ISD::SIGNED_SCALED \|\| IndexType == ISD::SIGNED_UNSCALED;
bool NeedsExtend =		bool NeedsExtend =
getScatterIndexIsExtended(Index) \|\|		getGatherScatterIndexIsExtended(Index) \|\|
Index.getSimpleValueType().getVectorElementType() == MVT::i32;		Index.getSimpleValueType().getVectorElementType() == MVT::i32;

EVT VT = StoreVal.getSimpleValueType();		EVT VT = StoreVal.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Other);		SDVTList VTs = DAG.getVTList(MVT::Other);
EVT MemVT = MSC->getMemoryVT();		EVT MemVT = MSC->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);		SDValue InputVT = DAG.getValueType(MemVT);

if (VT.getVectorElementType() == MVT::bf16 &&		if (VT.getVectorElementType() == MVT::bf16 &&
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())		!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();		return SDValue();

// Handle FP data		// Handle FP data
if (VT.isFloatingPoint()) {		if (VT.isFloatingPoint()) {
VT = VT.changeVectorElementTypeToInteger();		VT = VT.changeVectorElementTypeToInteger();
ElementCount EC = VT.getVectorElementCount();		ElementCount EC = VT.getVectorElementCount();
auto ScalarIntVT =		auto ScalarIntVT =
MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());		MVT::getIntegerVT(AArch64::SVEBitsPerBlock / EC.getKnownMinValue());
StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,		StoreVal = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL,
MVT::getVectorVT(ScalarIntVT, EC), StoreVal);		MVT::getVectorVT(ScalarIntVT, EC), StoreVal);

InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());		InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}		}

if (getScatterIndexIsExtended(Index))		if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);		Index = Index.getOperand(0);

SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};		SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,		return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,
VTs, Ops);		VTs, Ops);
}		}

// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.		// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
▲ Show 20 Lines • Show All 284 Lines • ▼ Show 20 Lines	SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
case ISD::FLT_ROUNDS_:		case ISD::FLT_ROUNDS_:
return LowerFLT_ROUNDS_(Op, DAG);		return LowerFLT_ROUNDS_(Op, DAG);
case ISD::MUL:		case ISD::MUL:
return LowerMUL(Op, DAG);		return LowerMUL(Op, DAG);
case ISD::INTRINSIC_WO_CHAIN:		case ISD::INTRINSIC_WO_CHAIN:
return LowerINTRINSIC_WO_CHAIN(Op, DAG);		return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::STORE:		case ISD::STORE:
return LowerSTORE(Op, DAG);		return LowerSTORE(Op, DAG);
		case ISD::MGATHER:
		return LowerMGATHER(Op, DAG);
case ISD::MSCATTER:		case ISD::MSCATTER:
return LowerMSCATTER(Op, DAG);		return LowerMSCATTER(Op, DAG);
case ISD::VECREDUCE_SEQ_FADD:		case ISD::VECREDUCE_SEQ_FADD:
return LowerVECREDUCE_SEQ_FADD(Op, DAG);		return LowerVECREDUCE_SEQ_FADD(Op, DAG);
case ISD::VECREDUCE_ADD:		case ISD::VECREDUCE_ADD:
case ISD::VECREDUCE_AND:		case ISD::VECREDUCE_AND:
case ISD::VECREDUCE_OR:		case ISD::VECREDUCE_OR:
case ISD::VECREDUCE_XOR:		case ISD::VECREDUCE_XOR:
▲ Show 20 Lines • Show All 7,844 Lines • ▼ Show 20 Lines	Dup = DAG.getNode(AArch64ISD::DUP, DL,
DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));		DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));

SDValue And = DAG.getNode(ISD::AND, DL,		SDValue And = DAG.getNode(ISD::AND, DL,
UnpkOp->getValueType(0), UnpkOp, Dup);		UnpkOp->getValueType(0), UnpkOp, Dup);

return DAG.getNode(Opc, DL, N->getValueType(0), And);		return DAG.getNode(Opc, DL, N->getValueType(0), And);
}		}

		if (!EnableCombineMGatherIntrinsics)
		return SDValue();

SDValue Mask = N->getOperand(1);		SDValue Mask = N->getOperand(1);

if (!Src.hasOneUse())		if (!Src.hasOneUse())
return SDValue();		return SDValue();

EVT MemVT;		EVT MemVT;

// SVE load instructions perform an implicit zero-extend, which makes them		// SVE load instructions perform an implicit zero-extend, which makes them
▲ Show 20 Lines • Show All 2,947 Lines • ▼ Show 20 Lines	if (Opc == AArch64ISD::UUNPKHI \|\| Opc == AArch64ISD::UUNPKLO) {
EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());		EVT ExtVT = VT.getDoubleNumVectorElementsVT(*DAG.getContext());

SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),		SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
ExtOp, DAG.getValueType(ExtVT));		ExtOp, DAG.getValueType(ExtVT));

return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);		return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
}		}

		if (!EnableCombineMGatherIntrinsics)
		return SDValue();

// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates		// SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.		// for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
unsigned NewOpc;		unsigned NewOpc;
unsigned MemVTOpNum = 4;		unsigned MemVTOpNum = 4;
switch (Opc) {		switch (Opc) {
case AArch64ISD::LD1_MERGE_ZERO:		case AArch64ISD::LD1_MERGE_ZERO:
NewOpc = AArch64ISD::LD1S_MERGE_ZERO;		NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
MemVTOpNum = 3;		MemVTOpNum = 3;
▲ Show 20 Lines • Show All 1,756 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled unpacked 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
				; CHECK-NEXT: ret
				%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ret
				%ptrs = getelementptr half, half* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: ret
				%ptrs = getelementptr float, float* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
				; CHECK-NEXT: ret
				%ptrs = getelementptr double, double* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: sxth z0.d, p1/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled packed 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
				; CHECK-NEXT: and z0.s, z0.s, #0xffff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
				; CHECK-NEXT: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
				ret <vscale x 4 x i32> %vals
				}

				define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
				; CHECK-NEXT: ret
				%ptrs = getelementptr half, half* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
				ret <vscale x 4 x half> %vals
				}

				define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw #2]
				; CHECK-NEXT: ret
				%ptrs = getelementptr float, float* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
				ret <vscale x 4 x float> %vals
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: sxth z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

				declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
				declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
				declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
				declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled unpacked 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: and z0.d, z0.d, #0xff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i64>
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x half>
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x float>
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x double>
				%vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: sxtb z0.d, p1/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: sxth z0.d, p1/m, z0.d
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ptrue p1.d
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: sxtw z0.d, p1/m, z0.d
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i32> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled packed 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
				; CHECK-NEXT: and z0.s, z0.s, #0xff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
				%vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: sunpklo z2.d, z0.s
				; CHECK-NEXT: sunpkhi z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z2.d, z1.d, z2.d
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: and z0.s, z0.s, #0xffff
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i16>
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: sunpklo z2.d, z0.s
				; CHECK-NEXT: sunpkhi z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z2.d, z1.d, z2.d
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
				; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i32>
				%vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
				ret <vscale x 4 x i32> %vals
				}

				define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: sunpklo z2.d, z0.s
				; CHECK-NEXT: sunpkhi z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z2.d, z1.d, z2.d
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x half>
				%vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
				ret <vscale x 4 x half> %vals
				}

				define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: sunpklo z2.d, z0.s
				; CHECK-NEXT: sunpkhi z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z2.d, z1.d, z2.d
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d]
				; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x float>
				%vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
				ret <vscale x 4 x float> %vals
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
				%vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: sunpklo z2.d, z0.s
				; CHECK-NEXT: sunpkhi z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z2.d, z1.d, z2.d
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: sxth z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i32> %offsets
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i16>
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

				declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
				declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
				declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
				declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
				declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled unpacked 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				sdesmalenUnsubmitted Done Reply Inline Actions Another thing I spotted, this should use `uxtw`, or use `sxtw` but then still keep the original zero-extend (`and`). So something in this patch is a bit too eager in removing the extend. sdesmalen: Another thing I spotted, this should use `uxtw`, or use `sxtw` but then still keep the original…
				kmclaughlinAuthorUnsubmitted Done Reply Inline Actions Good spot! I've removed the following lines from LowerMGATHER which was removing the extends: if (getGatherScatterIndexIsExtended(Index)) Index = Index.getOperand(0); I will move this to D92319, where it should be safe to remove the extend after refineIndexType has already folded it into the index type kmclaughlin: Good spot! I've removed the following lines from LowerMGATHER which was removing the extends…
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxth z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled packed 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 4 x i32> @masked_gather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: and z0.s, z0.s, #0xffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i32(i32* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
				ret <vscale x 4 x i32> %vals
				}

				define <vscale x 4 x half> @masked_gather_nxv4f16(half* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
				ret <vscale x 4 x half> %vals
				}

				define <vscale x 4 x float> @masked_gather_nxv4f32(float* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2]
				; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
				ret <vscale x 4 x float> %vals
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i16(i16* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1]
				; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: sxth z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

				declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
				declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
				declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
				declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled unpacked 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: and z0.d, z0.d, #0xff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i64>
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x half>
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x float>
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x double>
				%vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxth z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i32> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 2 x i32> %offsets to <vscale x 2 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
				; unscaled packed 32-bit offsets
				;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

				define <vscale x 4 x i32> @masked_gather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: and z0.s, z0.s, #0xff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
				%vals.zext = zext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: uunpkhi z2.d, z0.s
				; CHECK-NEXT: uunpklo z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: add z1.d, z1.d, z2.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: and z0.s, z0.s, #0xffff
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i16>
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.zext = zext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.zext
				}

				define <vscale x 4 x i32> @masked_gather_nxv4i32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: uunpkhi z2.d, z0.s
				; CHECK-NEXT: uunpklo z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: add z1.d, z1.d, z2.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i32>
				%vals = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x i32> undef)
				ret <vscale x 4 x i32> %vals
				}

				define <vscale x 4 x half> @masked_gather_nxv4f16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: uunpkhi z2.d, z0.s
				; CHECK-NEXT: uunpklo z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: add z1.d, z1.d, z2.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x half>
				%vals = call <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x half> undef)
				ret <vscale x 4 x half> %vals
				}

				define <vscale x 4 x float> @masked_gather_nxv4f32(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv4f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: uunpkhi z2.d, z0.s
				; CHECK-NEXT: uunpklo z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: add z1.d, z1.d, z2.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x float>
				%vals = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*> %ptrs, i32 4, <vscale x 4 x i1> %mask, <vscale x 4 x float> undef)
				ret <vscale x 4 x float> %vals
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i8(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: uunpklo z1.d, z0.s
				; CHECK-NEXT: uunpkhi z0.d, z0.s
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw]
				; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
				; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
				%vals.sext = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				define <vscale x 4 x i32> @masked_sgather_nxv4i16(i8* %base, <vscale x 4 x i32> %offsets, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: uunpkhi z2.d, z0.s
				; CHECK-NEXT: uunpklo z0.d, z0.s
				; CHECK-NEXT: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: add z1.d, z1.d, z2.d
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: sxth z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%offsets.zext = zext <vscale x 4 x i32> %offsets to <vscale x 4 x i64>
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets.zext
				%ptrs = bitcast <vscale x 4 x i8> %byte_ptrs to <vscale x 4 x i16>
				%vals = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*> %ptrs, i32 2, <vscale x 4 x i1> %mask, <vscale x 4 x i16> undef)
				%vals.sext = sext <vscale x 4 x i16> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %vals.sext
				}

				declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

				declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
				declare <vscale x 4 x i16> @llvm.masked.gather.nxv4i16(<vscale x 4 x i16*>, i32, <vscale x 4 x i1>, <vscale x 4 x i16>)
				declare <vscale x 4 x i32> @llvm.masked.gather.nxv4i32(<vscale x 4 x i32*>, i32, <vscale x 4 x i1>, <vscale x 4 x i32>)
				declare <vscale x 4 x half> @llvm.masked.gather.nxv4f16(<vscale x 4 x half*>, i32, <vscale x 4 x i1>, <vscale x 4 x half>)
				declare <vscale x 4 x float> @llvm.masked.gather.nxv4f32(<vscale x 4 x float*>, i32, <vscale x 4 x i1>, <vscale x 4 x float>)

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i64* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
				; CHECK-NEXT: ret
				%ptrs = getelementptr i64, i64* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(half* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
				; CHECK-NEXT: ret
				%ptrs = getelementptr half, half* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(float* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
				; CHECK-NEXT: ret
				%ptrs = getelementptr float, float* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(double* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, lsl #3]
				; CHECK-NEXT: ret
				%ptrs = getelementptr double, double* %base, <vscale x 2 x i64> %offsets
				%vals.sext = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i16* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxth z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i16, i16* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i32* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s \| FileCheck %s

				define <vscale x 2 x i64> @masked_gather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xff
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffff
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: and z0.d, z0.d, #0xffffffff
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.zext
				}

				define <vscale x 2 x i64> @masked_gather_nxv2i64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i64>
				%vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
				ret <vscale x 2 x i64> %vals
				}

				define <vscale x 2 x half> @masked_gather_nxv2f16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x half>
				%vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
				ret <vscale x 2 x half> %vals
				}

				define <vscale x 2 x float> @masked_gather_nxv2f32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x float>
				%vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
				ret <vscale x 2 x float> %vals
				}

				define <vscale x 2 x double> @masked_gather_nxv2f64(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2f64:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x double>
				%vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
				ret <vscale x 2 x double> %vals
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i8(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i8:
				; CHECK: // %bb.0:
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtb z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
				%vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i16(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i16:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxth z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i16>
				%vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
				%vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				define <vscale x 2 x i64> @masked_sgather_nxv2i32(i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv2i32:
				; CHECK: // %bb.0:
				; CHECK-NEXT: mov z1.d, x0
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: add z0.d, z1.d, z0.d
				; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.d
				; CHECK-NEXT: sxtw z0.d, p0/m, z0.d
				; CHECK-NEXT: ret
				%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
				%ptrs = bitcast <vscale x 2 x i8> %byte_ptrs to <vscale x 2 x i32>
				%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
				ret <vscale x 2 x i64> %vals.sext
				}

				declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
				declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
				declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
				declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
				declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)

llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s \| FileCheck %s

				; Tests that exercise various type legalisation scenarios for ISD::MGATHER.

				; Code generate load of an illegal datatype via promotion.
				define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv2i32:
				; CHECK-DAG: mov x8, xzr
				; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d]
				; CHECK: ret
				sdesmalenUnsubmitted Done Reply Inline Actions Sorry, just spotting this now while reviewing D92319. This instruction should not use the scaled addressing mode. That seems to be because `SelectionDAGBuilder::visitMaskedGather` defaults to `scaled`. In D90941 you've changed it to `unscaled` for MSCATTER. You'll need to do the same for MGATHER, because this leads to wrong code being generated.. sdesmalen: Sorry, just spotting this now while reviewing D92319. This instruction should not use the…
				%data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
				ret <vscale x 2 x i32> %data
				}

				; Code generate the worst case scenario when all vector types are illegal.
				define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i32> %indices, <vscale x 32 x i1> %mask) {
				; CHECK-LABEL: masked_gather_nxv32i32:
				; CHECK-NOT: unpkhi
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z0.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z1.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z2.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z3.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z4.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z5.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z6.s, sxtw #2]
				; CHECK-DAG: ld1w { {{z[0-9]+}}.s }, {{p[0-9]+}}/z, [x0, z7.s, sxtw #2]
				; CHECK: ret
				%ptrs = getelementptr i32, i32* %base, <vscale x 32 x i32> %indices
				%data = call <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*> %ptrs, i32 4, <vscale x 32 x i1> %mask, <vscale x 32 x i32> undef)
				ret <vscale x 32 x i32> %data
				}

				; TODO: Currently, the sign extend gets applied to the values after a 'uzp1' of two
				; registers, so it doesn't get folded away. Same for any other vector-of-pointers
				; style gathers which don't fit in an <vscale x 2 x type*> single register. Better folding
				; is required before we can check those off.
				define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) {
				; CHECK-LABEL: masked_sgather_nxv4i8:
				; CHECK: pfalse p1.b
				; CHECK-NEXT: mov x8, xzr
				; CHECK-NEXT: zip2 p2.s, p0.s, p1.s
				; CHECK-NEXT: zip1 p0.s, p0.s, p1.s
				; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d]
				; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d]
				; CHECK-NEXT: ptrue p0.s
				; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s
				; CHECK-NEXT: sxtb z0.s, p0/m, z0.s
				; CHECK-NEXT: ret
				%vals = call <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*> %ptrs, i32 1, <vscale x 4 x i1> %mask, <vscale x 4 x i8> undef)
				%svals = sext <vscale x 4 x i8> %vals to <vscale x 4 x i32>
				ret <vscale x 4 x i32> %svals
				}

				declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
				declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
				declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)

				declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)

				declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
				declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)

This is an archive of the discontinued LLVM Phabricator instance.

[SVE][CodeGen] Lower scalable masked gathers
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309872

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

This is an archive of the discontinued LLVM Phabricator instance.

[SVE][CodeGen] Lower scalable masked gathersClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 309872

llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.h

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll

llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll

[SVE][CodeGen] Lower scalable masked gathers
ClosedPublic