Diff 349196

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,016 Lines • ▼ Show 20 Lines	SDValue Indices =
DAG.getNode(RISCVISD::SUB_VL, DL, IntVT, SplatVL, VID, Mask, VL);		DAG.getNode(RISCVISD::SUB_VL, DL, IntVT, SplatVL, VID, Mask, VL);

return DAG.getNode(GatherOpc, DL, VecVT, Op.getOperand(0), Indices, Mask, VL);		return DAG.getNode(GatherOpc, DL, VecVT, Op.getOperand(0), Indices, Mask, VL);
}		}

SDValue		SDValue
RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,		RISCVTargetLowering::lowerFixedLengthVectorLoadToRVV(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
		SDLoc DL(Op);
auto *Load = cast<LoadSDNode>(Op);		auto *Load = cast<LoadSDNode>(Op);

SDLoc DL(Op);		if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
		Load->getMemoryVT(),
		*Load->getMemOperand())) {
		SDValue Result, Chain;
		std::tie(Result, Chain) = expandUnalignedLoad(Load, DAG);
		return DAG.getMergeValues({Result, Chain}, DL);
		}

MVT VT = Op.getSimpleValueType();		MVT VT = Op.getSimpleValueType();
MVT ContainerVT = getContainerForFixedLengthVector(VT);		MVT ContainerVT = getContainerForFixedLengthVector(VT);

SDValue VL =		SDValue VL =
DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());		DAG.getConstant(VT.getVectorNumElements(), DL, Subtarget.getXLenVT());

SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});		SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
SDValue NewLoad = DAG.getMemIntrinsicNode(		SDValue NewLoad = DAG.getMemIntrinsicNode(
RISCVISD::VLE_VL, DL, VTs, {Load->getChain(), Load->getBasePtr(), VL},		RISCVISD::VLE_VL, DL, VTs, {Load->getChain(), Load->getBasePtr(), VL},
Load->getMemoryVT(), Load->getMemOperand());		Load->getMemoryVT(), Load->getMemOperand());

SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);		SDValue Result = convertFromScalableVector(VT, NewLoad, DAG, Subtarget);
return DAG.getMergeValues({Result, Load->getChain()}, DL);		return DAG.getMergeValues({Result, Load->getChain()}, DL);
}		}

SDValue		SDValue
RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,		RISCVTargetLowering::lowerFixedLengthVectorStoreToRVV(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
		SDLoc DL(Op);
auto *Store = cast<StoreSDNode>(Op);		auto *Store = cast<StoreSDNode>(Op);

SDLoc DL(Op);		if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
		Store->getMemoryVT(),
		*Store->getMemOperand()))
		return expandUnalignedStore(Store, DAG);

SDValue StoreVal = Store->getValue();		SDValue StoreVal = Store->getValue();
MVT VT = StoreVal.getSimpleValueType();		MVT VT = StoreVal.getSimpleValueType();

// If the size less than a byte, we need to pad with zeros to make a byte.		// If the size less than a byte, we need to pad with zeros to make a byte.
if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {		if (VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() < 8) {
VT = MVT::v8i1;		VT = MVT::v8i1;
StoreVal = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,		StoreVal = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
DAG.getConstant(0, DL, VT), StoreVal,		DAG.getConstant(0, DL, VT), StoreVal,
▲ Show 20 Lines • Show All 4,539 Lines • Show Last 20 Lines

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Show First 20 Lines • Show All 98 Lines • ▼ Show 20 Lines	public:
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {		bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
if (!ST->hasStdExtV())		if (!ST->hasStdExtV())
return false;		return false;

// Only support fixed vectors if we know the minimum vector size.		// Only support fixed vectors if we know the minimum vector size.
if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)		if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
return false;		return false;

		if (Alignment <
		craig.topperUnsubmitted Not Done Reply Inline Actions Does the project intend to eventually remove the implicit cast to unsigned from TypeSize? If so should this call getFixedSize() before doing the compare? craig.topper: Does the project intend to eventually remove the implicit cast to unsigned from TypeSize? If so…
		frasercrmckAuthorUnsubmitted Done Reply Inline Actions Good spot! My understanding is that the implicit cast is not long for this world. I've added the call to `getFixedSize()`. frasercrmck: Good spot! My understanding is that the implicit cast is not long for this world. I've added…
		DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
		return false;

return isLegalElementTypeForRVV(DataType->getScalarType());		return isLegalElementTypeForRVV(DataType->getScalarType());
}		}

bool isLegalMaskedLoad(Type *DataType, Align Alignment) {		bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);		return isLegalMaskedLoadStore(DataType, Alignment);
}		}
bool isLegalMaskedStore(Type *DataType, Align Alignment) {		bool isLegalMaskedStore(Type *DataType, Align Alignment) {
return isLegalMaskedLoadStore(DataType, Alignment);		return isLegalMaskedLoadStore(DataType, Alignment);
}		}

bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) {		bool isLegalMaskedGatherScatter(Type *DataType, Align Alignment) {
if (!ST->hasStdExtV())		if (!ST->hasStdExtV())
return false;		return false;

// Only support fixed vectors if we know the minimum vector size.		// Only support fixed vectors if we know the minimum vector size.
if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)		if (isa<FixedVectorType>(DataType) && ST->getMinRVVVectorSizeInBits() == 0)
return false;		return false;

		if (Alignment <
		DL.getTypeStoreSize(DataType->getScalarType()).getFixedSize())
		return false;

return isLegalElementTypeForRVV(DataType->getScalarType());		return isLegalElementTypeForRVV(DataType->getScalarType());
}		}

bool isLegalMaskedGather(Type *DataType, Align Alignment) {		bool isLegalMaskedGather(Type *DataType, Align Alignment) {
return isLegalMaskedGatherScatter(DataType, Alignment);		return isLegalMaskedGatherScatter(DataType, Alignment);
}		}
bool isLegalMaskedScatter(Type *DataType, Align Alignment) {		bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
return isLegalMaskedGatherScatter(DataType, Alignment);		return isLegalMaskedGatherScatter(DataType, Alignment);
▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll

; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
; RUN: opt -cost-model -analyze -mtriple=riscv64 -mattr=+experimental-v,+f,+d,+experimental-zfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s 2>%t \| FileCheck %s		; RUN: opt -cost-model -analyze -mtriple=riscv64 -mattr=+experimental-v,+f,+d,+experimental-zfh -riscv-v-vector-bits-min=128 -riscv-v-fixed-length-vector-lmul-max=1 < %s 2>%t \| FileCheck %s
; Sanity check that we don't crash querying costs when vectors are not enabled.		; Sanity check that we don't crash querying costs when vectors are not enabled.
; RUN: opt -cost-model -analyze -mtriple=riscv64		; RUN: opt -cost-model -analyze -mtriple=riscv64

define i32 @masked_gather() {		define i32 @masked_gather() {
; CHECK-LABEL: 'masked_gather'		; CHECK-LABEL: 'masked_gather'
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 8, <8 x i1> undef, <8 x double> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 8, <8 x i1> undef, <8 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 8, <4 x i1> undef, <4 x double> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 8, <4 x i1> undef, <4 x double> undef)
		frasercrmckAuthorUnsubmitted Done Reply Inline Actions I just updated these tests naively, @craig.topper, but since I suspect the intention was only to test well-aligned values, perhaps we should add additional cases? frasercrmck: I just updated these tests naively, @craig.topper, but since I suspect the intention was only…
		craig.topperUnsubmitted Not Done Reply Inline Actions Yeah I think we should have additional tests. I'm not sure why I got the alignment right on some tests but not others. craig.topper: Yeah I think we should have additional tests. I'm not sure why I got the alignment right on…
		frasercrmckAuthorUnsubmitted Done Reply Inline Actions I've added those now. I pre-committed the updated alignment for these existing cases. frasercrmck: I've added those now. I pre-committed the updated alignment for these existing cases.
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8F32 = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 4, <8 x i1> undef, <8 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4F32 = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 4, <4 x i1> undef, <4 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2F32 = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 4, <2 x i1> undef, <2 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 4, <1 x i1> undef, <1 x float> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1F32 = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 4, <1 x i1> undef, <1 x float> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 2, <32 x i1> undef, <32 x half> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32F16 = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 2, <32 x i1> undef, <32 x half> undef)
Show All 19 Lines
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 2, <1 x i1> undef, <1 x i16> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I16 = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 2, <1 x i1> undef, <1 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: %V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 2, <8 x i1> undef, <8 x double> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 2, <4 x i1> undef, <4 x double> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 2, <1 x i1> undef, <1 x double> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 2, <16 x i1> undef, <16 x float> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 2, <8 x i1> undef, <8 x float> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 2, <1 x i1> undef, <1 x float> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 1, <1 x i1> undef, <1 x half> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 4, <1 x i1> undef, <1 x i64> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 1, <1 x i1> undef, <1 x i32> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 1, <1 x i1> undef, <1 x i16> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
;		;
%V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 8, <8 x i1> undef, <8 x double> undef)		%V8F64 = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 8, <8 x i1> undef, <8 x double> undef)
%V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 8, <4 x i1> undef, <4 x double> undef)		%V4F64 = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 8, <4 x i1> undef, <4 x double> undef)
%V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef)		%V2F64 = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 8, <2 x i1> undef, <2 x double> undef)
%V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef)		%V1F64 = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 8, <1 x i1> undef, <1 x double> undef)

%V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef)		%V16F32 = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 4, <16 x i1> undef, <16 x float> undef)
Show All 30 Lines	;
%V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)		%V64I8 = call <64 x i8> @llvm.masked.gather.v64i8.v64p0i8(<64 x i8*> undef, i32 1, <64 x i1> undef, <64 x i8> undef)
%V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)		%V32I8 = call <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*> undef, i32 1, <32 x i1> undef, <32 x i8> undef)
%V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)		%V16I8 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> undef, i32 1, <16 x i1> undef, <16 x i8> undef)
%V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)		%V8I8 = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> undef, i32 1, <8 x i1> undef, <8 x i8> undef)
%V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef)		%V4I8 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> undef, i32 1, <4 x i1> undef, <4 x i8> undef)
%V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef)		%V2I8 = call <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*> undef, i32 1, <2 x i1> undef, <2 x i8> undef)
%V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)		%V1I8 = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> undef, i32 1, <1 x i1> undef, <1 x i8> undef)

		; Test unaligned gathers
		%V8F64.u = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> undef, i32 2, <8 x i1> undef, <8 x double> undef)
		%V4F64.u = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> undef, i32 2, <4 x i1> undef, <4 x double> undef)
		%V2F64.u = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> undef, i32 2, <2 x i1> undef, <2 x double> undef)
		%V1F64.u = call <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*> undef, i32 2, <1 x i1> undef, <1 x double> undef)

		%V16F32.u = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> undef, i32 2, <16 x i1> undef, <16 x float> undef)
		%V8F32.u = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> undef, i32 2, <8 x i1> undef, <8 x float> undef)
		%V4F32.u = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> undef, i32 2, <4 x i1> undef, <4 x float> undef)
		%V2F32.u = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> undef, i32 2, <2 x i1> undef, <2 x float> undef)
		%V1F32.u = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> undef, i32 2, <1 x i1> undef, <1 x float> undef)

		%V32F16.u = call <32 x half> @llvm.masked.gather.v32f16.v32p0f16(<32 x half*> undef, i32 1, <32 x i1> undef, <32 x half> undef)
		%V16F16.u = call <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*> undef, i32 1, <16 x i1> undef, <16 x half> undef)
		%V8F16.u = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> undef, i32 1, <8 x i1> undef, <8 x half> undef)
		%V4F16.u = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> undef, i32 1, <4 x i1> undef, <4 x half> undef)
		%V2F16.u = call <2 x half> @llvm.masked.gather.v2f16.v2p0f16(<2 x half*> undef, i32 1, <2 x i1> undef, <2 x half> undef)
		%V1F16.u = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> undef, i32 1, <1 x i1> undef, <1 x half> undef)

		%V8I64.u = call <8 x i64> @llvm.masked.gather.v8i64.v8p0i64(<8 x i64*> undef, i32 4, <8 x i1> undef, <8 x i64> undef)
		%V4I64.u = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> undef, i32 4, <4 x i1> undef, <4 x i64> undef)
		%V2I64.u = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> undef, i32 4, <2 x i1> undef, <2 x i64> undef)
		%V1I64.u = call <1 x i64> @llvm.masked.gather.v1i64.v1p0i64(<1 x i64*> undef, i32 4, <1 x i1> undef, <1 x i64> undef)

		%V16I32.u = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> undef, i32 1, <16 x i1> undef, <16 x i32> undef)
		%V8I32.u = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> undef, i32 1, <8 x i1> undef, <8 x i32> undef)
		%V4I32.u = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> undef, i32 1, <4 x i1> undef, <4 x i32> undef)
		%V2I32.u = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> undef, i32 1, <2 x i1> undef, <2 x i32> undef)
		%V1I32.u = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> undef, i32 1, <1 x i1> undef, <1 x i32> undef)

		%V32I16.u = call <32 x i16> @llvm.masked.gather.v32i16.v32p0i16(<32 x i16*> undef, i32 1, <32 x i1> undef, <32 x i16> undef)
		%V16I16.u = call <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*> undef, i32 1, <16 x i1> undef, <16 x i16> undef)
		%V8I16.u = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> undef, i32 1, <8 x i1> undef, <8 x i16> undef)
		%V4I16.u = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> undef, i32 1, <4 x i1> undef, <4 x i16> undef)
		%V2I16.u = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> undef, i32 1, <2 x i1> undef, <2 x i16> undef)
		%V1I16.u = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> undef, i32 1, <1 x i1> undef, <1 x i16> undef)

ret i32 0		ret i32 0
}		}

declare <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x double>)		declare <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x double>)
declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)		declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)		declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
declare <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*>, i32, <1 x i1>, <1 x double>)		declare <1 x double> @llvm.masked.gather.v1f64.v1p0f64(<1 x double*>, i32, <1 x i1>, <1 x double>)

Show All 38 Lines

llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll

Show All 36 Lines
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 2, <1 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 2, <1 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 64 for instruction: call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 32 for instruction: call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef)		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 2, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 2, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 2, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 2, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 2, <16 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 2, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 2, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 2, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> undef, <1 x float*> undef, i32 2, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32f16.v32p0f16(<32 x half> undef, <32 x half*> undef, i32 1, <32 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 1, <16 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 1, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 1, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 1, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> undef, <1 x half*> undef, i32 1, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> undef, <1 x i32*> undef, i32 1, <1 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 1, <2 x i1> undef)
		; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 1, <1 x i1> undef)
; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0		; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret i32 0
;		;
call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 8, <8 x i1> undef)		call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 8, <8 x i1> undef)
call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 8, <4 x i1> undef)		call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 8, <4 x i1> undef)
call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 8, <2 x i1> undef)		call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 8, <2 x i1> undef)
call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 8, <1 x i1> undef)		call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 8, <1 x i1> undef)

call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef)		call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 4, <16 x i1> undef)
Show All 30 Lines	;
call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)		call void @llvm.masked.scatter.v64i8.v64p0i8(<64 x i8> undef, <64 x i8*> undef, i32 1, <64 x i1> undef)
call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)		call void @llvm.masked.scatter.v32i8.v32p0i8(<32 x i8> undef, <32 x i8*> undef, i32 1, <32 x i1> undef)
call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)		call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> undef, <16 x i8*> undef, i32 1, <16 x i1> undef)
call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)		call void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8> undef, <8 x i8*> undef, i32 1, <8 x i1> undef)
call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef)		call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> undef, <4 x i8*> undef, i32 1, <4 x i1> undef)
call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef)		call void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8> undef, <2 x i8*> undef, i32 1, <2 x i1> undef)
call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef)		call void @llvm.masked.scatter.v1i8.v1p0i8(<1 x i8> undef, <1 x i8*> undef, i32 1, <1 x i1> undef)

		; Test unaligned scatters
		call void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double> undef, <8 x double*> undef, i32 2, <8 x i1> undef)
		call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> undef, <4 x double*> undef, i32 2, <4 x i1> undef)
		call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> undef, <2 x double*> undef, i32 2, <2 x i1> undef)
		call void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double> undef, <1 x double*> undef, i32 2, <1 x i1> undef)

		call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> undef, <16 x float*> undef, i32 2, <16 x i1> undef)
		call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> undef, <8 x float*> undef, i32 2, <8 x i1> undef)
		call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> undef, <4 x float*> undef, i32 2, <4 x i1> undef)
		call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> undef, <2 x float*> undef, i32 2, <2 x i1> undef)
		call void @llvm.masked.scatter.v1f32.v1p0f32(<1 x float> undef, <1 x float*> undef, i32 2, <1 x i1> undef)

		call void @llvm.masked.scatter.v32f16.v32p0f16(<32 x half> undef, <32 x half*> undef, i32 1, <32 x i1> undef)
		call void @llvm.masked.scatter.v16f16.v16p0f16(<16 x half> undef, <16 x half*> undef, i32 1, <16 x i1> undef)
		call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> undef, <8 x half*> undef, i32 1, <8 x i1> undef)
		call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> undef, <4 x half*> undef, i32 1, <4 x i1> undef)
		call void @llvm.masked.scatter.v2f16.v2p0f16(<2 x half> undef, <2 x half*> undef, i32 1, <2 x i1> undef)
		call void @llvm.masked.scatter.v1f16.v1p0f16(<1 x half> undef, <1 x half*> undef, i32 1, <1 x i1> undef)

		call void @llvm.masked.scatter.v8i64.v8p0i64(<8 x i64> undef, <8 x i64*> undef, i32 1, <8 x i1> undef)
		call void @llvm.masked.scatter.v4i64.v4p0i64(<4 x i64> undef, <4 x i64*> undef, i32 1, <4 x i1> undef)
		call void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> undef, <2 x i64*> undef, i32 1, <2 x i1> undef)
		call void @llvm.masked.scatter.v1i64.v1p0i64(<1 x i64> undef, <1 x i64*> undef, i32 1, <1 x i1> undef)

		call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> undef, <16 x i32*> undef, i32 1, <16 x i1> undef)
		call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> undef, <8 x i32*> undef, i32 1, <8 x i1> undef)
		call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> undef, <4 x i32*> undef, i32 1, <4 x i1> undef)
		call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> undef, <2 x i32*> undef, i32 1, <2 x i1> undef)
		call void @llvm.masked.scatter.v1i32.v1p0i32(<1 x i32> undef, <1 x i32*> undef, i32 1, <1 x i1> undef)

		call void @llvm.masked.scatter.v32i16.v32p0i16(<32 x i16> undef, <32 x i16*> undef, i32 1, <32 x i1> undef)
		call void @llvm.masked.scatter.v16i16.v16p0i16(<16 x i16> undef, <16 x i16*> undef, i32 1, <16 x i1> undef)
		call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> undef, <8 x i16*> undef, i32 1, <8 x i1> undef)
		call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> undef, <4 x i16*> undef, i32 1, <4 x i1> undef)
		call void @llvm.masked.scatter.v2i16.v2p0i16(<2 x i16> undef, <2 x i16*> undef, i32 1, <2 x i1> undef)
		call void @llvm.masked.scatter.v1i16.v1p0i16(<1 x i16> undef, <1 x i16*> undef, i32 1, <1 x i1> undef)

ret i32 0		ret i32 0
}		}

declare void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double>, <8 x double*>, i32, <8 x i1>)		declare void @llvm.masked.scatter.v8f64.v8p0f64(<8 x double>, <8 x double*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>)		declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double>, <4 x double*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)		declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double>, <1 x double*>, i32, <1 x i1>)		declare void @llvm.masked.scatter.v1f64.v1p0f64(<1 x double>, <1 x double*>, i32, <1 x i1>)

Show All 38 Lines

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -mtriple=riscv32 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
				; RUN: \| FileCheck %s --check-prefix=RV32
				; RUN: llc -mtriple=riscv64 -mattr=+m,+experimental-v -riscv-v-vector-bits-min=128 -verify-machineinstrs < %s \
				; RUN: \| FileCheck %s --check-prefix=RV64

				define <4 x i32> @load_v4i32_align1(<4 x i32>* %ptr) {
				; RV32-LABEL: load_v4i32_align1:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: lbu a1, 13(a0)
				; RV32-NEXT: lbu a2, 12(a0)
				; RV32-NEXT: lbu a3, 15(a0)
				; RV32-NEXT: lbu a4, 14(a0)
				; RV32-NEXT: slli a1, a1, 8
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: slli a2, a3, 8
				; RV32-NEXT: or a2, a2, a4
				; RV32-NEXT: slli a2, a2, 16
				; RV32-NEXT: or a1, a2, a1
				; RV32-NEXT: sw a1, 12(sp)
				; RV32-NEXT: lbu a1, 9(a0)
				; RV32-NEXT: lbu a2, 8(a0)
				; RV32-NEXT: lbu a3, 11(a0)
				; RV32-NEXT: lbu a4, 10(a0)
				; RV32-NEXT: slli a1, a1, 8
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: slli a2, a3, 8
				; RV32-NEXT: or a2, a2, a4
				; RV32-NEXT: slli a2, a2, 16
				; RV32-NEXT: or a1, a2, a1
				; RV32-NEXT: sw a1, 8(sp)
				; RV32-NEXT: lbu a1, 5(a0)
				; RV32-NEXT: lbu a2, 4(a0)
				; RV32-NEXT: lbu a3, 7(a0)
				; RV32-NEXT: lbu a4, 6(a0)
				; RV32-NEXT: slli a1, a1, 8
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: slli a2, a3, 8
				; RV32-NEXT: or a2, a2, a4
				; RV32-NEXT: slli a2, a2, 16
				; RV32-NEXT: or a1, a2, a1
				; RV32-NEXT: sw a1, 4(sp)
				; RV32-NEXT: lbu a1, 1(a0)
				; RV32-NEXT: lbu a2, 0(a0)
				; RV32-NEXT: lbu a3, 3(a0)
				; RV32-NEXT: lbu a0, 2(a0)
				; RV32-NEXT: slli a1, a1, 8
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: slli a2, a3, 8
				; RV32-NEXT: or a0, a2, a0
				; RV32-NEXT: slli a0, a0, 16
				; RV32-NEXT: or a0, a0, a1
				; RV32-NEXT: sw a0, 0(sp)
				; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV32-NEXT: vle32.v v8, (sp)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: load_v4i32_align1:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: lbu a1, 9(a0)
				; RV64-NEXT: lbu a2, 8(a0)
				; RV64-NEXT: lbu a3, 11(a0)
				; RV64-NEXT: lbu a4, 10(a0)
				; RV64-NEXT: slli a1, a1, 8
				; RV64-NEXT: or a1, a1, a2
				; RV64-NEXT: slli a2, a3, 8
				; RV64-NEXT: or a2, a2, a4
				; RV64-NEXT: slli a2, a2, 16
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: lbu a2, 13(a0)
				; RV64-NEXT: lbu a3, 12(a0)
				; RV64-NEXT: lbu a4, 15(a0)
				; RV64-NEXT: lbu a5, 14(a0)
				; RV64-NEXT: slli a2, a2, 8
				; RV64-NEXT: or a2, a2, a3
				; RV64-NEXT: slli a3, a4, 8
				; RV64-NEXT: or a3, a3, a5
				; RV64-NEXT: slli a3, a3, 16
				; RV64-NEXT: or a2, a3, a2
				; RV64-NEXT: slli a2, a2, 32
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: sd a1, 8(sp)
				; RV64-NEXT: lbu a1, 1(a0)
				; RV64-NEXT: lbu a2, 0(a0)
				; RV64-NEXT: lbu a3, 3(a0)
				; RV64-NEXT: lbu a4, 2(a0)
				; RV64-NEXT: slli a1, a1, 8
				; RV64-NEXT: or a1, a1, a2
				; RV64-NEXT: slli a2, a3, 8
				; RV64-NEXT: or a2, a2, a4
				; RV64-NEXT: slli a2, a2, 16
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: lbu a2, 5(a0)
				; RV64-NEXT: lbu a3, 4(a0)
				; RV64-NEXT: lbu a4, 7(a0)
				; RV64-NEXT: lbu a0, 6(a0)
				; RV64-NEXT: slli a2, a2, 8
				; RV64-NEXT: or a2, a2, a3
				; RV64-NEXT: slli a3, a4, 8
				; RV64-NEXT: or a0, a3, a0
				; RV64-NEXT: slli a0, a0, 16
				; RV64-NEXT: or a0, a0, a2
				; RV64-NEXT: slli a0, a0, 32
				; RV64-NEXT: or a0, a0, a1
				; RV64-NEXT: sd a0, 0(sp)
				; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV64-NEXT: vle32.v v8, (sp)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%z = load <4 x i32>, <4 x i32>* %ptr, align 1
				ret <4 x i32> %z
				}

				define <4 x i32> @load_v4i32_align2(<4 x i32>* %ptr) {
				; RV32-LABEL: load_v4i32_align2:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: lhu a1, 14(a0)
				; RV32-NEXT: lhu a2, 12(a0)
				; RV32-NEXT: slli a1, a1, 16
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: sw a1, 12(sp)
				; RV32-NEXT: lhu a1, 10(a0)
				; RV32-NEXT: lhu a2, 8(a0)
				; RV32-NEXT: slli a1, a1, 16
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: sw a1, 8(sp)
				; RV32-NEXT: lhu a1, 6(a0)
				; RV32-NEXT: lhu a2, 4(a0)
				; RV32-NEXT: slli a1, a1, 16
				; RV32-NEXT: or a1, a1, a2
				; RV32-NEXT: sw a1, 4(sp)
				; RV32-NEXT: lhu a1, 2(a0)
				; RV32-NEXT: lhu a0, 0(a0)
				; RV32-NEXT: slli a1, a1, 16
				; RV32-NEXT: or a0, a1, a0
				; RV32-NEXT: sw a0, 0(sp)
				; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV32-NEXT: vle32.v v8, (sp)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: load_v4i32_align2:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: lhu a1, 10(a0)
				; RV64-NEXT: lhu a2, 8(a0)
				; RV64-NEXT: lhu a3, 14(a0)
				; RV64-NEXT: lhu a4, 12(a0)
				; RV64-NEXT: slli a1, a1, 16
				; RV64-NEXT: or a1, a1, a2
				; RV64-NEXT: slli a2, a3, 16
				; RV64-NEXT: or a2, a2, a4
				; RV64-NEXT: slli a2, a2, 32
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: sd a1, 8(sp)
				; RV64-NEXT: lhu a1, 2(a0)
				; RV64-NEXT: lhu a2, 0(a0)
				; RV64-NEXT: lhu a3, 6(a0)
				; RV64-NEXT: lhu a0, 4(a0)
				; RV64-NEXT: slli a1, a1, 16
				; RV64-NEXT: or a1, a1, a2
				; RV64-NEXT: slli a2, a3, 16
				; RV64-NEXT: or a0, a2, a0
				; RV64-NEXT: slli a0, a0, 32
				; RV64-NEXT: or a0, a0, a1
				; RV64-NEXT: sd a0, 0(sp)
				; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV64-NEXT: vle32.v v8, (sp)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%z = load <4 x i32>, <4 x i32>* %ptr, align 2
				ret <4 x i32> %z
				}

				define void @store_v4i32_align1(<4 x i32> %x, <4 x i32>* %ptr) {
				; RV32-LABEL: store_v4i32_align1:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV32-NEXT: vse32.v v8, (sp)
				; RV32-NEXT: lw a1, 12(sp)
				; RV32-NEXT: sb a1, 12(a0)
				; RV32-NEXT: lw a2, 8(sp)
				; RV32-NEXT: sb a2, 8(a0)
				; RV32-NEXT: lw a3, 4(sp)
				; RV32-NEXT: sb a3, 4(a0)
				; RV32-NEXT: lw a4, 0(sp)
				; RV32-NEXT: sb a4, 0(a0)
				; RV32-NEXT: srli a5, a1, 24
				; RV32-NEXT: sb a5, 15(a0)
				; RV32-NEXT: srli a5, a1, 16
				; RV32-NEXT: sb a5, 14(a0)
				; RV32-NEXT: srli a1, a1, 8
				; RV32-NEXT: sb a1, 13(a0)
				; RV32-NEXT: srli a1, a2, 24
				; RV32-NEXT: sb a1, 11(a0)
				; RV32-NEXT: srli a1, a2, 16
				; RV32-NEXT: sb a1, 10(a0)
				; RV32-NEXT: srli a1, a2, 8
				; RV32-NEXT: sb a1, 9(a0)
				; RV32-NEXT: srli a1, a3, 24
				; RV32-NEXT: sb a1, 7(a0)
				; RV32-NEXT: srli a1, a3, 16
				; RV32-NEXT: sb a1, 6(a0)
				; RV32-NEXT: srli a1, a3, 8
				; RV32-NEXT: sb a1, 5(a0)
				; RV32-NEXT: srli a1, a4, 24
				; RV32-NEXT: sb a1, 3(a0)
				; RV32-NEXT: srli a1, a4, 16
				; RV32-NEXT: sb a1, 2(a0)
				; RV32-NEXT: srli a1, a4, 8
				; RV32-NEXT: sb a1, 1(a0)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: store_v4i32_align1:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV64-NEXT: vse32.v v8, (sp)
				; RV64-NEXT: ld a1, 8(sp)
				; RV64-NEXT: sb a1, 8(a0)
				; RV64-NEXT: ld a2, 0(sp)
				; RV64-NEXT: sb a2, 0(a0)
				; RV64-NEXT: srli a3, a1, 56
				; RV64-NEXT: sb a3, 15(a0)
				; RV64-NEXT: srli a3, a1, 48
				; RV64-NEXT: sb a3, 14(a0)
				; RV64-NEXT: srli a3, a1, 40
				; RV64-NEXT: sb a3, 13(a0)
				; RV64-NEXT: srli a3, a1, 32
				; RV64-NEXT: sb a3, 12(a0)
				; RV64-NEXT: srli a3, a1, 24
				; RV64-NEXT: sb a3, 11(a0)
				; RV64-NEXT: srli a3, a1, 16
				; RV64-NEXT: sb a3, 10(a0)
				; RV64-NEXT: srli a1, a1, 8
				; RV64-NEXT: sb a1, 9(a0)
				; RV64-NEXT: srli a1, a2, 40
				; RV64-NEXT: sb a1, 5(a0)
				; RV64-NEXT: srli a1, a2, 32
				; RV64-NEXT: sb a1, 4(a0)
				; RV64-NEXT: srli a1, a2, 56
				; RV64-NEXT: sb a1, 7(a0)
				; RV64-NEXT: srli a1, a2, 48
				; RV64-NEXT: sb a1, 6(a0)
				; RV64-NEXT: srli a1, a2, 24
				; RV64-NEXT: sb a1, 3(a0)
				; RV64-NEXT: srli a1, a2, 16
				; RV64-NEXT: sb a1, 2(a0)
				; RV64-NEXT: srli a1, a2, 8
				; RV64-NEXT: sb a1, 1(a0)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				store <4 x i32> %x, <4 x i32>* %ptr, align 1
				ret void
				}

				define void @store_v4i32_align2(<4 x i32> %x, <4 x i32>* %ptr) {
				; RV32-LABEL: store_v4i32_align2:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV32-NEXT: vse32.v v8, (sp)
				; RV32-NEXT: lw a1, 12(sp)
				; RV32-NEXT: sh a1, 12(a0)
				; RV32-NEXT: lw a2, 8(sp)
				; RV32-NEXT: sh a2, 8(a0)
				; RV32-NEXT: lw a3, 4(sp)
				; RV32-NEXT: sh a3, 4(a0)
				; RV32-NEXT: lw a4, 0(sp)
				; RV32-NEXT: sh a4, 0(a0)
				; RV32-NEXT: srli a1, a1, 16
				; RV32-NEXT: sh a1, 14(a0)
				; RV32-NEXT: srli a1, a2, 16
				; RV32-NEXT: sh a1, 10(a0)
				; RV32-NEXT: srli a1, a3, 16
				; RV32-NEXT: sh a1, 6(a0)
				; RV32-NEXT: srli a1, a4, 16
				; RV32-NEXT: sh a1, 2(a0)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: store_v4i32_align2:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 4, e32,m1,ta,mu
				; RV64-NEXT: vse32.v v8, (sp)
				; RV64-NEXT: ld a1, 8(sp)
				; RV64-NEXT: sh a1, 8(a0)
				; RV64-NEXT: ld a2, 0(sp)
				; RV64-NEXT: sh a2, 0(a0)
				; RV64-NEXT: srli a3, a1, 48
				; RV64-NEXT: sh a3, 14(a0)
				; RV64-NEXT: srli a3, a1, 32
				; RV64-NEXT: sh a3, 12(a0)
				; RV64-NEXT: srli a1, a1, 16
				; RV64-NEXT: sh a1, 10(a0)
				; RV64-NEXT: srli a1, a2, 48
				; RV64-NEXT: sh a1, 6(a0)
				; RV64-NEXT: srli a1, a2, 32
				; RV64-NEXT: sh a1, 4(a0)
				; RV64-NEXT: srli a1, a2, 16
				; RV64-NEXT: sh a1, 2(a0)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				store <4 x i32> %x, <4 x i32>* %ptr, align 2
				ret void
				}

				declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>)

				define <2 x i16> @mgather_v2i16_align1(<2 x i16*> %ptrs, <2 x i1> %m, <2 x i16> %passthru) {
				; RV32-LABEL: mgather_v2i16_align1:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a0, sp, 14
				; RV32-NEXT: vse1.v v25, (a0)
				; RV32-NEXT: lbu a0, 14(sp)
				; RV32-NEXT: andi a1, a0, 1
				; RV32-NEXT: beqz a1, .LBB4_2
				; RV32-NEXT: # %bb.1: # %cond.load
				; RV32-NEXT: vsetvli zero, zero, e32,mf2,ta,mu
				; RV32-NEXT: vmv.x.s a1, v8
				; RV32-NEXT: lb a2, 1(a1)
				; RV32-NEXT: lbu a1, 0(a1)
				; RV32-NEXT: slli a2, a2, 8
				; RV32-NEXT: or a1, a2, a1
				; RV32-NEXT: vsetivli zero, 2, e16,mf4,ta,mu
				; RV32-NEXT: vmv.s.x v9, a1
				; RV32-NEXT: .LBB4_2: # %else
				; RV32-NEXT: andi a0, a0, 2
				; RV32-NEXT: beqz a0, .LBB4_4
				; RV32-NEXT: # %bb.3: # %cond.load1
				; RV32-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 1
				; RV32-NEXT: vmv.x.s a0, v25
				; RV32-NEXT: lb a1, 1(a0)
				; RV32-NEXT: lbu a0, 0(a0)
				; RV32-NEXT: slli a1, a1, 8
				; RV32-NEXT: or a0, a1, a0
				; RV32-NEXT: vsetivli zero, 2, e16,mf4,ta,mu
				; RV32-NEXT: vmv.s.x v25, a0
				; RV32-NEXT: vsetvli zero, zero, e16,mf4,tu,mu
				; RV32-NEXT: vslideup.vi v9, v25, 1
				; RV32-NEXT: .LBB4_4: # %else2
				; RV32-NEXT: vmv1r.v v8, v9
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: mgather_v2i16_align1:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a0, sp, 14
				; RV64-NEXT: vse1.v v25, (a0)
				; RV64-NEXT: lbu a0, 14(sp)
				; RV64-NEXT: andi a1, a0, 1
				; RV64-NEXT: beqz a1, .LBB4_2
				; RV64-NEXT: # %bb.1: # %cond.load
				; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu
				; RV64-NEXT: vmv.x.s a1, v8
				; RV64-NEXT: lb a2, 1(a1)
				; RV64-NEXT: lbu a1, 0(a1)
				; RV64-NEXT: slli a2, a2, 8
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: vsetivli zero, 2, e16,mf4,ta,mu
				; RV64-NEXT: vmv.s.x v9, a1
				; RV64-NEXT: .LBB4_2: # %else
				; RV64-NEXT: andi a0, a0, 2
				; RV64-NEXT: beqz a0, .LBB4_4
				; RV64-NEXT: # %bb.3: # %cond.load1
				; RV64-NEXT: vsetivli zero, 1, e64,m1,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 1
				; RV64-NEXT: vmv.x.s a0, v25
				; RV64-NEXT: lb a1, 1(a0)
				; RV64-NEXT: lbu a0, 0(a0)
				; RV64-NEXT: slli a1, a1, 8
				; RV64-NEXT: or a0, a1, a0
				; RV64-NEXT: vsetivli zero, 2, e16,mf4,ta,mu
				; RV64-NEXT: vmv.s.x v25, a0
				; RV64-NEXT: vsetvli zero, zero, e16,mf4,tu,mu
				; RV64-NEXT: vslideup.vi v9, v25, 1
				; RV64-NEXT: .LBB4_4: # %else2
				; RV64-NEXT: vmv1r.v v8, v9
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%v = call <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*> %ptrs, i32 1, <2 x i1> %m, <2 x i16> %passthru)
				ret <2 x i16> %v
				}

				declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)

				define <2 x i64> @mgather_v2i64_align4(<2 x i64*> %ptrs, <2 x i1> %m, <2 x i64> %passthru) {
				; RV32-LABEL: mgather_v2i64_align4:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a0, sp, 14
				; RV32-NEXT: vse1.v v25, (a0)
				; RV32-NEXT: lbu a0, 14(sp)
				; RV32-NEXT: andi a1, a0, 1
				; RV32-NEXT: vsetivli zero, 2, e32,m1,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: beqz a1, .LBB5_2
				; RV32-NEXT: # %bb.1: # %cond.load
				; RV32-NEXT: vmv.x.s a1, v8
				; RV32-NEXT: lw a2, 4(a1)
				; RV32-NEXT: lw a1, 0(a1)
				; RV32-NEXT: vslide1up.vx v26, v25, a2
				; RV32-NEXT: vslide1up.vx v27, v26, a1
				; RV32-NEXT: vsetivli zero, 1, e64,m1,tu,mu
				; RV32-NEXT: vslideup.vi v9, v27, 0
				; RV32-NEXT: .LBB5_2: # %else
				; RV32-NEXT: andi a0, a0, 2
				; RV32-NEXT: beqz a0, .LBB5_4
				; RV32-NEXT: # %bb.3: # %cond.load1
				; RV32-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v26, v8, 1
				; RV32-NEXT: vmv.x.s a0, v26
				; RV32-NEXT: lw a1, 4(a0)
				; RV32-NEXT: lw a0, 0(a0)
				; RV32-NEXT: vsetivli zero, 2, e32,m1,ta,mu
				; RV32-NEXT: vslide1up.vx v26, v25, a1
				; RV32-NEXT: vslide1up.vx v25, v26, a0
				; RV32-NEXT: vsetivli zero, 2, e64,m1,tu,mu
				; RV32-NEXT: vslideup.vi v9, v25, 1
				; RV32-NEXT: .LBB5_4: # %else2
				; RV32-NEXT: vmv1r.v v8, v9
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: mgather_v2i64_align4:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a0, sp, 14
				; RV64-NEXT: vse1.v v25, (a0)
				; RV64-NEXT: lbu a0, 14(sp)
				; RV64-NEXT: andi a1, a0, 1
				; RV64-NEXT: beqz a1, .LBB5_2
				; RV64-NEXT: # %bb.1: # %cond.load
				; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu
				; RV64-NEXT: vmv.x.s a1, v8
				; RV64-NEXT: lwu a2, 4(a1)
				; RV64-NEXT: lwu a1, 0(a1)
				; RV64-NEXT: slli a2, a2, 32
				; RV64-NEXT: or a1, a2, a1
				; RV64-NEXT: vsetivli zero, 2, e64,m1,ta,mu
				; RV64-NEXT: vmv.s.x v9, a1
				; RV64-NEXT: .LBB5_2: # %else
				; RV64-NEXT: andi a0, a0, 2
				; RV64-NEXT: beqz a0, .LBB5_4
				; RV64-NEXT: # %bb.3: # %cond.load1
				; RV64-NEXT: vsetivli zero, 1, e64,m1,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 1
				; RV64-NEXT: vmv.x.s a0, v25
				; RV64-NEXT: lwu a1, 4(a0)
				; RV64-NEXT: lwu a0, 0(a0)
				; RV64-NEXT: slli a1, a1, 32
				; RV64-NEXT: or a0, a1, a0
				; RV64-NEXT: vsetivli zero, 2, e64,m1,ta,mu
				; RV64-NEXT: vmv.s.x v25, a0
				; RV64-NEXT: vsetvli zero, zero, e64,m1,tu,mu
				; RV64-NEXT: vslideup.vi v9, v25, 1
				; RV64-NEXT: .LBB5_4: # %else2
				; RV64-NEXT: vmv1r.v v8, v9
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%v = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %ptrs, i32 4, <2 x i1> %m, <2 x i64> %passthru)
				ret <2 x i64> %v
				}

				declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)

				define void @mscatter_v4i16_align1(<4 x i16> %val, <4 x i16*> %ptrs, <4 x i1> %m) {
				; RV32-LABEL: mscatter_v4i16_align1:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 4, e8,mf4,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 4, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a0, sp, 12
				; RV32-NEXT: vse1.v v25, (a0)
				; RV32-NEXT: lbu a0, 12(sp)
				; RV32-NEXT: andi a1, a0, 1
				; RV32-NEXT: bnez a1, .LBB6_5
				; RV32-NEXT: # %bb.1: # %else
				; RV32-NEXT: andi a1, a0, 2
				; RV32-NEXT: bnez a1, .LBB6_6
				; RV32-NEXT: .LBB6_2: # %else2
				; RV32-NEXT: andi a1, a0, 4
				; RV32-NEXT: bnez a1, .LBB6_7
				; RV32-NEXT: .LBB6_3: # %else4
				; RV32-NEXT: andi a0, a0, 8
				; RV32-NEXT: bnez a0, .LBB6_8
				; RV32-NEXT: .LBB6_4: # %else6
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				; RV32-NEXT: .LBB6_5: # %cond.store
				; RV32-NEXT: vsetvli zero, zero, e16,mf2,ta,mu
				; RV32-NEXT: vmv.x.s a1, v8
				; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu
				; RV32-NEXT: vmv.x.s a2, v9
				; RV32-NEXT: sb a1, 0(a2)
				; RV32-NEXT: srli a1, a1, 8
				; RV32-NEXT: sb a1, 1(a2)
				; RV32-NEXT: andi a1, a0, 2
				; RV32-NEXT: beqz a1, .LBB6_2
				; RV32-NEXT: .LBB6_6: # %cond.store1
				; RV32-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 1
				; RV32-NEXT: vmv.x.s a1, v25
				; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v9, 1
				; RV32-NEXT: vmv.x.s a2, v25
				; RV32-NEXT: sb a1, 0(a2)
				; RV32-NEXT: srli a1, a1, 8
				; RV32-NEXT: sb a1, 1(a2)
				; RV32-NEXT: andi a1, a0, 4
				; RV32-NEXT: beqz a1, .LBB6_3
				; RV32-NEXT: .LBB6_7: # %cond.store3
				; RV32-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 2
				; RV32-NEXT: vmv.x.s a1, v25
				; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v9, 2
				; RV32-NEXT: vmv.x.s a2, v25
				; RV32-NEXT: sb a1, 0(a2)
				; RV32-NEXT: srli a1, a1, 8
				; RV32-NEXT: sb a1, 1(a2)
				; RV32-NEXT: andi a0, a0, 8
				; RV32-NEXT: beqz a0, .LBB6_4
				; RV32-NEXT: .LBB6_8: # %cond.store5
				; RV32-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 3
				; RV32-NEXT: vmv.x.s a0, v25
				; RV32-NEXT: vsetvli zero, zero, e32,m1,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v9, 3
				; RV32-NEXT: vmv.x.s a1, v25
				; RV32-NEXT: sb a0, 0(a1)
				; RV32-NEXT: srli a0, a0, 8
				; RV32-NEXT: sb a0, 1(a1)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: mscatter_v4i16_align1:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 4, e8,mf4,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 4, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a0, sp, 12
				; RV64-NEXT: vse1.v v25, (a0)
				; RV64-NEXT: lbu a0, 12(sp)
				; RV64-NEXT: andi a1, a0, 1
				; RV64-NEXT: bnez a1, .LBB6_5
				; RV64-NEXT: # %bb.1: # %else
				; RV64-NEXT: andi a1, a0, 2
				; RV64-NEXT: bnez a1, .LBB6_6
				; RV64-NEXT: .LBB6_2: # %else2
				; RV64-NEXT: andi a1, a0, 4
				; RV64-NEXT: bnez a1, .LBB6_7
				; RV64-NEXT: .LBB6_3: # %else4
				; RV64-NEXT: andi a0, a0, 8
				; RV64-NEXT: bnez a0, .LBB6_8
				; RV64-NEXT: .LBB6_4: # %else6
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				; RV64-NEXT: .LBB6_5: # %cond.store
				; RV64-NEXT: vsetvli zero, zero, e16,mf2,ta,mu
				; RV64-NEXT: vmv.x.s a1, v8
				; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
				; RV64-NEXT: vmv.x.s a2, v10
				; RV64-NEXT: sb a1, 0(a2)
				; RV64-NEXT: srli a1, a1, 8
				; RV64-NEXT: sb a1, 1(a2)
				; RV64-NEXT: andi a1, a0, 2
				; RV64-NEXT: beqz a1, .LBB6_2
				; RV64-NEXT: .LBB6_6: # %cond.store1
				; RV64-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 1
				; RV64-NEXT: vmv.x.s a1, v25
				; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
				; RV64-NEXT: vslidedown.vi v26, v10, 1
				; RV64-NEXT: vmv.x.s a2, v26
				; RV64-NEXT: sb a1, 0(a2)
				; RV64-NEXT: srli a1, a1, 8
				; RV64-NEXT: sb a1, 1(a2)
				; RV64-NEXT: andi a1, a0, 4
				; RV64-NEXT: beqz a1, .LBB6_3
				; RV64-NEXT: .LBB6_7: # %cond.store3
				; RV64-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 2
				; RV64-NEXT: vmv.x.s a1, v25
				; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
				; RV64-NEXT: vslidedown.vi v26, v10, 2
				; RV64-NEXT: vmv.x.s a2, v26
				; RV64-NEXT: sb a1, 0(a2)
				; RV64-NEXT: srli a1, a1, 8
				; RV64-NEXT: sb a1, 1(a2)
				; RV64-NEXT: andi a0, a0, 8
				; RV64-NEXT: beqz a0, .LBB6_4
				; RV64-NEXT: .LBB6_8: # %cond.store5
				; RV64-NEXT: vsetivli zero, 1, e16,mf2,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 3
				; RV64-NEXT: vmv.x.s a0, v25
				; RV64-NEXT: vsetvli zero, zero, e64,m2,ta,mu
				; RV64-NEXT: vslidedown.vi v26, v10, 3
				; RV64-NEXT: vmv.x.s a1, v26
				; RV64-NEXT: sb a0, 0(a1)
				; RV64-NEXT: srli a0, a0, 8
				; RV64-NEXT: sb a0, 1(a1)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %val, <4 x i16*> %ptrs, i32 1, <4 x i1> %m)
				ret void
				}

				declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>)

				define void @mscatter_v2i32_align2(<2 x i32> %val, <2 x i32*> %ptrs, <2 x i1> %m) {
				; RV32-LABEL: mscatter_v2i32_align2:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: .cfi_def_cfa_offset 16
				; RV32-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a0, sp, 14
				; RV32-NEXT: vse1.v v25, (a0)
				; RV32-NEXT: lbu a0, 14(sp)
				; RV32-NEXT: andi a1, a0, 1
				; RV32-NEXT: bnez a1, .LBB7_3
				; RV32-NEXT: # %bb.1: # %else
				; RV32-NEXT: andi a0, a0, 2
				; RV32-NEXT: bnez a0, .LBB7_4
				; RV32-NEXT: .LBB7_2: # %else2
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				; RV32-NEXT: .LBB7_3: # %cond.store
				; RV32-NEXT: vsetvli zero, zero, e32,mf2,ta,mu
				; RV32-NEXT: vmv.x.s a1, v8
				; RV32-NEXT: vmv.x.s a2, v9
				; RV32-NEXT: sh a1, 0(a2)
				; RV32-NEXT: srli a1, a1, 16
				; RV32-NEXT: sh a1, 2(a2)
				; RV32-NEXT: andi a0, a0, 2
				; RV32-NEXT: beqz a0, .LBB7_2
				; RV32-NEXT: .LBB7_4: # %cond.store1
				; RV32-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 1
				; RV32-NEXT: vmv.x.s a0, v25
				; RV32-NEXT: vslidedown.vi v25, v9, 1
				; RV32-NEXT: vmv.x.s a1, v25
				; RV32-NEXT: sh a0, 0(a1)
				; RV32-NEXT: srli a0, a0, 16
				; RV32-NEXT: sh a0, 2(a1)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: mscatter_v2i32_align2:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: .cfi_def_cfa_offset 16
				; RV64-NEXT: vsetivli zero, 2, e8,mf8,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a0, sp, 14
				; RV64-NEXT: vse1.v v25, (a0)
				; RV64-NEXT: lbu a0, 14(sp)
				; RV64-NEXT: andi a1, a0, 1
				; RV64-NEXT: bnez a1, .LBB7_3
				; RV64-NEXT: # %bb.1: # %else
				; RV64-NEXT: andi a0, a0, 2
				; RV64-NEXT: bnez a0, .LBB7_4
				; RV64-NEXT: .LBB7_2: # %else2
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				; RV64-NEXT: .LBB7_3: # %cond.store
				; RV64-NEXT: vsetvli zero, zero, e32,mf2,ta,mu
				; RV64-NEXT: vmv.x.s a1, v8
				; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu
				; RV64-NEXT: vmv.x.s a2, v9
				; RV64-NEXT: sh a1, 0(a2)
				; RV64-NEXT: srli a1, a1, 16
				; RV64-NEXT: sh a1, 2(a2)
				; RV64-NEXT: andi a0, a0, 2
				; RV64-NEXT: beqz a0, .LBB7_2
				; RV64-NEXT: .LBB7_4: # %cond.store1
				; RV64-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 1
				; RV64-NEXT: vmv.x.s a0, v25
				; RV64-NEXT: vsetvli zero, zero, e64,m1,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v9, 1
				; RV64-NEXT: vmv.x.s a1, v25
				; RV64-NEXT: sh a0, 0(a1)
				; RV64-NEXT: srli a0, a0, 16
				; RV64-NEXT: sh a0, 2(a1)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %val, <2 x i32*> %ptrs, i32 2, <2 x i1> %m)
				ret void
				}

				declare <2 x i32> @llvm.masked.load.v2i32(<2 x i32>*, i32, <2 x i1>, <2 x i32>)

				define void @masked_load_v2i32_align1(<2 x i32>* %a, <2 x i32> %m, <2 x i32>* %res_ptr) nounwind {
				; RV32-LABEL: masked_load_v2i32_align1:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV32-NEXT: vmseq.vi v0, v8, 0
				; RV32-NEXT: vsetvli zero, zero, e8,mf8,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a2, sp, 14
				; RV32-NEXT: vse1.v v25, (a2)
				; RV32-NEXT: lbu a2, 14(sp)
				; RV32-NEXT: andi a3, a2, 1
				; RV32-NEXT: beqz a3, .LBB8_2
				; RV32-NEXT: # %bb.1: # %cond.load
				; RV32-NEXT: lbu a6, 1(a0)
				; RV32-NEXT: lbu a7, 0(a0)
				; RV32-NEXT: lbu a5, 3(a0)
				; RV32-NEXT: lbu a3, 2(a0)
				; RV32-NEXT: slli a4, a6, 8
				; RV32-NEXT: or a4, a4, a7
				; RV32-NEXT: slli a5, a5, 8
				; RV32-NEXT: or a3, a5, a3
				; RV32-NEXT: slli a3, a3, 16
				; RV32-NEXT: or a3, a3, a4
				; RV32-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV32-NEXT: vmv.v.x v25, a3
				; RV32-NEXT: andi a2, a2, 2
				; RV32-NEXT: bnez a2, .LBB8_3
				; RV32-NEXT: j .LBB8_4
				; RV32-NEXT: .LBB8_2:
				; RV32-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: andi a2, a2, 2
				; RV32-NEXT: beqz a2, .LBB8_4
				; RV32-NEXT: .LBB8_3: # %cond.load1
				; RV32-NEXT: lbu a2, 5(a0)
				; RV32-NEXT: lbu a3, 4(a0)
				; RV32-NEXT: lbu a4, 7(a0)
				; RV32-NEXT: lbu a0, 6(a0)
				; RV32-NEXT: slli a2, a2, 8
				; RV32-NEXT: or a2, a2, a3
				; RV32-NEXT: slli a3, a4, 8
				; RV32-NEXT: or a0, a3, a0
				; RV32-NEXT: slli a0, a0, 16
				; RV32-NEXT: or a0, a0, a2
				; RV32-NEXT: vmv.s.x v26, a0
				; RV32-NEXT: vsetvli zero, zero, e32,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v25, v26, 1
				; RV32-NEXT: .LBB8_4: # %else2
				; RV32-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV32-NEXT: vse32.v v25, (a1)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: masked_load_v2i32_align1:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV64-NEXT: vmseq.vi v0, v8, 0
				; RV64-NEXT: vsetvli zero, zero, e8,mf8,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a2, sp, 14
				; RV64-NEXT: vse1.v v25, (a2)
				; RV64-NEXT: lbu a2, 14(sp)
				; RV64-NEXT: andi a3, a2, 1
				; RV64-NEXT: beqz a3, .LBB8_2
				; RV64-NEXT: # %bb.1: # %cond.load
				; RV64-NEXT: lbu a6, 1(a0)
				; RV64-NEXT: lbu a7, 0(a0)
				; RV64-NEXT: lb a5, 3(a0)
				; RV64-NEXT: lbu a3, 2(a0)
				; RV64-NEXT: slli a4, a6, 8
				; RV64-NEXT: or a4, a4, a7
				; RV64-NEXT: slli a5, a5, 8
				; RV64-NEXT: or a3, a5, a3
				; RV64-NEXT: slli a3, a3, 16
				; RV64-NEXT: or a3, a3, a4
				; RV64-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV64-NEXT: vmv.v.x v25, a3
				; RV64-NEXT: andi a2, a2, 2
				; RV64-NEXT: bnez a2, .LBB8_3
				; RV64-NEXT: j .LBB8_4
				; RV64-NEXT: .LBB8_2:
				; RV64-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: andi a2, a2, 2
				; RV64-NEXT: beqz a2, .LBB8_4
				; RV64-NEXT: .LBB8_3: # %cond.load1
				; RV64-NEXT: lbu a2, 5(a0)
				; RV64-NEXT: lbu a3, 4(a0)
				; RV64-NEXT: lb a4, 7(a0)
				; RV64-NEXT: lbu a0, 6(a0)
				; RV64-NEXT: slli a2, a2, 8
				; RV64-NEXT: or a2, a2, a3
				; RV64-NEXT: slli a3, a4, 8
				; RV64-NEXT: or a0, a3, a0
				; RV64-NEXT: slli a0, a0, 16
				; RV64-NEXT: or a0, a0, a2
				; RV64-NEXT: vmv.s.x v26, a0
				; RV64-NEXT: vsetvli zero, zero, e32,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v25, v26, 1
				; RV64-NEXT: .LBB8_4: # %else2
				; RV64-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV64-NEXT: vse32.v v25, (a1)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%mask = icmp eq <2 x i32> %m, zeroinitializer
				%load = call <2 x i32> @llvm.masked.load.v2i32(<2 x i32>* %a, i32 1, <2 x i1> %mask, <2 x i32> undef)
				store <2 x i32> %load, <2 x i32>* %res_ptr
				ret void
				}

				declare void @llvm.masked.store.v2i32.p0v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)

				define void @masked_store_v2i32_align2(<2 x i32> %val, <2 x i32>* %a, <2 x i32> %m) nounwind {
				; RV32-LABEL: masked_store_v2i32_align2:
				; RV32: # %bb.0:
				; RV32-NEXT: addi sp, sp, -16
				; RV32-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV32-NEXT: vmseq.vi v0, v9, 0
				; RV32-NEXT: vsetvli zero, zero, e8,mf8,ta,mu
				; RV32-NEXT: vmv.v.i v25, 0
				; RV32-NEXT: vmerge.vim v25, v25, 1, v0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmv.v.i v26, 0
				; RV32-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV32-NEXT: vslideup.vi v26, v25, 0
				; RV32-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV32-NEXT: vmsne.vi v25, v26, 0
				; RV32-NEXT: addi a1, sp, 14
				; RV32-NEXT: vse1.v v25, (a1)
				; RV32-NEXT: lbu a1, 14(sp)
				; RV32-NEXT: andi a2, a1, 1
				; RV32-NEXT: bnez a2, .LBB9_3
				; RV32-NEXT: # %bb.1: # %else
				; RV32-NEXT: andi a1, a1, 2
				; RV32-NEXT: bnez a1, .LBB9_4
				; RV32-NEXT: .LBB9_2: # %else2
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				; RV32-NEXT: .LBB9_3: # %cond.store
				; RV32-NEXT: vsetvli zero, zero, e32,mf2,ta,mu
				; RV32-NEXT: vmv.x.s a2, v8
				; RV32-NEXT: sh a2, 0(a0)
				; RV32-NEXT: srli a2, a2, 16
				; RV32-NEXT: sh a2, 2(a0)
				; RV32-NEXT: andi a1, a1, 2
				; RV32-NEXT: beqz a1, .LBB9_2
				; RV32-NEXT: .LBB9_4: # %cond.store1
				; RV32-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV32-NEXT: vslidedown.vi v25, v8, 1
				; RV32-NEXT: vmv.x.s a1, v25
				; RV32-NEXT: sh a1, 4(a0)
				; RV32-NEXT: srli a1, a1, 16
				; RV32-NEXT: sh a1, 6(a0)
				; RV32-NEXT: addi sp, sp, 16
				; RV32-NEXT: ret
				;
				; RV64-LABEL: masked_store_v2i32_align2:
				; RV64: # %bb.0:
				; RV64-NEXT: addi sp, sp, -16
				; RV64-NEXT: vsetivli zero, 2, e32,mf2,ta,mu
				; RV64-NEXT: vmseq.vi v0, v9, 0
				; RV64-NEXT: vsetvli zero, zero, e8,mf8,ta,mu
				; RV64-NEXT: vmv.v.i v25, 0
				; RV64-NEXT: vmerge.vim v25, v25, 1, v0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmv.v.i v26, 0
				; RV64-NEXT: vsetivli zero, 2, e8,mf2,tu,mu
				; RV64-NEXT: vslideup.vi v26, v25, 0
				; RV64-NEXT: vsetivli zero, 8, e8,mf2,ta,mu
				; RV64-NEXT: vmsne.vi v25, v26, 0
				; RV64-NEXT: addi a1, sp, 14
				; RV64-NEXT: vse1.v v25, (a1)
				; RV64-NEXT: lbu a1, 14(sp)
				; RV64-NEXT: andi a2, a1, 1
				; RV64-NEXT: bnez a2, .LBB9_3
				; RV64-NEXT: # %bb.1: # %else
				; RV64-NEXT: andi a1, a1, 2
				; RV64-NEXT: bnez a1, .LBB9_4
				; RV64-NEXT: .LBB9_2: # %else2
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				; RV64-NEXT: .LBB9_3: # %cond.store
				; RV64-NEXT: vsetvli zero, zero, e32,mf2,ta,mu
				; RV64-NEXT: vmv.x.s a2, v8
				; RV64-NEXT: sh a2, 0(a0)
				; RV64-NEXT: srli a2, a2, 16
				; RV64-NEXT: sh a2, 2(a0)
				; RV64-NEXT: andi a1, a1, 2
				; RV64-NEXT: beqz a1, .LBB9_2
				; RV64-NEXT: .LBB9_4: # %cond.store1
				; RV64-NEXT: vsetivli zero, 1, e32,mf2,ta,mu
				; RV64-NEXT: vslidedown.vi v25, v8, 1
				; RV64-NEXT: vmv.x.s a1, v25
				; RV64-NEXT: sh a1, 4(a0)
				; RV64-NEXT: srli a1, a1, 16
				; RV64-NEXT: sh a1, 6(a0)
				; RV64-NEXT: addi sp, sp, 16
				; RV64-NEXT: ret
				%mask = icmp eq <2 x i32> %m, zeroinitializer
				call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %a, i32 2, <2 x i1> %mask)
				ret void
				}

This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Expand unaligned fixed-length vector memory accesses
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 349196

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll

llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll

This is an archive of the discontinued LLVM Phabricator instance.

[RISCV] Expand unaligned fixed-length vector memory accessesClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 349196

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

llvm/test/Analysis/CostModel/RISCV/fixed-vector-gather.ll

llvm/test/Analysis/CostModel/RISCV/fixed-vector-scatter.ll

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-unaligned.ll

[RISCV] Expand unaligned fixed-length vector memory accesses
ClosedPublic