This is an archive of the discontinued LLVM Phabricator instance.

Show First 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	bool mayIgnoreSignedZero(SDValue Op) const {

const auto Flags = Op.getNode()->getFlags();		const auto Flags = Op.getNode()->getFlags();
if (Flags.isDefined())		if (Flags.isDefined())
return Flags.hasNoSignedZeros();		return Flags.hasNoSignedZeros();

return false;		return false;
}		}

		static inline SDValue stripBitcast(SDValue Val) {
		return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
		}

static bool allUsesHaveSourceMods(const SDNode *N,		static bool allUsesHaveSourceMods(const SDNode *N,
unsigned CostThreshold = 4);		unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;		bool isFAbsFree(EVT VT) const override;
bool isFNegFree(EVT VT) const override;		bool isFNegFree(EVT VT) const override;
bool isTruncateFree(EVT Src, EVT Dest) const override;		bool isTruncateFree(EVT Src, EVT Dest) const override;
bool isTruncateFree(Type Src, Type Dest) const override;		bool isTruncateFree(Type Src, Type Dest) const override;

bool isZExtFree(Type Src, Type Dest) const override;		bool isZExtFree(Type Src, Type Dest) const override;
▲ Show 20 Lines • Show All 434 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 3,138 Lines • ▼ Show 20 Lines	if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
EltVT.changeTypeToInteger(), Elt0);		EltVT.changeTypeToInteger(), Elt0);
}		}

return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);		return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
}		}
}		}
}		}

		// Equivalent of above for accessing the high element of a vector as an
		// integer operation.
		// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
		if (Src.getOpcode() == ISD::SRL) {
		if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
		if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
		SDValue BV = stripBitcast(Src.getOperand(0));
		if (BV.getOpcode() == ISD::BUILD_VECTOR &&
		BV.getValueType().getVectorNumElements() == 2) {
		SDValue SrcElt = BV.getOperand(1);
		EVT SrcEltVT = SrcElt.getValueType();
		if (SrcEltVT.isFloatingPoint()) {
		SrcElt = DAG.getNode(ISD::BITCAST, SL,
		SrcEltVT.changeTypeToInteger(), SrcElt);
		}

		return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
		}
		}
		}
		}

// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.		// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
//		//
// i16 (trunc (srl i64:x, K)), K <= 16 ->		// i16 (trunc (srl i64:x, K)), K <= 16 ->
// i16 (trunc (srl (i32 (trunc x), K)))		// i16 (trunc (srl (i32 (trunc x), K)))
if (VT.getScalarSizeInBits() < 32) {		if (VT.getScalarSizeInBits() < 32) {
EVT SrcVT = Src.getValueType();		EVT SrcVT = Src.getValueType();
if (SrcVT.getScalarSizeInBits() > 32 &&		if (SrcVT.getScalarSizeInBits() > 32 &&
(Src.getOpcode() == ISD::SRL \|\|		(Src.getOpcode() == ISD::SRL \|\|
▲ Show 20 Lines • Show All 1,274 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	class SITargetLowering final : public AMDGPUTargetLowering {
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;

SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,		SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
SelectionDAG &DAG) const;		SelectionDAG &DAG) const;

SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
		SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;		SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;

SDNode adjustWritemask(MachineSDNode &N, SelectionDAG &DAG) const;		SDNode adjustWritemask(MachineSDNode &N, SelectionDAG &DAG) const;

SDValue performUCharToFloatCombine(SDNode *N,		SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;		DAGCombinerInfo &DCI) const;
SDValue performSHLPtrCombine(SDNode *N,		SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,		unsigned AS,
▲ Show 20 Lines • Show All 201 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 282 Lines • ▼ Show 20 Lines	for (MVT Vec64 : { MVT::v2i64, MVT::v2f64 }) {
AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);		AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
}		}

setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);		setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);

		setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
		setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);

// Avoid stack access for these.		// Avoid stack access for these.
// TODO: Generalize to more vector types.		// TODO: Generalize to more vector types.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);

setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);

		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
		setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
		setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);

// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,		// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
// and output demarshalling		// and output demarshalling
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);		setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);		setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);

// We can't return success/failure, only the old value,		// We can't return success/failure, only the old value,
// let LLVM add the comparison		// let LLVM add the comparison
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);		setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand);
▲ Show 20 Lines • Show All 3,022 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);		case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);		case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);		case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG);
case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);		case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG);
case ISD::INSERT_VECTOR_ELT:		case ISD::INSERT_VECTOR_ELT:
return lowerINSERT_VECTOR_ELT(Op, DAG);		return lowerINSERT_VECTOR_ELT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT:		case ISD::EXTRACT_VECTOR_ELT:
return lowerEXTRACT_VECTOR_ELT(Op, DAG);		return lowerEXTRACT_VECTOR_ELT(Op, DAG);
		case ISD::BUILD_VECTOR:
		return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:		case ISD::FP_ROUND:
return lowerFP_ROUND(Op, DAG);		return lowerFP_ROUND(Op, DAG);
case ISD::TRAP:		case ISD::TRAP:
case ISD::DEBUGTRAP:		case ISD::DEBUGTRAP:
return lowerTRAP(Op, DAG);		return lowerTRAP(Op, DAG);
}		}
return SDValue();		return SDValue();
}		}
▲ Show 20 Lines • Show All 808 Lines • ▼ Show 20 Lines	DiagnosticInfoUnsupported InvalidAddrSpaceCast(
MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());		MF.getFunction(), "invalid addrspacecast", SL.getDebugLoc());
DAG.getContext()->diagnose(InvalidAddrSpaceCast);		DAG.getContext()->diagnose(InvalidAddrSpaceCast);

return DAG.getUNDEF(ASC->getValueType(0));		return DAG.getUNDEF(ASC->getValueType(0));
}		}

SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,		SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
		SDValue Vec = Op.getOperand(0);
		SDValue InsVal = Op.getOperand(1);
SDValue Idx = Op.getOperand(2);		SDValue Idx = Op.getOperand(2);
		EVT VecVT = Vec.getValueType();

		assert(VecVT.getScalarSizeInBits() == 16);
		rampitecUnsubmitted Not Done Reply Inline Actions Need to bail if vector size is not an expected 64. rampitec: Need to bail if vector size is not an expected 64.

		unsigned NumElts = VecVT.getVectorNumElements();
		SDLoc SL(Op);
		auto KIdx = dyn_cast<ConstantSDNode>(Idx);

		if (NumElts == 4 && KIdx) {
		SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);

		SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
		DAG.getConstant(0, SL, MVT::i32));
		SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
		DAG.getConstant(1, SL, MVT::i32));

		SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
		SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);

		unsigned Idx = KIdx->getZExtValue();
		bool InsertLo = Idx < 2;
		SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
		InsertLo ? LoVec : HiVec,
		DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
		DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));

		InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);

		SDValue Concat = InsertLo ?
		DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
		DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });

		return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
		}

		assert(NumElts == 2 \|\| NumElts == 4);

if (isa<ConstantSDNode>(Idx))		if (isa<ConstantSDNode>(Idx))
return SDValue();		return SDValue();

		EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;

// Avoid stack access for dynamic indexing.		// Avoid stack access for dynamic indexing.
SDLoc SL(Op);		SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
SDValue Vec = Op.getOperand(0);
SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));

// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec		// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);		SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);

// Convert vector index to bit-index.		// Convert vector index to bit-index.
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,		SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
DAG.getConstant(4, SL, MVT::i32));		DAG.getConstant(4, SL, MVT::i32));

SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);		SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
		SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,		DAG.getConstant(0xffff, SL, IntVT),
DAG.getConstant(0xffff, SL, MVT::i32),
ScaledIdx);		ScaledIdx);

SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);		SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,		SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
DAG.getNOT(SL, BFM, MVT::i32), BCVec);		DAG.getNOT(SL, BFM, IntVT), BCVec);

SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);		SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);		return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
}		}

SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,		SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);

EVT ResultVT = Op.getValueType();		EVT ResultVT = Op.getValueType();
SDValue Vec = Op.getOperand(0);		SDValue Vec = Op.getOperand(0);
SDValue Idx = Op.getOperand(1);		SDValue Idx = Op.getOperand(1);
		EVT VecVT = Vec.getValueType();
		unsigned NumElts = VecVT.getVectorNumElements();
		assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 \|\| NumElts == 4));

DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);		DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);

// Make sure we do any optimizations that will make it easier to fold		// Make sure we do any optimizations that will make it easier to fold
// source modifiers before obscuring it with bit operations.		// source modifiers before obscuring it with bit operations.

// XXX - Why doesn't this get called when vector_shuffle is expanded?		// XXX - Why doesn't this get called when vector_shuffle is expanded?
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))		if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;		return Combined;

		EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
SDValue Four = DAG.getConstant(4, SL, MVT::i32);		SDValue Four = DAG.getConstant(4, SL, MVT::i32);

// Convert vector index to bit-index (* 16)		// Convert vector index to bit-index (* 16)
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four);		SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four);

SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);		SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);		SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);

SDValue Result = Elt;
if (ResultVT.bitsLT(MVT::i32))
Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);

		if (ResultVT == MVT::f16) {
		SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);		return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
}		}

		return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
		}

		SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
		SelectionDAG &DAG) const {
		SDLoc SL(Op);
		EVT VT = Op.getValueType();
		assert(VT == MVT::v4i16 \|\| VT == MVT::v4f16);

		EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);

		// Turn into pair of packed build_vectors.
		// TODO: Special case for constants that can be materialized with s_mov_b64.
		SDValue Lo = DAG.getBuildVector(HalfVT, SL,
		{ Op.getOperand(0), Op.getOperand(1) });
		SDValue Hi = DAG.getBuildVector(HalfVT, SL,
		{ Op.getOperand(2), Op.getOperand(3) });

		SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
		SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);

		SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
		return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
		}

bool		bool
SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {		SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
// We can fold offsets for anything that doesn't require a GOT relocation.		// We can fold offsets for anything that doesn't require a GOT relocation.
return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS \|\|		return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS \|\|
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS \|\|		GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS \|\|
GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&		GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) &&
!shouldEmitGOTReloc(GA->getGlobal());		!shouldEmitGOTReloc(GA->getGlobal());
}		}
▲ Show 20 Lines • Show All 3,435 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/extload-align.ll

	; RUN: llc -debug-only=machine-scheduler -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs %s -o - 2>&1\| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s			; RUN: llc -debug-only=machine-scheduler -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs %s -o - 2>&1\| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s
	target datalayout = "A5"			target datalayout = "A5"
	; REQUIRES: asserts			; REQUIRES: asserts

	; Verify that the extload generated from %eval has the default			; Verify that the extload generated from %eval has the default
	; alignment size (2) corresponding to the underlying memory size (i16)			; alignment size (2) corresponding to the underlying memory size (i16)
	; size and not 4 corresponding to the sign-extended size (i32).			; size and not 4 corresponding to the sign-extended size (i32).

	; DEBUG: {{^}}# Machine code for function extload_align:			; DEBUG: {{^}}# Machine code for function extload_align:
	; DEBUG: (load 2, addrspace 5)			; DEBUG: (volatile load 2 from %ir.a, addrspace 5)
	; DEBUG: {{^}}# End machine code for function extload_align.			; DEBUG: {{^}}# End machine code for function extload_align.

	define amdgpu_kernel void @extload_align(i32 addrspace(5)* %out, i32 %index) #0 {			define amdgpu_kernel void @extload_align(i32 addrspace(5)* %out, i32 %index) #0 {
	%v0 = alloca [4 x i16], addrspace(5)			%v0 = alloca [4 x i16], addrspace(5)
	%a1 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 0			%a1 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 0
	%a2 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 1			%a2 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 1
	store i16 0, i16 addrspace(5)* %a1			store volatile i16 0, i16 addrspace(5)* %a1
	store i16 1, i16 addrspace(5)* %a2			store volatile i16 1, i16 addrspace(5)* %a2
	%a = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 %index			%a = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 %index
	%val = load i16, i16 addrspace(5)* %a			%val = load volatile i16, i16 addrspace(5)* %a
	%eval = sext i16 %val to i32			%eval = sext i16 %val to i32
	store i32 %eval, i32 addrspace(5)* %out			store i32 %eval, i32 addrspace(5)* %out
	ret void			ret void
	}			}

test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
%p0 = extractelement <3 x half> %foo, i32 0		%p0 = extractelement <3 x half> %foo, i32 0
%p1 = extractelement <3 x half> %foo, i32 2		%p1 = extractelement <3 x half> %foo, i32 2
%out1 = getelementptr half, half addrspace(1)* %out, i32 1		%out1 = getelementptr half, half addrspace(1)* %out, i32 1
store half %p1, half addrspace(1)* %out, align 2		store half %p1, half addrspace(1)* %out, align 2
store half %p0, half addrspace(1)* %out1, align 2		store half %p0, half addrspace(1)* %out1, align 2
ret void		ret void
}		}

; GCN-LABEL: {{^}}extract_vector_elt_v4f16:
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort
; GCN: buffer_store_short
; GCN: buffer_store_short
define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
%p0 = extractelement <4 x half> %foo, i32 0
%p1 = extractelement <4 x half> %foo, i32 2
%out1 = getelementptr half, half addrspace(1)* %out, i32 10
store half %p1, half addrspace(1)* %out, align 2
store half %p0, half addrspace(1)* %out1, align 2
ret void
}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:		; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
; GCN: buffer_load_ushort		; SICIVI: buffer_load_ushort
; GCN: buffer_load_ushort		; SICIVI: buffer_load_ushort
; GCN: buffer_load_ushort		; SICIVI: buffer_load_ushort

; GCN: buffer_store_short		; GFX9-DAG: global_load_short_d16_hi v
; GCN: buffer_store_short		; GFX9-DAG: global_load_short_d16 v
; GCN: buffer_store_short

; GCN: buffer_load_ushort		; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
; GCN: buffer_store_short		; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v

		; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}

		; GCN: {{buffer\|global}}_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {		define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
%p0 = extractelement <3 x half> %foo, i32 %idx		%p0 = extractelement <3 x half> %foo, i32 %idx
%out1 = getelementptr half, half addrspace(1)* %out, i32 1		%out1 = getelementptr half, half addrspace(1)* %out, i32 1
store half %p0, half addrspace(1)* %out		store half %p0, half addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4f16:		; GCN-LABEL: {{^}}v_extractelement_v4f16_2:
; GCN: buffer_load_ushort		; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
; GCN: buffer_load_ushort		; SI: buffer_store_short [[LOAD]]
; GCN: buffer_load_ushort
; GCN: buffer_load_ushort		; VI: flat_load_dword v
		; VI: flat_store_short

		; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4
		; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]]
		define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%vec.extract = extractelement <4 x half> %vec, i32 2
		store half %vec.extract, half addrspace(1)* %out.gep
		ret void
		}

; GCN: buffer_store_short		; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr:
; GCN: buffer_store_short		; GCN-DAG: {{flat\|global\|buffer}}_load_dword [[IDX:v[0-9]+]],
; GCN: buffer_store_short		; GCN-DAG: {{flat\|global\|buffer}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GCN: buffer_store_short		; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]

; GCN: buffer_load_ushort		; GFX89: v_lshrrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], v{{\[}}[[LO]]:[[HI]]{{\]}}
; GCN: buffer_store_short		; GFX89: {{flat\|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
%p0 = extractelement <4 x half> %foo, i32 %idx		; SI: v_lshr_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[SCALED_IDX]]
%out1 = getelementptr half, half addrspace(1)* %out, i32 1		; SI: buffer_store_short v[[SHIFT_LO]]
store half %p0, half addrspace(1)* %out		define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
		%idx.val = load volatile i32, i32 addrspace(1)* undef
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%vec.extract = extractelement <4 x half> %vec, i32 %idx.val
		store half %vec.extract, half addrspace(1)* %out.gep
ret void		ret void
}		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s		; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s		; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 %s		; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s

; GCN-LABEL: {{^}}extract_vector_elt_v2i16:		; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
; GCN: s_load_dword [[VEC:s[0-9]+]]		; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16		; GCN: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]		; GCN-DAG: v_mov_b32_e32 [[VELT0:v[0-9]+]], [[VEC]]
; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]		; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
; GCN-DAG: buffer_store_short [[VELT0]]		; GCN-DAG: buffer_store_short [[VELT0]]
; GCN-DAG: buffer_store_short [[VELT1]]		; GCN-DAG: buffer_store_short [[VELT1]]
▲ Show 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:		; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
; SICIVI: buffer_load_ushort		; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort		; SICIVI: buffer_load_ushort
; SICIVI: buffer_load_ushort		; SICIVI: buffer_load_ushort

; SICIVI: buffer_store_short		; GFX9-DAG: global_load_short_d16_hi v
; SICIVI: buffer_store_short		; GFX9-DAG: global_load_short_d16 v
; SICIVI: buffer_store_short

; SICIVI: buffer_load_ushort		; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
; SICIVI: buffer_store_short		; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v

		; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}

; GFX9: buffer_load_ushort		; GCN: {{buffer\|global}}_store_short
; GFX9: global_load_short_d16_hi
; GFX9: global_load_short_d16 v
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
; GFX9: buffer_load_ushort
; GFX9: buffer_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {		define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
%p0 = extractelement <3 x i16> %foo, i32 %idx		%p0 = extractelement <3 x i16> %foo, i32 %idx
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1		%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 %p0, i16 addrspace(1)* %out		store i16 %p0, i16 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:		; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr:
; SICIVI: buffer_load_ushort		define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %idx) #0 {
; SICIVI: buffer_load_ushort		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
; SICIVI: buffer_load_ushort		%tid.ext = sext i32 %tid to i64
; SICIVI: buffer_load_ushort		%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
; SICIVI: buffer_store_short		%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
; SICIVI: buffer_store_short		%vec.extract = extractelement <4 x i16> %vec, i32 %idx
; SICIVI: buffer_store_short		store i16 %vec.extract, i16 addrspace(1)* %out.gep
; SICIVI: buffer_store_short

; SICIVI: buffer_load_ushort
; SICIVI: buffer_store_short

; GFX9: s_load_dword
; GFX9: buffer_store_dword
; GFX9: buffer_store_dword
; GFX9: buffer_load_ushort
; GFX9: buffer_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
%p0 = extractelement <4 x i16> %foo, i32 %idx
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 %p0, i16 addrspace(1)* %out
ret void		ret void
}		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/insert_vector_elt.ll

	Show First 20 Lines • Show All 195 Lines • ▼ Show 20 Lines

	; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:			; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
	define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {			define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
	%vecins = insertelement <3 x i16> %a, i16 5, i32 %b			%vecins = insertelement <3 x i16> %a, i16 5, i32 %b
	store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8			store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}dynamic_insertelement_v4i16:
	; GCN: buffer_load_ushort v{{[0-9]+}}, off
	; GCN: buffer_load_ushort v{{[0-9]+}}, off
	; GCN: buffer_load_ushort v{{[0-9]+}}, off
	; GCN: buffer_load_ushort v{{[0-9]+}}, off

	; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}}
	; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
	; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}

	; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14
	; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12
	; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10
	; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8
	; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}

	; GCN-NO-TONGA: s_waitcnt expcnt

	; GCN: buffer_load_dwordx2

	; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
	define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
	%vecins = insertelement <4 x i16> %a, i16 5, i32 %b
	store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
	ret void
	}

	; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:			; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
	; GCN: buffer_load_ubyte v{{[0-9]+}}, off			; GCN: buffer_load_ubyte v{{[0-9]+}}, off
	; GCN: buffer_load_ubyte v{{[0-9]+}}, off			; GCN: buffer_load_ubyte v{{[0-9]+}}, off

	; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5			; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
	; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4			; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4

	; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}			; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
	▲ Show 20 Lines • Show All 212 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s \| FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s		; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s \| FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s \| FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s		; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s \| FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s \| FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s		; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s \| FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s

; GCN-LABEL: {{^}}s_insertelement_v2i16_0:		; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
; GCN: s_load_dword [[VEC:s[0-9]+]]		; GCN: s_load_dword [[VEC:s[0-9]+]]

; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}		; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}		; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}

; GFX9-NOT: lshr		; GFX9-NOT: lshr
▲ Show 20 Lines • Show All 467 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext		%out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
%idx = load i32, i32 addrspace(1)* %idx.gep		%idx = load i32, i32 addrspace(1)* %idx.gep
%vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep		%vec = load <2 x half>, <2 x half> addrspace(1)* %in.gep
%vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx		%vecins = insertelement <2 x half> %vec, half 0xH1234, i32 %idx
store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep		store <2 x half> %vecins, <2 x half> addrspace(1)* %out.gep
ret void		ret void
}		}

		; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
		; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]

		; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
		; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_insertelement_v4f16_1:
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
		; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]

		; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
		; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
		; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

		; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
		; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
		; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]

		; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
		; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_insertelement_v4f16_3:
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
		; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]

		; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
		; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
		; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD

		; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
		; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_insertelement_v4i16_2:
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
		; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]

		; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
		; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to i16
		%vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
		store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
		ret void
		}

		; FIXME: Better code on CI?
		; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_vgpr:
		; GCN-DAG: {{flat\|global}}_load_dword [[IDX:v[0-9]+]],
		; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
		; GCN-DAG: {{flat\|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}

		; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
		; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
		; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}}

		; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
		; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]]
		; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]]
		; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]]

		; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]]
		; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]]
		; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD


		; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
		; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]],
		; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0,

		; GCN: {{flat\|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}}
		define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
		%idx.val = load volatile i32, i32 addrspace(1)* undef
		%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to i16
		%vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
		store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
		ret void
		}

		; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_sgpr:
		define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
		%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
		%tid.ext = sext i32 %tid to i64
		%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
		%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
		%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
		%val.trunc = trunc i32 %val to i16
		%val.cvt = bitcast i16 %val.trunc to half
		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
		ret void
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }

test/CodeGen/AMDGPU/min.ll

	Show First 20 Lines • Show All 283 Lines • ▼ Show 20 Lines
	; FIXME: Reduce unused packed component to scalar			; FIXME: Reduce unused packed component to scalar
	; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}			; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}
	; SI: v_min_u32_e32			; SI: v_min_u32_e32
	; SI: v_min_u32_e32			; SI: v_min_u32_e32
	; SI: v_min_u32_e32			; SI: v_min_u32_e32
	; SI-NOT: v_min_u32_e32			; SI-NOT: v_min_u32_e32

	; VI: v_min_u16_e32			; VI: v_min_u16_e32
	; VI: v_min_u16_sdwa			; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
	; VI: v_min_u16_e32			; VI: v_min_u16_e32
	; VI-NOT: v_min_u16_e32			; VI-NOT: v_min_u16

	; GFX9: v_pk_min_u16			; GFX9: v_pk_min_u16
	; GFX9: v_pk_min_u16			; GFX9: v_pk_min_u16

	; GCN: s_endpgm			; GCN: s_endpgm

	; EG: MIN_UINT			; EG: MIN_UINT
	; EG: MIN_UINT			; EG: MIN_UINT
	▲ Show 20 Lines • Show All 338 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Custom lower v4i16/v4f16 vector operationsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 146905

lib/Target/AMDGPU/AMDGPUISelLowering.h

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

lib/Target/AMDGPU/SIISelLowering.h

lib/Target/AMDGPU/SIISelLowering.cpp

test/CodeGen/AMDGPU/extload-align.ll

test/CodeGen/AMDGPU/extract_vector_elt-f16.ll

test/CodeGen/AMDGPU/extract_vector_elt-i16.ll

test/CodeGen/AMDGPU/insert_vector_elt.ll

test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

test/CodeGen/AMDGPU/min.ll

AMDGPU: Custom lower v4i16/v4f16 vector operations
ClosedPublic