Diff 174632

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 146 Lines • ▼ Show 20 Lines	private:
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,		SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;		SDValue Op0, SDValue Op1) const;
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,		SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1, bool Signed) const;		SDValue Op0, SDValue Op1, bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
		SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;

unsigned getFusedOpcode(const SelectionDAG &DAG,		unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode N0, const SDNode N1) const;		const SDNode N0, const SDNode N1) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
▲ Show 20 Lines • Show All 194 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 673 Lines • ▼ Show 20 Lines	#endif
setTargetDAGCombine(ISD::OR);		setTargetDAGCombine(ISD::OR);
setTargetDAGCombine(ISD::XOR);		setTargetDAGCombine(ISD::XOR);
setTargetDAGCombine(ISD::SINT_TO_FP);		setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);		setTargetDAGCombine(ISD::UINT_TO_FP);
setTargetDAGCombine(ISD::FCANONICALIZE);		setTargetDAGCombine(ISD::FCANONICALIZE);
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);		setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
setTargetDAGCombine(ISD::ZERO_EXTEND);		setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);		setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
		setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);

// All memory operations. Some folding on the pointer operand is done to help		// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.		// matching the constant offsets in the addressing modes.
setTargetDAGCombine(ISD::LOAD);		setTargetDAGCombine(ISD::LOAD);
setTargetDAGCombine(ISD::STORE);		setTargetDAGCombine(ISD::STORE);
setTargetDAGCombine(ISD::ATOMIC_LOAD);		setTargetDAGCombine(ISD::ATOMIC_LOAD);
setTargetDAGCombine(ISD::ATOMIC_STORE);		setTargetDAGCombine(ISD::ATOMIC_STORE);
setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);		setTargetDAGCombine(ISD::ATOMIC_CMP_SWAP);
▲ Show 20 Lines • Show All 7,419 Lines • ▼ Show 20 Lines	if (isa<MemSDNode>(Vec) &&
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);		SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, EltVT.changeTypeToInteger(), Srl);
DCI.AddToWorklist(Trunc.getNode());		DCI.AddToWorklist(Trunc.getNode());
return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);		return DAG.getNode(ISD::BITCAST, SL, EltVT, Trunc);
}		}

return SDValue();		return SDValue();
}		}

		SDValue
		SITargetLowering::performInsertVectorEltCombine(SDNode *N,
		DAGCombinerInfo &DCI) const {
		SDValue Vec = N->getOperand(0);
		SDValue Idx = N->getOperand(2);
		EVT VecVT = Vec.getValueType();
		EVT EltVT = VecVT.getVectorElementType();
		unsigned VecSize = VecVT.getSizeInBits();
		unsigned EltSize = EltVT.getSizeInBits();

		// INSERT_VECTOR_ELT (<n x e>, var-idx)
		// => BUILD_VECTOR n x select (e, const-idx)
		// This elminates non-constant index and subsequent movrel or scratch access.
		// Sub-dword vectors of size 2 dword or less have better implementation.
		// Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
		// instructions.
		if (isa<ConstantSDNode>(Idx) \|\|
		VecSize > 256 \|\| (VecSize <= 64 && EltSize < 32))
		return SDValue();

		SelectionDAG &DAG = DCI.DAG;
		SDLoc SL(N);
		SDValue Ins = N->getOperand(1);
		EVT IdxVT = Idx.getValueType();

		SDValue V;
		SmallVector<SDValue, 16> Ops;
		for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
		SDValue IC = DAG.getConstant(I, SL, IdxVT);
		SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
		SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
		Ops.push_back(V);
		}

		return DAG.getBuildVector(VecVT, SL, Ops);
		}

unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,		unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,		const SDNode *N0,
const SDNode *N1) const {		const SDNode *N1) const {
EVT VT = N0->getValueType(0);		EVT VT = N0->getValueType(0);

// Only do this if we are not trying to support denormals. v_mad_f32 does not		// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.		// support denormals ever.
if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) \|\|		if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) \|\|
▲ Show 20 Lines • Show All 592 Lines • ▼ Show 20 Lines	if (VT == MVT::v2i16 \|\| VT == MVT::v2f16) {
SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);		SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
return DAG.getNode(ISD::BITCAST, SL, VT, Ext);		return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
}		}

break;		break;
}		}
case ISD::EXTRACT_VECTOR_ELT:		case ISD::EXTRACT_VECTOR_ELT:
return performExtractVectorEltCombine(N, DCI);		return performExtractVectorEltCombine(N, DCI);
		case ISD::INSERT_VECTOR_ELT:
		return performInsertVectorEltCombine(N, DCI);
}		}
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);		return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
}		}

/// Helper function for adjustWritemask		/// Helper function for adjustWritemask
static unsigned SubIdx2Lane(unsigned Idx) {		static unsigned SubIdx2Lane(unsigned Idx) {
switch (Idx) {		switch (Idx) {
default: return 0;		default: return 0;
▲ Show 20 Lines • Show All 627 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll

	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,IDXMODE,GFX9 %s

	; indexing of vectors.			; indexing of vectors.

	; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll			; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
	; to avoid gfx9 scheduling induced issues.			; to avoid gfx9 scheduling induced issues.


	; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:			; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
	; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}			; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[IDX0:v[0-9]+]]			; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[IDX0:v[0-9]+]]
	; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62			; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62

	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]			; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}			; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:[0-9]+]], s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:3]], s[[S_ELT0]]

	; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:			; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GCN-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]			; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
	; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]			; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
	; GCN: s_and_saveexec_b64 vcc, vcc			; GCN: s_and_saveexec_b64 vcc, vcc

	; MOVREL: s_mov_b32 m0, [[READLANE]]			; MOVREL: s_mov_b32 m0, [[READLANE]]
	Show All 12 Lines
	; GCN: s_mov_b64 [[MASK]], exec			; GCN: s_mov_b64 [[MASK]], exec

	; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:			; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
	; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]			; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
	; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]			; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
	; GCN: s_and_saveexec_b64 vcc, vcc			; GCN: s_and_saveexec_b64 vcc, vcc

	; MOVREL: s_mov_b32 m0, [[READLANE]]			; MOVREL: s_mov_b32 m0, [[READLANE]]
	; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63			; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63

	; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst			; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
	; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63			; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
	; IDXMODE: s_set_gpr_idx_off			; IDXMODE: s_set_gpr_idx_off

	; GCN-NEXT: s_xor_b64 exec, exec, vcc			; GCN-NEXT: s_xor_b64 exec, exec, vcc
	; GCN: s_cbranch_execnz [[LOOP1]]			; GCN: s_cbranch_execnz [[LOOP1]]

	; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:			; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:

	; GCN: buffer_store_dword [[INS0]]			; GCN: buffer_store_dword [[INS0]]
	define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {			define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
	entry:			entry:
	%id = call i32 @llvm.amdgcn.workitem.id.x() #1			%id = call i32 @llvm.amdgcn.workitem.id.x() #1
	%id.ext = zext i32 %id to i64			%id.ext = zext i32 %id to i64
	%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext			%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
	%idx0 = load volatile i32, i32 addrspace(1)* %gep			%idx0 = load volatile i32, i32 addrspace(1)* %gep
	%idx1 = add i32 %idx0, 1			%idx1 = add i32 %idx0, 1
	%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()			%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
	%vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0			%vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
	%vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1			%vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
	store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0			store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
	%cmp = icmp eq i32 %id, 0			%cmp = icmp eq i32 %id, 0
	br i1 %cmp, label %bb1, label %bb2			br i1 %cmp, label %bb1, label %bb2

	bb1:			bb1:
	store volatile i32 %live.out.val, i32 addrspace(1)* undef			store volatile i32 %live.out.val, i32 addrspace(1)* undef
	br label %bb2			br label %bb2

	bb2:			bb2:
	Show All 9 Lines

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll

	; RUN: llc -O0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s			; RUN: llc -O0 -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck %s

	; FIXME: Merge into indirect-addressing-si.ll			; FIXME: Merge into indirect-addressing-si.ll

	; Make sure that TwoAddressInstructions keeps src0 as subregister sub0			; Make sure that TwoAddressInstructions keeps src0 as subregister sub0
	; of the tied implicit use and def of the super register.			; of the tied implicit use and def of the super register.

	; CHECK-LABEL: {{^}}insert_wo_offset:			; CHECK-LABEL: {{^}}insert_wo_offset:
	; CHECK: s_load_dword [[IN:s[0-9]+]]			; CHECK: s_load_dword [[IN:s[0-9]+]]
	; CHECK: s_mov_b32 m0, [[IN]]			; CHECK: s_mov_b32 m0, [[IN]]
	; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]			; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
	; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:			; CHECK: buffer_store_dwordx4
	define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {			; CHECK: buffer_store_dwordx4
				; CHECK: buffer_store_dwordx4
				; CHECK: buffer_store_dwordx4
				define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
	entry:			entry:
	%ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in			%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
	store <4 x float> %ins, <4 x float> addrspace(1)* %out			store <16 x float> %ins, <16 x float> addrspace(1)* %out
	ret void			ret void
	}			}

	; Make sure we don't hit use of undefined register errors when expanding an			; Make sure we don't hit use of undefined register errors when expanding an
	; extract with undef index.			; extract with undef index.

	; CHECK-LABEL: {{^}}extract_adjacent_blocks:			; CHECK-LABEL: {{^}}extract_adjacent_blocks:
	; CHECK: s_load_dword [[ARG:s[0-9]+]]			; CHECK: s_load_dword [[ARG:s[0-9]+]]
	▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll

	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,MOVREL,PREGFX9 %s
	; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s			; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,IDXMODE,PREGFX9 %s

	; Tests for indirect addressing on SI, which is implemented using dynamic			; Tests for indirect addressing on SI, which is implemented using dynamic
	; indexing of vectors.			; indexing of vectors.

	; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll			; Subtest below moved from file test/CodeGen/AMDGPU/indirect-addressing-si.ll
	; to avoid gfx9 scheduling induced issues.			; to avoid gfx9 scheduling induced issues.


	; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:			; GCN-LABEL: {{^}}insert_vgpr_offset_multiple_in_block:
	; GCN-DAG: s_load_dwordx4 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT3:[0-9]+]]{{\]}}			; GCN-DAG: s_load_dwordx16 s{{\[}}[[S_ELT0:[0-9]+]]:[[S_ELT15:[0-9]+]]{{\]}}
	; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[IDX0:v[0-9]+]]			; GCN-DAG: {{buffer\|flat\|global}}_load_dword [[IDX0:v[0-9]+]]
	; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62			; GCN-DAG: v_mov_b32 [[INS0:v[0-9]+]], 62

	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT3:[0-9]+]], s[[S_ELT3]]			; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT15:[0-9]+]], s[[S_ELT15]]
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT2:[0-9]+]], s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT1:3]], s{{[0-9]+}}
	; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]			; GCN-DAG: v_mov_b32_e32 v[[VEC_ELT0:[0-9]+]], s[[S_ELT0]]

	; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:			; GCN: [[LOOP0:BB[0-9]+_[0-9]+]]:
	; GCN-NEXT: s_waitcnt vmcnt(0)			; GCN-NEXT: s_waitcnt vmcnt(0)
	; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]			; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
	; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]			; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
	; GCN: s_and_saveexec_b64 vcc, vcc			; GCN: s_and_saveexec_b64 vcc, vcc

	Show All 13 Lines
	; GCN: s_mov_b64 [[MASK]], exec			; GCN: s_mov_b64 [[MASK]], exec

	; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:			; GCN: [[LOOP1:BB[0-9]+_[0-9]+]]:
	; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]			; GCN-NEXT: v_readfirstlane_b32 [[READLANE:s[0-9]+]], [[IDX0]]
	; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]			; GCN: v_cmp_eq_u32_e32 vcc, [[READLANE]], [[IDX0]]
	; GCN: s_and_saveexec_b64 vcc, vcc			; GCN: s_and_saveexec_b64 vcc, vcc

	; MOVREL: s_mov_b32 m0, [[READLANE]]			; MOVREL: s_mov_b32 m0, [[READLANE]]
	; MOVREL-NEXT: v_movreld_b32_e32 v[[VEC_ELT1]], 63			; MOVREL-NEXT: v_movreld_b32_e32 v{{[0-9]+}}, 63

	; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst			; IDXMODE: s_set_gpr_idx_on [[READLANE]], dst
	; IDXMODE-NEXT: v_mov_b32_e32 v[[VEC_ELT1]], 63			; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 63
	; IDXMODE: s_set_gpr_idx_off			; IDXMODE: s_set_gpr_idx_off

	; GCN-NEXT: s_xor_b64 exec, exec, vcc			; GCN-NEXT: s_xor_b64 exec, exec, vcc
	; GCN: s_cbranch_execnz [[LOOP1]]			; GCN: s_cbranch_execnz [[LOOP1]]

	; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:			; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:

	; GCN: buffer_store_dword [[INS0]]			; GCN: buffer_store_dword [[INS0]]
	define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {			define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<16 x i32> addrspace(1)* %out0, <16 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <16 x i32> %vec0) #0 {
	entry:			entry:
	%id = call i32 @llvm.amdgcn.workitem.id.x() #1			%id = call i32 @llvm.amdgcn.workitem.id.x() #1
	%id.ext = zext i32 %id to i64			%id.ext = zext i32 %id to i64
	%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext			%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %id.ext
	%idx0 = load volatile i32, i32 addrspace(1)* %gep			%idx0 = load volatile i32, i32 addrspace(1)* %gep
	%idx1 = add i32 %idx0, 1			%idx1 = add i32 %idx0, 1
	%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()			%live.out.val = call i32 asm sideeffect "v_mov_b32 $0, 62", "=v"()
	%vec1 = insertelement <4 x i32> %vec0, i32 %live.out.val, i32 %idx0			%vec1 = insertelement <16 x i32> %vec0, i32 %live.out.val, i32 %idx0
	%vec2 = insertelement <4 x i32> %vec1, i32 63, i32 %idx1			%vec2 = insertelement <16 x i32> %vec1, i32 63, i32 %idx1
	store volatile <4 x i32> %vec2, <4 x i32> addrspace(1)* %out0			store volatile <16 x i32> %vec2, <16 x i32> addrspace(1)* %out0
	%cmp = icmp eq i32 %id, 0			%cmp = icmp eq i32 %id, 0
	br i1 %cmp, label %bb1, label %bb2			br i1 %cmp, label %bb1, label %bb2

	bb1:			bb1:
	store volatile i32 %live.out.val, i32 addrspace(1)* undef			store volatile i32 %live.out.val, i32 addrspace(1)* undef
	br label %bb2			br label %bb2

	bb2:			bb2:
	Show All 9 Lines

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

Show First 20 Lines • Show All 176 Lines • ▼ Show 20 Lines

; GCN-LABEL: {{^}}insert_w_offset:		; GCN-LABEL: {{^}}insert_w_offset:
; GCN-DAG: s_load_dword [[IN:s[0-9]+]]		; GCN-DAG: s_load_dword [[IN:s[0-9]+]]
; MOVREL-DAG: s_mov_b32 m0, [[IN]]		; MOVREL-DAG: s_mov_b32 m0, [[IN]]
; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0		; GCN-DAG: v_mov_b32_e32 v[[ELT0:[0-9]+]], 1.0
; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0		; GCN-DAG: v_mov_b32_e32 v[[ELT1:[0-9]+]], 2.0
; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000		; GCN-DAG: v_mov_b32_e32 v[[ELT2:[0-9]+]], 0x40400000
; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0		; GCN-DAG: v_mov_b32_e32 v[[ELT3:[0-9]+]], 4.0
; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x40a00000		; GCN-DAG: v_mov_b32_e32 v[[ELT15:[0-9]+]], 0x41800000
		; GCN-DAG: v_mov_b32_e32 v[[INS:[0-9]+]], 0x41880000

; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]		; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}		; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {		define amdgpu_kernel void @insert_w_offset(<16 x float> addrspace(1)* %out, i32 %in) {
entry:		entry:
%0 = add i32 %in, 1		%add = add i32 %in, 1
%1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0		%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %add
store <4 x float> %1, <4 x float> addrspace(1)* %out		store <16 x float> %ins, <16 x float> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}insert_wo_offset:		; GCN-LABEL: {{^}}insert_wo_offset:
; GCN: s_load_dword [[IN:s[0-9]+]]		; GCN: s_load_dword [[IN:s[0-9]+]]

; MOVREL: s_mov_b32 m0, [[IN]]		; MOVREL: s_mov_b32 m0, [[IN]]
; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]		; MOVREL: v_movreld_b32_e32 v[[ELT0:[0-9]+]]

; IDXMODE: s_set_gpr_idx_on [[IN]], dst		; IDXMODE: s_set_gpr_idx_on [[IN]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}		; IDXMODE-NEXT: v_mov_b32_e32 v[[ELT0:[0-9]+]], v{{[0-9]+}}
; IDXMODE-NEXT: s_set_gpr_idx_off		; IDXMODE-NEXT: s_set_gpr_idx_off

; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:		; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {		define amdgpu_kernel void @insert_wo_offset(<16 x float> addrspace(1)* %out, i32 %in) {
entry:		entry:
%0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in		%ins = insertelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, float 17.0, i32 %in
store <4 x float> %0, <4 x float> addrspace(1)* %out		store <16 x float> %ins, <16 x float> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}insert_neg_offset_sgpr:		; GCN-LABEL: {{^}}insert_neg_offset_sgpr:
; The offset depends on the register that holds the first element of the vector.		; The offset depends on the register that holds the first element of the vector.
; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}		; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
; MOVREL: v_movreld_b32_e32 v0, 5		; MOVREL: v_movreld_b32_e32 v0, 16

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}		; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst		; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v0, 5		; IDXMODE-NEXT: v_mov_b32_e32 v0, 16
; IDXMODE-NEXT: s_set_gpr_idx_off		; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {		define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, i32 %offset) {
entry:		entry:
%index = add i32 %offset, -512		%index = add i32 %offset, -512
%value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index		%value = insertelement <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>, i32 16, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out		store <16 x i32> %value, <16 x i32> addrspace(1)* %out
ret void		ret void
}		}

; The vector indexed into is originally loaded into an SGPR rather		; The vector indexed into is originally loaded into an SGPR rather
; than built with a reg_sequence		; than built with a reg_sequence

; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:		; GCN-LABEL: {{^}}insert_neg_offset_sgpr_loadreg:
; The offset depends on the register that holds the first element of the vector.		; The offset depends on the register that holds the first element of the vector.
; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}		; MOVREL: s_add_i32 m0, s{{[0-9]+}}, 0xfffffe{{[0-9a-z]+}}
; MOVREL: v_movreld_b32_e32 v0, 5		; MOVREL: v_movreld_b32_e32 v0, 5

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}		; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst		; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE-NEXT: v_mov_b32_e32 v0, 5		; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
; IDXMODE-NEXT: s_set_gpr_idx_off		; IDXMODE-NEXT: s_set_gpr_idx_off
define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {		define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out, <16 x i32> %vec, i32 %offset) {
entry:		entry:
%index = add i32 %offset, -512		%index = add i32 %offset, -512
%value = insertelement <4 x i32> %vec, i32 5, i32 %index		%value = insertelement <16 x i32> %vec, i32 5, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out		store <16 x i32> %value, <16 x i32> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}insert_neg_offset_vgpr:		; GCN-LABEL: {{^}}insert_neg_offset_vgpr:
; The offset depends on the register that holds the first element of the vector.		; The offset depends on the register that holds the first element of the vector.

; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}

; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec		; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec
; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:		; GCN: [[LOOPBB:BB[0-9]+_[0-9]+]]:
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]		; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]
; GCN: s_and_saveexec_b64 vcc, vcc		; GCN: s_and_saveexec_b64 vcc, vcc

; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00		; MOVREL: s_add_i32 m0, [[READLANE]], 0xfffffe00
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 5		; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], 33

; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}		; IDXMODE: s_addk_i32 [[ADD_IDX:s[0-9]+]], 0xfe00{{$}}
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst		; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 5		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 33
; IDXMODE: s_set_gpr_idx_off		; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz [[LOOPBB]]		; GCN: s_cbranch_execnz [[LOOPBB]]
; GCN: s_mov_b64 exec, [[SAVEEXEC]]		; GCN: s_mov_b64 exec, [[SAVEEXEC]]

; GCN: buffer_store_dword		; GCN: buffer_store_dword
define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {		define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
entry:		entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1		%id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, -512		%index = add i32 %id, -512
%value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 5, i32 %index		%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 33, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out		store <16 x i32> %value, <16 x i32> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:		; GCN-LABEL: {{^}}insert_neg_inline_offset_vgpr:

; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT0:v[0-9]+]], 1{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT1:v[0-9]+]], 2{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT2:v[0-9]+]], 3{{$}}
; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 4{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 5{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 6{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 7{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 8{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 9{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 10{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 11{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 12{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 13{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 14{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 15{{$}}
		; GCN-DAG: v_mov_b32_e32 [[VEC_ELT3:v[0-9]+]], 16{{$}}
; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}		; GCN-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x1f4{{$}}

; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec		; GCN: s_mov_b64 [[SAVEEXEC:s\[[0-9]+:[0-9]+\]]], exec

; The offset depends on the register that holds the first element of the vector.		; The offset depends on the register that holds the first element of the vector.
; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]		; GCN: v_readfirstlane_b32 [[READLANE:s[0-9]+]]

; MOVREL: s_add_i32 m0, [[READLANE]], -16		; MOVREL: s_add_i32 m0, [[READLANE]], -16
; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]		; MOVREL: v_movreld_b32_e32 [[VEC_ELT0]], [[VAL]]

; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16		; IDXMODE: s_add_i32 [[ADD_IDX:s[0-9]+]], [[READLANE]], -16
; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst		; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]		; IDXMODE: v_mov_b32_e32 [[VEC_ELT0]], [[VAL]]
; IDXMODE: s_set_gpr_idx_off		; IDXMODE: s_set_gpr_idx_off

; GCN: s_cbranch_execnz		; GCN: s_cbranch_execnz
define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {		define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <16 x i32> addrspace(1)* %out) {
entry:		entry:
%id = call i32 @llvm.amdgcn.workitem.id.x() #1		%id = call i32 @llvm.amdgcn.workitem.id.x() #1
%index = add i32 %id, -16		%index = add i32 %id, -16
%value = insertelement <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i32 500, i32 %index		%value = insertelement <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>, i32 500, i32 %index
store <4 x i32> %value, <4 x i32> addrspace(1)* %out		store <16 x i32> %value, <16 x i32> addrspace(1)* %out
ret void		ret void
}		}

; When the block is split to insert the loop, make sure any other		; When the block is split to insert the loop, make sure any other
; places that need to be expanded in the same block are also handled.		; places that need to be expanded in the same block are also handled.

; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:		; GCN-LABEL: {{^}}extract_vgpr_offset_multiple_in_block:

▲ Show 20 Lines • Show All 101 Lines • ▼ Show 20 Lines

; FIXME: Should be able to fold zero input to movreld to inline imm?		; FIXME: Should be able to fold zero input to movreld to inline imm?

; GCN-LABEL: {{^}}multi_same_block:		; GCN-LABEL: {{^}}multi_same_block:

; GCN: s_load_dword [[ARG:s[0-9]+]]		; GCN: s_load_dword [[ARG:s[0-9]+]]

; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000		; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: s_waitcnt		; MOVREL: s_waitcnt
; MOVREL: s_add_i32 m0, [[ARG]], -16		; MOVREL: s_add_i32 m0, [[ARG]], -16
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0		; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, 4.0
		; MOVREL: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0		; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, -4.0
; MOVREL: s_mov_b32 m0, -1		; MOVREL: s_mov_b32 m0, -1


; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41900000
; IDXMODE: s_waitcnt		; IDXMODE: s_waitcnt
; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16		; IDXMODE: s_add_i32 [[ARG]], [[ARG]], -16
; IDXMODE: s_set_gpr_idx_on [[ARG]], dst		; IDXMODE: s_set_gpr_idx_on [[ARG]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 4.0
; IDXMODE: s_set_gpr_idx_off		; IDXMODE: s_set_gpr_idx_off
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, 0x41b0cccd
; IDXMODE: s_set_gpr_idx_on [[ARG]], dst		; IDXMODE: s_set_gpr_idx_on [[ARG]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, -4.0
; IDXMODE: s_set_gpr_idx_off		; IDXMODE: s_set_gpr_idx_off

; GCN: ds_write_b32		; GCN: ds_write_b32
; GCN: ds_write_b32		; GCN: ds_write_b32
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {		define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
bb:		bb:
%tmp1 = add i32 %arg, -16		%tmp1 = add i32 %arg, -16
%tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1		%tmp2 = insertelement <9 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01, float 2.300000e+01, float 2.400000e+01, float 2.500000e+01>, float 4.000000e+00, i32 %tmp1
%tmp3 = add i32 %arg, -16		%tmp3 = add i32 %arg, -16
%tmp4 = insertelement <6 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000>, float -4.0, i32 %tmp3		%tmp4 = insertelement <9 x float> <float 0x40311999A0000000, float 0x40321999A0000000, float 0x40331999A0000000, float 0x40341999A0000000, float 0x40351999A0000000, float 0x40361999A0000000, float 0x40371999A0000000, float 0x40381999A0000000, float 0x40391999A0000000>, float -4.0, i32 %tmp3
%tmp5 = bitcast <6 x float> %tmp2 to <6 x i32>		%tmp5 = bitcast <9 x float> %tmp2 to <9 x i32>
%tmp6 = extractelement <6 x i32> %tmp5, i32 1		%tmp6 = extractelement <9 x i32> %tmp5, i32 1
%tmp7 = bitcast <6 x float> %tmp4 to <6 x i32>		%tmp7 = bitcast <9 x float> %tmp4 to <9 x i32>
%tmp8 = extractelement <6 x i32> %tmp7, i32 5		%tmp8 = extractelement <9 x i32> %tmp7, i32 5
store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4		store volatile i32 %tmp6, i32 addrspace(3)* undef, align 4
store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4		store volatile i32 %tmp8, i32 addrspace(3)* undef, align 4
ret void		ret void
}		}

; offset puts outside of superegister bounaries, so clamp to 1st element.		; offset puts outside of superegister bounaries, so clamp to 1st element.
; GCN-LABEL: {{^}}extract_largest_inbounds_offset:		; GCN-LABEL: {{^}}extract_largest_inbounds_offset:
; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}}		; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[LO_ELT:[0-9]+]]:[[HI_ELT:[0-9]+]]{{\].* offset:48}}
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	entry:
%ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in		%ld = load volatile <16 x i32>, <16 x i32> addrspace(1)* %in
%idx.shl = shl i32 %idx.in, 2		%idx.shl = shl i32 %idx.in, 2
%idx = or i32 %idx.shl, 1		%idx = or i32 %idx.shl, 1
%value = extractelement <16 x i32> %ld, i32 %idx		%value = extractelement <16 x i32> %ld, i32 %idx
store i32 %value, i32 addrspace(1)* %out		store i32 %value, i32 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}insertelement_v4f32_or_index:		; GCN-LABEL: {{^}}insertelement_v16f32_or_index:
; GCN: s_load_dword [[IDX_IN:s[0-9]+]]		; GCN: s_load_dword [[IDX_IN:s[0-9]+]]
; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]		; GCN: s_lshl_b32 [[IDX_SHL:s[0-9]+]], [[IDX_IN]]
; GCN-NOT: [[IDX_SHL]]		; GCN-NOT: [[IDX_SHL]]

; MOVREL: s_mov_b32 m0, [[IDX_SHL]]		; MOVREL: s_mov_b32 m0, [[IDX_SHL]]
; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}		; MOVREL: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}

; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst		; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}		; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; IDXMODE: s_set_gpr_idx_off		; IDXMODE: s_set_gpr_idx_off
define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {		define amdgpu_kernel void @insertelement_v16f32_or_index(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %idx.in) nounwind {
%idx.shl = shl i32 %idx.in, 2		%idx.shl = shl i32 %idx.in, 2
%idx = or i32 %idx.shl, 1		%idx = or i32 %idx.shl, 1
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx		%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %idx
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16		store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
ret void		ret void
}		}

; GCN-LABEL: {{^}}broken_phi_bb:		; GCN-LABEL: {{^}}broken_phi_bb:
; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8		; GCN: v_mov_b32_e32 [[PHIREG:v[0-9]+]], 8

; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]		; GCN: s_branch [[BB2:BB[0-9]+_[0-9]+]]

Show All 18 Lines

bb2: ; preds = %bb4, %bb		bb2: ; preds = %bb4, %bb
%tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]		%tmp = phi i32 [ 8, %bb ], [ %tmp7, %bb4 ]
%tmp3 = icmp slt i32 %tmp, %arg		%tmp3 = icmp slt i32 %tmp, %arg
br i1 %tmp3, label %bb4, label %bb8		br i1 %tmp3, label %bb4, label %bb8

bb4: ; preds = %bb2		bb4: ; preds = %bb2
%vgpr = load volatile i32, i32 addrspace(1)* undef		%vgpr = load volatile i32, i32 addrspace(1)* undef
%tmp5 = insertelement <8 x i32> undef, i32 undef, i32 %vgpr		%tmp5 = insertelement <16 x i32> undef, i32 undef, i32 %vgpr
%tmp6 = insertelement <8 x i32> %tmp5, i32 %arg1, i32 %vgpr		%tmp6 = insertelement <16 x i32> %tmp5, i32 %arg1, i32 %vgpr
%tmp7 = extractelement <8 x i32> %tmp6, i32 0		%tmp7 = extractelement <16 x i32> %tmp6, i32 0
br label %bb2		br label %bb2

bb8: ; preds = %bb2		bb8: ; preds = %bb2
ret void		ret void
}		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1
declare void @llvm.amdgcn.s.barrier() #2		declare void @llvm.amdgcn.s.barrier() #2

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }		attributes #1 = { nounwind readnone }
attributes #2 = { nounwind convergent }		attributes #2 = { nounwind convergent }

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

				; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN %s

				; GCN-LABEL: {{^}}float4_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
				; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
				define amdgpu_kernel void @float4_inselt(<4 x float> addrspace(1)* %out, <4 x float> %vec, i32 %sel) {
				entry:
				%v = insertelement <4 x float> %vec, float 1.000000e+00, i32 %sel
				store <4 x float> %v, <4 x float> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}float4_inselt_undef:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-NOT: v_cmp_
				; GCN-NOT: v_cndmask_
				; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
				; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
				; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
				; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
				define amdgpu_kernel void @float4_inselt_undef(<4 x float> addrspace(1)* %out, i32 %sel) {
				entry:
				%v = insertelement <4 x float> undef, float 1.000000e+00, i32 %sel
				store <4 x float> %v, <4 x float> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}int4_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC2]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC3]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1, v{{[0-9]+}}, [[CC4]]
				; GCN: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
				define amdgpu_kernel void @int4_inselt(<4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %sel) {
				entry:
				%v = insertelement <4 x i32> %vec, i32 1, i32 %sel
				store <4 x i32> %v, <4 x i32> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}float2_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC2]]
				; GCN: flat_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST]]:[[ELT_LAST]]]
				define amdgpu_kernel void @float2_inselt(<2 x float> addrspace(1)* %out, <2 x float> %vec, i32 %sel) {
				entry:
				%v = insertelement <2 x float> %vec, float 1.000000e+00, i32 %sel
				store <2 x float> %v, <2 x float> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}float8_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 3
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 2
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC2]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 1
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC3]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX]], 0
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST0:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC4]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC5:[^,]+]], [[IDX:s[0-9]+]], 7
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_LAST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC5]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC6:[^,]+]], [[IDX]], 6
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC6]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC7:[^,]+]], [[IDX]], 5
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, [[CC7]]
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC8:[^,]+]], [[IDX]], 4
				; GCN-DAG: v_cndmask_b32_e32 v[[ELT_FIRST1:[0-9]+]], 1.0, v{{[0-9]+}}, [[CC8]]
				; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST0]]:[[ELT_LAST0]]]
				; GCN-DAG: flat_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[ELT_FIRST1]]:[[ELT_LAST1]]]
				define amdgpu_kernel void @float8_inselt(<8 x float> addrspace(1)* %out, <8 x float> %vec, i32 %sel) {
				entry:
				%v = insertelement <8 x float> %vec, float 1.000000e+00, i32 %sel
				store <8 x float> %v, <8 x float> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}float16_inselt:
				; GCN: v_movreld_b32
				define amdgpu_kernel void @float16_inselt(<16 x float> addrspace(1)* %out, <16 x float> %vec, i32 %sel) {
				entry:
				%v = insertelement <16 x float> %vec, float 1.000000e+00, i32 %sel
				store <16 x float> %v, <16 x float> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}half4_inselt:
				; GCN-NOT: v_cndmask_b32
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
				; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
				; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x3c00
				define amdgpu_kernel void @half4_inselt(<4 x half> addrspace(1)* %out, <4 x half> %vec, i32 %sel) {
				entry:
				%v = insertelement <4 x half> %vec, half 1.000000e+00, i32 %sel
				store <4 x half> %v, <4 x half> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}half2_inselt:
				; GCN-NOT: v_cndmask_b32
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
				; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
				; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], v{{[0-9]+}}, v{{[0-9]+}}
				define amdgpu_kernel void @half2_inselt(<2 x half> addrspace(1)* %out, <2 x half> %vec, i32 %sel) {
				entry:
				%v = insertelement <2 x half> %vec, half 1.000000e+00, i32 %sel
				store <2 x half> %v, <2 x half> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}half8_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 1
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 2
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 3
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 4
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 5
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 6
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 7
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				define amdgpu_kernel void @half8_inselt(<8 x half> addrspace(1)* %out, <8 x half> %vec, i32 %sel) {
				entry:
				%v = insertelement <8 x half> %vec, half 1.000000e+00, i32 %sel
				store <8 x half> %v, <8 x half> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}short2_inselt:
				; GCN-NOT: v_cndmask_b32
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
				; GCN: s_lshl_b32 [[V:s[0-9]+]], 0xffff, [[SEL]]
				; GCN: v_bfi_b32 v{{[0-9]+}}, [[V]], 1, v{{[0-9]+}}
				define amdgpu_kernel void @short2_inselt(<2 x i16> addrspace(1)* %out, <2 x i16> %vec, i32 %sel) {
				entry:
				%v = insertelement <2 x i16> %vec, i16 1, i32 %sel
				store <2 x i16> %v, <2 x i16> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}short4_inselt:
				; GCN-NOT: v_cndmask_b32
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 4
				; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
				; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
				define amdgpu_kernel void @short4_inselt(<4 x i16> addrspace(1)* %out, <4 x i16> %vec, i32 %sel) {
				entry:
				%v = insertelement <4 x i16> %vec, i16 1, i32 %sel
				store <4 x i16> %v, <4 x i16> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}byte8_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN: s_lshl_b32 [[SEL:s[0-9]+]], s{{[0-9]+}}, 3
				; GCN: s_lshl_b64 s[{{[0-9:]+}}], s[{{[0-9:]+}}], [[SEL]]
				; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
				define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %vec, i32 %sel) {
				entry:
				%v = insertelement <8 x i8> %vec, i8 1, i32 %sel
				store <8 x i8> %v, <8 x i8> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}byte16_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 0
				; GCN-DAG: v_cmp_ne_u32_e64 {{[^,]+}}, {{s[0-9]+}}, 15
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_cndmask_b32_e32
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				; GCN-DAG: v_or_b32_sdwa
				define amdgpu_kernel void @byte16_inselt(<16 x i8> addrspace(1)* %out, <16 x i8> %vec, i32 %sel) {
				entry:
				%v = insertelement <16 x i8> %vec, i8 1, i32 %sel
				store <16 x i8> %v, <16 x i8> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}double2_inselt:
				; GCN-NOT: v_movrel
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX:s[0-9]+]], 1
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
				; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 0
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, [[CC2]]
				; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
				define amdgpu_kernel void @double2_inselt(<2 x double> addrspace(1)* %out, <2 x double> %vec, i32 %sel) {
				entry:
				%v = insertelement <2 x double> %vec, double 1.000000e+00, i32 %sel
				store <2 x double> %v, <2 x double> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}double8_inselt:
				; GCN-NOT: v_cndmask
				; GCN: buffer_store_dword
				; GCN: buffer_store_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				; GCN: buffer_load_dword
				define amdgpu_kernel void @double8_inselt(<8 x double> addrspace(1)* %out, <8 x double> %vec, i32 %sel) {
				entry:
				%v = insertelement <8 x double> %vec, double 1.000000e+00, i32 %sel
				store <8 x double> %v, <8 x double> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}bit4_inselt:
				; GCN: buffer_store_byte
				; GCN: buffer_load_ubyte
				; GCN: buffer_load_ubyte
				; GCN: buffer_load_ubyte
				; GCN: buffer_load_ubyte
				define amdgpu_kernel void @bit4_inselt(<4 x i1> addrspace(1)* %out, <4 x i1> %vec, i32 %sel) {
				entry:
				%v = insertelement <4 x i1> %vec, i1 1, i32 %sel
				store <4 x i1> %v, <4 x i1> addrspace(1)* %out
				ret void
				}

				; GCN-LABEL: {{^}}bit128_inselt:
				; GCN-NOT: buffer_
				; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], s{{[0-9]+}}, 0
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CC1]]
				; GCN-DAG: v_mov_b32_e32 [[LASTIDX:v[0-9]+]], 0x7f
				; GCN-DAG: v_cmp_ne_u32_e32 [[CCL:[^,]+]], s{{[0-9]+}}, [[LASTIDX]]
				; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}, [[CCL]]
				define amdgpu_kernel void @bit128_inselt(<128 x i1> addrspace(1)* %out, <128 x i1> %vec, i32 %sel) {
				entry:
				%v = insertelement <128 x i1> %vec, i1 1, i32 %sel
				store <128 x i1> %v, <128 x i1> addrspace(1)* %out
				ret void
				}

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

Show First 20 Lines • Show All 77 Lines • ▼ Show 20 Lines
define <4 x float> @insertelement_to_sgpr() nounwind {		define <4 x float> @insertelement_to_sgpr() nounwind {
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef		%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
%tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0		%tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
%tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)		%tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
ret <4 x float> %tmp2		ret <4 x float> %tmp2
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:		; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000		; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]		; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:		; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b		%vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8		store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:		; GCN-LABEL: {{^}}dynamic_insertelement_v3f32:
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000		; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]		; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
		; GCN-DAG: buffer_store_dwordx2 v
; GCN-DAG: buffer_store_dword v		; GCN-DAG: buffer_store_dword v
define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b		%vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16		store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:		; GCN-LABEL: {{^}}dynamic_insertelement_v4f32:
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000		; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]		; GCN-DAG: v_cmp_ne_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC4]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC3]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC2]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]], v{{[0-9]+}}, [[CC1]]
; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:		; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b		%vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16		store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:		; GCN-LABEL: {{^}}dynamic_insertelement_v8f32:
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
		; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CCL]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}, [[CC1]]
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b		%vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32		store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:		; GCN-LABEL: {{^}}dynamic_insertelement_v16f32:
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}		; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b		%vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64		store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:		; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
; GCN: v_movreld_b32		; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN: buffer_store_dwordx2		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5, v{{[0-9]+}}, [[CC1]]
		; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i32> %a, i32 5, i32 %b		%vecins = insertelement <2 x i32> %a, i32 5, i32 %b
store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8		store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:		; GCN-LABEL: {{^}}dynamic_insertelement_v3i32:
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5		; GCN-DAG: v_cmp_ne_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC3]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC2]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
		; GCN-DAG: buffer_store_dwordx2 v
; GCN-DAG: buffer_store_dword v		; GCN-DAG: buffer_store_dword v
define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <3 x i32> %a, i32 5, i32 %b		%vecins = insertelement <3 x i32> %a, i32 5, i32 %b
store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16		store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:		; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11\|0x44}}		; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11\|0x44}}
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]		; GCN-DAG: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]		; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC4]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC3]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC2]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[VVAL]], [[CC1]]
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b		%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16		store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:		; GCN-LABEL: {{^}}dynamic_insertelement_v8i32:
; GCN: v_movreld_b32		; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 7
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
%vecins = insertelement <8 x i32> %a, i32 5, i32 %b		%vecins = insertelement <8 x i32> %a, i32 5, i32 %b
store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32		store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
ret void		ret void
}		}

▲ Show 20 Lines • Show All 105 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(4)* %a.ptr, i32 %b) nounwind {
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:		; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
; GCN: s_load_dwordx2		; GCN: s_load_dwordx2
; GCN: s_load_dwordx4		; GCN: s_load_dwordx4
; GCN: s_load_dword s		; GCN: s_load_dword s

; GCN: buffer_store_byte		; GCN-NOT: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte		; GCN-DAG: v_cmp_ne_u32_e64 [[CCL:[^,]+]], [[IDX:s[0-9]+]], 15
; GCN: buffer_store_byte		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CCL]]
; GCN: buffer_store_byte		; GCN-DAG: v_cmp_ne_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
; GCN: buffer_store_byte		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}, [[CC1]]
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte
; GCN: buffer_store_byte

; GCN: buffer_store_byte
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
%vecins = insertelement <16 x i8> %a, i8 5, i32 %b		%vecins = insertelement <16 x i8> %a, i8 5, i32 %b
store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16		store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
ret void		ret void
}		}

; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that		; This test requires handling INSERT_SUBREG in SIFixSGPRCopies. Check that
Show All 21 Lines	endif:
store <2 x i32> %7, <2 x i32> addrspace(1)* %out		store <2 x i32> %7, <2 x i32> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:		; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}		; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18\|0x60}}{{$}}		; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18\|0x60}}{{$}}

; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}

; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}		; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000		; GCN-DAG: v_mov_b32_e32 [[ELT1:v[0-9]+]], 0x40200000

; GCN-DAG: s_mov_b32 m0, [[SCALEDIDX]]		; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC2]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
; Increment to next element folded into base register, but FileCheck		; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
; can't do math expressions		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[ELT1]], [[CC1]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
; FIXME: Should be able to manipulate m0 directly instead of s_lshl_b32 + copy to m0

; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[ELT1]]

; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <2 x double> %a, double 8.0, i32 %b		%vecins = insertelement <2 x double> %a, double 8.0, i32 %b
store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16		store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:		; GCN-LABEL: {{^}}dynamic_insertelement_v2i64:

; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 5		; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX:s[0-9]+]], 1
; GCN-DAG: v_movreld_b32_e32 v{{[0-9]+}}, 0		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]

; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: s_endpgm		; GCN: s_endpgm
define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
%vecins = insertelement <2 x i64> %a, i64 5, i32 %b		%vecins = insertelement <2 x i64> %a, i64 5, i32 %b
store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8		store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:		; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX:s[0-9]+]], 2
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC3]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC2]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 5, [[CC1]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]
define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
%vecins = insertelement <3 x i64> %a, i64 5, i32 %b		%vecins = insertelement <3 x i64> %a, i64 5, i32 %b
store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32		store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
ret void		ret void
}		}

; FIXME: Should be able to do without stack access. The used stack
; space is also 2x what should be required.

; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:		; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:

; Stack store		; GCN-DAG: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40200000
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC4:[^,]+]], [[IDX:s[0-9]+]], 3
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC4]]
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC4]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC3:[^,]+]], [[IDX]], 2
; Write element		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC3]]
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC3]]
		; GCN-DAG: v_cmp_eq_u32_e64 [[CC2:[^,]+]], [[IDX]], 1
; Stack reload		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC2]]
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC2]]
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}		; GCN-DAG: v_cmp_eq_u32_e64 [[CC1:[^,]+]], [[IDX]], 0
		; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]], [[CC1]]
		; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 0, [[CC1]]

; Store result
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4		; GCN: buffer_store_dwordx4
; GCN: s_endpgm		; GCN: s_endpgm
; GCN: ScratchSize: 64		; GCN: ScratchSize: 0

define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {		define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
%vecins = insertelement <4 x double> %a, double 8.0, i32 %b		%vecins = insertelement <4 x double> %a, double 8.0, i32 %b
store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16		store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
ret void		ret void
}		}

; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:		; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
Show All 28 Lines

llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

	; GCN-LABEL: {{^}}main:			; GCN-LABEL: {{^}}main:
	; GCN: v_movreld_b32_e32 v0,			; GCN: v_movreld_b32_e32 v0,
	; GCN: v_mov_b32_e32 v0, v1			; GCN: v_mov_b32_e32 v0, v1
	; GCN: ; return			; GCN: ; return
	define amdgpu_ps float @main(i32 inreg %arg) #0 {			define amdgpu_ps float @main(i32 inreg %arg) #0 {
	main_body:			main_body:
	%tmp24 = insertelement <2 x float> undef, float 0.000000e+00, i32 %arg			%tmp24 = insertelement <16 x float> undef, float 0.000000e+00, i32 %arg
	%tmp25 = extractelement <2 x float> %tmp24, i32 1			%tmp25 = extractelement <16 x float> %tmp24, i32 1
	ret float %tmp25			ret float %tmp25
	}			}

	attributes #0 = { "InitialPSInputAddr"="36983" }			attributes #0 = { "InitialPSInputAddr"="36983" }

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX8 %s		; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s		; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=OPT %s		; RUN: opt -S -mtriple=amdgcn-- -data-layout=A5 -mcpu=fiji -amdgpu-promote-alloca < %s \| FileCheck -check-prefix=OPT %s

; GCN-LABEL: {{^}}float4_alloca_store4:		; GCN-LABEL: {{^}}float4_alloca_store4:
; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4		; OPT-LABEL: define amdgpu_kernel void @float4_alloca_store4

; GFX-NOT: buffer_		; GCN-NOT: buffer_
; GCN: v_cndmask_b32		; GCN: v_cndmask_b32
; GCN: v_cndmask_b32		; GCN: v_cndmask_b32
; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,		; GCN: v_cndmask_b32_e32 [[RES:v[0-9]+]], 4.0,
; GCN: store_dword v[{{[0-9:]+}}], [[RES]]		; GCN: store_dword v[{{[0-9:]+}}], [[RES]]

; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4		; OPT: store <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00>, <4 x float> addrspace(5)* %alloca, align 4
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca		; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
Show All 14 Lines	entry:
%load = load float, float addrspace(5)* %gep, align 4		%load = load float, float addrspace(5)* %gep, align 4
store float %load, float addrspace(1)* %out, align 4		store float %load, float addrspace(1)* %out, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}float4_alloca_load4:		; GCN-LABEL: {{^}}float4_alloca_load4:
; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4		; OPT-LABEL: define amdgpu_kernel void @float4_alloca_load4

; GFX-NOT: buffer_		; GCN-NOT: v_movrel
; GCN: v_readfirstlane_b32		; GCN-NOT: buffer_
; GFX8: v_movreld_b32		; GCN-NOT: v_cmp_
; GFX9: s_set_gpr_idx_on		; GCN-NOT: v_cndmask_
; GFX9: s_set_gpr_idx_off		; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1.0
		; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
		; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
		; GCN: v_mov_b32_e32 v{{[0-9]+}}, [[ONE]]
		; GCN: store_dwordx4 v[{{[0-9:]+}}],

; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca		; OPT: %0 = load <4 x float>, <4 x float> addrspace(5)* %alloca
; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2		; OPT: %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 %sel2
; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca		; OPT: store <4 x float> %1, <4 x float> addrspace(5)* %alloca
; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4		; OPT: %load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4		; OPT: store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4

Show All 11 Lines	entry:
%load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4		%load = load <4 x float>, <4 x float> addrspace(5)* %alloca, align 4
store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4		store <4 x float> %load, <4 x float> addrspace(1)* %out, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}half4_alloca_store4:		; GCN-LABEL: {{^}}half4_alloca_store4:
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4		; OPT-LABEL: define amdgpu_kernel void @half4_alloca_store4

; GFX-NOT: buffer_		; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200		; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x44004200
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00		; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x40003c00
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]		; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]

; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2		; OPT: store <4 x half> <half 0xH3C00, half 0xH4000, half 0xH4200, half 0xH4400>, <4 x half> addrspace(5)* %alloca, align 2
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca		; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
; OPT: %1 = extractelement <4 x half> %0, i32 %sel2		; OPT: %1 = extractelement <4 x half> %0, i32 %sel2
Show All 13 Lines	entry:
%load = load half, half addrspace(5)* %gep, align 2		%load = load half, half addrspace(5)* %gep, align 2
store half %load, half addrspace(1)* %out, align 2		store half %load, half addrspace(1)* %out, align 2
ret void		ret void
}		}

; GCN-LABEL: {{^}}half4_alloca_load4:		; GCN-LABEL: {{^}}half4_alloca_load4:
; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4		; OPT-LABEL: define amdgpu_kernel void @half4_alloca_load4

; GFX-NOT: buffer_		; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0		; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff		; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff

; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca		; OPT: %0 = load <4 x half>, <4 x half> addrspace(5)* %alloca
; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2		; OPT: %1 = insertelement <4 x half> %0, half 0xH3C00, i32 %sel2
; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca		; OPT: store <4 x half> %1, <4 x half> addrspace(5)* %alloca
; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2		; OPT: %load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
Show All 13 Lines	entry:
%load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2		%load = load <4 x half>, <4 x half> addrspace(5)* %alloca, align 2
store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2		store <4 x half> %load, <4 x half> addrspace(1)* %out, align 2
ret void		ret void
}		}

; GCN-LABEL: {{^}}short4_alloca_store4:		; GCN-LABEL: {{^}}short4_alloca_store4:
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4		; OPT-LABEL: define amdgpu_kernel void @short4_alloca_store4

; GFX-NOT: buffer_		; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003		; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0x40003
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001		; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0x20001
; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]		; GCN: v_lshrrev_b64 v[{{[0-9:]+}}], v{{[0-9]+}}, s{{\[}}[[SL]]:[[SH]]]

; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2		; OPT: store <4 x i16> <i16 1, i16 2, i16 3, i16 4>, <4 x i16> addrspace(5)* %alloca, align 2
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca		; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2		; OPT: %1 = extractelement <4 x i16> %0, i32 %sel2
Show All 13 Lines	entry:
%load = load i16, i16 addrspace(5)* %gep, align 2		%load = load i16, i16 addrspace(5)* %gep, align 2
store i16 %load, i16 addrspace(1)* %out, align 2		store i16 %load, i16 addrspace(1)* %out, align 2
ret void		ret void
}		}

; GCN-LABEL: {{^}}short4_alloca_load4:		; GCN-LABEL: {{^}}short4_alloca_load4:
; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4		; OPT-LABEL: define amdgpu_kernel void @short4_alloca_load4

; GFX-NOT: buffer_		; GCN-NOT: buffer_
; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0		; GCN-DAG: s_mov_b32 s[[SH:[0-9]+]], 0
; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff		; GCN-DAG: s_mov_b32 s[[SL:[0-9]+]], 0xffff

; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2		; OPT: %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(5)* %alloca, i32 0, i32 %sel2
; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca		; OPT: %0 = load <4 x i16>, <4 x i16> addrspace(5)* %alloca
; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2		; OPT: %1 = insertelement <4 x i16> %0, i16 1, i32 %sel2
; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca		; OPT: store <4 x i16> %1, <4 x i16> addrspace(5)* %alloca
; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2		; OPT: %load = load <4 x i16>, <4 x i16> addrspace(5)* %alloca, align 2
Show All 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll

Show All 21 Lines	define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
%insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx		%insert = insertelement <4 x i32> %vec, i32 %val, i32 %idx
%extract = extractelement <4 x i32> %insert, i32 %idx		%extract = extractelement <4 x i32> %insert, i32 %idx
store i32 %extract, i32 addrspace(1)* %gep.out		store i32 %extract, i32 addrspace(1)* %gep.out
ret void		ret void
}		}

; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:		; GCN-LABEL: {{^}}extract_insert_different_dynelt_v4i32:
; GCN: buffer_load_dwordx4		; GCN: buffer_load_dwordx4
; GCN: v_movreld_b32		; GCN: v_cndmask_b32
		; GCN: v_cndmask_b32
		; GCN: v_cndmask_b32
		; GCN: v_cndmask_b32
; GCN: v_cndmask_b32		; GCN: v_cndmask_b32
; GCN: v_cndmask_b32		; GCN: v_cndmask_b32
; GCN: v_cndmask_b32		; GCN: v_cndmask_b32
; GCN: buffer_store_dword v		; GCN: buffer_store_dword v
define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {		define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
%id = call i32 @llvm.amdgcn.workitem.id.x()		%id = call i32 @llvm.amdgcn.workitem.id.x()
%id.ext = sext i32 %id to i64		%id.ext = sext i32 %id to i64
%gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext		%gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
▲ Show 20 Lines • Show All 48 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Convert insert_vector_elt into set of selects
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 174632

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Convert insert_vector_elt into set of selectsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 174632

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.h

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-gfx9.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-pregfx9.ll

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

llvm/trunk/test/CodeGen/AMDGPU/insert_vector_elt.ll

llvm/trunk/test/CodeGen/AMDGPU/movreld-bug.ll

llvm/trunk/test/CodeGen/AMDGPU/promote-alloca-vector-to-vector.ll

llvm/trunk/test/CodeGen/AMDGPU/vector-extract-insert.ll

[AMDGPU] Convert insert_vector_elt into set of selects
ClosedPublic