Diff 426278

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 731 Lines • ▼ Show 20 Lines	if (Subtarget->has16BitInsts()) {
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);		setOperationAction(ISD::FMAXNUM, MVT::v4f16, Expand);
setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);		setOperationAction(ISD::FMINNUM, MVT::v8f16, Expand);
setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);		setOperationAction(ISD::FMAXNUM, MVT::v8f16, Expand);

for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {		for (MVT Vec16 : { MVT::v8i16, MVT::v8f16 }) {
setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);		setOperationAction(ISD::BUILD_VECTOR, Vec16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);		setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);		setOperationAction(ISD::INSERT_VECTOR_ELT, Vec16, Expand);
setOperationAction(ISD::SCALAR_TO_VECTOR, Vec16, Expand);
}		}
}		}

if (Subtarget->hasVOP3PInsts()) {		if (Subtarget->hasVOP3PInsts()) {
setOperationAction(ISD::ADD, MVT::v2i16, Legal);		setOperationAction(ISD::ADD, MVT::v2i16, Legal);
setOperationAction(ISD::SUB, MVT::v2i16, Legal);		setOperationAction(ISD::SUB, MVT::v2i16, Legal);
setOperationAction(ISD::MUL, MVT::v2i16, Legal);		setOperationAction(ISD::MUL, MVT::v2i16, Legal);
setOperationAction(ISD::SHL, MVT::v2i16, Legal);		setOperationAction(ISD::SHL, MVT::v2i16, Legal);
▲ Show 20 Lines • Show All 5,215 Lines • ▼ Show 20 Lines

SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,		SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SDLoc SL(Op);		SDLoc SL(Op);
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();

if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|		if (VT == MVT::v4i16 \|\| VT == MVT::v4f16 \|\|
VT == MVT::v8i16 \|\| VT == MVT::v8f16) {		VT == MVT::v8i16 \|\| VT == MVT::v8f16) {
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),		EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
		arsenmUnsubmitted Done Reply Inline Actions Not much point in the assert since this works for any size arsenm: Not much point in the assert since this works for any size
VT.getVectorNumElements() / 2);		VT.getVectorNumElements() / 2);
MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());		MVT HalfIntVT = MVT::getIntegerVT(HalfVT.getSizeInBits());

		rampitecUnsubmitted Done Reply Inline Actions call getUNDEF() once. rampitec: call getUNDEF() once.
// Turn into pair of packed build_vectors.		// Turn into pair of packed build_vectors.
		rampitecUnsubmitted Done Reply Inline Actions Why not undef? It is less operations. rampitec: Why not undef? It is less operations.
// TODO: Special case for constants that can be materialized with s_mov_b64.		// TODO: Special case for constants that can be materialized with s_mov_b64.
SmallVector<SDValue, 4> LoOps, HiOps;		SmallVector<SDValue, 4> LoOps, HiOps;
for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {		for (unsigned I = 0, E = VT.getVectorNumElements() / 2; I != E; ++I) {
LoOps.push_back(Op.getOperand(I));		LoOps.push_back(Op.getOperand(I));
HiOps.push_back(Op.getOperand(I + E));		HiOps.push_back(Op.getOperand(I + E));
		arsenmUnsubmitted Done Reply Inline Actions Just return the build_vecotr. You are potentially missing combine opportunities by directly lowering it arsenm: Just return the build_vecotr. You are potentially missing combine opportunities by directly…
		hsmhsmAuthorUnsubmitted Done Reply Inline Actions This again giving same ISA for (new) lit tests which we were getting when lowered to insert_subreg. hsmhsm: This again giving same ISA for (new) lit tests which we were getting when lowered to…
}		}
SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);		SDValue Lo = DAG.getBuildVector(HalfVT, SL, LoOps);
SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);		SDValue Hi = DAG.getBuildVector(HalfVT, SL, HiOps);

SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);		SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Lo);
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);		SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, HalfIntVT, Hi);

SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,		SDValue Blend = DAG.getBuildVector(MVT::getVectorVT(HalfIntVT, 2), SL,
▲ Show 20 Lines • Show All 6,700 Lines • Show Last 20 Lines

llvm/lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 2,700 Lines • ▼ Show 20 Lines
	>;			>;

	def : GCNPat <			def : GCNPat <
	(v4f16 (scalar_to_vector f16:$src0)),			(v4f16 (scalar_to_vector f16:$src0)),
	(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)			(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
	>;			>;

	def : GCNPat <			def : GCNPat <
				(v8i16 (scalar_to_vector i16:$src0)),
				(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
				arsenmUnsubmitted Not Done Reply Inline Actions I don’t think these should be legal. We don’t naturally have 8 X 16 operations. A lowering that splits the vector would avoid introducing the wider registers and may combine better arsenm: I don’t think these should be legal. We don’t naturally have 8 X 16 operations. A lowering that…
				rampitecUnsubmitted Not Done Reply Inline Actions We actually do have these operands: v_smfmac_f32_16x16x32_f16 v_smfmac_f32_32x32x16_f16 v_smfmac_f32_16x16x32_bf16 v_smfmac_f32_32x32x16_bf16 rampitec: We actually do have these operands: ``` v_smfmac_f32_16x16x32_f16 v_smfmac_f32_32x32x16_f16…
				hsmhsmAuthorUnsubmitted Done Reply Inline Actions And, even if we think that we better handle it by splitting the vector, then we can just materialize scalar_to_vector as build_vector since build_vector already has custom lowering for v8i16/v8f16 by splitting these types. I experimented it, I see better ISEL output in this case, and also final ISA looks good - one shift and one pack operation is got eliminated. I will update the patch with this change. Let's take a look at it and discuss it. hsmhsm: And, even if we think that we better handle it by splitting the vector, then we can just…
				rampitecUnsubmitted Not Done Reply Inline Actions Please do. Offhand the patch looks good to me. rampitec: Please do. Offhand the patch looks good to me.
				>;

				def : GCNPat <
				(v8f16 (scalar_to_vector f16:$src0)),
				(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
				arsenmUnsubmitted Not Done Reply Inline Actions It’s also not obvious what register clas me this will end up picking arsenm: It’s also not obvious what register clas me this will end up picking
				rampitecUnsubmitted Not Done Reply Inline Actions Legal class for VT? Note exactly the same code above. rampitec: Legal class for VT? Note exactly the same code above.
				>;

				def : GCNPat <
	(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,			(i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
	timm:$bank_mask, timm:$bound_ctrl)),			timm:$bank_mask, timm:$bound_ctrl)),
	(V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,			(V_MOV_B64_DPP_PSEUDO VReg_64_Align2:$src, VReg_64_Align2:$src,
	(as_i32timm $dpp_ctrl), (as_i32timm $row_mask),			(as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
	(as_i32timm $bank_mask),			(as_i32timm $bank_mask),
	(as_i1timm $bound_ctrl))			(as_i1timm $bound_ctrl))
	>;			>;

	▲ Show 20 Lines • Show All 517 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

Show First 20 Lines • Show All 1,742 Lines • ▼ Show 20 Lines	; CI-NEXT: s_endpgm
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval		%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep		store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
ret void		ret void
}		}

define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {		define amdgpu_kernel void @v_insertelement_v8f16_3(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v8f16_3:		; GFX9-LABEL: v_insertelement_v8f16_3:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0		; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10		; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0		; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11]		; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16
; GFX9-NEXT: s_waitcnt vmcnt(2)
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v1		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]		; GFX9-NEXT: v_lshl_or_b32 v1, s6, 16, v1
		; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm		; GFX9-NEXT: s_endpgm
;		;
; VI-LABEL: v_insertelement_v8f16_3:		; VI-LABEL: v_insertelement_v8f16_3:
; VI: ; %bb.0:		; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0		; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10		; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0		; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)		; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11		; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4		; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc		; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]		; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16		; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16		; VI-NEXT: s_lshl_b32 s1, s4, 16
; VI-NEXT: v_mov_b32_e32 v5, s9		; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4		; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: s_mov_b32 s4, 0xffff		; VI-NEXT: v_mov_b32_e32 v6, s1
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc		; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_bfi_b32 v3, s4, v3, v3
; VI-NEXT: s_waitcnt vmcnt(0)		; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v6, 16, v6		; VI-NEXT: v_bfi_b32 v3, s2, v3, v3
; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD		; VI-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]		; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm		; VI-NEXT: s_endpgm
;		;
; CI-LABEL: v_insertelement_v8f16_3:		; CI-LABEL: v_insertelement_v8f16_3:
; CI: ; %bb.0:		; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0		; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4		; CI-NEXT: s_load_dword s4, s[4:5], 0x4
Show All 22 Lines	; CI-NEXT: s_endpgm
%vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3		%vecins = insertelement <8 x half> %vec, half %val.cvt, i32 3
store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep		store <8 x half> %vecins, <8 x half> addrspace(1)* %out.gep
ret void		ret void
}		}

define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {		define amdgpu_kernel void @v_insertelement_v8i16_6(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in, i32 %val) {
; GFX9-LABEL: v_insertelement_v8i16_6:		; GFX9-LABEL: v_insertelement_v8i16_6:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0		; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10		; GFX9-NEXT: s_load_dword s6, s[4:5], 0x10
; GFX9-NEXT: s_add_u32 s0, s0, s7
; GFX9-NEXT: s_addc_u32 s1, s1, 0
; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0		; GFX9-NEXT: v_lshlrev_b32_e32 v4, 4, v0
		; GFX9-NEXT: v_mov_b32_e32 v5, 0xffff
; GFX9-NEXT: s_waitcnt lgkmcnt(0)		; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[10:11]		; GFX9-NEXT: global_load_dwordx4 v[0:3], v4, s[2:3]
; GFX9-NEXT: v_mov_b32_e32 v5, s6
; GFX9-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16
; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], 0 offset:16
; GFX9-NEXT: v_mov_b32_e32 v6, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v3, v6, v5, v3		; GFX9-NEXT: v_bfi_b32 v3, v5, s6, v3
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]		; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm		; GFX9-NEXT: s_endpgm
;		;
; VI-LABEL: v_insertelement_v8i16_6:		; VI-LABEL: v_insertelement_v8i16_6:
; VI: ; %bb.0:		; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0		; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; VI-NEXT: s_load_dword s4, s[4:5], 0x10		; VI-NEXT: s_load_dword s4, s[4:5], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0		; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
; VI-NEXT: s_add_u32 s0, s0, s7
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: s_waitcnt lgkmcnt(0)		; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s11		; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s10, v4		; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc		; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v5, s4
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]		; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: buffer_store_short v5, off, s[0:3], 0 offset:16		; VI-NEXT: s_mov_b32 s2, 0xffff
; VI-NEXT: buffer_load_dword v6, off, s[0:3], 0 offset:16		; VI-NEXT: v_mov_b32_e32 v5, s1
; VI-NEXT: s_mov_b32 s4, 0xffff		; VI-NEXT: v_mov_b32_e32 v6, s4
; VI-NEXT: v_mov_b32_e32 v5, s9		; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
; VI-NEXT: v_add_u32_e32 v4, vcc, s8, v4
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc		; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc
; VI-NEXT: s_waitcnt vmcnt(2)
; VI-NEXT: v_bfi_b32 v1, s4, v1, v1
; VI-NEXT: s_waitcnt vmcnt(0)		; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_bfi_b32 v3, s4, v6, v3		; VI-NEXT: v_bfi_b32 v3, s2, v6, v3
		; VI-NEXT: v_bfi_b32 v1, s2, v1, v1
; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]		; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; VI-NEXT: s_endpgm		; VI-NEXT: s_endpgm
;		;
; CI-LABEL: v_insertelement_v8i16_6:		; CI-LABEL: v_insertelement_v8i16_6:
; CI: ; %bb.0:		; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0		; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
; CI-NEXT: s_load_dword s4, s[4:5], 0x4		; CI-NEXT: s_load_dword s4, s[4:5], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0		; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
▲ Show 20 Lines • Show All 215 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
				; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s \| FileCheck -check-prefix=GFX900 %s
				; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 < %s \| FileCheck -check-prefixes=GFX906 %s
				; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s \| FileCheck -check-prefixes=GFX908 %s
				; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s \| FileCheck -check-prefixes=GFX90A %s

				define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, <8 x i16>* %out) #0 {
				; GFX900-LABEL: scalar_to_vector_v8i16:
				; GFX900: ; %bb.0: ; %entry
				; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX900-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX900-NEXT: s_waitcnt lgkmcnt(0)
				; GFX900-NEXT: s_lshr_b32 s4, s0, 16
				; GFX900-NEXT: s_pack_lh_b32_b16 s5, s0, s0
				; GFX900-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX900-NEXT: v_mov_b32_e32 v6, s3
				; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX900-NEXT: v_mov_b32_e32 v1, s5
				; GFX900-NEXT: v_mov_b32_e32 v2, s1
				; GFX900-NEXT: v_mov_b32_e32 v4, s0
				; GFX900-NEXT: v_mov_b32_e32 v3, s5
				; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX900-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX900-NEXT: s_endpgm
				;
				; GFX906-LABEL: scalar_to_vector_v8i16:
				; GFX906: ; %bb.0: ; %entry
				; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX906-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX906-NEXT: s_waitcnt lgkmcnt(0)
				; GFX906-NEXT: s_lshr_b32 s4, s0, 16
				; GFX906-NEXT: s_pack_lh_b32_b16 s5, s0, s0
				; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX906-NEXT: v_mov_b32_e32 v6, s3
				; GFX906-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX906-NEXT: v_mov_b32_e32 v1, s5
				; GFX906-NEXT: v_mov_b32_e32 v2, s1
				; GFX906-NEXT: v_mov_b32_e32 v4, s0
				; GFX906-NEXT: v_mov_b32_e32 v3, s5
				; GFX906-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX906-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX906-NEXT: s_endpgm
				;
				; GFX908-LABEL: scalar_to_vector_v8i16:
				; GFX908: ; %bb.0: ; %entry
				; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX908-NEXT: s_waitcnt lgkmcnt(0)
				; GFX908-NEXT: s_lshr_b32 s4, s0, 16
				; GFX908-NEXT: s_pack_lh_b32_b16 s5, s0, s0
				; GFX908-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX908-NEXT: v_mov_b32_e32 v6, s3
				; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX908-NEXT: v_mov_b32_e32 v1, s5
				; GFX908-NEXT: v_mov_b32_e32 v2, s1
				; GFX908-NEXT: v_mov_b32_e32 v4, s0
				; GFX908-NEXT: v_mov_b32_e32 v3, s5
				; GFX908-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX908-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX908-NEXT: s_endpgm
				;
				; GFX90A-LABEL: scalar_to_vector_v8i16:
				; GFX90A: ; %bb.0: ; %entry
				; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
				; GFX90A-NEXT: s_lshr_b32 s4, s0, 16
				; GFX90A-NEXT: s_pack_lh_b32_b16 s5, s0, s0
				; GFX90A-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX90A-NEXT: v_mov_b32_e32 v1, s3
				; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
				; GFX90A-NEXT: v_mov_b32_e32 v2, s5
				; GFX90A-NEXT: v_mov_b32_e32 v3, s1
				; GFX90A-NEXT: v_mov_b32_e32 v5, s0
				; GFX90A-NEXT: v_mov_b32_e32 v4, s5
				; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
				; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
				; GFX90A-NEXT: s_endpgm
				entry:
				%val.1.i32 = extractelement <2 x i32> %in, i64 0
				%val.2.vec2.i16 = bitcast i32 %val.1.i32 to <2 x i16>
				%val.3.vec8.i16 = shufflevector <2 x i16> %val.2.vec2.i16, <2 x i16> %val.2.vec2.i16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>

				%val.4.vec4.i32 = shufflevector <2 x i32> %in, <2 x i32> %in, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%val.5.vec8.i16 = bitcast <4 x i32> %val.4.vec4.i32 to <8 x i16>

				%val.6.vec8.i16 = shufflevector <8 x i16> %val.5.vec8.i16, <8 x i16> %val.3.vec8.i16, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>

				%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
				%tid.ext = sext i32 %tid to i64
				%out.gep = getelementptr inbounds <8 x i16>, <8 x i16>* %out, i64 %tid.ext
				store <8 x i16> %val.6.vec8.i16, <8 x i16>* %out.gep, align 16

				ret void
				}

				define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, <8 x half>* %out) #0 {
				; GFX900-LABEL: scalar_to_vector_v8f16:
				; GFX900: ; %bb.0: ; %entry
				; GFX900-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX900-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX900-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX900-NEXT: s_waitcnt lgkmcnt(0)
				; GFX900-NEXT: s_lshr_b32 s4, s0, 16
				; GFX900-NEXT: v_mov_b32_e32 v1, s0
				; GFX900-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX900-NEXT: v_mov_b32_e32 v6, s3
				; GFX900-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX900-NEXT: v_mov_b32_e32 v2, s1
				; GFX900-NEXT: v_mov_b32_e32 v4, s0
				; GFX900-NEXT: v_mov_b32_e32 v3, v1
				; GFX900-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX900-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX900-NEXT: s_endpgm
				;
				; GFX906-LABEL: scalar_to_vector_v8f16:
				; GFX906: ; %bb.0: ; %entry
				; GFX906-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX906-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX906-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX906-NEXT: s_waitcnt lgkmcnt(0)
				; GFX906-NEXT: s_lshr_b32 s4, s0, 16
				; GFX906-NEXT: v_mov_b32_e32 v1, s0
				; GFX906-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX906-NEXT: v_mov_b32_e32 v6, s3
				; GFX906-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX906-NEXT: v_mov_b32_e32 v2, s1
				; GFX906-NEXT: v_mov_b32_e32 v4, s0
				; GFX906-NEXT: v_mov_b32_e32 v3, v1
				; GFX906-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX906-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX906-NEXT: s_endpgm
				;
				; GFX908-LABEL: scalar_to_vector_v8f16:
				; GFX908: ; %bb.0: ; %entry
				; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX908-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX908-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX908-NEXT: s_waitcnt lgkmcnt(0)
				; GFX908-NEXT: s_lshr_b32 s4, s0, 16
				; GFX908-NEXT: v_mov_b32_e32 v1, s0
				; GFX908-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX908-NEXT: v_mov_b32_e32 v6, s3
				; GFX908-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
				; GFX908-NEXT: v_mov_b32_e32 v2, s1
				; GFX908-NEXT: v_mov_b32_e32 v4, s0
				; GFX908-NEXT: v_mov_b32_e32 v3, v1
				; GFX908-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
				; GFX908-NEXT: flat_store_dwordx4 v[5:6], v[1:4]
				; GFX908-NEXT: s_endpgm
				;
				; GFX90A-LABEL: scalar_to_vector_v8f16:
				; GFX90A: ; %bb.0: ; %entry
				; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
				; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
				; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 4, v0
				; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
				; GFX90A-NEXT: s_lshr_b32 s4, s0, 16
				; GFX90A-NEXT: v_mov_b32_e32 v2, s0
				; GFX90A-NEXT: s_pack_ll_b32_b16 s0, s0, s4
				; GFX90A-NEXT: v_mov_b32_e32 v1, s3
				; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0
				; GFX90A-NEXT: v_mov_b32_e32 v3, s1
				; GFX90A-NEXT: v_mov_b32_e32 v5, s0
				; GFX90A-NEXT: v_mov_b32_e32 v4, v2
				; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
				; GFX90A-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
				; GFX90A-NEXT: s_endpgm
				entry:
				%val.1.float = extractelement <2 x float> %in, i64 0
				%val.2.vec2.half = bitcast float %val.1.float to <2 x half>
				%val.3.vec8.half = shufflevector <2 x half> %val.2.vec2.half, <2 x half> %val.2.vec2.half, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>

				%val.4.vec4.float = shufflevector <2 x float> %in, <2 x float> %in, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
				%val.5.vec8.half = bitcast <4 x float> %val.4.vec4.float to <8 x half>

				%val.6.vec8.half = shufflevector <8 x half> %val.5.vec8.half, <8 x half> %val.3.vec8.half, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>

				%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
				%tid.ext = sext i32 %tid to i64
				%out.gep = getelementptr inbounds <8 x half>, <8 x half>* %out, i64 %tid.ext
				store <8 x half> %val.6.vec8.half, <8 x half>* %out.gep, align 16

				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x() #1

				attributes #0 = { nounwind }
				attributes #1 = { nounwind readnone }

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fix scalar_to_vector for v8i16/v8f16
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 426278

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fix scalar_to_vector for v8i16/v8f16ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 426278

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll

[AMDGPU] Fix scalar_to_vector for v8i16/v8f16
ClosedPublic