This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Cast sub-dword elements to i32 in concat_vectors
ClosedPublic

Authored by rampitec on Jan 9 2023, 3:05 PM.

Download Raw Diff

Details

Reviewers

kerbowa
arsenm

Commits

rGc8ed36281a92: [AMDGPU] Cast sub-dword elements to i32 in concat_vectors

Summary

This produces better code by avoiding repacking in some cases.

Fixes: SWDEV-373436

Diff Detail

Event Timeline

rampitec created this revision.Jan 9 2023, 3:05 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 9 2023, 3:05 PM

Herald added subscribers: kosarev, foad, hiraditya and 5 others. · View Herald Transcript

rampitec requested review of this revision.Jan 9 2023, 3:05 PM

Herald added a project: Restricted Project. · View Herald TranscriptJan 9 2023, 3:05 PM

Herald added a subscriber: wdng. · View Herald Transcript

Commit message? Can?

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
2315	Not sure we'll reach here with i8 vectors

This revision is now accepted and ready to land.Jan 9 2023, 3:15 PM

rampitec retitled this revision from [AMDGPU] Can sub-dword elements to i32 in concat_vectors to [AMDGPU] Cast sub-dword elements to i32 in concat_vectors.Jan 9 2023, 3:17 PM

rampitec marked an inline comment as done.Jan 9 2023, 3:21 PM

rampitec added inline comments.

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll
2315	Not now, but in case we make it legal it is better to have a test.

This revision was landed with ongoing or failed builds.Jan 9 2023, 3:36 PM

Closed by commit rGc8ed36281a92: [AMDGPU] Cast sub-dword elements to i32 in concat_vectors (authored by rampitec). · Explain Why

This revision was automatically updated to reflect the committed changes.

rampitec marked an inline comment as done.

rampitec added a commit: rGc8ed36281a92: [AMDGPU] Cast sub-dword elements to i32 in concat_vectors.

Harbormaster completed remote builds in B206633: Diff 487568.Jan 9 2023, 4:29 PM

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUISelLowering.cpp

29 lines

test/

CodeGen/

AMDGPU/

vector_shuffle.packed.ll

52 lines

Diff 487568

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 1,338 Lines • ▼ Show 20 Lines	if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS \|\|
return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());		return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
}		}
return SDValue();		return SDValue();
}		}

SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,		SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
SmallVector<SDValue, 8> Args;		SmallVector<SDValue, 8> Args;
		SDLoc SL(Op);

EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
if (VT == MVT::v4i16 \|\| VT == MVT::v4f16) {		if (VT.getVectorElementType().getSizeInBits() < 32) {
SDLoc SL(Op);		unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));		if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));		unsigned NewNumElt = OpBitSize / 32;
		EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
		: EVT::getVectorVT(*DAG.getContext(),
		MVT::i32, NewNumElt);
		for (const SDUse &U : Op->ops()) {
		SDValue In = U.get();
		SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
		if (NewNumElt > 1)
		DAG.ExtractVectorElements(NewIn, Args);
		else
		Args.push_back(NewIn);
		}

SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });		EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
		NewNumElt * Op.getNumOperands());
		SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
return DAG.getNode(ISD::BITCAST, SL, VT, BV);		return DAG.getNode(ISD::BITCAST, SL, VT, BV);
}		}
		}

for (const SDUse &U : Op->ops())		for (const SDUse &U : Op->ops())
DAG.ExtractVectorElements(U.get(), Args);		DAG.ExtractVectorElements(U.get(), Args);

return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);		return DAG.getBuildVector(Op.getValueType(), SL, Args);
}		}

SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,		SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {

SmallVector<SDValue, 8> Args;		SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();		unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();		EVT VT = Op.getValueType();
▲ Show 20 Lines • Show All 3,580 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Show First 20 Lines • Show All 2,267 Lines • ▼ Show 20 Lines	; GFX11-NEXT: s_setpc_b64 s[30:31]
store <8 x half> %shuffle, ptr addrspace(1) %out		store <8 x half> %shuffle, ptr addrspace(1) %out
ret void		ret void
}		}

define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {		define void @shuffle_v16f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
; GFX9-LABEL: shuffle_v16f16_concat:		; GFX9-LABEL: shuffle_v16f16_concat:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off		; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off		; GFX9-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX9-NEXT: s_mov_b32 s4, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_bfi_b32 v3, s4, v9, v9
; GFX9-NEXT: v_bfi_b32 v2, s4, v8, v8
; GFX9-NEXT: v_bfi_b32 v1, s4, v7, v7
; GFX9-NEXT: v_bfi_b32 v0, s4, v6, v6
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_bfi_b32 v9, s4, v13, v13
; GFX9-NEXT: v_bfi_b32 v8, s4, v12, v12
; GFX9-NEXT: v_bfi_b32 v7, s4, v11, v11
; GFX9-NEXT: v_bfi_b32 v6, s4, v10, v10
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16		; GFX9-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off		; GFX9-NEXT: s_waitcnt vmcnt(1)
		; GFX9-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX10-LABEL: shuffle_v16f16_concat:		; GFX10-LABEL: shuffle_v16f16_concat:
; GFX10: ; %bb.0:		; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_load_dwordx4 v[6:9], v[0:1], off		; GFX10-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX10-NEXT: global_load_dwordx4 v[10:13], v[2:3], off		; GFX10-NEXT: global_load_dwordx4 v[10:13], v[0:1], off
; GFX10-NEXT: s_waitcnt vmcnt(1)		; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_bfi_b32 v3, 0xffff, v9, v9		; GFX10-NEXT: global_store_dwordx4 v[4:5], v[6:9], off offset:16
; GFX10-NEXT: s_waitcnt vmcnt(0)		; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_bfi_b32 v13, 0xffff, v13, v13		; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off
; GFX10-NEXT: v_bfi_b32 v12, 0xffff, v12, v12
; GFX10-NEXT: v_bfi_b32 v11, 0xffff, v11, v11
; GFX10-NEXT: v_bfi_b32 v10, 0xffff, v10, v10
; GFX10-NEXT: v_bfi_b32 v2, 0xffff, v8, v8
; GFX10-NEXT: v_bfi_b32 v1, 0xffff, v7, v7
; GFX10-NEXT: v_bfi_b32 v0, 0xffff, v6, v6
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[10:13], off offset:16
; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: s_setpc_b64 s[30:31]		; GFX10-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX11-LABEL: shuffle_v16f16_concat:		; GFX11-LABEL: shuffle_v16f16_concat:
; GFX11: ; %bb.0:		; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0		; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_load_b128 v[6:9], v[0:1], off		; GFX11-NEXT: global_load_b128 v[6:9], v[2:3], off
; GFX11-NEXT: global_load_b128 v[0:3], v[2:3], off		; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(1)		; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_bfi_b32 v9, 0xffff, v9, v9		; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off offset:16
; GFX11-NEXT: s_waitcnt vmcnt(0)		; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_bfi_b32 v3, 0xffff, v3, v3		; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off
; GFX11-NEXT: v_bfi_b32 v2, 0xffff, v2, v2
; GFX11-NEXT: v_bfi_b32 v1, 0xffff, v1, v1
; GFX11-NEXT: v_bfi_b32 v0, 0xffff, v0, v0
; GFX11-NEXT: v_bfi_b32 v8, 0xffff, v8, v8
; GFX11-NEXT: v_bfi_b32 v7, 0xffff, v7, v7
; GFX11-NEXT: v_bfi_b32 v6, 0xffff, v6, v6
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_store_b128 v[4:5], v[0:3], off offset:16
; GFX11-NEXT: global_store_b128 v[4:5], v[6:9], off
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0		; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_setpc_b64 s[30:31]		; GFX11-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, ptr addrspace(1) %arg0		%val0 = load <8 x half>, ptr addrspace(1) %arg0
%val1 = load <8 x half>, ptr addrspace(1) %arg1		%val1 = load <8 x half>, ptr addrspace(1) %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
store <16 x half> %shuffle, ptr addrspace(1) %out		store <16 x half> %shuffle, ptr addrspace(1) %out
ret void		ret void
}		}
		arsenmUnsubmitted Done Reply Inline Actions Not sure we'll reach here with i8 vectors arsenm: Not sure we'll reach here with i8 vectors
		rampitecAuthorUnsubmitted Done Reply Inline Actions Not now, but in case we make it legal it is better to have a test. rampitec: Not now, but in case we make it legal it is better to have a test.

define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {		define void @shuffle_v32f16_concat(ptr addrspace(1) %arg0, ptr addrspace(1) %arg1, ptr addrspace(1) %out) {
; GFX9-LABEL: shuffle_v32f16_concat:		; GFX9-LABEL: shuffle_v32f16_concat:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off		; GFX9-NEXT: global_load_dwordx4 v[6:9], v[2:3], off
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16		; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off		; GFX9-NEXT: global_load_dwordx4 v[14:17], v[0:1], off
▲ Show 20 Lines • Show All 526 Lines • Show Last 20 Lines