Diff 328660

llvm/lib/Target/AMDGPU/SIInstructions.td

	Show First 20 Lines • Show All 2,390 Lines • ▼ Show 20 Lines
	>;			>;

	// TODO: Should source modifiers be matched to v_pack_b32_f16?			// TODO: Should source modifiers be matched to v_pack_b32_f16?
	def : GCNPat <			def : GCNPat <
	(v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),			(v2f16 (build_vector (f16 SReg_32:$src0), (f16 SReg_32:$src1))),
	(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)			(S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
	>;			>;

				def : GCNPat <
				(v2f16 (build_vector (f16 (bitconvert (i16 (trunc VGPR_32:$src0)))),
				(f16 (bitconvert (i16 (trunc VGPR_32:$src1)))))),
				(V_PACK_B32_F16_e64 SRCMODS.NONE, VGPR_32:$src0, SRCMODS.NONE, VGPR_32:$src1)
				arsenmUnsubmitted Not Done Reply Inline Actions This isn't a simple bitpacking, this has FP output effects like flushing arsenm: This isn't a simple bitpacking, this has FP output effects like flushing
				arsenmUnsubmitted Done Reply Inline Actions I believe source modifiers should work as normal, so you can use the VOP3Mods complex patterns for the sources arsenm: I believe source modifiers should work as normal, so you can use the VOP3Mods complex patterns…
				>;

	} // End SubtargetPredicate = HasVOP3PInsts			} // End SubtargetPredicate = HasVOP3PInsts


	def : GCNPat <			def : GCNPat <
	(v2f16 (scalar_to_vector f16:$src0)),			(v2f16 (scalar_to_vector f16:$src0)),
	(COPY $src0)			(COPY $src0)
	>;			>;

	▲ Show 20 Lines • Show All 463 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll

Show First 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	; GFX8-NEXT: s_setpc_b64 s[30:31]
ret void		ret void
}		}

define void @undef_lo2_v4f16(<2 x half> %arg0) {		define void @undef_lo2_v4f16(<2 x half> %arg0) {
; GFX9-LABEL: undef_lo2_v4f16:		; GFX9-LABEL: undef_lo2_v4f16:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: ;;#ASMSTART		; GFX9-NEXT: ;;#ASMSTART
; GFX9-NEXT: ; use v[0:1]		; GFX9-NEXT: ; use v[0:1]
; GFX9-NEXT: ;;#ASMEND		; GFX9-NEXT: ;;#ASMEND
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX8-LABEL: undef_lo2_v4f16:		; GFX8-LABEL: undef_lo2_v4f16:
; GFX8: ; %bb.0:		; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
▲ Show 20 Lines • Show All 187 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

	Show First 20 Lines • Show All 68 Lines • ▼ Show 20 Lines
	; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]]			; SI-DAG: v_cvt_f16_f32_e32 v[[CVTHI:[0-9]+]], v[[A_F32_1]]
	; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]]			; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[CVTHI]]

	; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD			; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD

	; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]			; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

	; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]			; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
	; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]			; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
	; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]

	; GCN: buffer_store_dword v[[R_V2_F16]]			; GCN: buffer_store_dword v[[R_V2_F16]]

	define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(			define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
	<2 x half> addrspace(1)* %r,			<2 x half> addrspace(1)* %r,
	<2 x double> addrspace(1)* %a) {			<2 x double> addrspace(1)* %a) {
	entry:			entry:
	%a.val = load <2 x double>, <2 x double> addrspace(1)* %a			%a.val = load <2 x double>, <2 x double> addrspace(1)* %a
	▲ Show 20 Lines • Show All 112 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

	Show First 20 Lines • Show All 1,665 Lines • ▼ Show 20 Lines
	define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {			define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
	; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:			; GFX9-LABEL: v_insertelement_v4f16_dynamic_sgpr:
	; GFX9: ; %bb.0:			; GFX9: ; %bb.0:
	; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10			; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x10
	; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0			; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0
	; GFX9-NEXT: s_waitcnt lgkmcnt(0)			; GFX9-NEXT: s_waitcnt lgkmcnt(0)
	; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]			; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
	; GFX9-NEXT: s_pack_ll_b32_b16 s5, s6, s6
	; GFX9-NEXT: s_mov_b32 s3, 0			; GFX9-NEXT: s_mov_b32 s3, 0
	; GFX9-NEXT: s_mov_b32 s2, 0xffff			; GFX9-NEXT: s_mov_b32 s2, 0xffff
	; GFX9-NEXT: s_lshl_b32 s4, s7, 4			; GFX9-NEXT: s_lshl_b32 s4, s7, 4
				; GFX9-NEXT: v_pack_b32_f16 v3, s6, s6
	; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4			; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s4
	; GFX9-NEXT: v_mov_b32_e32 v3, s5
	; GFX9-NEXT: v_mov_b32_e32 v4, s5
	; GFX9-NEXT: s_waitcnt vmcnt(0)			; GFX9-NEXT: s_waitcnt vmcnt(0)
	; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1			; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1
	; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0			; GFX9-NEXT: v_bfi_b32 v0, s2, v3, v0
	; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]			; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
	; GFX9-NEXT: s_endpgm			; GFX9-NEXT: s_endpgm
	;			;
	; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:			; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr:
	; VI: ; %bb.0:			; VI: ; %bb.0:
	; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0			; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
	; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10			; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10
	; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0			; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
	▲ Show 20 Lines • Show All 66 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/pack.v2f16.ll

; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX9 %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=-flat-for-global -denormal-fp-math=preserve-sign -verify-machineinstrs < %s \| FileCheck --check-prefixes=GCN,GFX9 %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s		; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s


; GCN-LABEL: {{^}}s_pack_v2f16:		; GCN-LABEL: {{^}}s_pack_v2f16:
; GFX9: s_load_dword [[VAL0:s[0-9]+]]		; GFX9: s_load_dword [[VAL0:s[0-9]+]]
; GFX9: s_load_dword [[VAL1:s[0-9]+]]		; GFX9: s_load_dword [[VAL1:s[0-9]+]]
; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]		; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[PACKED]]
; GFX9: ; use [[PACKED]]		; GFX9: ; use [[PACKED]]
define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {		define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(4)* %in0, i32 addrspace(4)* %in1) #0 {
%val0 = load volatile i32, i32 addrspace(4)* %in0		%val0 = load volatile i32, i32 addrspace(4)* %in0
%val1 = load volatile i32, i32 addrspace(4)* %in1		%val1 = load volatile i32, i32 addrspace(4)* %in1
%lo.i = trunc i32 %val0 to i16		%lo.i = trunc i32 %val0 to i16
%hi.i = trunc i32 %val1 to i16		%hi.i = trunc i32 %val1 to i16
%lo = bitcast i16 %lo.i to half		%lo = bitcast i16 %lo.i to half
%hi = bitcast i16 %hi.i to half		%hi = bitcast i16 %hi.i to half
Show All 36 Lines	define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(4)* %in0) #0 {
call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0		call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_pack_v2f16:		; GCN-LABEL: {{^}}v_pack_v2f16:
; GFX9: global_load_dword [[VAL0:v[0-9]+]]		; GFX9: global_load_dword [[VAL0:v[0-9]+]]
; GFX9: global_load_dword [[VAL1:v[0-9]+]]		; GFX9: global_load_dword [[VAL1:v[0-9]+]]

; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]		; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
; GFX9: ; use [[PACKED]]		; GFX9: ; use [[PACKED]]
define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {		define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()		%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64		%tid.ext = sext i32 %tid to i64
%in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext		%in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
%in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext		%in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
%val0 = load volatile i32, i32 addrspace(1)* %in0.gep		%val0 = load volatile i32, i32 addrspace(1)* %in0.gep
%val1 = load volatile i32, i32 addrspace(1)* %in1.gep		%val1 = load volatile i32, i32 addrspace(1)* %in1.gep
%lo.i = trunc i32 %val0 to i16		%lo.i = trunc i32 %val0 to i16
%hi.i = trunc i32 %val1 to i16		%hi.i = trunc i32 %val1 to i16
%lo = bitcast i16 %lo.i to half		%lo = bitcast i16 %lo.i to half
%hi = bitcast i16 %hi.i to half		%hi = bitcast i16 %hi.i to half
%vec.0 = insertelement <2 x half> undef, half %lo, i32 0		%vec.0 = insertelement <2 x half> undef, half %lo, i32 0
%vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1		%vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
%vec.i32 = bitcast <2 x half> %vec.1 to i32		%vec.i32 = bitcast <2 x half> %vec.1 to i32
call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0		call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_pack_v2f16_user:		; GCN-LABEL: {{^}}v_pack_v2f16_user:
; GFX9: global_load_dword [[VAL0:v[0-9]+]]		; GFX9: global_load_dword [[VAL0:v[0-9]+]]
; GFX9: global_load_dword [[VAL1:v[0-9]+]]		; GFX9: global_load_dword [[VAL1:v[0-9]+]]

; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]		; GFX9: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
; GFX9: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]

; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]]		; GFX9: v_add_u32_e32 v{{[0-9]+}}, 9, [[PACKED]]
define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {		define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()		%tid = call i32 @llvm.amdgcn.workitem.id.x()
%tid.ext = sext i32 %tid to i64		%tid.ext = sext i32 %tid to i64
%in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext		%in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
%in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext		%in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
%val0 = load volatile i32, i32 addrspace(1)* %in0.gep		%val0 = load volatile i32, i32 addrspace(1)* %in0.gep
▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines
}		}

define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_35u5(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_35u5:		; GFX9-LABEL: shuffle_v4f16_35u5:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[2:3], off		; GFX9-NEXT: global_load_dword v4, v[2:3], off
; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX9-NEXT: v_mov_b32_e32 v1, v4		; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 undef, i32 5>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_357u(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_357u:		; GFX9-LABEL: shuffle_v4f16_357u:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4		; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v6
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0101(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 243 Lines • ▼ Show 20 Lines
}		}

define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2356(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2356:		; GFX9-LABEL: shuffle_v4f16_2356:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v0		; GFX9-NEXT: v_pack_b32_f16 v1, v0, v6
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, v4		; GFX9-NEXT: v_mov_b32_e32 v0, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 5, i32 6>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_5623(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_5623:		; GFX9-LABEL: shuffle_v4f16_5623:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, v6
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, v4		; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_3456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_3456:		; GFX9-LABEL: shuffle_v4f16_3456:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v2, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v0, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v0, v4, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v5, 16, v2		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v5
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_5634(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_5634:		; GFX9-LABEL: shuffle_v4f16_5634:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
		; GFX9-NEXT: s_waitcnt vmcnt(1)
		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_pack_b32_f16 v0, v1, v5
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 6, i32 3, i32 4>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_5734(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_5734:		; GFX9-LABEL: shuffle_v4f16_5734:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff		; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v5		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
		; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v1, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_and_b32_sdwa v3, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_pack_b32_f16 v0, v2, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v4, 16, v3
; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 5, i32 7, i32 3, i32 4>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {		define <4 x i16> @shuffle_v4i16_2356(<4 x i16> addrspace(1)* %arg0, <4 x i16> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	; GFX9-NEXT: s_setpc_b64 s[30:31]
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_1100(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_1100:		; GFX9-LABEL: shuffle_v4f16_1100:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v1, v2, v0		; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 16, v0		; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, v2, v3		; GFX9-NEXT: v_pack_b32_f16 v0, v2, v2
; GFX9-NEXT: v_lshl_or_b32 v0, v3, 16, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 1, i32 1, i32 0, i32 0>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6161(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6161:		; GFX9-LABEL: shuffle_v4f16_6161:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[2:3], off offset:4		; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: global_load_dword v5, v[0:1], off		; GFX9-NEXT: global_load_dword v5, v[2:3], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v4		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5		; GFX9-NEXT: v_pack_b32_f16 v0, v5, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, v1
; GFX9-NEXT: v_mov_b32_e32 v1, v0		; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 6, i32 1, i32 6, i32 1>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_2333(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_2333:		; GFX9-LABEL: shuffle_v4f16_2333:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_6667(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_6667:		; GFX9-LABEL: shuffle_v4f16_6667:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 2, i32 3, i32 3, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_0101(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 58 Lines • ▼ Show 20 Lines
}		}

define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v8f16_13_14_2_3(<8 x half> addrspace(1)* %arg0, <8 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v8f16_13_14_2_3:		; GFX9-LABEL: shuffle_v8f16_13_14_2_3:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8		; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off offset:8
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4		; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:4
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(1)		; GFX9-NEXT: s_waitcnt vmcnt(1)
; GFX9-NEXT: v_and_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v0		; GFX9-NEXT: v_pack_b32_f16 v0, v0, v6
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v1, v4		; GFX9-NEXT: v_mov_b32_e32 v1, v4
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0		%val0 = load <8 x half>, <8 x half> addrspace(1)* %arg0
%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1		%val1 = load <8 x half>, <8 x half> addrspace(1)* %arg1
%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>		%shuffle = shufflevector <8 x half> %val0, <8 x half> %val1, <4 x i32> <i32 13, i32 14, i32 2, i32 3>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v3f16_0122(<3 x half> addrspace(1)* %arg0, <3 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v3f16_0122:		; GFX9-LABEL: shuffle_v3f16_0122:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v1		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v1
; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0		%val0 = load <3 x half>, <3 x half> addrspace(1)* %arg0
%val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1		%val1 = load <3 x half>, <3 x half> addrspace(1)* %arg1
%shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>		%shuffle = shufflevector <3 x half> %val0, <3 x half> %val1, <4 x i32> <i32 0, i32 1, i32 2, i32 2>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v2f16_0122(<2 x half> addrspace(1)* %arg0, <2 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v2f16_0122:		; GFX9-LABEL: shuffle_v2f16_0122:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v0, v[0:1], off		; GFX9-NEXT: global_load_dword v0, v[0:1], off
; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v0, 16, v1		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0		%val0 = load <2 x half>, <2 x half> addrspace(1)* %arg0
%val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1		%val1 = load <2 x half>, <2 x half> addrspace(1)* %arg1
%shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>		%shuffle = shufflevector <2 x half> %val0, <2 x half> %val1, <4 x i32> <i32 0, i32 1, i32 1, i32 0>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {		define <6 x half> @shuffle_v6f16_452367(<6 x half> addrspace(1)* %arg0, <6 x half> addrspace(1)* %arg1) {
▲ Show 20 Lines • Show All 66 Lines • ▼ Show 20 Lines
define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {		define <4 x half> @shuffle_v4f16_0456(<4 x half> addrspace(1)* %arg0, <4 x half> addrspace(1)* %arg1) {
; GFX9-LABEL: shuffle_v4f16_0456:		; GFX9-LABEL: shuffle_v4f16_0456:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off		; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off		; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off
; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1		; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1
; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff
; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3		; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3
; GFX9-NEXT: v_and_b32_e32 v1, v0, v4		; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v4
; GFX9-NEXT: s_waitcnt vmcnt(0)		; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1		; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v5
; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1		; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2		; GFX9-NEXT: v_pack_b32_f16 v1, v1, v6
; GFX9-NEXT: s_setpc_b64 s[30:31]		; GFX9-NEXT: s_setpc_b64 s[30:31]
%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0		%val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0
%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1		%val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1
%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>		%shuffle = shufflevector <4 x half> %val0, <4 x half> %val1, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
ret <4 x half> %shuffle		ret <4 x half> %shuffle
}		}

define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) {		define amdgpu_kernel void @shuffle_scalar_load_v8i32_0123(<8 x i32> addrspace(4)* %in, <4 x i32> addrspace(1)* %out) {
Show All 23 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve Codegen for build_vector
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 328660

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

llvm/test/CodeGen/AMDGPU/pack.v2f16.ll

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Improve Codegen for build_vectorClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 328660

llvm/lib/Target/AMDGPU/SIInstructions.td

llvm/test/CodeGen/AMDGPU/build-vector-packed-partial-undef.ll

llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll

llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll

llvm/test/CodeGen/AMDGPU/pack.v2f16.ll

llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll

[AMDGPU] Improve Codegen for build_vector
ClosedPublic