This is an archive of the discontinued LLVM Phabricator instance.

Differential D49255

AMDGPU: Split wide vectors of i16/f16 into 32-bit regs on calls
ClosedPublic

Authored by arsenm on Jul 12 2018, 10:37 AM.

Download Raw Diff

Details

Reviewers

rampitec

Summary

This improves code for the same reasons as scalarizing 32-bit
element vectors.

Diff Detail

Event Timeline

arsenm created this revision.Jul 12 2018, 10:37 AM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptJul 12 2018, 10:37 AM

arsenm added a parent revision: D49254: AMDGPU: Scalarize vector argument types to calls.Jul 12 2018, 10:37 AM

LGTM

This revision is now accepted and ready to land.Jul 12 2018, 10:48 AM

r338418

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

SIISelLowering.cpp

26 lines

test/

CodeGen/

AMDGPU/

call-argument-types.ll

57 lines

mad-mix-lo.ll

25 lines

mul.i16.ll

2 lines

Diff 155215

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 696 Lines • ▼ Show 20 Lines	MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,		CallingConv::ID CC,
EVT VT) const {		EVT VT) const {
// TODO: Consider splitting all arguments into 32-bit pieces.		// TODO: Consider splitting all arguments into 32-bit pieces.
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {		if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
EVT ScalarVT = VT.getScalarType();		EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();		unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32 \|\| Size == 64)		if (Size == 32 \|\| Size == 64)
return ScalarVT.getSimpleVT();		return ScalarVT.getSimpleVT();

		if (Size == 16 &&
		Subtarget->has16BitInsts() &&
		isPowerOf2_32(VT.getVectorNumElements()))
		return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
}		}

return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);		return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}		}

unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,		unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,		CallingConv::ID CC,
EVT VT) const {		EVT VT) const {
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {		if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
		unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();		EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();		unsigned Size = ScalarVT.getSizeInBits();

if (Size == 32 \|\| Size == 64)		if (Size == 32 \|\| Size == 64)
return VT.getVectorNumElements();		return NumElts;

		// FIXME: Fails to break down as we want with v3.
		if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
		return VT.getVectorNumElements() / 2;
}		}

return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);		return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}		}

unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(		unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC,		LLVMContext &Context, CallingConv::ID CC,
EVT VT, EVT &IntermediateVT,		EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {		unsigned &NumIntermediates, MVT &RegisterVT) const {
if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {		if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
		unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();		EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();		unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32 \|\| Size == 64) {		if (Size == 32 \|\| Size == 64) {
RegisterVT = ScalarVT.getSimpleVT();		RegisterVT = ScalarVT.getSimpleVT();
IntermediateVT = RegisterVT;		IntermediateVT = RegisterVT;
NumIntermediates = VT.getVectorNumElements();		NumIntermediates = NumElts;
		return NumIntermediates;
		}

		// FIXME: We should fix the ABI to be the same on targets without 16-bit
		// support, but unless we can properly handle 3-vectors, it will be still be
		// inconsistent.
		if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
		RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
		IntermediateVT = RegisterVT;
		NumIntermediates = NumElts / 2;
return NumIntermediates;		return NumIntermediates;
}		}
}		}

return TargetLowering::getVectorTypeBreakdownForCallingConv(		return TargetLowering::getVectorTypeBreakdownForCallingConv(
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);		Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
}		}

▲ Show 20 Lines • Show All 7,647 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/call-argument-types.ll

	Show All 18 Lines
	declare void @external_void_func_i64(i64) #0			declare void @external_void_func_i64(i64) #0
	declare void @external_void_func_v2i64(<2 x i64>) #0			declare void @external_void_func_v2i64(<2 x i64>) #0
	declare void @external_void_func_v3i64(<3 x i64>) #0			declare void @external_void_func_v3i64(<3 x i64>) #0
	declare void @external_void_func_v4i64(<4 x i64>) #0			declare void @external_void_func_v4i64(<4 x i64>) #0

	declare void @external_void_func_f16(half) #0			declare void @external_void_func_f16(half) #0
	declare void @external_void_func_f32(float) #0			declare void @external_void_func_f32(float) #0
	declare void @external_void_func_f64(double) #0			declare void @external_void_func_f64(double) #0
				declare void @external_void_func_v2f32(<2 x float>) #0

	declare void @external_void_func_v2i16(<2 x i16>) #0			declare void @external_void_func_v2i16(<2 x i16>) #0
	declare void @external_void_func_v2f16(<2 x half>) #0			declare void @external_void_func_v2f16(<2 x half>) #0
				declare void @external_void_func_v3i16(<3 x i16>) #0
				declare void @external_void_func_v3f16(<3 x half>) #0
				declare void @external_void_func_v4i16(<4 x i16>) #0
				declare void @external_void_func_v4f16(<4 x half>) #0

	declare void @external_void_func_v2i32(<2 x i32>) #0			declare void @external_void_func_v2i32(<2 x i32>) #0
	declare void @external_void_func_v3i32(<3 x i32>) #0			declare void @external_void_func_v3i32(<3 x i32>) #0
	declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0			declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
	declare void @external_void_func_v4i32(<4 x i32>) #0			declare void @external_void_func_v4i32(<4 x i32>) #0
	declare void @external_void_func_v8i32(<8 x i32>) #0			declare void @external_void_func_v8i32(<8 x i32>) #0
	declare void @external_void_func_v16i32(<16 x i32>) #0			declare void @external_void_func_v16i32(<16 x i32>) #0
	declare void @external_void_func_v32i32(<32 x i32>) #0			declare void @external_void_func_v32i32(<32 x i32>) #0
	▲ Show 20 Lines • Show All 276 Lines • ▼ Show 20 Lines
	; GCN: v_mov_b32_e32 v0, 4.0			; GCN: v_mov_b32_e32 v0, 4.0
	; GCN-NOT: v0			; GCN-NOT: v0
	; GCN: s_swappc_b64			; GCN: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {			define amdgpu_kernel void @test_call_external_void_func_f32_imm() #0 {
	call void @external_void_func_f32(float 4.0)			call void @external_void_func_f32(float 4.0)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm:
				; GCN-DAG: v_mov_b32_e32 v0, 1.0
				; GCN-DAG: v_mov_b32_e32 v1, 2.0
				; GCN: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 {
				call void @external_void_func_v2f32(<2 x float> <float 1.0, float 2.0>)
				ret void
				}

	; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:			; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm:
	; GCN: v_mov_b32_e32 v0, 0{{$}}			; GCN: v_mov_b32_e32 v0, 0{{$}}
	; GCN: v_mov_b32_e32 v1, 0x40100000			; GCN: v_mov_b32_e32 v1, 0x40100000
	; GCN: s_swappc_b64			; GCN: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {			define amdgpu_kernel void @test_call_external_void_func_f64_imm() #0 {
	call void @external_void_func_f64(double 4.0)			call void @external_void_func_f64(double 4.0)
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:			; GCN-LABEL: {{^}}test_call_external_void_func_v2i16:
	; GFX9: buffer_load_dword v0			; GFX9: buffer_load_dword v0
	; GFX9-NOT: v0			; GFX9-NOT: v0
	; GFX9: s_swappc_b64			; GFX9: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {			define amdgpu_kernel void @test_call_external_void_func_v2i16() #0 {
	%val = load <2 x i16>, <2 x i16> addrspace(1)* undef			%val = load <2 x i16>, <2 x i16> addrspace(1)* undef
	call void @external_void_func_v2i16(<2 x i16> %val)			call void @external_void_func_v2i16(<2 x i16> %val)
	ret void			ret void
	}			}

				; GCN-LABEL: {{^}}test_call_external_void_func_v3i16:
				; GFX9: buffer_load_dwordx2 v[0:1]
				; GFX9-NOT: v0
				; GFX9-NOT: v1
				; GFX9: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 {
				%val = load <3 x i16>, <3 x i16> addrspace(1)* undef
				call void @external_void_func_v3i16(<3 x i16> %val)
				ret void
				}

				; FIXME: materialize constant directly in VGPR
				; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm:
				; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001
				; GFX9-DAG: s_pack_ll_b32_b16 [[K23:s[0-9]+]], 3, s{{[0-9]+}}
				; GFX9: v_mov_b32_e32 v0, [[K01]]
				; GFX9: v_mov_b32_e32 v1, [[K23]]
				; GFX9: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
				call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
				ret void
				}

				; GCN-LABEL: {{^}}test_call_external_void_func_v4i16:
				; GFX9: buffer_load_dwordx2 v[0:1]
				; GFX9-NOT: v0
				; GFX9-NOT: v1
				; GFX9: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 {
				%val = load <4 x i16>, <4 x i16> addrspace(1)* undef
				call void @external_void_func_v4i16(<4 x i16> %val)
				ret void
				}

				; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm:
				; GFX9-DAG: v_mov_b32_e32 v0, 0x20001
				; GFX9-DAG: v_mov_b32_e32 v1, 0x40003
				; GFX9: s_swappc_b64
				define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 {
				call void @external_void_func_v4i16(<4 x i16> <i16 1, i16 2, i16 3, i16 4>)
				ret void
				}

	; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:			; GCN-LABEL: {{^}}test_call_external_void_func_v2f16:
	; GFX9: buffer_load_dword v0			; GFX9: buffer_load_dword v0
	; GFX9-NOT: v0			; GFX9-NOT: v0
	; GFX9: s_swappc_b64			; GFX9: s_swappc_b64
	define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {			define amdgpu_kernel void @test_call_external_void_func_v2f16() #0 {
	%val = load <2 x half>, <2 x half> addrspace(1)* undef			%val = load <2 x half>, <2 x half> addrspace(1)* undef
	call void @external_void_func_v2f16(<2 x half> %val)			call void @external_void_func_v2f16(<2 x half> %val)
	ret void			ret void
	▲ Show 20 Lines • Show All 268 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/mad-mix-lo.ll

Show First 20 Lines • Show All 106 Lines • ▼ Show 20 Lines	define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%src2.ext = fpext <3 x half> %src2 to <3 x float>		%src2.ext = fpext <3 x half> %src2 to <3 x float>
%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)		%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
%cvt.result = fptrunc <3 x float> %result to <3 x half>		%cvt.result = fptrunc <3 x float> %result to <3 x half>
ret <3 x half> %cvt.result		ret <3 x half> %cvt.result
}		}

; GCN-LABEL: {{^}}v_mad_mix_v4f32:		; GCN-LABEL: {{^}}v_mad_mix_v4f32:
; GCN: s_waitcnt		; GCN: s_waitcnt
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]		; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1]
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]		; GFX9-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1]
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]		; GFX9-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]		; GFX9-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
; GFX9-NEXT: v_mov_b32_e32 v0, v6		; GFX9-NEXT: v_mov_b32_e32 v0, v7
; GFX9-NEXT: v_mov_b32_e32 v1, v7		; GFX9-NEXT: v_mov_b32_e32 v1, v6
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {		define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
%src0.ext = fpext <4 x half> %src0 to <4 x float>		%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>		%src1.ext = fpext <4 x half> %src1 to <4 x float>
%src2.ext = fpext <4 x half> %src2 to <4 x float>		%src2.ext = fpext <4 x half> %src2 to <4 x float>
%result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)		%result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
%cvt.result = fptrunc <4 x float> %result to <4 x half>		%cvt.result = fptrunc <4 x float> %result to <4 x half>
ret <4 x half> %cvt.result		ret <4 x half> %cvt.result
Show All 35 Lines	define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer)		%max = call <3 x half> @llvm.maxnum.v3f16(<3 x half> %cvt.result, <3 x half> zeroinitializer)
%clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> <half 1.0, half 1.0, half 1.0>)		%clamp = call <3 x half> @llvm.minnum.v3f16(<3 x half> %max, <3 x half> <half 1.0, half 1.0, half 1.0>)
ret <3 x half> %clamp		ret <3 x half> %clamp
}		}

; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt:		; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt:
; GCN: s_waitcnt		; GCN: s_waitcnt
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp		; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
		; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9-NEXT: v_mov_b32_e32 v0, v6		; GFX9-NEXT: v_mov_b32_e32 v0, v6
; GFX9-NEXT: v_mov_b32_e32 v1, v7		; GFX9-NEXT: v_mov_b32_e32 v1, v2
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {		define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
%src0.ext = fpext <4 x half> %src0 to <4 x float>		%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>		%src1.ext = fpext <4 x half> %src1 to <4 x float>
%src2.ext = fpext <4 x half> %src2 to <4 x float>		%src2.ext = fpext <4 x half> %src2 to <4 x float>
%result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)		%result = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %src0.ext, <4 x float> %src1.ext, <4 x float> %src2.ext)
%cvt.result = fptrunc <4 x float> %result to <4 x half>		%cvt.result = fptrunc <4 x float> %result to <4 x half>
%max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer)		%max = call <4 x half> @llvm.maxnum.v4f16(<4 x half> %cvt.result, <4 x half> zeroinitializer)
▲ Show 20 Lines • Show All 77 Lines • ▼ Show 20 Lines	define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)		%result = tail call <3 x float> @llvm.fmuladd.v3f32(<3 x float> %src0.ext, <3 x float> %src1.ext, <3 x float> %src2.ext)
%max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)		%max = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %result, <3 x float> zeroinitializer)
%clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)		%clamp = call <3 x float> @llvm.minnum.v3f32(<3 x float> %max, <3 x float> <float 1.0, float 1.0, float 1.0>)
%cvt.result = fptrunc <3 x float> %clamp to <3 x half>		%cvt.result = fptrunc <3 x float> %clamp to <3 x half>
ret <3 x half> %cvt.result		ret <3 x half> %cvt.result
}		}

; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt:		; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt:
; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp		; GFX9: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp		; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
		; GFX9: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
		; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp

; GFX9: v_cvt_f16_f32		; GFX9: v_cvt_f16_f32
; GFX9: v_cvt_f16_f32		; GFX9: v_cvt_f16_f32
; GFX9: v_cvt_f16_f32		; GFX9: v_cvt_f16_f32
; GFX9: v_cvt_f16_f32		; GFX9: v_cvt_f16_f32
define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {		define <4 x half> @v_mad_mix_v4f32_clamp_precvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
%src0.ext = fpext <4 x half> %src0 to <4 x float>		%src0.ext = fpext <4 x half> %src0 to <4 x float>
%src1.ext = fpext <4 x half> %src1 to <4 x float>		%src1.ext = fpext <4 x half> %src1 to <4 x float>
%src2.ext = fpext <4 x half> %src2 to <4 x float>		%src2.ext = fpext <4 x half> %src2 to <4 x float>
Show All 34 Lines

test/CodeGen/AMDGPU/mul.i16.ll

	Show First 20 Lines • Show All 84 Lines • ▼ Show 20 Lines
	; VI: v_mul_lo_u16_sdwa			; VI: v_mul_lo_u16_sdwa
	; VI: v_mul_lo_u16_e32			; VI: v_mul_lo_u16_e32
	; VI: v_mul_lo_u16_sdwa			; VI: v_mul_lo_u16_sdwa
	; VI: v_mul_lo_u16_e32			; VI: v_mul_lo_u16_e32
	; VI: v_or_b32_e32			; VI: v_or_b32_e32
	; VI: v_or_b32_e32			; VI: v_or_b32_e32

	; GFX9: s_waitcnt			; GFX9: s_waitcnt
	; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3
	; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2			; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
				; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3
	; GFX9-NEXT: s_setpc_b64			; GFX9-NEXT: s_setpc_b64
	define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) {			define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) {
	%r.val = mul <4 x i16> %a, %b			%r.val = mul <4 x i16> %a, %b
	ret <4 x i16> %r.val			ret <4 x i16> %r.val
	}			}