This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] performMinMaxCombine should not optimize patterns of vectors to min3/max3
ClosedPublic

Authored by FarhanaAleen on Apr 3 2018, 11:31 AM.

Download Raw Diff

Details

Reviewers

Commits

rGe80aeac0f2f5: [AMDGPU] performMinMaxCombine should not optimize patterns of vectors to…
rL329131: [AMDGPU] performMinMaxCombine should not optimize patterns of vectors to…

Summary

There are no packed instructions for min3 or max3. So, performMinMaxCombine should not optimize vectors of f16 to min3/max3.

Diff Detail

Repository: rL LLVM

Event Timeline

FarhanaAleen created this revision.Apr 3 2018, 11:31 AM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptApr 3 2018, 11:31 AM

LGTM except needs check lines

test/CodeGen/AMDGPU/fmax3.ll
89 ↗	(On Diff #140834)	Needs check lines
test/CodeGen/AMDGPU/fmin3.ll
87 ↗	(On Diff #140834)	Ditto

This revision was not accepted when it landed; it landed in state Needs Review.Apr 3 2018, 4:03 PM

Closed by commit rL329131: [AMDGPU] performMinMaxCombine should not optimize patterns of vectors to… (authored by faaleen). · Explain Why

This revision was automatically updated to reflect the committed changes.

LGTM

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

2 lines

test/

CodeGen/

AMDGPU/

fmax3.ll

29 lines

fmin3.ll

29 lines

Diff 140879

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,440 Lines • ▼ Show 20 Lines	SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
SDValue Op0 = N->getOperand(0);		SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);		SDValue Op1 = N->getOperand(1);

// Only do this if the inner op has one use since this will just increases		// Only do this if the inner op has one use since this will just increases
// register pressure for no benefit.		// register pressure for no benefit.


if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&		if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
VT != MVT::f64 &&		!VT.isVector() && VT != MVT::f64 &&
((VT != MVT::f16 && VT != MVT::i16) \|\| Subtarget->hasMin3Max3_16())) {		((VT != MVT::f16 && VT != MVT::i16) \|\| Subtarget->hasMin3Max3_16())) {
// max(max(a, b), c) -> max3(a, b, c)		// max(max(a, b), c) -> max3(a, b, c)
// min(min(a, b), c) -> min3(a, b, c)		// min(min(a, b), c) -> min3(a, b, c)
if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {		if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
SDLoc DL(N);		SDLoc DL(N);
return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),		return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc),
DL,		DL,
N->getValueType(0),		N->getValueType(0),
▲ Show 20 Lines • Show All 1,151 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fmax3.ll

Show First 20 Lines • Show All 78 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @test_fmax3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
%b = load volatile half, half addrspace(1)* %bptr, align 2		%b = load volatile half, half addrspace(1)* %bptr, align 2
%c = load volatile half, half addrspace(1)* %cptr, align 2		%c = load volatile half, half addrspace(1)* %cptr, align 2
%f0 = call half @llvm.maxnum.f16(half %a, half %b)		%f0 = call half @llvm.maxnum.f16(half %a, half %b)
%f1 = call half @llvm.maxnum.f16(half %c, half %f0)		%f1 = call half @llvm.maxnum.f16(half %c, half %f0)
store half %f1, half addrspace(1)* %out, align 2		store half %f1, half addrspace(1)* %out, align 2
ret void		ret void
}		}

		; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of max3
		; since there are no pack instructions for fmax3.
		; GCN-LABEL: {{^}}no_fmax3_v2f16:

		; SI: v_cvt_f16_f32_e32
		; SI: v_max_f32_e32
		; SI-NEXT: v_max_f32_e32
		; SI-NEXT: v_max3_f32
		; SI-NEXT: v_max3_f32

		; VI: v_max_f16_e32
		; VI-NEXT: v_max_f16_e32
		; VI-NEXT: v_max_f16_e32
		; VI-NEXT: v_max_f16_e32
		; VI-NEXT: v_max_f16_e32
		; VI-NEXT: v_max_f16_e32

		; GFX9: v_pk_max_f16
		; GFX9-NEXT: v_pk_max_f16
		; GFX9-NEXT: v_pk_max_f16
		define <2 x half> @no_fmax3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
		entry:
		%max = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
		%max1 = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %c, <2 x half> %max)
		%res = tail call fast <2 x half> @llvm.maxnum.v2f16(<2 x half> %max1, <2 x half> %d)
		ret <2 x half> %res
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.maxnum.f32(float, float) #1		declare float @llvm.maxnum.f32(float, float) #1
declare half @llvm.maxnum.f16(half, half) #1		declare half @llvm.maxnum.f16(half, half) #1
		declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>)

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }		attributes #1 = { nounwind readnone speculatable }

llvm/trunk/test/CodeGen/AMDGPU/fmin3.ll

Show First 20 Lines • Show All 76 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @test_fmin3_olt_1_f16(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #0 {
%b = load volatile half, half addrspace(1)* %bptr, align 2		%b = load volatile half, half addrspace(1)* %bptr, align 2
%c = load volatile half, half addrspace(1)* %cptr, align 2		%c = load volatile half, half addrspace(1)* %cptr, align 2
%f0 = call half @llvm.minnum.f16(half %a, half %b)		%f0 = call half @llvm.minnum.f16(half %a, half %b)
%f1 = call half @llvm.minnum.f16(half %c, half %f0)		%f1 = call half @llvm.minnum.f16(half %c, half %f0)
store half %f1, half addrspace(1)* %out, align 2		store half %f1, half addrspace(1)* %out, align 2
ret void		ret void
}		}

		; Checks whether the test passes; performMinMaxCombine() should not optimize vector patterns of min3
		; since there are no pack instructions for fmin3.
		; GCN-LABEL: {{^}}no_fmin3_v2f16:

		; SI: v_cvt_f16_f32_e32
		; SI: v_min_f32_e32
		; SI-NEXT: v_min_f32_e32
		; SI-NEXT: v_min3_f32
		; SI-NEXT: v_min3_f32

		; VI: v_min_f16_e32
		; VI-NEXT: v_min_f16_e32
		; VI-NEXT: v_min_f16_e32
		; VI-NEXT: v_min_f16_e32
		; VI-NEXT: v_min_f16_e32
		; VI-NEXT: v_min_f16_e32

		; GFX9: v_pk_min_f16
		; GFX9: v_pk_min_f16
		; GFX9: v_pk_min_f16
		define <2 x half> @no_fmin3_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) {
		entry:
		%min = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
		%min1 = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %c, <2 x half> %min)
		%res = tail call fast <2 x half> @llvm.minnum.v2f16(<2 x half> %min1, <2 x half> %d)
		ret <2 x half> %res
		}

declare i32 @llvm.amdgcn.workitem.id.x() #1		declare i32 @llvm.amdgcn.workitem.id.x() #1
declare float @llvm.minnum.f32(float, float) #1		declare float @llvm.minnum.f32(float, float) #1
declare half @llvm.minnum.f16(half, half) #1		declare half @llvm.minnum.f16(half, half) #1
		declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>)

attributes #0 = { nounwind }		attributes #0 = { nounwind }
attributes #1 = { nounwind readnone speculatable }		attributes #1 = { nounwind readnone speculatable }