This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/trunk/
-
trunk/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
-
SIISelLowering.cpp
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
-
dagcombine-setcc-select.ll
-
indirect-addressing-si-noopt.ll

Differential D54358

[AMDGPU] Disable DAG combine at -O0
ClosedPublic

Authored by rampitec on Nov 9 2018, 2:37 PM.

Download Raw Diff

Details

Reviewers

scott.linder
arsenm
nhaehnle

Commits

rG443a7f97882d: [AMDGPU] Disable DAG combine at -O0
rL347659: [AMDGPU] Disable DAG combine at -O0

Diff Detail

Repository: rL LLVM

Event Timeline

rampitec created this revision.Nov 9 2018, 2:37 PM

Herald added subscribers: t-tye, tpr, dstuttard and 4 others. · View Herald TranscriptNov 9 2018, 2:37 PM

arsenm added inline comments.Nov 9 2018, 2:50 PM

lib/Target/AMDGPU/SIISelLowering.cpp
8581–8582 ↗	(On Diff #173448)	I don't see why we would bother disabling specifically the target combines at -O0, but not all of them. I thought combines were already partially skipped at -O0?

rampitec added inline comments.Nov 9 2018, 2:57 PM

lib/Target/AMDGPU/SIISelLowering.cpp
8581–8582 ↗	(On Diff #173448)	I do not see it. Moreover, I see that combiner works regardless of optimization level.

Rebased.

Sorry for taking so long to look at this. I am running some OCL conformance tests to get an idea of what impact this will have in terms of regressions, but if we believe this is the correct thing to do I don't think that should have any impact on when it is merged anyway.

scott.linder added inline comments.Nov 15 2018, 12:36 PM

lib/Target/AMDGPU/SIISelLowering.cpp

8704–8721 ↗

(On Diff #173939)

The only regression I see is for something of the form:

target triple = "amdgcn-amd-amdhsa"

; Function Attrs: noinline optnone
define void @spam() #0 {
  %tmp = load <3 x i16>, <3 x i16> addrspace(5)* undef, align 8
  %tmp1 = insertelement <3 x i16> %tmp, i16 0, i64 0
  store <3 x i16> %tmp1, <3 x i16> addrspace(5)* undef, align 8
  ret void
}

attributes #0 = { noinline optnone "target-cpu"="fiji" }

For which we produce:

Optimized legalized selection DAG: %bb.0 'spam:'
SelectionDAG has 19 nodes:
  t0: ch = EntryToken
          t43: v2i16 = scalar_to_vector Constant:i16<0>
        t57: i32 = bitcast t43
      t68: ch = store<(store 2 into `<3 x i16> addrspace(5)* undef`, align 8, addrspace 5), trunc to i16> t40:1, t57, undef:i32, undef:i32
              t41: v2i32 = BUILD_VECTOR t40, t40
            t63: i64 = bitcast t41
          t36: i64 = srl t63, Constant:i32<32>
        t37: i16 = truncate t36
      t21: ch = store<(store 2 into `<3 x i16> addrspace(5)* undef` + 4, align 4, addrspace 5)> t40:1, t37, undef:i32, undef:i32
    t22: ch = TokenFactor t68, t21
    t9: i64,ch = CopyFromReg t0, Register:i64 %0
  t11: ch,glue = CopyToReg t22, Register:i64 $sgpr30_sgpr31, t9
  t40: i32,ch = load<(load 4 from `<3 x i16> addrspace(5)* undef`, align 8, addrspace 5)> t0, undef:i32, undef:i32
  t12: ch = RET_FLAG t11, Register:i64 $sgpr30_sgpr31, t11:1

And then fail to select t43: v2i16 = scalar_to_vector Constant:i16<0>

rampitec marked an inline comment as done.Nov 19 2018, 11:58 AM

rampitec added inline comments.

lib/Target/AMDGPU/SIISelLowering.cpp
8704–8721 ↗	(On Diff #173939)	The fix for it here: D54718.

Ping.

LGTM; if there are other regressions we can address them as they come up, but I don't see any major ones now.

This revision is now accepted and ready to land.Nov 27 2018, 7:10 AM

Closed by commit rL347659: [AMDGPU] Disable DAG combine at -O0 (authored by rampitec). · Explain WhyNov 27 2018, 7:16 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

AMDGPU/

SIISelLowering.cpp

11 lines

test/

CodeGen/

AMDGPU/

dagcombine-setcc-select.ll

60 lines

indirect-addressing-si-noopt.ll

10 lines

Diff 175479

llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,631 Lines • ▼ Show 20 Lines	if (Cmp1 == APFloat::cmpGreaterThan)
return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));		return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));

return SDValue(CSrc, 0);		return SDValue(CSrc, 0);
}		}


SDValue SITargetLowering::PerformDAGCombine(SDNode *N,		SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {		DAGCombinerInfo &DCI) const {
		if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
		return SDValue();

switch (N->getOpcode()) {		switch (N->getOpcode()) {
default:		default:
return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);		return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
case ISD::ADD:		case ISD::ADD:
return performAddCombine(N, DCI);		return performAddCombine(N, DCI);
case ISD::SUB:		case ISD::SUB:
return performSubCombine(N, DCI);		return performSubCombine(N, DCI);
case ISD::ADDCARRY:		case ISD::ADDCARRY:
Show All 9 Lines	SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FMINNUM:		case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:		case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:		case ISD::FMINNUM_IEEE:
case ISD::SMAX:		case ISD::SMAX:
case ISD::SMIN:		case ISD::SMIN:
case ISD::UMAX:		case ISD::UMAX:
case ISD::UMIN:		case ISD::UMIN:
case AMDGPUISD::FMIN_LEGACY:		case AMDGPUISD::FMIN_LEGACY:
case AMDGPUISD::FMAX_LEGACY: {		case AMDGPUISD::FMAX_LEGACY:
if (//DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
getTargetMachine().getOptLevel() > CodeGenOpt::None)
return performMinMaxCombine(N, DCI);		return performMinMaxCombine(N, DCI);
break;
}
case ISD::FMA:		case ISD::FMA:
return performFMACombine(N, DCI);		return performFMACombine(N, DCI);
case ISD::LOAD: {		case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))		if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;		return Widended;
LLVM_FALLTHROUGH;		LLVM_FALLTHROUGH;
}		}
case ISD::STORE:		case ISD::STORE:
▲ Show 20 Lines • Show All 720 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/dagcombine-setcc-select.ll

	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -O0 < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefix=GCN %s

	; GCN-LABEL: {{^}}eq_t:			; GCN-LABEL: {{^}}eq_t:
	; GCN-DAG: s_load_dword [[X:s[0-9]+]]			; GCN-DAG: s_load_dword [[X:s[0-9]+]]
	; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0			; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], 1.0{{$}}
	; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
	; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], [[VONE]]{{$}}
	; GCN-NOT: 0xddd5			; GCN-NOT: 0xddd5
	; GCN-NOT: v_cndmask_b32			; GCN-NOT: v_cndmask_b32
	; GCN-NOT: v_cmp_eq_u32			; GCN-NOT: v_cmp_eq_u32
	; GCN-NOT: v_cndmask_b32			; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]]
	; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
	; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
	; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
	; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
	; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
	; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}			; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
	define amdgpu_kernel void @eq_t(float %x) {			define amdgpu_kernel void @eq_t(float %x) {
	%c1 = fcmp olt float %x, 1.0			%c1 = fcmp olt float %x, 1.0
	%s1 = select i1 %c1, i32 56789, i32 1			%s1 = select i1 %c1, i32 56789, i32 1
	%c2 = icmp eq i32 %s1, 56789			%c2 = icmp eq i32 %s1, 56789
	%s2 = select i1 %c2, float 4.0, float 2.0			%s2 = select i1 %c2, float 4.0, float 2.0
	store float %s2, float* undef, align 4			store float %s2, float* undef, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}ne_t:			; GCN-LABEL: {{^}}ne_t:
	; GCN-DAG: s_load_dword [[X:s[0-9]+]]			; GCN-DAG: s_load_dword [[X:s[0-9]+]]
	; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0			; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], 1.0{{$}}
	; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
	; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], [[VONE]]{{$}}
	; GCN-NOT: 0xddd5			; GCN-NOT: 0xddd5
	; GCN-NOT: v_cndmask_b32			; GCN-NOT: v_cndmask_b32
	; GCN-NOT: v_cmp_eq_u32			; GCN-NOT: v_cmp_eq_u32
	; GCN-NOT: v_cndmask_b32			; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]]
	; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
	; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
	; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
	; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
	; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
	; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}			; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
	define amdgpu_kernel void @ne_t(float %x) {			define amdgpu_kernel void @ne_t(float %x) {
	%c1 = fcmp olt float %x, 1.0			%c1 = fcmp olt float %x, 1.0
	%s1 = select i1 %c1, i32 56789, i32 1			%s1 = select i1 %c1, i32 56789, i32 1
	%c2 = icmp ne i32 %s1, 56789			%c2 = icmp ne i32 %s1, 56789
	%s2 = select i1 %c2, float 4.0, float 2.0			%s2 = select i1 %c2, float 4.0, float 2.0
	store float %s2, float* undef, align 4			store float %s2, float* undef, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}eq_f:			; GCN-LABEL: {{^}}eq_f:
	; GCN-DAG: s_load_dword [[X:s[0-9]+]]			; GCN-DAG: s_load_dword [[X:s[0-9]+]]
	; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0			; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], 1.0{{$}}
	; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
	; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], [[VONE]]{{$}}
	; GCN-NOT: 0xddd5			; GCN-NOT: 0xddd5
	; GCN-NOT: v_cndmask_b32			; GCN-NOT: v_cndmask_b32
	; GCN-NOT: v_cmp_eq_u32			; GCN-NOT: v_cmp_eq_u32
	; GCN-NOT: v_cndmask_b32			; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], 4.0, 2.0, [[CC]]
	; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
	; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
	; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
	; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
	; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], [[VFOUR]], [[VTWO]], [[CC]]
	; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}			; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
	define amdgpu_kernel void @eq_f(float %x) {			define amdgpu_kernel void @eq_f(float %x) {
	%c1 = fcmp olt float %x, 1.0			%c1 = fcmp olt float %x, 1.0
	%s1 = select i1 %c1, i32 1, i32 56789			%s1 = select i1 %c1, i32 1, i32 56789
	%c2 = icmp eq i32 %s1, 56789			%c2 = icmp eq i32 %s1, 56789
	%s2 = select i1 %c2, float 4.0, float 2.0			%s2 = select i1 %c2, float 4.0, float 2.0
	store float %s2, float* undef, align 4			store float %s2, float* undef, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}ne_f:			; GCN-LABEL: {{^}}ne_f:
	; GCN-DAG: s_load_dword [[X:s[0-9]+]]			; GCN-DAG: s_load_dword [[X:s[0-9]+]]
	; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0			; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], 1.0{{$}}
	; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
	; GCN: v_cmp_lt_f32_e{{32\|64}} [[CC:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], [[VONE]]{{$}}
	; GCN-NOT: 0xddd5			; GCN-NOT: 0xddd5
	; GCN-NOT: v_cndmask_b32			; GCN-NOT: v_cndmask_b32
	; GCN-NOT: v_cmp_eq_u32			; GCN-NOT: v_cmp_eq_u32
	; GCN-NOT: v_cndmask_b32			; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], 2.0, 4.0, [[CC]]
	; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
	; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
	; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
	; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
	; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC]]
	; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}			; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
	define amdgpu_kernel void @ne_f(float %x) {			define amdgpu_kernel void @ne_f(float %x) {
	%c1 = fcmp olt float %x, 1.0			%c1 = fcmp olt float %x, 1.0
	%s1 = select i1 %c1, i32 1, i32 56789			%s1 = select i1 %c1, i32 1, i32 56789
	%c2 = icmp ne i32 %s1, 56789			%c2 = icmp ne i32 %s1, 56789
	%s2 = select i1 %c2, float 4.0, float 2.0			%s2 = select i1 %c2, float 4.0, float 2.0
	store float %s2, float* undef, align 4			store float %s2, float* undef, align 4
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}different_constants:			; GCN-LABEL: {{^}}different_constants:
	; GCN-DAG: s_load_dword [[X:s[0-9]+]]			; GCN: v_mov_b32_e32 [[RES:v[0-9]+]], 2.0
	; GCN-DAG: s_mov_b32 [[SONE:s[0-9]+]], 1.0
	; GCN-DAG: v_mov_b32_e32 [[VONE:v[0-9]+]], [[SONE]]
	; GCN-DAG: v_cmp_lt_f32_e{{32\|64}} [[CC1:s\[[0-9]+:[0-9]+\]\|vcc]], [[X]], [[VONE]]{{$}}
	; GCN-DAG: v_cndmask_b32_e{{32\|64}} [[CND1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}, [[CC1]]
	; GCN-DAG: v_cmp_eq_u32_e{{32\|64}} [[CC2:s\[[0-9]+:[0-9]+\]\|vcc]], s{{[0-9]+}}, v{{[0-9]+}}{{$}}
	; GCN-DAG: s_mov_b32 [[STWO:s[0-9]+]], 2.0
	; GCN-DAG: v_mov_b32_e32 [[VTWO:v[0-9]+]], [[STWO]]
	; GCN-DAG: s_mov_b32 [[SFOUR:s[0-9]+]], 4.0
	; GCN-DAG: v_mov_b32_e32 [[VFOUR:v[0-9]+]], [[SFOUR]]
	; GCN: v_cndmask_b32_e{{32\|64}} [[RES:v[0-9]+]], [[VTWO]], [[VFOUR]], [[CC2]]
	; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}			; GCN: store_dword v[{{[0-9:]+}}], [[RES]]{{$}}
	define amdgpu_kernel void @different_constants(float %x) {			define amdgpu_kernel void @different_constants(float %x) {
	%c1 = fcmp olt float %x, 1.0			%c1 = fcmp olt float %x, 1.0
	%s1 = select i1 %c1, i32 56789, i32 1			%s1 = select i1 %c1, i32 56789, i32 1
	%c2 = icmp eq i32 %s1, 5678			%c2 = icmp eq i32 %s1, 5678
	%s2 = select i1 %c2, float 4.0, float 2.0			%s2 = select i1 %c2, float 4.0, float 2.0
	store float %s2, float* undef, align 4			store float %s2, float* undef, align 4
	ret void			ret void
	}			}

llvm/trunk/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll

	Show All 23 Lines
	; extract with undef index.			; extract with undef index.

	; CHECK-LABEL: {{^}}extract_adjacent_blocks:			; CHECK-LABEL: {{^}}extract_adjacent_blocks:
	; CHECK: s_load_dword [[ARG:s[0-9]+]]			; CHECK: s_load_dword [[ARG:s[0-9]+]]
	; CHECK: s_cmp_lg_u32			; CHECK: s_cmp_lg_u32
	; CHECK: s_cbranch_scc1 [[BB4:BB[0-9]+_[0-9]+]]			; CHECK: s_cbranch_scc1 [[BB4:BB[0-9]+_[0-9]+]]

	; CHECK: buffer_load_dwordx4			; CHECK: buffer_load_dwordx4
	; CHECK: v_cndmask_b32_e64			; CHECK: s_mov_b32 m0,
	; CHECK: v_cndmask_b32_e64			; CHECK: v_movrels_b32_e32
	; CHECK: v_cndmask_b32_e64

	; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]			; CHECK: s_branch [[ENDBB:BB[0-9]+_[0-9]+]]

	; CHECK: [[BB4]]:			; CHECK: [[BB4]]:
	; CHECK: buffer_load_dwordx4			; CHECK: buffer_load_dwordx4
	; CHECK: v_cndmask_b32_e64			; CHECK: s_mov_b32 m0,
	; CHECK: v_cndmask_b32_e64			; CHECK: v_movrels_b32_e32
	; CHECK: v_cndmask_b32_e64

	; CHECK: [[ENDBB]]:			; CHECK: [[ENDBB]]:
	; CHECK: buffer_store_dword			; CHECK: buffer_store_dword
	; CHECK: s_endpgm			; CHECK: s_endpgm

	define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {			define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
	bb:			bb:
	%tmp = icmp eq i32 %arg, 0			%tmp = icmp eq i32 %arg, 0
	Show All 19 Lines