This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Fixed i64 add/sub used in lowering of i64 srem
AbandonedPublic

Authored by tpr on Mar 19 2019, 1:07 PM.

Download Raw Diff

Details

Reviewers

Summary

My commit rL356399 "[AMDGPU] Asm/disasm clamp modifier on vop3 int arithmetic"
broke a case of i64 srem being lowered. Fixed.

Change-Id: Id274ae6ac3c8687a23999ea239f383b37d812fab

Diff Detail

Repository

rL LLVM

Build Status

Buildable 29368
Build 29367: arc lint + arc unit

Event Timeline

tpr created this revision.Mar 19 2019, 1:07 PM

Herald added a project: Restricted Project. · View Herald TranscriptMar 19 2019, 1:07 PM

Herald added subscribers: llvm-commits, t-tye, dstuttard and 6 others. · View Herald Transcript

Harbormaster completed remote builds in B29368: Diff 191377.Mar 19 2019, 1:07 PM

tpr added a reviewer: michel.daenzer.Mar 19 2019, 1:09 PM

Testcase should be reducible . You can use the uaddo intrinsic directly

The test is already reduced as much as I can. Removing anything in there makes the problem disappear. Constructing a new test case using llvm.uadd.with.overflow does not show the problem. Can we go with this test case?

In D59556#1436285, @tpr wrote:

The test is already reduced as much as I can. Removing anything in there makes the problem disappear. Constructing a new test case using llvm.uadd.with.overflow does not show the problem. Can we go with this test case?

I managed with this:

define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr, float %dummy.val) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr
  %a = load volatile i32, i32 addrspace(1)* %a.gep, align 4
  %b = load volatile i32, i32 addrspace(1)* %b.gep, align 4
  %uadd0 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
  %val0 = extractvalue { i32, i1 } %uadd0, 0
  %carry0 = extractvalue { i32, i1 } %uadd0, 1
  store volatile i32 %val0, i32 addrspace(1)* %out, align 4
  store i1 %carry0, i1 addrspace(1)* %carryout

  ; Force a use of an i1 0 that will be materialized in a register,
  ; which will be selected before the uaddo (so its operand is
  ; repalced with the materialized node)
  %fmas = call float @llvm.amdgcn.div.fmas.f32(float %dummy.val, float %dummy.val, float %dummy.val, i1 false)
  store volatile float %fmas, float addrspace(1)* null
  ret void
}

declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1)
declare i32 @llvm.amdgcn.workitem.id.x() #1
declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) #1

attributes #0 = { nounwind }
attributes #1 = { nounwind readnone }

arsenm mentioned this in D59608: [AMDGPU] Fix clamp bit DAG operand.Mar 20 2019, 12:48 PM

Thanks for the better test Matt. But I'll abandon this one in favor of Michael's improved fix D59608.

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelDAGToDAG.cpp

7 lines

test/

CodeGen/

AMDGPU/

sremi64.ll

40 lines

Diff 191377

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

	Show First 20 Lines • Show All 925 Lines • ▼ Show 20 Lines

	void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {			void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
	// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned			// The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
	// carry out despite the _i32 name. These were renamed in VI to _U32.			// carry out despite the _i32 name. These were renamed in VI to _U32.
	// FIXME: We should probably rename the opcodes here.			// FIXME: We should probably rename the opcodes here.
	unsigned Opc = N->getOpcode() == ISD::UADDO ?			unsigned Opc = N->getOpcode() == ISD::UADDO ?
	AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;			AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;

	CurDAG->SelectNodeTo(N, Opc, N->getVTList(),			CurDAG->SelectNodeTo(
				N, Opc, N->getVTList(),
	{N->getOperand(0), N->getOperand(1),			{N->getOperand(0), N->getOperand(1),
	CurDAG->getConstant(0, {}, MVT::i1)/clamp bit/});			CurDAG->getTargetConstant(0, {}, MVT::i1) /clamp bit/});
	}			}

	void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {			void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
	SDLoc SL(N);			SDLoc SL(N);
	// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod			// src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp, omod
	SDValue Ops[10];			SDValue Ops[10];

	SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);			SelectVOP3Mods0(N->getOperand(1), Ops[1], Ops[0], Ops[6], Ops[7]);
	▲ Show 20 Lines • Show All 1,508 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/sremi64.ll

This file was added.

				; RUN: llc -march=amdgcn -mcpu=gfx600 -verify-machineinstrs <%s \| FileCheck -check-prefixes=GCN,GFX678 %s
				; RUN: llc -march=amdgcn -mcpu=gfx704 -verify-machineinstrs <%s \| FileCheck -check-prefixes=GCN,GFX678 %s
				; RUN: llc -march=amdgcn -mcpu=gfx802 -verify-machineinstrs <%s \| FileCheck -check-prefixes=GCN,GFX678 %s
				; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs <%s \| FileCheck -check-prefixes=GCN,GFX9 %s

				; GCN-LABEL: wrapper:
				; GFX678: v_add_{{[iu]}}32_e32
				; GFX9: v_add_co_u32_e32
				; GFX678: v_addc_u32_e32
				; GFX9: v_addc_co_u32_e32

				; This was lowering the i64 srem with V_ADD_I32_e64 etc ops without the required clamp bit operand.

				declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32 immarg)
				declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1 immarg, i1 immarg)
				declare void @llvm.amdgcn.tbuffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg, i32 immarg, i1 immarg, i1 immarg)

				define amdgpu_gs void @wrapper(i32 inreg %arg4) {
				main_body:
				%tmp = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 4, i32 0)
				%tmp14 = bitcast float %tmp to i32
				%tmp15 = insertelement <2 x i32> undef, i32 %tmp14, i32 1
				%tmp16 = bitcast <2 x i32> %tmp15 to i64
				%tmp17 = call nsz float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> undef, i32 32, i32 0)
				%tmp18 = bitcast float %tmp17 to i32
				%tmp19 = insertelement <2 x i32> undef, i32 %tmp18, i32 0
				%tmp20 = insertelement <2 x i32> %tmp19, i32 undef, i32 1
				%tmp21 = bitcast <2 x i32> %tmp20 to i64
				%tmp22 = srem i64 %tmp16, %tmp21
				%tmp23 = icmp eq i64 %tmp22, 0
				%tmp24 = icmp eq i64 0, 0
				%tmp25 = icmp eq i64 0, 0
				%tmp26 = and i1 %tmp24, %tmp25
				%tmp27 = and i1 %tmp23, %tmp26
				%tmp28 = select i1 %tmp27, i32 0, i32 1065353216
				call void @llvm.amdgcn.tbuffer.store.i32(i32 undef, <4 x i32> undef, i32 0, i32 64, i32 %arg4, i32 0, i32 4, i32 4, i1 true, i1 true)
				call void @llvm.amdgcn.tbuffer.store.i32(i32 %tmp28, <4 x i32> undef, i32 0, i32 56, i32 %arg4, i32 0, i32 4, i32 4, i1 true, i1 true)
				ret void
				}