Diff 475723

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 3,196 Lines • ▼ Show 20 Lines	if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));		DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1), N->getOperand(1)));
}		}
}		}
}		}

if (VT != MVT::i64)		if (VT != MVT::i64)
return SDValue();		return SDValue();

		// fold (i64 (shr (add a, b), 32)) -> (i64 (zext (uaddo a, b).overflow))
		arsenmUnsubmitted Not Done Reply Inline Actions Should also move to generic code arsenm: Should also move to generic code
		Pierre-vhAuthorUnsubmitted Done Reply Inline Actions What do you mean "generic"? Not checking the types and instead check that the shift amount is 1/2 of the type's size in bits? Pierre-vh: What do you mean "generic"? Not checking the types and instead check that the shift amount is…
		arsenmUnsubmitted Not Done Reply Inline Actions Yes, and move to DAGCombiner. You then just need to check that the target UADDO is legal or it's pre-legalize arsenm: Yes, and move to DAGCombiner. You then just need to check that the target UADDO is legal or…
		arsenmUnsubmitted Done Reply Inline Actions This is missing the extends in the input and output arsenm: This is missing the extends in the input and output
		// iff a/b have >= 32 leading zeroes
		// (usually a/b are i32->i64 zexts)
		if (ShiftAmt == 32 && LHS.getOpcode() == ISD::ADD) {
		SDValue AddLHS = LHS->getOperand(0);
		SDValue AddRHS = LHS->getOperand(1);

		KnownBits AddLHSKnownBits = DAG.computeKnownBits(AddLHS);
		foadUnsubmitted Done Reply Inline Actions Could also accept 64-bit constants whose upper 32 bits are 0. foad: Could also accept 64-bit constants whose upper 32 bits are 0.
		KnownBits AddRHSKnownBits = DAG.computeKnownBits(AddRHS);
		if (AddLHSKnownBits.countMinLeadingZeros() >= 32 &&
		arsenmUnsubmitted Not Done Reply Inline Actions Should try to short circuit the second known bits call if the first one fails the countMinLeadingZeros check arsenm: Should try to short circuit the second known bits call if the first one fails the…
		AddRHSKnownBits.countMinLeadingZeros() >= 32) {

		// All users of the add must either be this shr, or truncs to i32.
		// If there are other users, don't do the transform.
		SmallVector<SDValue, 4> TruncsToReplace;
		bool CanCombine = true;
		for (SDNode *User : LHS->uses()) {
		arsenmUnsubmitted Done Reply Inline Actions Looking at uses is unusual and I'm not sure why you're doing it arsenm: Looking at uses is unusual and I'm not sure why you're doing it
		foadUnsubmitted Done Reply Inline Actions As mentioned below, the thinking is that this transform is not profitable unless every use either only wants the overflow bit, or only wants the low 32 bits of the 64 bit result. Otherwise you might as well keep the full 64 bit add. foad: As mentioned below, the thinking is that this transform is not profitable unless every use…
		Pierre-vhAuthorUnsubmitted Done Reply Inline Actions Indeed as Jay said, it's because the transformation is only profitable when the users only care about the lower 32 bits and the carry bit. Pierre-vh: Indeed as Jay said, it's because the transformation is only profitable when the users only care…
		if (User == N)
		foadUnsubmitted Done Reply Inline Actions This is not correct because it will lose the overflow bit. You should probably only do this if the ADD has a single use. foad: This is not correct because it will lose the overflow bit. You should probably only do this if…
		Pierre-vhAuthorUnsubmitted Done Reply Inline Actions Doing it if the add has a single use negates the purpose of the combine as it'll always have 2 uses in the cases we're interested in, but the second use is a trunc to i32. I've adapted the combine so it only does it when users are all truncs to i32, or the srl. Pierre-vh: Doing it if the add has a single use negates the purpose of the combine as it'll always have 2…
		arsenmUnsubmitted Not Done Reply Inline Actions There's no point in looking for multiple TRUNCATE users. Those would have been automagically CSEd arsenm: There's no point in looking for multiple TRUNCATE users. Those would have been automagically…
		continue;
		if (User->getOpcode() != ISD::TRUNCATE \|\|
		User->getValueType(0) != MVT::i32) {
		CanCombine = false;
		break;
		}

		TruncsToReplace.push_back(SDValue(User, 0));
		}

		if (CanCombine) {
		// (i32 (uaddo a, b))
		SDValue A = DAG.getNode(ISD::TRUNCATE, SL, {MVT::i32}, {AddLHS});
		SDValue B = DAG.getNode(ISD::TRUNCATE, SL, {MVT::i32}, {AddRHS});
		SDValue UADDO =
		DAG.getNode(ISD::UADDO, SL, {MVT::i32, MVT::i1}, {A, B});

		for (SDValue V : TruncsToReplace)
		DAG.ReplaceAllUsesOfValueWith(V, UADDO);

		// Replace this shift with (i64 (zext uaddo.overflow))
		return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, {UADDO.getValue(1)});
		}
		}
		}

if (ShiftAmt < 32)		if (ShiftAmt < 32)
return SDValue();		return SDValue();

// srl i64:x, C for C >= 32		// srl i64:x, C for C >= 32
// =>		// =>
// build_pair (srl hi_32(x), C - 32), 0		// build_pair (srl hi_32(x), C - 32), 0
SDValue Zero = DAG.getConstant(0, SL, MVT::i32);		SDValue Zero = DAG.getConstant(0, SL, MVT::i32);

▲ Show 20 Lines • Show All 1,662 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/add_shr_carry.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs \| FileCheck -check-prefixes=VI,SDAG-VI %s		; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs \| FileCheck -check-prefixes=VI,SDAG-VI %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s		; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GFX10,SDAG-GFX10 %s		; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GFX10,SDAG-GFX10 %s
; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs \| FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s		; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs \| FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s

		arsenmUnsubmitted Done Reply Inline Actions Should precommit this test to show the diff arsenm: Should precommit this test to show the diff
; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs \| FileCheck -check-prefixes=VI,GISEL-VI %s		; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=fiji -verify-machineinstrs \| FileCheck -check-prefixes=VI,GISEL-VI %s
; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s		; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx900 -verify-machineinstrs \| FileCheck -check-prefixes=GFX9,GISEL-GFX9 %s
; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GFX10,GISEL-GFX10 %s		; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -verify-machineinstrs \| FileCheck -check-prefixes=GFX10,GISEL-GFX10 %s
; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs \| FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s		; RUN: llc -global-isel < %s -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs \| FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s

define i64 @basic_zext(i32 %a, i32 %b, i64 %c) {		define i64 @basic_zext(i32 %a, i32 %b, i64 %c) {
; VI-LABEL: basic_zext:		; SDAG-VI-LABEL: basic_zext:
; VI: ; %bb.0: ; %entry		; SDAG-VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1		; SDAG-VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
; VI-NEXT: v_addc_u32_e64 v0, s[4:5], 0, 0, vcc		; SDAG-VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_mov_b32_e32 v1, 0		; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_setpc_b64 s[30:31]		; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX9-LABEL: basic_zext:		; SDAG-GFX9-LABEL: basic_zext:
; GFX9: ; %bb.0: ; %entry		; SDAG-GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1		; SDAG-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], 0, 0, vcc		; SDAG-GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX10-LABEL: basic_zext:		; SDAG-GFX10-LABEL: basic_zext:
; GFX10: ; %bb.0: ; %entry		; SDAG-GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v0, s4, v0, v1		; SDAG-GFX10-NEXT: v_add_co_u32 v0, s4, v0, v1
; GFX10-NEXT: v_add_co_ci_u32_e64 v0, s4, 0, 0, s4		; SDAG-GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; GFX10-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX10-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX11-LABEL: basic_zext:		; SDAG-GFX11-LABEL: basic_zext:
; GFX11: ; %bb.0: ; %entry		; SDAG-GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_co_u32 v0, s0, v0, v1		; SDAG-GFX11-NEXT: v_add_co_u32 v0, s0, v0, v1
; GFX11-NEXT: v_add_co_ci_u32_e64 v0, null, 0, 0, s0		; SDAG-GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; GFX11-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0
; GFX11-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-VI-LABEL: basic_zext:
		; GISEL-VI: ; %bb.0: ; %entry
		; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-VI-NEXT: v_add_u32_e32 v0, vcc, v0, v1
		arsenmUnsubmitted Done Reply Inline Actions This second add is superfluous to the basic pattern https://alive2.llvm.org/ce/z/aLg_Ki arsenm: This second add is superfluous to the basic pattern https://alive2.llvm.org/ce/z/aLg_Ki
		; GISEL-VI-NEXT: v_addc_u32_e64 v0, s[4:5], 0, 0, vcc
		; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX9-LABEL: basic_zext:
		; GISEL-GFX9: ; %bb.0: ; %entry
		; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v1
		; GISEL-GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], 0, 0, vcc
		; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX10-LABEL: basic_zext:
		; GISEL-GFX10: ; %bb.0: ; %entry
		; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
		; GISEL-GFX10-NEXT: v_add_co_u32 v0, s4, v0, v1
		; GISEL-GFX10-NEXT: v_add_co_ci_u32_e64 v0, s4, 0, 0, s4
		; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-GFX10-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX11-LABEL: basic_zext:
		; GISEL-GFX11: ; %bb.0: ; %entry
		; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
		; GISEL-GFX11-NEXT: v_add_co_u32 v0, s0, v0, v1
		; GISEL-GFX11-NEXT: v_add_co_ci_u32_e64 v0, null, 0, 0, s0
		; GISEL-GFX11-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
entry:		entry:
%a.zext = zext i32 %a to i64		%a.zext = zext i32 %a to i64
%b.zext = zext i32 %b to i64		%b.zext = zext i32 %b to i64
%add.a.b = add i64 %a.zext, %b.zext		%add.a.b = add i64 %a.zext, %b.zext
%shr = lshr i64 %add.a.b, 32		%shr = lshr i64 %add.a.b, 32
ret i64 %shr		ret i64 %shr
}		}

define i64 @basic_cst_32leadingzeroes(i32 %b, i64 %c) {		define i64 @basic_cst_32leadingzeroes(i32 %b, i64 %c) {
; VI-LABEL: basic_cst_32leadingzeroes:		; SDAG-VI-LABEL: basic_cst_32leadingzeroes:
; VI: ; %bb.0: ; %entry		; SDAG-VI: ; %bb.0: ; %entry
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, -1, v0		; SDAG-VI-NEXT: v_add_u32_e32 v0, vcc, -1, v0
; VI-NEXT: v_addc_u32_e64 v0, s[4:5], 0, 0, vcc		; SDAG-VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; VI-NEXT: v_mov_b32_e32 v1, 0		; SDAG-VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: s_setpc_b64 s[30:31]		; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX9-LABEL: basic_cst_32leadingzeroes:		; SDAG-GFX9-LABEL: basic_cst_32leadingzeroes:
; GFX9: ; %bb.0: ; %entry		; SDAG-GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v0		; SDAG-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v0
; GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], 0, 0, vcc		; SDAG-GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX9-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;		;
; SDAG-GFX10-LABEL: basic_cst_32leadingzeroes:		; SDAG-GFX10-LABEL: basic_cst_32leadingzeroes:
; SDAG-GFX10: ; %bb.0: ; %entry		; SDAG-GFX10: ; %bb.0: ; %entry
; SDAG-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; SDAG-GFX10-NEXT: v_add_co_u32 v0, s4, v0, -1		; SDAG-GFX10-NEXT: v_add_co_u32 v0, s4, v0, -1
; SDAG-GFX10-NEXT: v_add_co_ci_u32_e64 v0, s4, 0, 0, s4		; SDAG-GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4
; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX10-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX10-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX10-NEXT: s_setpc_b64 s[30:31]
;		;
; SDAG-GFX11-LABEL: basic_cst_32leadingzeroes:		; SDAG-GFX11-LABEL: basic_cst_32leadingzeroes:
; SDAG-GFX11: ; %bb.0: ; %entry		; SDAG-GFX11: ; %bb.0: ; %entry
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; SDAG-GFX11-NEXT: v_add_co_u32 v0, s0, v0, -1		; SDAG-GFX11-NEXT: v_add_co_u32 v0, s0, v0, -1
; SDAG-GFX11-NEXT: v_add_co_ci_u32_e64 v0, null, 0, 0, s0		; SDAG-GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0
; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0		; SDAG-GFX11-NEXT: v_mov_b32_e32 v1, 0
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
;		;
		; GISEL-VI-LABEL: basic_cst_32leadingzeroes:
		; GISEL-VI: ; %bb.0: ; %entry
		; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-VI-NEXT: v_add_u32_e32 v0, vcc, -1, v0
		; GISEL-VI-NEXT: v_addc_u32_e64 v0, s[4:5], 0, 0, vcc
		; GISEL-VI-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX9-LABEL: basic_cst_32leadingzeroes:
		; GISEL-GFX9: ; %bb.0: ; %entry
		; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -1, v0
		; GISEL-GFX9-NEXT: v_addc_co_u32_e64 v0, s[4:5], 0, 0, vcc
		; GISEL-GFX9-NEXT: v_mov_b32_e32 v1, 0
		; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
		;
; GISEL-GFX10-LABEL: basic_cst_32leadingzeroes:		; GISEL-GFX10-LABEL: basic_cst_32leadingzeroes:
; GISEL-GFX10: ; %bb.0: ; %entry		; GISEL-GFX10: ; %bb.0: ; %entry
; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GISEL-GFX10-NEXT: v_add_co_u32 v0, s4, -1, v0		; GISEL-GFX10-NEXT: v_add_co_u32 v0, s4, -1, v0
; GISEL-GFX10-NEXT: v_add_co_ci_u32_e64 v0, s4, 0, 0, s4		; GISEL-GFX10-NEXT: v_add_co_ci_u32_e64 v0, s4, 0, 0, s4
; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0		; GISEL-GFX10-NEXT: v_mov_b32_e32 v1, 0
; GISEL-GFX10-NEXT: s_setpc_b64 s[30:31]		; GISEL-GFX10-NEXT: s_setpc_b64 s[30:31]
▲ Show 20 Lines • Show All 83 Lines • ▼ Show 20 Lines
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]		; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
entry:		entry:
%b.zext = zext i32 %b to i64		%b.zext = zext i32 %b to i64
%add.a.b = add i64 8589934591, %b.zext ; 0x1FFFFFFFF		%add.a.b = add i64 8589934591, %b.zext ; 0x1FFFFFFFF
%shr = lshr i64 %add.a.b, 32		%shr = lshr i64 %add.a.b, 32
ret i64 %shr		ret i64 %shr
}		}

define <3 x i32> @add3_i96(<3 x i32> %0, <3 x i32> %1) {		define <3 x i32> @add3_i96(<3 x i32> %0, <3 x i32> %1) {
; VI-LABEL: add3_i96:		; SDAG-VI-LABEL: add3_i96:
		arsenmUnsubmitted Not Done Reply Inline Actions Testcase with multiple uses? arsenm: Testcase with multiple uses?
; VI: ; %bb.0:		; SDAG-VI: ; %bb.0:
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: v_add_u32_e32 v0, vcc, v3, v0		; SDAG-VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1
; VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc		; SDAG-VI-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1		; SDAG-VI-NEXT: v_add_u32_e32 v0, vcc, v3, v0
; VI-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, vcc		; SDAG-VI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3		; SDAG-VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc		; SDAG-VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
; VI-NEXT: v_add_u32_e32 v2, vcc, v5, v2		; SDAG-VI-NEXT: v_add_u32_e32 v2, vcc, v5, v2
; VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3		; SDAG-VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3
; VI-NEXT: s_setpc_b64 s[30:31]		; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX9-LABEL: add3_i96:		; SDAG-GFX9-LABEL: add3_i96:
; GFX9: ; %bb.0:		; SDAG-GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0		; SDAG-GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1
; GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc		; SDAG-GFX9-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, vcc
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1		; SDAG-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
; GFX9-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, vcc		; SDAG-GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3		; SDAG-GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc		; SDAG-GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
; GFX9-NEXT: v_add3_u32 v2, v5, v2, v3		; SDAG-GFX9-NEXT: v_add3_u32 v2, v5, v2, v3
; GFX9-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX10-LABEL: add3_i96:		; SDAG-GFX10-LABEL: add3_i96:
; GFX10: ; %bb.0:		; SDAG-GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_add_co_u32 v0, s4, v3, v0		; SDAG-GFX10-NEXT: v_add_co_u32 v0, s4, v3, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, 0, 0, s4		; SDAG-GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4
; GFX10-NEXT: v_add_co_u32 v1, s4, v4, v1		; SDAG-GFX10-NEXT: v_add_co_u32 v1, s4, v4, v1
; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, 0, s4		; SDAG-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, 0, s4
; GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3		; SDAG-GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo		; SDAG-GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
; GFX10-NEXT: v_add3_u32 v2, v5, v2, v3		; SDAG-GFX10-NEXT: v_add3_u32 v2, v5, v2, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX10-NEXT: s_setpc_b64 s[30:31]
;		;
; GFX11-LABEL: add3_i96:		; SDAG-GFX11-LABEL: add3_i96:
; GFX11: ; %bb.0:		; SDAG-GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0		; SDAG-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: v_add_co_u32 v0, s0, v3, v0		; SDAG-GFX11-NEXT: v_add_co_u32 v0, s0, v3, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0		; SDAG-GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0
; GFX11-NEXT: v_add_co_u32 v1, s0, v4, v1		; SDAG-GFX11-NEXT: v_add_co_u32 v1, s0, v4, v1
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0		; SDAG-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0
; GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3		; SDAG-GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo		; SDAG-GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
; GFX11-NEXT: v_add3_u32 v2, v5, v2, v3		; SDAG-GFX11-NEXT: v_add3_u32 v2, v5, v2, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]		; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-VI-LABEL: add3_i96:
		; GISEL-VI: ; %bb.0:
		; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-VI-NEXT: v_add_u32_e32 v0, vcc, v3, v0
		; GISEL-VI-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
		; GISEL-VI-NEXT: v_add_u32_e32 v1, vcc, v4, v1
		; GISEL-VI-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, vcc
		; GISEL-VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3
		; GISEL-VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc
		; GISEL-VI-NEXT: v_add_u32_e32 v2, vcc, v5, v2
		; GISEL-VI-NEXT: v_add_u32_e32 v2, vcc, v2, v3
		; GISEL-VI-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX9-LABEL: add3_i96:
		; GISEL-GFX9: ; %bb.0:
		; GISEL-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v3, v0
		; GISEL-GFX9-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, vcc
		; GISEL-GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v4, v1
		; GISEL-GFX9-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, vcc
		; GISEL-GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v1, v3
		; GISEL-GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
		; GISEL-GFX9-NEXT: v_add3_u32 v2, v5, v2, v3
		; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX10-LABEL: add3_i96:
		; GISEL-GFX10: ; %bb.0:
		; GISEL-GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX10-NEXT: s_waitcnt_vscnt null, 0x0
		; GISEL-GFX10-NEXT: v_add_co_u32 v0, s4, v3, v0
		; GISEL-GFX10-NEXT: v_add_co_ci_u32_e64 v3, s4, 0, 0, s4
		; GISEL-GFX10-NEXT: v_add_co_u32 v1, s4, v4, v1
		; GISEL-GFX10-NEXT: v_add_co_ci_u32_e64 v4, s4, 0, 0, s4
		; GISEL-GFX10-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
		; GISEL-GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
		; GISEL-GFX10-NEXT: v_add3_u32 v2, v5, v2, v3
		; GISEL-GFX10-NEXT: s_setpc_b64 s[30:31]
		;
		; GISEL-GFX11-LABEL: add3_i96:
		; GISEL-GFX11: ; %bb.0:
		; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GISEL-GFX11-NEXT: s_waitcnt_vscnt null, 0x0
		; GISEL-GFX11-NEXT: v_add_co_u32 v0, s0, v3, v0
		; GISEL-GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, 0, 0, s0
		; GISEL-GFX11-NEXT: v_add_co_u32 v1, s0, v4, v1
		; GISEL-GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, 0, 0, s0
		; GISEL-GFX11-NEXT: v_add_co_u32 v1, vcc_lo, v1, v3
		; GISEL-GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
		; GISEL-GFX11-NEXT: v_add3_u32 v2, v5, v2, v3
		; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
%3 = extractelement <3 x i32> %0, i64 0		%3 = extractelement <3 x i32> %0, i64 0
%4 = zext i32 %3 to i64		%4 = zext i32 %3 to i64
%5 = extractelement <3 x i32> %1, i64 0		%5 = extractelement <3 x i32> %1, i64 0
%6 = zext i32 %5 to i64		%6 = zext i32 %5 to i64
%7 = add nuw nsw i64 %6, %4		%7 = add nuw nsw i64 %6, %4
%8 = extractelement <3 x i32> %0, i64 1		%8 = extractelement <3 x i32> %0, i64 1
%9 = zext i32 %8 to i64		%9 = zext i32 %8 to i64
%10 = extractelement <3 x i32> %1, i64 1		%10 = extractelement <3 x i32> %1, i64 1
Show All 9 Lines	; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
%20 = add i32 %17, %19		%20 = add i32 %17, %19
%21 = trunc i64 %7 to i32		%21 = trunc i64 %7 to i32
%22 = insertelement <3 x i32> undef, i32 %21, i32 0		%22 = insertelement <3 x i32> undef, i32 %21, i32 0
%23 = trunc i64 %14 to i32		%23 = trunc i64 %14 to i32
%24 = insertelement <3 x i32> %22, i32 %23, i32 1		%24 = insertelement <3 x i32> %22, i32 %23, i32 1
%25 = insertelement <3 x i32> %24, i32 %20, i32 2		%25 = insertelement <3 x i32> %24, i32 %20, i32 2
ret <3 x i32> %25		ret <3 x i32> %25
}		}
		;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
		; GFX10: {{.*}}
		; GFX11: {{.*}}
		; GFX9: {{.*}}
		; VI: {{.*}}

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add DAG Combine for right-shift carry add to uaddo
AbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 475723

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/test/CodeGen/AMDGPU/add_shr_carry.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add DAG Combine for right-shift carry add to uaddoAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 475723

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

llvm/test/CodeGen/AMDGPU/add_shr_carry.ll

[AMDGPU] Add DAG Combine for right-shift carry add to uaddo
AbandonedPublic