Diff 186879

lib/Target/AMDGPU/SIISelLowering.h

Show First 20 Lines • Show All 149 Lines • ▼ Show 20 Lines	private:
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,		SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1, bool Signed) const;		SDValue Op0, SDValue Op1, bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;

		SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,		unsigned getFusedOpcode(const SelectionDAG &DAG,
const SDNode N0, const SDNode N1) const;		const SDNode N0, const SDNode N1) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;		SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
▲ Show 20 Lines • Show All 194 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 8,472 Lines • ▼ Show 20 Lines	if ((Options.AllowFPOpFusion == FPOpFusion::Fast \|\| Options.UnsafeFPMath \|\|
N1->getFlags().hasAllowContract())) &&		N1->getFlags().hasAllowContract())) &&
isFMAFasterThanFMulAndFAdd(VT)) {		isFMAFasterThanFMulAndFAdd(VT)) {
return ISD::FMA;		return ISD::FMA;
}		}

return 0;		return 0;
}		}

		// For a reassociatable opcode perform:
		// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
		SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
		SelectionDAG &DAG) const {
		unsigned Opc = N->getOpcode();
		SDValue Op0 = N->getOperand(0);
		SDValue Op1 = N->getOperand(1);

		if (!(Op0->isDivergent() ^ Op1->isDivergent()))
		return SDValue();

		if (Op0->isDivergent())
		std::swap(Op0, Op1);

		if (Op1.getOpcode() != Opc \|\| !Op1.hasOneUse())
		return SDValue();

		SDValue Op2 = Op1.getOperand(1);
		Op1 = Op1.getOperand(0);
		if (!(Op1->isDivergent() ^ Op2->isDivergent()))
		return SDValue();

		if (Op1->isDivergent())
		std::swap(Op1, Op2);

		// If either operand is constant this will conflict with
		// DAGCombiner::ReassociateOps().
		if (isa<ConstantSDNode>(Op0) \|\| isa<ConstantSDNode>(Op1))
		return SDValue();

		EVT VT = N->getValueType(0);
		SDLoc SL(N);

		SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
		return DAG.getNode(Opc, SL, VT, Add1, Op2);
		}

static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,		static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL,
EVT VT,		EVT VT,
SDValue N0, SDValue N1, SDValue N2,		SDValue N0, SDValue N1, SDValue N2,
bool Signed) {		bool Signed) {
unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;		unsigned MadOpc = Signed ? AMDGPUISD::MAD_I64_I32 : AMDGPUISD::MAD_U64_U32;
SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);		SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);		SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);		return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
Show All 32 Lines	if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);		MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);		AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);		return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, true);
}		}

return SDValue();		return SDValue();
}		}

		if (VT != MVT::i32 && VT != MVT::i64)
		return SDValue();

		if (SDValue V = reassociateScalarOps(N, DAG)) {
		return V;
		}
		arsenmUnsubmitted Done Reply Inline Actions Maybe move this to a helper function with the opcode as a parameter like ReassociateOps. We probably want to do this for the other reassociatable opcodes next arsenm: Maybe move this to a helper function with the opcode as a parameter like ReassociateOps. We…

if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())		if (VT != MVT::i32 \|\| !DCI.isAfterLegalizeDAG())
return SDValue();		return SDValue();

// add x, zext (setcc) => addcarry x, 0, setcc		// add x, zext (setcc) => addcarry x, 0, setcc
// add x, sext (setcc) => subcarry x, 0, setcc		// add x, sext (setcc) => subcarry x, 0, setcc
unsigned Opc = LHS.getOpcode();		unsigned Opc = LHS.getOpcode();
if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|		if (Opc == ISD::ZERO_EXTEND \|\| Opc == ISD::SIGN_EXTEND \|\|
Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ADDCARRY)		Opc == ISD::ANY_EXTEND \|\| Opc == ISD::ADDCARRY)
std::swap(RHS, LHS);		std::swap(RHS, LHS);

Opc = RHS.getOpcode();		Opc = RHS.getOpcode();
switch (Opc) {		switch (Opc) {
		arsenmUnsubmitted Done Reply Inline Actions Is preserving the flags really correct? ReassociateOps seems to not do it arsenm: Is preserving the flags really correct? ReassociateOps seems to not do it
		rampitecAuthorUnsubmitted Done Reply Inline Actions I think the final add has the same behavior as initial. Otherwise how could analysis tell us it was something like nuw or nsw in a first place? But if you think this is questionable I can remove it. We do not use these flags in case of full dword adds anyway. rampitec: I think the final add has the same behavior as initial. Otherwise how could analysis tell us it…
default: break;		default: break;
case ISD::ZERO_EXTEND:		case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:		case ISD::SIGN_EXTEND:
case ISD::ANY_EXTEND: {		case ISD::ANY_EXTEND: {
auto Cond = RHS.getOperand(0);		auto Cond = RHS.getOperand(0);
if (!isBoolSGPR(Cond))		if (!isBoolSGPR(Cond))
break;		break;
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);		SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
▲ Show 20 Lines • Show All 1,271 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/add3.ll

Show All 19 Lines	; GFX9-NEXT: ; return to shader part epilog
%x = add i32 %a, %b		%x = add i32 %a, %b
%result = add i32 %x, %c		%result = add i32 %x, %c
%bc = bitcast i32 %result to float		%bc = bitcast i32 %result to float
ret float %bc		ret float %bc
}		}

; V_MAD_U32_U24 is given higher priority.		; V_MAD_U32_U24 is given higher priority.
define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {		define amdgpu_ps float @mad_no_add3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
		; VI-LABEL: mad_no_add3:
		; VI: ; %bb.0:
		; VI-NEXT: v_mad_u32_u24 v0, v0, v1, v4
		; VI-NEXT: v_mad_u32_u24 v0, v2, v3, v0
		; VI-NEXT: ; return to shader part epilog
		;
; GFX9-LABEL: mad_no_add3:		; GFX9-LABEL: mad_no_add3:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v4		; GFX9-NEXT: v_mad_u32_u24 v0, v0, v1, v4
; GFX9-NEXT: v_mad_u32_u24 v0, v2, v3, v0		; GFX9-NEXT: v_mad_u32_u24 v0, v2, v3, v0
; GFX9-NEXT: ; return to shader part epilog		; GFX9-NEXT: ; return to shader part epilog
%a0 = shl i32 %a, 8		%a0 = shl i32 %a, 8
%a1 = lshr i32 %a0, 8		%a1 = lshr i32 %a0, 8
%b0 = shl i32 %b, 8		%b0 = shl i32 %b, 8
Show All 13 Lines	; GFX9-NEXT: ; return to shader part epilog
ret float %bc		ret float %bc
}		}

; ThreeOp instruction variant not used due to Constant Bus Limitations		; ThreeOp instruction variant not used due to Constant Bus Limitations
; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32		; TODO: with reassociation it is possible to replace a v_add_u32_e32 with a s_add_i32
define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {		define amdgpu_ps float @add3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) {
; VI-LABEL: add3_vgpr_b:		; VI-LABEL: add3_vgpr_b:
; VI: ; %bb.0:		; VI: ; %bb.0:
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0		; VI-NEXT: s_add_i32 s3, s3, s2
; VI-NEXT: v_add_u32_e32 v0, vcc, s3, v0		; VI-NEXT: v_add_u32_e32 v0, vcc, s3, v0
; VI-NEXT: ; return to shader part epilog		; VI-NEXT: ; return to shader part epilog
;		;
; GFX9-LABEL: add3_vgpr_b:		; GFX9-LABEL: add3_vgpr_b:
; GFX9: ; %bb.0:		; GFX9: ; %bb.0:
; GFX9-NEXT: v_add_u32_e32 v0, s2, v0		; GFX9-NEXT: s_add_i32 s3, s3, s2
; GFX9-NEXT: v_add_u32_e32 v0, s3, v0		; GFX9-NEXT: v_add_u32_e32 v0, s3, v0
; GFX9-NEXT: ; return to shader part epilog		; GFX9-NEXT: ; return to shader part epilog
%x = add i32 %a, %b		%x = add i32 %a, %b
%result = add i32 %x, %c		%result = add i32 %x, %c
%bc = bitcast i32 %result to float		%bc = bitcast i32 %result to float
ret float %bc		ret float %bc
}		}

▲ Show 20 Lines • Show All 126 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/reassoc-scalar.ll

This file was added.

				; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GFX8 %s
				; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s \| FileCheck -check-prefixes=GCN,GFX9 %s
				arsenmUnsubmitted Done Reply Inline Actions Should have a gfx9 run line also because of add3 arsenm: Should have a gfx9 run line also because of add3
				rampitecAuthorUnsubmitted Done Reply Inline Actions Added. However v_add3 was not generated for these tests. rampitec: Added. However v_add3 was not generated for these tests.

				; GCN-LABEL: reassoc_i32:
				; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
				; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
				; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
				define amdgpu_kernel void @reassoc_i32(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
				bb:
				%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
				%add1 = add i32 %x, %tid
				%add2 = add i32 %add1, %y
				store i32 %add2, i32 addrspace(1)* %arg, align 4
				ret void
				}

				; GCN-LABEL: reassoc_i32_swap_arg_order:
				; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
				; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
				; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
				define amdgpu_kernel void @reassoc_i32_swap_arg_order(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
				bb:
				%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
				%add1 = add i32 %tid, %x
				%add2 = add i32 %y, %add1
				store i32 %add2, i32 addrspace(1)* %arg, align 4
				ret void
				}

				; GCN-LABEL: reassoc_i64:
				; GCN: s_add_u32 [[ADD1L:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
				; GCN: s_addc_u32 [[ADD1H:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
				; GFX8-DAG: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1L]], v{{[0-9]+}}
				; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, [[ADD1L]], v{{[0-9]+}}
				; GCN-DAG: v_mov_b32_e32 [[VADD1H:v[0-9]+]], [[ADD1H]]
				; GFX8: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, [[VADD1H]], vcc
				; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, [[VADD1H]], vcc
				define amdgpu_kernel void @reassoc_i64(i64 addrspace(1)* %arg, i64 %x, i64 %y) {
				bb:
				%tid32 = tail call i32 @llvm.amdgcn.workitem.id.x()
				%tid = zext i32 %tid32 to i64
				%add1 = add i64 %x, %tid
				%add2 = add i64 %add1, %y
				store i64 %add2, i64 addrspace(1)* %arg, align 8
				ret void
				}

				; GCN-LABEL: reassoc_i32_nuw:
				; GCN: s_add_i32 [[ADD1:s[0-9]+]], s{{[0-9]+}}, s{{[0-9]+}}
				; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, [[ADD1]], v{{[0-9]+}}
				; GFX9: v_add_u32_e32 v{{[0-9]+}}, [[ADD1]], v{{[0-9]+}}
				define amdgpu_kernel void @reassoc_i32_nuw(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
				bb:
				%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
				%add1 = add i32 %x, %tid
				%add2 = add nuw i32 %add1, %y
				store i32 %add2, i32 addrspace(1)* %arg, align 4
				ret void
				}

				; GCN-LABEL: reassoc_i32_multiuse:
				; GFX8: v_add_u32_e32 [[ADD1:v[0-9]+]], vcc, s{{[0-9]+}}, v{{[0-9]+}}
				; GFX9: v_add_u32_e32 [[ADD1:v[0-9]+]], s{{[0-9]+}}, v{{[0-9]+}}
				; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, [[ADD1]]
				; GFX9: v_add_u32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[ADD1]]
				define amdgpu_kernel void @reassoc_i32_multiuse(i32 addrspace(1)* %arg, i32 %x, i32 %y) {
				bb:
				%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
				%add1 = add i32 %x, %tid
				%add2 = add i32 %add1, %y
				store volatile i32 %add1, i32 addrspace(1)* %arg, align 4
				store volatile i32 %add2, i32 addrspace(1)* %arg, align 4
				ret void
				}

				; TODO: This should be reassociated as well, however it is disabled to avoid endless
				; loop since DAGCombiner::ReassociateOps() reverts the reassociation.
				; GCN-LABEL: reassoc_i32_const:
				; GFX8: v_add_u32_e32 [[ADD1:v[0-9]+]], vcc, 42, v{{[0-9]+}}
				; GFX9: v_add_u32_e32 [[ADD1:v[0-9]+]], 42, v{{[0-9]+}}
				; GFX8: v_add_u32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, [[ADD1]]
				; GFX9: v_add_u32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[ADD1]]
				define amdgpu_kernel void @reassoc_i32_const(i32 addrspace(1)* %arg, i32 %x) {
				bb:
				%tid = tail call i32 @llvm.amdgcn.workitem.id.x()
				%add1 = add i32 %tid, 42
				%add2 = add i32 %add1, %x
				store volatile i32 %add1, i32 addrspace(1)* %arg, align 4
				store volatile i32 %add2, i32 addrspace(1)* %arg, align 4
				ret void
				}

				declare i32 @llvm.amdgcn.workitem.id.x()

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Ressociate 'add (add x, y), z' to use SALU
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 186879

lib/Target/AMDGPU/SIISelLowering.h

lib/Target/AMDGPU/SIISelLowering.cpp

test/CodeGen/AMDGPU/add3.ll

test/CodeGen/AMDGPU/reassoc-scalar.ll

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Ressociate 'add (add x, y), z' to use SALUClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 186879

lib/Target/AMDGPU/SIISelLowering.h

lib/Target/AMDGPU/SIISelLowering.cpp

test/CodeGen/AMDGPU/add3.ll

test/CodeGen/AMDGPU/reassoc-scalar.ll

[AMDGPU] Ressociate 'add (add x, y), z' to use SALU
ClosedPublic