This is an archive of the discontinued LLVM Phabricator instance.

Differential D10936

AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32)
ClosedPublic

Authored by arsenm on Jul 3 2015, 4:32 PM.

Download Raw Diff

Details

Reviewers

• tstellarAMD

Summary

This can be done only with moves which theoretically
will optimize better later.

Although this transform increases the instruction count,
it should be code size / cycle count neutral in the worst
VALU case. It also seems to slightly improve a couple
of testcases due to other DAG combines this exposes.

This is probably slightly worse for the SALU case, so
it might be better to handle this during moveToVALU,
although then you lose some simplifications like
the load width reducing in the simple testcase.

Diff Detail

Event Timeline

arsenm updated this revision to Diff 29038.Jul 3 2015, 4:32 PM

arsenm retitled this revision from to AMDGPU: Avoid using 64-bit shift for i64 (shl x, 32).

arsenm updated this object.

arsenm added a subscriber: llvm-commits.

Use truncate instead of extract_element x, 0
Add tests for lshr x, 32 case

ping

LGTM

This revision is now accepted and ready to land.Jul 14 2015, 7:09 AM

r242177

Revision Contents

Path

Size

lib/

Target/

AMDGPU/

AMDGPUISelLowering.h

1 line

AMDGPUISelLowering.cpp

34 lines

test/

CodeGen/

AMDGPU/

array-ptr-calc-i64.ll

5 lines

mul_uint24.ll

20 lines

shl.ll

38 lines

srl.ll

35 lines

Diff 29049

lib/Target/AMDGPU/AMDGPUISelLowering.h

Context not available.
	SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;	SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;

	SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;	SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
		SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const;
	SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;	SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const;

	protected:	protected:
Context not available.

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Context not available.
	setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);	setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom);
	setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);	setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom);

		setTargetDAGCombine(ISD::SHL);
	setTargetDAGCombine(ISD::MUL);	setTargetDAGCombine(ISD::MUL);
	setTargetDAGCombine(ISD::SELECT);	setTargetDAGCombine(ISD::SELECT);
	setTargetDAGCombine(ISD::SELECT_CC);	setTargetDAGCombine(ISD::SELECT_CC);
Context not available.
	SN->getBasePtr(), SN->getMemOperand());	SN->getBasePtr(), SN->getMemOperand());
	}	}

		SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
		DAGCombinerInfo &DCI) const {
		if (N->getValueType(0) != MVT::i64)
		return SDValue();

		// i64 (shl x, 32) -> (build_pair 0, x)

		// Doing this with moves theoretically helps MI optimizations that understand
		// copies. 2 v_mov_b32_e32 will have the same code size / cycle count as
		// v_lshl_b64. In the SALU case, I think this is slightly worse since it
		// doubles the code size and I'm unsure about cycle count.
		const ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
		if (!RHS \|\| RHS->getZExtValue() != 32)
		return SDValue();

		SDValue LHS = N->getOperand(0);

		SDLoc SL(N);
		SelectionDAG &DAG = DCI.DAG;

		// Extract low 32-bits.
		SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);

		const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
		return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Zero, Lo);
		}

	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,	SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
	DAGCombinerInfo &DCI) const {	DAGCombinerInfo &DCI) const {
	EVT VT = N->getValueType(0);	EVT VT = N->getValueType(0);
Context not available.
	switch(N->getOpcode()) {	switch(N->getOpcode()) {
	default:	default:
	break;	break;
		case ISD::SHL: {
		if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
		break;

		return performShlCombine(N, DCI);
		}
	case ISD::MUL:	case ISD::MUL:
	return performMulCombine(N, DCI);	return performMulCombine(N, DCI);
	case AMDGPUISD::MUL_I24:	case AMDGPUISD::MUL_I24:
Context not available.

test/CodeGen/AMDGPU/array-ptr-calc-i64.ll

Context not available.
	declare i32 @llvm.SI.tid() readnone	declare i32 @llvm.SI.tid() readnone

	; SI-LABEL: {{^}}test_array_ptr_calc:	; SI-LABEL: {{^}}test_array_ptr_calc:
	; SI: v_mul_lo_i32	; SI-DAG: v_mul_lo_i32
	; SI: v_mul_hi_i32	; SI-DAG: v_mul_hi_i32
		; SI: s_endpgm
	define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {	define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
	%tid = call i32 @llvm.SI.tid() readnone	%tid = call i32 @llvm.SI.tid() readnone
	%a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0	%a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
Context not available.

test/CodeGen/AMDGPU/mul_uint24.ll

Context not available.
	; FUNC_LABEL: {{^}}mul24_i64:	; FUNC_LABEL: {{^}}mul24_i64:
	; EG; MUL_UINT24	; EG; MUL_UINT24
	; EG: MULHI	; EG: MULHI
	; SI: v_mul_u32_u24
	; FIXME: SI support 24-bit mulhi	; FIXME: SI support 24-bit mulhi
	; SI: v_mul_hi_u32
	define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {	; SI-DAG: v_mul_u32_u24
		; SI-DAG: v_mul_hi_u32
		; SI: s_endpgm
		define void @mul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
	entry:	entry:
	%0 = shl i64 %a, 40	%tmp0 = shl i64 %a, 40
	%a_24 = lshr i64 %0, 40	%a_24 = lshr i64 %tmp0, 40
	%1 = shl i64 %b, 40	%tmp1 = shl i64 %b, 40
	%b_24 = lshr i64 %1, 40	%b_24 = lshr i64 %tmp1, 40
	%2 = mul i64 %a_24, %b_24	%tmp2 = mul i64 %a_24, %b_24
	store i64 %2, i64 addrspace(1)* %out	store i64 %tmp2, i64 addrspace(1)* %out
	ret void	ret void
	}	}
Context not available.

test/CodeGen/AMDGPU/shl.ll

	;RUN: llc < %s -march=r600 -mcpu=redwood \| FileCheck --check-prefix=EG %s	; RUN: llc < %s -march=r600 -mcpu=redwood \| FileCheck --check-prefix=EG %s
	;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck --check-prefix=SI %s	; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck -check-prefix=GCN -check-prefix=SI %s
	;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck --check-prefix=VI %s	; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs \| FileCheck -check-prefix=GCN -check-prefix=VI %s

		declare i32 @llvm.r600.read.tidig.x() #0


	;EG: {{^}}shl_v2i32:	;EG: {{^}}shl_v2i32:
	;EG: LSHL {{\? }}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}	;EG: LSHL {{\? }}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
Context not available.
	store <4 x i64> %result, <4 x i64> addrspace(1)* %out	store <4 x i64> %result, <4 x i64> addrspace(1)* %out
	ret void	ret void
	}	}

		; Make sure load width gets reduced to i32 load.
		; GCN-LABEL: {{^}}s_shl_32_i64:
		; GCN-DAG: s_load_dword [[LO_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb{{$}}
		; GCN-DAG: s_mov_b32 s[[SLO:[0-9]+]], 0{{$}}
		; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
		; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
		; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
		define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
		%result = shl i64 %a, 32
		store i64 %result, i64 addrspace(1)* %out
		ret void
		}

		; GCN-LABEL: {{^}}v_shl_32_i64:
		; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
		; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
		; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
		define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
		%tid = call i32 @llvm.r600.read.tidig.x() #0
		%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
		%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
		%a = load i64, i64 addrspace(1)* %gep.in
		%result = shl i64 %a, 32
		store i64 %result, i64 addrspace(1)* %gep.out
		ret void
		}

		attributes #0 = { nounwind readnone }
Context not available.

test/CodeGen/AMDGPU/srl.ll

	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=FUNC %s	; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s \| FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=FUNC %s	; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s	; RUN: llc -march=r600 -mcpu=redwood < %s \| FileCheck -check-prefix=EG -check-prefix=FUNC %s

		declare i32 @llvm.r600.read.tidig.x() #0

	; FUNC-LABEL: {{^}}lshr_i32:	; FUNC-LABEL: {{^}}lshr_i32:
	; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}	; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
	; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}	; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
Context not available.
	store <4 x i64> %result, <4 x i64> addrspace(1)* %out	store <4 x i64> %result, <4 x i64> addrspace(1)* %out
	ret void	ret void
	}	}

		; Make sure load width gets reduced to i32 load.
		; GCN-LABEL: {{^}}s_lshr_32_i64:
		; GCN-DAG: s_load_dword [[HI_A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc{{$}}
		; GCN-DAG: s_mov_b32 s[[SHI:[0-9]+]], 0{{$}}
		; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
		; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
		; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
		define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
		%result = lshr i64 %a, 32
		store i64 %result, i64 addrspace(1)* %out
		ret void
		}

		; GCN-LABEL: {{^}}v_lshr_32_i64:
		; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
		; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
		; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
		define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
		%tid = call i32 @llvm.r600.read.tidig.x() #0
		%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
		%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
		%a = load i64, i64 addrspace(1)* %gep.in
		%result = lshr i64 %a, 32
		store i64 %result, i64 addrspace(1)* %gep.out
		ret void
		}

		attributes #0 = { nounwind readnone }
Context not available.