Diff 102124

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,567 Lines • ▼ Show 20 Lines	if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
isConstantOrConstantVector(N1, /* No Opaques */ true)) {		isConstantOrConstantVector(N1, /* No Opaques */ true)) {
SDLoc DL(N);		SDLoc DL(N);
SDValue AllBits = DAG.getAllOnesConstant(DL, VT);		SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);		SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);		return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
}		}

// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)		// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
		// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
// Variant of version done on multiply, except mul by a power of 2 is turned		// Variant of version done on multiply, except mul by a power of 2 is turned
// into a shift.		// into a shift.
if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&		if ((N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR) &&
		N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&		isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {		isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);		SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);		SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
AddToWorklist(Shl0.getNode());		AddToWorklist(Shl0.getNode());
AddToWorklist(Shl1.getNode());		AddToWorklist(Shl1.getNode());
return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);		return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
}		}

// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)		// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&		if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&		isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {		isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);		SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
if (isConstantOrConstantVector(Shl))		if (isConstantOrConstantVector(Shl))
▲ Show 20 Lines • Show All 11,316 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/fneg-fabs.f16.ll

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%fabs = call half @llvm.fabs.f16(half %val)		%fabs = call half @llvm.fabs.f16(half %val)
%fsub = fsub half -0.0, %fabs		%fsub = fsub half -0.0, %fabs
store half %fsub, half addrspace(1)* %out, align 2		store half %fsub, half addrspace(1)* %out, align 2
ret void		ret void
}		}

; FIXME: single bit op		; FIXME: single bit op
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:		; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}		; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]],
		; CI: v_or_b32_e32 v{{[0-9]+}}, [[SHL]],
		; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000,
		; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]		; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; CIVI: flat_store_dword		; CIVI: flat_store_dword

; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}		; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {		define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)		%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
%fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs		%fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out		store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}fneg_fabs_v4f16:		; GCN-LABEL: {{^}}fneg_fabs_v4f16:
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}		; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]]
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; CI: v_or_b32_e32 v{{[0-9]+}}, [[SHL0]],
		; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]]
		; CI: v_or_b32_e32 v{{[0-9]+}}, [[SHL1]],
		tstellarUnsubmitted Not Done Reply Inline Actions Does this test generate extra shl instructions now, or did it generate those before the patch and there just weren't any check lines for it? tstellar: Does this test generate extra shl instructions now, or did it generate those before the patch…
		RKSimonAuthorUnsubmitted Not Done Reply Inline Actions To confirm, the shifts were always there but the test didn't check for them RKSimon: To confirm, the shifts were always there but the test didn't check for them
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]		; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],

▲ Show 20 Lines • Show All 61 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

	Show All 21 Lines
	; CHECK-LABEL: {{^}}ds_bpermute_imm_index:			; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
	; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64			; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
	define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {			define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
	%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0			%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
	store i32 %bpermute, i32 addrspace(1)* %out, align 4			store i32 %bpermute, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; CHECK-LABEL: {{^}}ds_bpermute_add_shl:
				; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
				; CHECK: s_waitcnt lgkmcnt
				define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
				%index = add i32 %base_index, 1
				%byte_index = shl i32 %index, 2
				%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
				store i32 %bpermute, i32 addrspace(1)* %out, align 4
				ret void
				}

				; CHECK-LABEL: {{^}}ds_bpermute_or_shl:
				; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
				; CHECK: s_waitcnt lgkmcnt
				define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
				%masked = and i32 %base_index, 62
				%index = or i32 %masked, 1
				%byte_index = shl i32 %index, 2
				%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
				store i32 %bpermute, i32 addrspace(1)* %out, align 4
				ret void
				}

	attributes #0 = { nounwind readnone convergent }			attributes #0 = { nounwind readnone convergent }

test/CodeGen/AMDGPU/shl.ll

	Show First 20 Lines • Show All 470 Lines • ▼ Show 20 Lines
	; FUNC-LABEL: {{^}}test_mul2:			; FUNC-LABEL: {{^}}test_mul2:
	; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1			; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
	define amdgpu_kernel void @test_mul2(i32 %p) {			define amdgpu_kernel void @test_mul2(i32 %p) {
	%i = mul i32 %p, 2			%i = mul i32 %p, 2
	store volatile i32 %i, i32 addrspace(1)* undef			store volatile i32 %i, i32 addrspace(1)* undef
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}shl_or_k:
				; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
				; SI: v_or_b32_e32 [[OR:v[0-9]+]], 4, [[SHL]]
				; SI: buffer_store_dword [[OR]]
				define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
				%tmp0 = or i32 %in, 1
				%tmp2 = shl i32 %tmp0, 2
				store i32 %tmp2, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}shl_or_k_two_uses:
				; SI: v_or_b32_e32 [[OR:v[0-9]+]], 1, v{{[0-9]+}}
				; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, [[OR]]
				; SI-DAG: buffer_store_dword [[OR]]
				; SI-DAG: buffer_store_dword [[SHL]]
				define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
				%tmp0 = or i32 %in, 1
				%tmp2 = shl i32 %tmp0, 2
				store i32 %tmp2, i32 addrspace(1)* %out0
				store i32 %tmp0, i32 addrspace(1)* %out1
				ret void
				}

	attributes #0 = { nounwind readnone }			attributes #0 = { nounwind readnone }

test/CodeGen/X86/combine-shl.ll

Show First 20 Lines • Show All 538 Lines • ▼ Show 20 Lines	; AVX-NEXT: retq
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>		%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %2		ret <4 x i32> %2
}		}

; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)		; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {		define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_or0:		; SSE-LABEL: combine_vec_shl_or0:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: pslld $2, %xmm0		; SSE-NEXT: pslld $2, %xmm0
		; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: combine_vec_shl_or0:		; AVX-LABEL: combine_vec_shl_or0:
; AVX: # BB#0:		; AVX: # BB#0:
		; AVX-NEXT: vpslld $2, %xmm0, %xmm0
; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1		; AVX-NEXT: vpbroadcastd {{.*}}(%rip), %xmm1
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0		; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpslld $2, %xmm0, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>		%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>		%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
ret <4 x i32> %2		ret <4 x i32> %2
}		}

define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {		define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
; SSE-LABEL: combine_vec_shl_or1:		; SSE-LABEL: combine_vec_shl_or1:
; SSE: # BB#0:		; SSE: # BB#0:
; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0		; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
		; SSE-NEXT: por {{.*}}(%rip), %xmm0
; SSE-NEXT: retq		; SSE-NEXT: retq
;		;
; AVX-LABEL: combine_vec_shl_or1:		; AVX-LABEL: combine_vec_shl_or1:
; AVX: # BB#0:		; AVX: # BB#0:
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0		; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
		; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX-NEXT: retq		; AVX-NEXT: retq
%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>		%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>		%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
ret <4 x i32> %2		ret <4 x i32> %2
}		}

; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)		; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {		define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
Show All 29 Lines

This is an archive of the discontinued LLVM Phabricator instance.

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 102124

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fneg-fabs.f16.ll

test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

test/CodeGen/AMDGPU/shl.ll

test/CodeGen/X86/combine-shl.ll

This is an archive of the discontinued LLVM Phabricator instance.

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 102124

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

test/CodeGen/AMDGPU/fneg-fabs.f16.ll

test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

test/CodeGen/AMDGPU/shl.ll

test/CodeGen/X86/combine-shl.ll

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
ClosedPublic