Diff 115195

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,589 Lines • ▼ Show 20 Lines	if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
isConstantOrConstantVector(N1, /* No Opaques */ true)) {		isConstantOrConstantVector(N1, /* No Opaques */ true)) {
SDLoc DL(N);		SDLoc DL(N);
SDValue AllBits = DAG.getAllOnesConstant(DL, VT);		SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);		SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);		return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
}		}

// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)		// fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
		// fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
// Variant of version done on multiply, except mul by a power of 2 is turned		// Variant of version done on multiply, except mul by a power of 2 is turned
// into a shift.		// into a shift.
if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&		if ((N0.getOpcode() == ISD::ADD \|\| N0.getOpcode() == ISD::OR) &&
		N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&		isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {		isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);		SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);		SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
AddToWorklist(Shl0.getNode());		AddToWorklist(Shl0.getNode());
AddToWorklist(Shl1.getNode());		AddToWorklist(Shl1.getNode());
return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);		return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, Shl0, Shl1);
}		}

// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)		// fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&		if (N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse() &&
isConstantOrConstantVector(N1, /* No Opaques */ true) &&		isConstantOrConstantVector(N1, /* No Opaques */ true) &&
isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {		isConstantOrConstantVector(N0.getOperand(1), /* No Opaques */ true)) {
SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);		SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
if (isConstantOrConstantVector(Shl))		if (isConstantOrConstantVector(Shl))
▲ Show 20 Lines • Show All 11,857 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

Show First 20 Lines • Show All 64 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
%fabs = call half @llvm.fabs.f16(half %val)		%fabs = call half @llvm.fabs.f16(half %val)
%fsub = fsub half -0.0, %fabs		%fsub = fsub half -0.0, %fabs
store half %fsub, half addrspace(1)* %out, align 2		store half %fsub, half addrspace(1)* %out, align 2
ret void		ret void
}		}

; FIXME: single bit op		; FIXME: single bit op
; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:		; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}		; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_or_b32_e32 [[OR:v[0-9]+]], [[MASK]], v{{[0-9]+}}		; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[OR]]		; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SHL]]		; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]		; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; CIVI: flat_store_dword		; CIVI: flat_store_dword

; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}		; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {		define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)		%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
%fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs		%fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out		store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}fneg_fabs_v4f16:		; GCN-LABEL: {{^}}fneg_fabs_v4f16:
; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}		; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; CI: v_or_b32_e32 [[OR00:v[0-9]+]], [[MASK]], v{{[0-9]+}}		; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, [[OR00]]		; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
; CI: v_or_b32_e32 [[OR01:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]		; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR01]]		; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
; CI: v_or_b32_e32 [[OR10:v[0-9]+]], [[MASK]], v{{[0-9]+}}		; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, [[OR10]]		; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
; CI: v_or_b32_e32 [[OR11:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]		; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR11]]
; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]		; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD		; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],		; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],

; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000		; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}		; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
▲ Show 20 Lines • Show All 59 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

	Show All 21 Lines
	; CHECK-LABEL: {{^}}ds_bpermute_imm_index:			; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
	; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64			; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
	define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {			define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
	%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0			%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
	store i32 %bpermute, i32 addrspace(1)* %out, align 4			store i32 %bpermute, i32 addrspace(1)* %out, align 4
	ret void			ret void
	}			}

				; CHECK-LABEL: {{^}}ds_bpermute_add_shl:
				; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
				; CHECK: s_waitcnt lgkmcnt
				define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
				%index = add i32 %base_index, 1
				%byte_index = shl i32 %index, 2
				%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
				store i32 %bpermute, i32 addrspace(1)* %out, align 4
				ret void
				}

				; CHECK-LABEL: {{^}}ds_bpermute_or_shl:
				; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
				; CHECK: s_waitcnt lgkmcnt
				define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
				%masked = and i32 %base_index, 62
				%index = or i32 %masked, 1
				%byte_index = shl i32 %index, 2
				%bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
				store i32 %bpermute, i32 addrspace(1)* %out, align 4
				ret void
				}

	attributes #0 = { nounwind readnone convergent }			attributes #0 = { nounwind readnone convergent }

llvm/trunk/test/CodeGen/AMDGPU/shl.ll

	Show First 20 Lines • Show All 470 Lines • ▼ Show 20 Lines
	; FUNC-LABEL: {{^}}test_mul2:			; FUNC-LABEL: {{^}}test_mul2:
	; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1			; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
	define amdgpu_kernel void @test_mul2(i32 %p) {			define amdgpu_kernel void @test_mul2(i32 %p) {
	%i = mul i32 %p, 2			%i = mul i32 %p, 2
	store volatile i32 %i, i32 addrspace(1)* undef			store volatile i32 %i, i32 addrspace(1)* undef
	ret void			ret void
	}			}

				; FUNC-LABEL: {{^}}shl_or_k:
				; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
				; SI: v_or_b32_e32 [[OR:v[0-9]+]], 4, [[SHL]]
				; SI: buffer_store_dword [[OR]]
				define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
				%tmp0 = or i32 %in, 1
				%tmp2 = shl i32 %tmp0, 2
				store i32 %tmp2, i32 addrspace(1)* %out
				ret void
				}

				; FUNC-LABEL: {{^}}shl_or_k_two_uses:
				; SI: v_or_b32_e32 [[OR:v[0-9]+]], 1, v{{[0-9]+}}
				; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, [[OR]]
				; SI-DAG: buffer_store_dword [[OR]]
				; SI-DAG: buffer_store_dword [[SHL]]
				define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
				%tmp0 = or i32 %in, 1
				%tmp2 = shl i32 %tmp0, 2
				store i32 %tmp2, i32 addrspace(1)* %out0
				store i32 %tmp0, i32 addrspace(1)* %out1
				ret void
				}

	attributes #0 = { nounwind readnone }			attributes #0 = { nounwind readnone }

llvm/trunk/test/CodeGen/X86/combine-shl.ll

	Show First 20 Lines • Show All 531 Lines • ▼ Show 20 Lines
	; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>			%1 = add <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
	%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>			%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

	; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)			; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
	define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {			define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
	; SSE-LABEL: combine_vec_shl_or0:			; SSE-LABEL: combine_vec_shl_or0:
	; SSE: # BB#0:			; SSE: # BB#0:
	; SSE-NEXT: por {{.*}}(%rip), %xmm0
	; SSE-NEXT: pslld $2, %xmm0			; SSE-NEXT: pslld $2, %xmm0
				; SSE-NEXT: por {{.*}}(%rip), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: combine_vec_shl_or0:			; AVX-LABEL: combine_vec_shl_or0:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
	; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpslld $2, %xmm0, %xmm0			; AVX-NEXT: vpslld $2, %xmm0, %xmm0
				; AVX-NEXT: vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
				; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>			%1 = or <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
	%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>			%2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

	define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {			define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
	; SSE-LABEL: combine_vec_shl_or1:			; SSE-LABEL: combine_vec_shl_or1:
	; SSE: # BB#0:			; SSE: # BB#0:
	; SSE-NEXT: por {{.*}}(%rip), %xmm0
	; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0			; SSE-NEXT: pmulld {{.*}}(%rip), %xmm0
				; SSE-NEXT: por {{.*}}(%rip), %xmm0
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: combine_vec_shl_or1:			; AVX-LABEL: combine_vec_shl_or1:
	; AVX: # BB#0:			; AVX: # BB#0:
	; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpsllvd {{.*}}(%rip), %xmm0, %xmm0
				; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>			%1 = or <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
	%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>			%2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

	; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)			; fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
	define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {			define <4 x i32> @combine_vec_shl_mul0(<4 x i32> %x) {
	Show All 29 Lines

This is an archive of the discontinued LLVM Phabricator instance.

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 115195

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

llvm/trunk/test/CodeGen/AMDGPU/shl.ll

llvm/trunk/test/CodeGen/X86/combine-shl.ll

This is an archive of the discontinued LLVM Phabricator instance.

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 115195

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/trunk/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll

llvm/trunk/test/CodeGen/AMDGPU/shl.ll

llvm/trunk/test/CodeGen/X86/combine-shl.ll

DAGCombine: (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
ClosedPublic