This is an archive of the discontinued LLVM Phabricator instance.

Differential D20529

[x86, AVX] allow explicit calls to VZERO* to modify state in VZeroUpperInserter pass
ClosedPublic

Authored by spatel on May 23 2016, 10:16 AM.

Download Raw Diff

Details

Reviewers

qcolombet
RKSimon
aaboud

Commits

rG3955360b24c7: [x86, AVX] allow explicit calls to VZERO* to modify state in VZeroUpperInserter…
rL270718: [x86, AVX] allow explicit calls to VZERO* to modify state in…

Summary

Although this fixes the duplicate VZ* instructions in the existing tests, we still have more problems.

For example, why does a VZU call cause this stack spill which then leads to yet another VZU?

define <4 x float> @avx_in_sse_out(<8 x float> %x) nounwind {
; CHECK-LABEL: avx_in_sse_out:
; CHECK:       # BB#0:
; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
;
  %xmm = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  call void @llvm.x86.avx.vzeroupper()
  ret <4 x float> %xmm
}

Diff Detail

Repository: rL LLVM

Event Timeline

spatel updated this revision to Diff 58111.May 23 2016, 10:16 AM

spatel retitled this revision from to [x86, AVX] allow explicit calls to VZERO* to modify state in VZeroUpperInserter pass.

spatel updated this object.

spatel added reviewers: RKSimon, aaboud, qcolombet.

spatel added a subscriber: llvm-commits.

Herald added a subscriber: mcrosier. · View Herald TranscriptMay 23 2016, 10:16 AM

Your changes LGTM, Sanjay.

For the example you gave, it looks like a problem in coalescing. To eliminate a copy, coalescing extends a VR256 lifetime across the vzeroupper:

# *** IR Dump After Live Interval Analysis ***:
# Machine code for function avx_in_sse_out: Properties: <Post SSA, tracking liv\
eness, HasVRegs>
Function Live Ins: %YMM0 in %vreg0

0B      BB#0: derived from LLVM BB %0
            Live Ins: %YMM0
16B             %vreg0<def> = COPY %YMM0; VR256:%vreg0
32B             %vreg1<def> = COPY %vreg0:sub_xmm; VR128:%vreg1 VR256:%vreg0
48B             VZEROUPPER
64B             %XMM0<def> = COPY %vreg1; VR128:%vreg1
80B             RET 0, %XMM0

# End machine code for function avx_in_sse_out.

# *** IR Dump After Simple Register Coalescing ***:
# Machine code for function avx_in_sse_out: Properties: <Post SSA, tracking liv\
eness, HasVRegs>
Function Live Ins: %YMM0 in %vreg0

0B      BB#0: derived from LLVM BB %0
            Live Ins: %YMM0
16B             %vreg0<def> = COPY %YMM0; VR256:%vreg0
48B             VZEROUPPER
64B             %XMM0<def> = COPY %vreg0:sub_xmm; VR256:%vreg0
80B             RET 0, %XMM0

Not good! In this case, coalescing would be better off doing this:

# *** IR Dump After Simple Register Coalescing ***:
# Machine code for function avx_in_sse_out: Properties: <Post SSA, tracking liv\
eness, HasVRegs>
Function Live Ins: %YMM0 in %vreg0

0B      BB#0: derived from LLVM BB %0
            Live Ins: %YMM0
16B             %vreg1<def> = COPY %YMM0:sub_xmm; VR128:%vreg1
48B             VZEROUPPER
64B             %XMM0<def> = COPY %vreg1; VR128:%vreg1
80B             RET 0, %XMM0

Thanks, Dave! I'll check this part in and then update https://llvm.org/bugs/show_bug.cgi?id=27823 with your analysis.

Closed by commit rL270718: [x86, AVX] allow explicit calls to VZERO* to modify state in… (authored by spatel). · Explain WhyMay 25 2016, 9:46 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

X86/

X86VZeroUpper.cpp

13 lines

test/

CodeGen/

X86/

vzero-excess.ll

7 lines

Diff 58444

llvm/trunk/lib/Target/X86/X86VZeroUpper.cpp

Show First 20 Lines • Show All 182 Lines • ▼ Show 20 Lines	void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {

for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {		for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
MachineInstr *MI = I;		MachineInstr *MI = I;
// No need for vzeroupper before iret in interrupt handler function,		// No need for vzeroupper before iret in interrupt handler function,
// epilogue will restore YMM registers if needed.		// epilogue will restore YMM registers if needed.
bool IsReturnFromX86INTR = IsX86INTR && MI->isReturn();		bool IsReturnFromX86INTR = IsX86INTR && MI->isReturn();
bool IsControlFlow = MI->isCall() \|\| MI->isReturn();		bool IsControlFlow = MI->isCall() \|\| MI->isReturn();

// Shortcut: don't need to check regular instructions in dirty state.		// An existing VZERO* instruction resets the state.
if ((!IsControlFlow \|\| IsReturnFromX86INTR) && CurState == EXITS_DIRTY)		if (MI->getOpcode() == X86::VZEROALL \|\|
		MI->getOpcode() == X86::VZEROUPPER) {
		CurState = EXITS_CLEAN;
continue;		continue;
		}

// Ignore existing VZERO* instructions.		// Shortcut: don't need to check regular instructions in dirty state.
// FIXME: The existence of these instructions should be used to modify the		if ((!IsControlFlow \|\| IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
// current state and/or used when deciding whether we need to create a VZU.
if (MI->getOpcode() == X86::VZEROALL \|\| MI->getOpcode() == X86::VZEROUPPER)
continue;		continue;

if (hasYmmReg(MI)) {		if (hasYmmReg(MI)) {
// We found a ymm-using instruction; this could be an AVX instruction,		// We found a ymm-using instruction; this could be an AVX instruction,
// or it could be control flow.		// or it could be control flow.
CurState = EXITS_DIRTY;		CurState = EXITS_DIRTY;
continue;		continue;
}		}
▲ Show 20 Lines • Show All 123 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/vzero-excess.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s			; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx \| FileCheck %s

	; FIXME: The vzeroupper added by the VZeroUpperInserter pass is unnecessary in these tests.			; In the following 4 tests, the existing call to VZU/VZA ensures clean state before
				; the call to the unknown, so we don't need to insert a second VZU at that point.

	define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {			define <4 x float> @zeroupper_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
	; CHECK-LABEL: zeroupper_v4f32:			; CHECK-LABEL: zeroupper_v4f32:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: pushq %rbx			; CHECK-NEXT: pushq %rbx
	; CHECK-NEXT: subq $48, %rsp			; CHECK-NEXT: subq $48, %rsp
	; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill			; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
	; CHECK-NEXT: movq %rdi, %rbx			; CHECK-NEXT: movq %rdi, %rbx
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: callq the_unknown			; CHECK-NEXT: callq the_unknown
	; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload			; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
	; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0			; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1			; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0			; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; CHECK-NEXT: addq $48, %rsp			; CHECK-NEXT: addq $48, %rsp
	; CHECK-NEXT: popq %rbx			; CHECK-NEXT: popq %rbx
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	Show All 9 Lines
	}			}

	define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {			define <8 x float> @zeroupper_v8f32(<8 x float> %x) nounwind {
	; CHECK-LABEL: zeroupper_v8f32:			; CHECK-LABEL: zeroupper_v8f32:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: subq $56, %rsp			; CHECK-NEXT: subq $56, %rsp
	; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill			; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: callq the_unknown			; CHECK-NEXT: callq the_unknown
	; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload			; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
	; CHECK-NEXT: addq $56, %rsp			; CHECK-NEXT: addq $56, %rsp
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	call void @llvm.x86.avx.vzeroupper()			call void @llvm.x86.avx.vzeroupper()
	call void @the_unknown()			call void @the_unknown()
	ret <8 x float> %x			ret <8 x float> %x
	}			}

	define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {			define <4 x float> @zeroall_v4f32(<8 x float> *%x, <8 x float> %y) nounwind {
	; CHECK-LABEL: zeroall_v4f32:			; CHECK-LABEL: zeroall_v4f32:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: pushq %rbx			; CHECK-NEXT: pushq %rbx
	; CHECK-NEXT: subq $48, %rsp			; CHECK-NEXT: subq $48, %rsp
	; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill			; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
	; CHECK-NEXT: movq %rdi, %rbx			; CHECK-NEXT: movq %rdi, %rbx
	; CHECK-NEXT: vzeroall			; CHECK-NEXT: vzeroall
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: callq the_unknown			; CHECK-NEXT: callq the_unknown
	; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload			; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
	; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0			; CHECK-NEXT: vaddps (%rbx), %ymm0, %ymm0
	; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1			; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
	; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0			; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0
	; CHECK-NEXT: addq $48, %rsp			; CHECK-NEXT: addq $48, %rsp
	; CHECK-NEXT: popq %rbx			; CHECK-NEXT: popq %rbx
	; CHECK-NEXT: vzeroupper			; CHECK-NEXT: vzeroupper
	Show All 9 Lines
	}			}

	define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind {			define <8 x float> @zeroall_v8f32(<8 x float> %x) nounwind {
	; CHECK-LABEL: zeroall_v8f32:			; CHECK-LABEL: zeroall_v8f32:
	; CHECK: # BB#0:			; CHECK: # BB#0:
	; CHECK-NEXT: subq $56, %rsp			; CHECK-NEXT: subq $56, %rsp
	; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill			; CHECK-NEXT: vmovups %ymm0, (%rsp) # 32-byte Spill
	; CHECK-NEXT: vzeroall			; CHECK-NEXT: vzeroall
	; CHECK-NEXT: vzeroupper
	; CHECK-NEXT: callq the_unknown			; CHECK-NEXT: callq the_unknown
	; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload			; CHECK-NEXT: vmovups (%rsp), %ymm0 # 32-byte Reload
	; CHECK-NEXT: addq $56, %rsp			; CHECK-NEXT: addq $56, %rsp
	; CHECK-NEXT: retq			; CHECK-NEXT: retq
	call void @llvm.x86.avx.vzeroall()			call void @llvm.x86.avx.vzeroall()
	call void @the_unknown()			call void @the_unknown()
	ret <8 x float> %x			ret <8 x float> %x
	}			}

	declare void @llvm.x86.avx.vzeroupper() nounwind readnone			declare void @llvm.x86.avx.vzeroupper() nounwind readnone
	declare void @llvm.x86.avx.vzeroall() nounwind readnone			declare void @llvm.x86.avx.vzeroall() nounwind readnone
	declare void @the_unknown() nounwind			declare void @the_unknown() nounwind