Although this fixes the duplicate VZ* instructions in the existing tests, we still have more problems.
For example, why does a VZU call cause this stack spill which then leads to yet another VZU?
define <4 x float> @avx_in_sse_out(<8 x float> %x) nounwind {
; CHECK-LABEL: avx_in_sse_out:
; CHECK: # BB#0:
; CHECK-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: vmovups -{{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
;
%xmm = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
call void @llvm.x86.avx.vzeroupper()
ret <4 x float> %xmm
}