Although this fixes the duplicate VZ* instructions in the existing tests, we still have more problems.
For example, why does a VZU call cause this stack spill which then leads to yet another VZU?
define <4 x float> @avx_in_sse_out(<8 x float> %x) nounwind {
; CHECK-LABEL: avx_in_sse_out:
; CHECK:       # BB#0:
; CHECK-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp) # 32-byte Spill
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
; CHECK-NEXT:    vzeroupper
; CHECK-NEXT:    retq
;
  %xmm = shufflevector <8 x float> %x, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  call void @llvm.x86.avx.vzeroupper()
  ret <4 x float> %xmm
}