Index: lib/Target/X86/X86InstrVecCompiler.td =================================================================== --- lib/Target/X86/X86InstrVecCompiler.td +++ lib/Target/X86/X86InstrVecCompiler.td @@ -360,3 +360,62 @@ defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, loadv4i64, sub_ymm>; } + +// List of opcodes that guaranteed to zero the upper elements of vector regs. +// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA +// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make +// this difficult. So starting with a couple opcodes used by reduction loops +// where we explicitly insert zeros. +class veczeroupper : + PatLeaf<(vt RC:$src), [{ + return N->getOpcode() == X86ISD::VPMADDWD; + }]>; + +def zeroupperv2f64 : veczeroupper; +def zeroupperv4f32 : veczeroupper; +def zeroupperv2i64 : veczeroupper; +def zeroupperv4i32 : veczeroupper; +def zeroupperv8i16 : veczeroupper; +def zeroupperv16i8 : veczeroupper; + +def zeroupperv4f64 : veczeroupper; +def zeroupperv8f32 : veczeroupper; +def zeroupperv4i64 : veczeroupper; +def zeroupperv8i32 : veczeroupper; +def zeroupperv16i16 : veczeroupper; +def zeroupperv32i8 : veczeroupper; + + +// If we can guarantee the upper elements have already been zeroed we can elide +// an explicit zeroing. +multiclass subvector_zero_ellision { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + Zeroupper:$src, (iPTR 0))), + (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>; +} + +// 128->256 +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; + +// 128->512 +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; + +// 256->512 +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; +defm: subvector_zero_ellision; Index: test/CodeGen/X86/madd.ll =================================================================== --- test/CodeGen/X86/madd.ll +++ test/CodeGen/X86/madd.ll @@ -40,7 +40,6 @@ ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 ; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 -; AVX2-NEXT: vmovdqa %xmm1, %xmm1 ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: addq $8, %rcx ; AVX2-NEXT: cmpq %rcx, %rax @@ -65,7 +64,6 @@ ; AVX512-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX512-NEXT: vmovdqu (%rsi,%rcx,2), %xmm1 ; AVX512-NEXT: vpmaddwd (%rdi,%rcx,2), %xmm1, %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, %xmm1 ; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: addq $8, %rcx ; AVX512-NEXT: cmpq %rcx, %rax @@ -314,7 +312,6 @@ ; AVX512-NEXT: vpmovsxbw (%rdi,%rcx), %ymm1 ; AVX512-NEXT: vpmovsxbw (%rsi,%rcx), %ymm2 ; AVX512-NEXT: vpmaddwd %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vmovdqa %ymm1, %ymm1 ; AVX512-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; AVX512-NEXT: addq $16, %rcx ; AVX512-NEXT: cmpq %rcx, %rax