Index: lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- lib/Target/X86/X86ISelDAGToDAG.cpp +++ lib/Target/X86/X86ISelDAGToDAG.cpp @@ -181,6 +181,7 @@ bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override; void PreprocessISelDAG() override; + void PostprocessISelDAG() override; // Include the pieces autogenerated from the target description. #include "X86GenDAGISel.inc" @@ -752,6 +753,70 @@ } +void X86DAGToDAGISel::PostprocessISelDAG() { + // Skip peepholes at -O0. + if (TM.getOptLevel() == CodeGenOpt::None) + return; + + // Attempt to remove vectors moves that were inserted to zero upper bits. + + SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode()); + ++Position; + + while (Position != CurDAG->allnodes_begin()) { + SDNode *N = &*--Position; + // Skip dead nodes and any non-machine opcodes. + if (N->use_empty() || !N->isMachineOpcode()) + continue; + + if (N->getMachineOpcode() != TargetOpcode::SUBREG_TO_REG) + continue; + + unsigned SubRegIdx = N->getConstantOperandVal(2); + if (SubRegIdx != X86::sub_xmm && SubRegIdx != X86::sub_ymm) + continue; + + SDValue Move = N->getOperand(1); + if (!Move.isMachineOpcode()) + continue; + + // Make sure its one of the move opcodes we recognize. + switch (Move.getMachineOpcode()) { + default: + continue; + case X86::VMOVAPDrr: case X86::VMOVUPDrr: + case X86::VMOVAPSrr: case X86::VMOVUPSrr: + case X86::VMOVDQArr: case X86::VMOVDQUrr: + case X86::VMOVAPDYrr: case X86::VMOVUPDYrr: + case X86::VMOVAPSYrr: case X86::VMOVUPSYrr: + case X86::VMOVDQAYrr: case X86::VMOVDQUYrr: + case X86::VMOVAPDZ128rr: case X86::VMOVUPDZ128rr: + case X86::VMOVAPSZ128rr: case X86::VMOVUPSZ128rr: + case X86::VMOVDQA32Z128rr: case X86::VMOVDQU32Z128rr: + case X86::VMOVDQA64Z128rr: case X86::VMOVDQU64Z128rr: + case X86::VMOVAPDZ256rr: case X86::VMOVUPDZ256rr: + case X86::VMOVAPSZ256rr: case X86::VMOVUPSZ256rr: + case X86::VMOVDQA32Z256rr: case X86::VMOVDQU32Z256rr: + case X86::VMOVDQA64Z256rr: case X86::VMOVDQU64Z256rr: + break; + } + + SDValue In = Move.getOperand(0); + if (!In.isMachineOpcode() || + In.getMachineOpcode() <= TargetOpcode::GENERIC_OP_END) + continue; + + // Producing instruction is another vector instruction. We can drop the + // move. + CurDAG->UpdateNodeOperands(N, N->getOperand(0), In, N->getOperand(2)); + + // If the move is now dead, delete it. + if (Move.getNode()->use_empty()) + CurDAG->RemoveDeadNode(Move.getNode()); + } +} + + /// Emit any code that needs to be executed only in the main function. void X86DAGToDAGISel::emitSpecialCodeForMain() { if (Subtarget->isTargetCygMing()) { Index: lib/Target/X86/X86InstrVecCompiler.td =================================================================== --- lib/Target/X86/X86InstrVecCompiler.td +++ lib/Target/X86/X86InstrVecCompiler.td @@ -360,67 +360,6 @@ v16i32, loadv4i64, sub_ymm>; } -// List of opcodes that guaranteed to zero the upper elements of vector regs. -// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA -// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make -// this difficult. So starting with a couple opcodes used by reduction loops -// where we explicitly insert zeros. -class veczeroupper : - PatLeaf<(vt RC:$src), [{ - return N->getOpcode() == X86ISD::VPMADDWD || - N->getOpcode() == X86ISD::PSADBW; - }]>; - -def zeroupperv2f64 : veczeroupper; -def zeroupperv4f32 : veczeroupper; -def zeroupperv2i64 : veczeroupper; -def zeroupperv4i32 : veczeroupper; -def zeroupperv8i16 : veczeroupper; -def zeroupperv16i8 : veczeroupper; - -def zeroupperv4f64 : veczeroupper; -def zeroupperv8f32 : veczeroupper; -def zeroupperv4i64 : veczeroupper; -def zeroupperv8i32 : veczeroupper; -def zeroupperv16i16 : veczeroupper; -def zeroupperv32i8 : veczeroupper; - - -// If we can guarantee the upper elements have already been zeroed we can elide -// an explicit zeroing. -multiclass subvector_zero_ellision { - def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), - Zeroupper:$src, (iPTR 0))), - (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>; -} - -// 128->256 -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; - -// 128->512 -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; - -// 256->512 -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; -defm: subvector_zero_ellision; - - class maskzeroupper : PatLeaf<(vt RC:$src), [{ return isMaskZeroExtended(N); Index: test/CodeGen/X86/avg.ll =================================================================== --- test/CodeGen/X86/avg.ll +++ test/CodeGen/X86/avg.ll @@ -523,7 +523,6 @@ ; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1 ; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1 ; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0 -; AVX512BW-NEXT: vmovdqa %ymm0, %ymm0 ; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 ; AVX512BW-NEXT: vmovdqu %ymm1, (%rax) Index: test/CodeGen/X86/bitcast-setcc-128.ll =================================================================== --- test/CodeGen/X86/bitcast-setcc-128.ll +++ test/CodeGen/X86/bitcast-setcc-128.ll @@ -658,7 +658,6 @@ ; AVX2-LABEL: v16i8_widened_with_zeroes: ; AVX2: # %bb.0: # %entry ; AVX2-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vmovdqa %xmm0, %xmm0 ; AVX2-NEXT: vpmovmskb %ymm0, %eax ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq Index: test/CodeGen/X86/merge-consecutive-loads-256.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-256.ll +++ test/CodeGen/X86/merge-consecutive-loads-256.ll @@ -597,7 +597,6 @@ ; AVX: # %bb.0: ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; AVX-NEXT: vmovapd %xmm0, %xmm0 ; AVX-NEXT: retq ; ; X32-AVX-LABEL: merge_4f64_f64_34uz_volatile: @@ -605,7 +604,6 @@ ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-AVX-NEXT: vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0] -; X32-AVX-NEXT: vmovapd %xmm0, %xmm0 ; X32-AVX-NEXT: retl %ptr0 = getelementptr inbounds double, double* %ptr, i64 3 %ptr1 = getelementptr inbounds double, double* %ptr, i64 4 Index: test/CodeGen/X86/vector-shuffle-variable-256.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-variable-256.ll +++ test/CodeGen/X86/vector-shuffle-variable-256.ll @@ -136,7 +136,6 @@ ; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero ; ALL-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; ALL-NEXT: vmovaps %xmm0, %xmm0 ; ALL-NEXT: movq %rbp, %rsp ; ALL-NEXT: popq %rbp ; ALL-NEXT: retq