Index: llvm/lib/Target/X86/X86.td =================================================================== --- llvm/lib/Target/X86/X86.td +++ llvm/lib/Target/X86/X86.td @@ -445,6 +445,10 @@ "HasLZCNTFalseDeps", "true", "LZCNT/TZCNT have a false dependency on dest register">; +def TuningSBBFalseDeps : SubtargetFeature<"false-deps-sbb", + "HasSBBFalseDeps", "true", + "SBB has a false dependency on same src register">; + // On recent X86 (port bound) processors, its preferable to combine to a single shuffle // using a variable mask over multiple fixed shuffles. def TuningFastVariableCrossLaneShuffle @@ -653,6 +657,7 @@ // Nehalem list NHMFeatures = X86_64V2Features; list NHMTuning = [TuningMacroFusion, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; // Westmere @@ -673,6 +678,7 @@ TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -704,6 +710,7 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -732,6 +739,7 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -760,6 +768,7 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningPOPCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -797,6 +806,7 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -822,6 +832,7 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -886,6 +897,7 @@ TuningSlowTwoMemOps, TuningLEAUsesAG, TuningPadShortFunctions, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; // Silvermont @@ -904,6 +916,7 @@ TuningFast7ByteNOP, TuningFastMOVBE, TuningPOPCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list SLMFeatures = !listconcat(AtomFeatures, SLMAdditionalFeatures); @@ -924,6 +937,7 @@ TuningSlowIncDec, TuningFastMOVBE, TuningPOPCNTFalseDeps, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list GLMFeatures = !listconcat(SLMFeatures, GLMAdditionalFeatures); @@ -936,6 +950,7 @@ TuningSlowLEA, TuningSlowIncDec, TuningFastMOVBE, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]; list GLPFeatures = !listconcat(GLMFeatures, GLPAdditionalFeatures); @@ -1011,6 +1026,7 @@ TuningPreferMaskRegisters, TuningFastGather, TuningFastMOVBE, + TuningSBBFalseDeps, TuningSlowPMADDWD]; // TODO Add AVX5124FMAPS/AVX5124VNNIW features list KNMFeatures = @@ -1217,6 +1233,7 @@ TuningSlowDivide64, TuningSlowIncDec, TuningMacroFusion, + TuningSBBFalseDeps, TuningInsertVZEROUPPER]>; def : Proc<"i386", [FeatureX87], @@ -1511,6 +1528,7 @@ TuningSlowDivide64, TuningSlowIncDec, TuningMacroFusion, + TuningSBBFalseDeps, TuningInsertVZEROUPPER ]>; Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -464,8 +464,13 @@ } // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = CurDAG->getCopyToReg( - CurDAG->getEntryNode(), dl, X86::EFLAGS, N->getOperand(2), SDValue()); + unsigned Opcode = N->getOpcode(); + assert(Opcode == X86ISD::SBB || Opcode == X86ISD::SETCC_CARRY && + "Unexpected opcode for SBB materialization"); + unsigned FlagOpIndex = Opcode == X86ISD::SBB ? 2 : 1; + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + N->getOperand(FlagOpIndex), SDValue()); // Create a 64-bit instruction if the result is 64-bits otherwise use the // 32-bit version. @@ -5801,21 +5806,26 @@ break; case X86ISD::SETCC_CARRY: { - // We have to do this manually because tblgen will put the eflags copy in - // the wrong place if we use an extract_subreg in the pattern. MVT VT = Node->getSimpleValueType(0); + SDValue Result; + if (Subtarget->hasSBBFalseDeps()) { + Result = getSBBZero(Node); + } else { + // We have to do this manually because tblgen will put the eflags copy in + // the wrong place if we use an extract_subreg in the pattern. + // Copy flags to the EFLAGS register and glue it to next node. + SDValue EFLAGS = + CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, + Node->getOperand(1), SDValue()); - // Copy flags to the EFLAGS register and glue it to next node. - SDValue EFLAGS = - CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS, - Node->getOperand(1), SDValue()); - - // Create a 64-bit instruction if the result is 64-bits otherwise use the - // 32-bit version. - unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; - MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; - SDValue Result = SDValue( - CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0); + // Create a 64-bit instruction if the result is 64-bits otherwise use the + // 32-bit version. + unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r; + MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32; + Result = SDValue( + CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), + 0); + } // For less than 32-bits we need to extract from the 32-bit node. if (VT == MVT::i8 || VT == MVT::i16) { Index: llvm/lib/Target/X86/X86Subtarget.h =================================================================== --- llvm/lib/Target/X86/X86Subtarget.h +++ llvm/lib/Target/X86/X86Subtarget.h @@ -246,6 +246,9 @@ /// True if LZCNT/TZCNT instructions have a false dependency on the destination register. bool HasLZCNTFalseDeps = false; + /// True if SBB instruction has a false dependency with same source register. + bool HasSBBFalseDeps = false; + /// True if its preferable to combine to a single cross-lane shuffle /// using a variable mask over multiple fixed shuffles. bool HasFastVariableCrossLaneShuffle = false; @@ -719,6 +722,7 @@ bool useLeaForSP() const { return UseLeaForSP; } bool hasPOPCNTFalseDeps() const { return HasPOPCNTFalseDeps; } bool hasLZCNTFalseDeps() const { return HasLZCNTFalseDeps; } + bool hasSBBFalseDeps() const { return HasSBBFalseDeps; } bool hasFastVariableCrossLaneShuffle() const { return HasFastVariableCrossLaneShuffle; } Index: llvm/test/CodeGen/X86/pr32588.ll =================================================================== --- llvm/test/CodeGen/X86/pr32588.ll +++ llvm/test/CodeGen/X86/pr32588.ll @@ -8,6 +8,7 @@ define void @fn1() { ; CHECK-LABEL: fn1: ; CHECK: # %bb.0: +; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: cmpl $1, c(%rip) ; CHECK-NEXT: sbbl %eax, %eax ; CHECK-NEXT: andl $1, %eax Index: llvm/test/CodeGen/X86/sbb-false-dep.ll =================================================================== --- llvm/test/CodeGen/X86/sbb-false-dep.ll +++ llvm/test/CodeGen/X86/sbb-false-dep.ll @@ -24,13 +24,15 @@ ; CHECK-NEXT: callq foo1@PLT ; CHECK-NEXT: movq 8(%rbx), %rax ; CHECK-NEXT: movq (%rax), %rdx +; CHECK-NEXT: xorl %ebp, %ebp ; CHECK-NEXT: movl %r13d, %ecx ; CHECK-NEXT: negl %ecx -; CHECK-NEXT: sbbq %rbp, %rbp -; CHECK-NEXT: orq %rdx, %rbp -; CHECK-NEXT: cmpl $1, %r13d +; CHECK-NEXT: movl $0, %eax ; CHECK-NEXT: sbbq %rax, %rax ; CHECK-NEXT: orq %rdx, %rax +; CHECK-NEXT: cmpl $1, %r13d +; CHECK-NEXT: sbbq %rbp, %rbp +; CHECK-NEXT: orq %rdx, %rbp ; CHECK-NEXT: subq $8, %rsp ; CHECK-NEXT: movq %r12, %rdi ; CHECK-NEXT: movl %r15d, %esi @@ -38,8 +40,8 @@ ; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: xorl %r8d, %r8d ; CHECK-NEXT: xorl %r9d, %r9d -; CHECK-NEXT: pushq %rax ; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: callq foo2@PLT ; CHECK-NEXT: addq $40, %rsp Index: llvm/test/CodeGen/X86/select.ll =================================================================== --- llvm/test/CodeGen/X86/select.ll +++ llvm/test/CodeGen/X86/select.ll @@ -638,11 +638,10 @@ ; ; ATOM-LABEL: test9: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpq $1, %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq %rsi, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test9: @@ -686,11 +685,10 @@ ; ; ATOM-LABEL: test9a: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpq $1, %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq %rsi, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test9a: @@ -732,11 +730,10 @@ ; ; ATOM-LABEL: test9b: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpq $1, %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq %rsi, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test9b: @@ -779,11 +776,10 @@ ; ; ATOM-LABEL: test10: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpq $1, %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq $1, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test10: @@ -823,11 +819,10 @@ ; ; ATOM-LABEL: test11: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negq %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq %rsi, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test11: @@ -870,11 +865,10 @@ ; ; ATOM-LABEL: test11a: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negq %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq %rsi, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test11a: @@ -916,11 +910,10 @@ ; ; ATOM-LABEL: eqzero_const_or_all_ones: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negl %edi ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: orl $42, %eax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: eqzero_const_or_all_ones: @@ -952,11 +945,10 @@ ; ; ATOM-LABEL: nezero_const_or_all_ones: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpl $1, %edi ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: orl $42, %eax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: nezero_const_or_all_ones: @@ -987,11 +979,10 @@ ; ; ATOM-LABEL: eqzero_all_ones_or_const: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpq $1, %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: orq $42, %rax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: eqzero_all_ones_or_const: @@ -1032,12 +1023,11 @@ ; ; ATOM-LABEL: nezero_all_ones_or_const: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negb %dil ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: orb $42, %al ; ATOM-NEXT: ## kill: def $al killed $al killed $eax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: nezero_all_ones_or_const: @@ -1071,11 +1061,10 @@ ; ; ATOM-LABEL: PR53006: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negl %edi ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: orl $1, %eax -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: PR53006: @@ -1106,12 +1095,11 @@ ; ; ATOM-LABEL: test13: ; ATOM: ## %bb.0: +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: cmpl %esi, %edi ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: nop ; ATOM-NEXT: nop -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test13: @@ -1178,12 +1166,11 @@ ; ; ATOM-LABEL: test15: ; ATOM: ## %bb.0: ## %entry +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negl %edi ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: nop ; ATOM-NEXT: nop -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test15: @@ -1213,12 +1200,11 @@ ; ; ATOM-LABEL: test16: ; ATOM: ## %bb.0: ## %entry +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negq %rdi ; ATOM-NEXT: sbbq %rax, %rax ; ATOM-NEXT: nop ; ATOM-NEXT: nop -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test16: @@ -1256,13 +1242,12 @@ ; ; ATOM-LABEL: test17: ; ATOM: ## %bb.0: ## %entry +; ATOM-NEXT: xorl %eax, %eax ; ATOM-NEXT: negw %di ; ATOM-NEXT: sbbl %eax, %eax ; ATOM-NEXT: ## kill: def $ax killed $ax killed $eax ; ATOM-NEXT: nop ; ATOM-NEXT: nop -; ATOM-NEXT: nop -; ATOM-NEXT: nop ; ATOM-NEXT: retq ; ; ATHLON-LABEL: test17: