diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -633,6 +633,10 @@ : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true", "Use Goldmont specific floating point div/sqrt costs">; +def TuningAvoidMFENCE + : SubtargetFeature<"avoid-mfence", "AvoidMFence", "true", + "Avoid MFENCE for fence seq_cst, and instead use lock or">; + //===----------------------------------------------------------------------===// // X86 CPU Families // TODO: Remove these - use general tuning features to determine codegen. @@ -704,7 +708,8 @@ // Nehalem list NHMFeatures = X86_64V2Features; list NHMTuning = [TuningMacroFusion, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; // Westmere list WSMAdditionalFeatures = [FeaturePCLMUL]; @@ -724,7 +729,8 @@ TuningFastSHLDRotate, TuningFast15ByteNOP, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list SNBFeatures = !listconcat(WSMFeatures, SNBAdditionalFeatures); @@ -755,7 +761,8 @@ TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, TuningLZCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list HSWFeatures = !listconcat(IVBFeatures, HSWAdditionalFeatures); @@ -783,7 +790,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list SKLFeatures = !listconcat(BDWFeatures, SKLAdditionalFeatures); @@ -811,7 +819,8 @@ TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, TuningPOPCNTFalseDeps, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list SKXFeatures = !listconcat(BDWFeatures, SKXAdditionalFeatures); @@ -848,7 +857,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list CNLFeatures = !listconcat(SKLFeatures, CNLAdditionalFeatures); @@ -873,7 +883,8 @@ TuningFastVariableCrossLaneShuffle, TuningFastVariablePerLaneShuffle, TuningPrefer256Bit, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list ICLFeatures = !listconcat(CNLFeatures, ICLAdditionalFeatures); @@ -999,7 +1010,8 @@ // Tremont list TRMAdditionalFeatures = [FeatureCLWB, FeatureGFNI]; - list TRMTuning = GLPTuning; + list TRMAdditionalTuning = [TuningAvoidMFENCE]; + list TRMTuning = !listconcat(GLPTuning, TRMAdditionalTuning); list TRMFeatures = !listconcat(GLPFeatures, TRMAdditionalFeatures); @@ -1160,7 +1172,8 @@ TuningFastScalarShiftMasks, TuningBranchFusion, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; // PileDriver list BdVer2AdditionalFeatures = [FeatureF16C, @@ -1237,7 +1250,8 @@ TuningFastMOVBE, TuningSlowSHLD, TuningSBBDepBreaking, - TuningInsertVZEROUPPER]; + TuningInsertVZEROUPPER, + TuningAvoidMFENCE]; list ZN2AdditionalFeatures = [FeatureCLWB, FeatureRDPID, FeatureRDPRU, @@ -1383,7 +1397,8 @@ [ TuningMacroFusion, TuningSlowUAMem16, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]>; def : ProcModel<"penryn", SandyBridgeModel, [ FeatureX87, @@ -1400,7 +1415,8 @@ [ TuningMacroFusion, TuningSlowUAMem16, - TuningInsertVZEROUPPER + TuningInsertVZEROUPPER, + TuningAvoidMFENCE ]>; // Atom CPUs. diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31087,7 +31087,7 @@ // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && FenceSSID == SyncScope::System) { - if (Subtarget.hasMFence()) + if (!Subtarget.avoidMFence() && Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); SDValue Chain = Op.getOperand(0); diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h --- a/llvm/lib/Target/X86/X86TargetTransformInfo.h +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h @@ -87,6 +87,7 @@ X86::TuningInsertVZEROUPPER, X86::TuningUseSLMArithCosts, X86::TuningUseGLMDivSqrtCosts, + X86::TuningAvoidMFENCE, // Perf-tuning flags. X86::TuningFastGather, diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2330,7 +2330,7 @@ ; CHECK-LABEL: nofold_fence: ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax -; CHECK-NEXT: mfence +; CHECK-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: addq $15, %rax ; CHECK-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8 @@ -2418,14 +2418,14 @@ ; CHECK-O0-LABEL: fold_constant_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movq Constant(%rip), %rax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: addq %rdi, %rax ; CHECK-O0-NEXT: retq ; ; CHECK-O3-CUR-LABEL: fold_constant_fence: ; CHECK-O3-CUR: # %bb.0: ; CHECK-O3-CUR-NEXT: movq Constant(%rip), %rax -; CHECK-O3-CUR-NEXT: mfence +; CHECK-O3-CUR-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-CUR-NEXT: addq %rdi, %rax ; CHECK-O3-CUR-NEXT: retq ; @@ -2433,7 +2433,7 @@ ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rdi, %rax ; CHECK-O3-EX-NEXT: addq Constant(%rip), %rax -; CHECK-O3-EX-NEXT: mfence +; CHECK-O3-EX-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* @Constant unordered, align 8 fence seq_cst @@ -2473,14 +2473,14 @@ ; CHECK-O0-LABEL: fold_invariant_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movq (%rdi), %rax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: addq %rsi, %rax ; CHECK-O0-NEXT: retq ; ; CHECK-O3-CUR-LABEL: fold_invariant_fence: ; CHECK-O3-CUR: # %bb.0: ; CHECK-O3-CUR-NEXT: movq (%rdi), %rax -; CHECK-O3-CUR-NEXT: mfence +; CHECK-O3-CUR-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-CUR-NEXT: addq %rsi, %rax ; CHECK-O3-CUR-NEXT: retq ; @@ -2488,7 +2488,7 @@ ; CHECK-O3-EX: # %bb.0: ; CHECK-O3-EX-NEXT: movq %rsi, %rax ; CHECK-O3-EX-NEXT: addq (%rdi), %rax -; CHECK-O3-EX-NEXT: mfence +; CHECK-O3-EX-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O3-EX-NEXT: retq %v = load atomic i64, i64* %p unordered, align 8, !invariant.load !{} fence seq_cst @@ -2661,7 +2661,7 @@ ; CHECK-O0-LABEL: fold_cmp_over_fence: ; CHECK-O0: # %bb.0: ; CHECK-O0-NEXT: movl (%rdi), %eax -; CHECK-O0-NEXT: mfence +; CHECK-O0-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) ; CHECK-O0-NEXT: cmpl %eax, %esi ; CHECK-O0-NEXT: jne .LBB116_2 ; CHECK-O0-NEXT: # %bb.1: # %taken @@ -2672,30 +2672,18 @@ ; CHECK-O0-NEXT: # kill: def $al killed $al killed $eax ; CHECK-O0-NEXT: retq ; -; CHECK-O3-CUR-LABEL: fold_cmp_over_fence: -; CHECK-O3-CUR: # %bb.0: -; CHECK-O3-CUR-NEXT: movl (%rdi), %eax -; CHECK-O3-CUR-NEXT: mfence -; CHECK-O3-CUR-NEXT: cmpl %eax, %esi -; CHECK-O3-CUR-NEXT: jne .LBB116_2 -; CHECK-O3-CUR-NEXT: # %bb.1: # %taken -; CHECK-O3-CUR-NEXT: movb $1, %al -; CHECK-O3-CUR-NEXT: retq -; CHECK-O3-CUR-NEXT: .LBB116_2: # %untaken -; CHECK-O3-CUR-NEXT: xorl %eax, %eax -; CHECK-O3-CUR-NEXT: retq -; -; CHECK-O3-EX-LABEL: fold_cmp_over_fence: -; CHECK-O3-EX: # %bb.0: -; CHECK-O3-EX-NEXT: cmpl (%rdi), %esi -; CHECK-O3-EX-NEXT: mfence -; CHECK-O3-EX-NEXT: jne .LBB116_2 -; CHECK-O3-EX-NEXT: # %bb.1: # %taken -; CHECK-O3-EX-NEXT: movb $1, %al -; CHECK-O3-EX-NEXT: retq -; CHECK-O3-EX-NEXT: .LBB116_2: # %untaken -; CHECK-O3-EX-NEXT: xorl %eax, %eax -; CHECK-O3-EX-NEXT: retq +; CHECK-O3-LABEL: fold_cmp_over_fence: +; CHECK-O3: # %bb.0: +; CHECK-O3-NEXT: movl (%rdi), %eax +; CHECK-O3-NEXT: lock orl $0, -{{[0-9]+}}(%rsp) +; CHECK-O3-NEXT: cmpl %eax, %esi +; CHECK-O3-NEXT: jne .LBB116_2 +; CHECK-O3-NEXT: # %bb.1: # %taken +; CHECK-O3-NEXT: movb $1, %al +; CHECK-O3-NEXT: retq +; CHECK-O3-NEXT: .LBB116_2: # %untaken +; CHECK-O3-NEXT: xorl %eax, %eax +; CHECK-O3-NEXT: retq %v2 = load atomic i32, i32* %p unordered, align 4 fence seq_cst %cmp = icmp eq i32 %v1, %v2