Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -788,33 +788,38 @@ FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", [ @@ -822,7 +827,8 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", [ @@ -831,7 +837,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C", @@ -841,7 +848,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", "Cortex-A710 ARM processors", [ @@ -850,7 +858,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", "Cortex-A715 ARM processors", [ @@ -859,7 +868,8 @@ FeatureCmpBccFusion, FeatureLSLFast, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", "CortexR82", @@ -873,7 +883,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", "Cortex-X2 ARM processors", [ @@ -882,7 +893,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", "Cortex-X3 ARM processors", [ @@ -890,7 +902,8 @@ FeatureFuseAdrpAdd, FeatureFuseAES, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", "Fujitsu A64FX processors", [ @@ -1075,7 +1088,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2", "Neoverse N2 ARM processors", [ @@ -1083,7 +1097,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB", "Neoverse 512-TVB ARM processors", [ @@ -1091,7 +1106,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1", "Neoverse V1 ARM processors", [ @@ -1099,14 +1115,16 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", "Neoverse V2 ARM processors", [ FeatureFuseAES, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ Index: llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECK-GENERIC +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-n1 -S < %s | FileCheck %s +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -S < %s | FileCheck %s + +; Test has not predictable select, which should not be transformed to a branch +define i32 @test1(i32 %a) { +; CHECK-GENERIC-LABEL: @test1( +; CHECK-GENERIC-NEXT: entry: +; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]] +; CHECK-GENERIC-NEXT: ret i32 [[RES]] +; +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cmp = icmp slt i32 %a, 1 + %dec = sub i32 %a, 1 + %res = select i1 %cmp, i32 0, i32 %dec, !prof !0 + ret i32 %res +} + +; Test has highly predictable select according to profile data, +; which should be transformed to a branch on cores with enabled FeaturePredictableSelectIsExpensive +define i32 @test2(i32 %a) { +; CHECK-GENERIC-LABEL: @test2( +; CHECK-GENERIC-NEXT: entry: +; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF1:![0-9]+]] +; CHECK-GENERIC-NEXT: ret i32 [[RES]] +; +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-NEXT: [[RES_FROZEN:%.*]] = freeze i1 [[CMP]] +; CHECK-NEXT: br i1 [[RES_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: select.false.sink: +; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[DEC]], [[SELECT_FALSE_SINK]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cmp = icmp slt i32 %a, 1 + %dec = sub i32 %a, 1 + %res = select i1 %cmp, i32 0, i32 %dec, !prof !1 + ret i32 %res +} + +!0 = !{!"branch_weights", i32 1, i32 1} +!1 = !{!"branch_weights", i32 1, i32 1000}