diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -792,33 +792,38 @@ FeatureFuseAddress, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", "Cortex-A72 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureFuseLiterals, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA73 : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73", "Cortex-A73 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA75 : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75", "Cortex-A75 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", "Cortex-A76 ARM processors", [ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA77 : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77", "Cortex-A77 ARM processors", [ @@ -826,7 +831,8 @@ FeatureFuseAES, FeatureFuseAdrpAdd, FeatureLSLFast, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA78 : SubtargetFeature<"a78", "ARMProcFamily", "CortexA78", "Cortex-A78 ARM processors", [ @@ -835,7 +841,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA78C : SubtargetFeature<"a78c", "ARMProcFamily", "CortexA78C", @@ -845,7 +852,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA710 : SubtargetFeature<"a710", "ARMProcFamily", "CortexA710", "Cortex-A710 ARM processors", [ @@ -854,7 +862,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA715 : SubtargetFeature<"a715", "ARMProcFamily", "CortexA715", "Cortex-A715 ARM processors", [ @@ -863,7 +872,8 @@ FeatureCmpBccFusion, FeatureLSLFast, FeatureFuseAdrpAdd, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneR82 : SubtargetFeature<"cortex-r82", "ARMProcFamily", "CortexR82", @@ -877,7 +887,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneX2 : SubtargetFeature<"cortex-x2", "ARMProcFamily", "CortexX2", "Cortex-X2 ARM processors", [ @@ -886,7 +897,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneX3 : SubtargetFeature<"cortex-x3", "ARMProcFamily", "CortexX3", "Cortex-X3 ARM processors", [ @@ -894,7 +906,8 @@ FeatureFuseAdrpAdd, FeatureFuseAES, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX", "Fujitsu A64FX processors", [ @@ -1079,7 +1092,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2", "Neoverse N2 ARM processors", [ @@ -1087,7 +1101,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverse512TVB : SubtargetFeature<"neoverse512tvb", "ARMProcFamily", "Neoverse512TVB", "Neoverse 512-TVB ARM processors", [ @@ -1095,7 +1110,8 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1", "Neoverse V1 ARM processors", [ @@ -1103,14 +1119,16 @@ FeatureFuseAdrpAdd, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", "Neoverse V2 ARM processors", [ FeatureFuseAES, FeatureLSLFast, FeaturePostRAScheduler, - FeatureEnableSelectOptimize]>; + FeatureEnableSelectOptimize, + FeaturePredictableSelectIsExpensive]>; def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira", "Qualcomm Saphira processors", [ diff --git a/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll b/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/convert-highly-predictable-select-to-branch.ll @@ -0,0 +1,60 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECK-GENERIC +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-n1 -S < %s | FileCheck %s +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s +; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -S < %s | FileCheck %s + +; Test has not predictable select, which should not be transformed to a branch +define i32 @test1(i32 %a) { +; CHECK-GENERIC-LABEL: @test1( +; CHECK-GENERIC-NEXT: entry: +; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]] +; CHECK-GENERIC-NEXT: ret i32 [[RES]] +; +; CHECK-LABEL: @test1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF0:![0-9]+]] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cmp = icmp slt i32 %a, 1 + %dec = sub i32 %a, 1 + %res = select i1 %cmp, i32 0, i32 %dec, !prof !0 + ret i32 %res +} + +; Test has highly predictable select according to profile data, +; which should be transformed to a branch on cores with enabled FeaturePredictableSelectIsExpensive +define i32 @test2(i32 %a) { +; CHECK-GENERIC-LABEL: @test2( +; CHECK-GENERIC-NEXT: entry: +; CHECK-GENERIC-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-GENERIC-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-GENERIC-NEXT: [[RES:%.*]] = select i1 [[CMP]], i32 0, i32 [[DEC]], !prof [[PROF1:![0-9]+]] +; CHECK-GENERIC-NEXT: ret i32 [[RES]] +; +; CHECK-LABEL: @test2( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[A:%.*]], 1 +; CHECK-NEXT: [[RES_FROZEN:%.*]] = freeze i1 [[CMP]] +; CHECK-NEXT: br i1 [[RES_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE_SINK:%.*]], !prof [[PROF1:![0-9]+]] +; CHECK: select.false.sink: +; CHECK-NEXT: [[DEC:%.*]] = sub i32 [[A]], 1 +; CHECK-NEXT: br label [[SELECT_END]] +; CHECK: select.end: +; CHECK-NEXT: [[RES:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[DEC]], [[SELECT_FALSE_SINK]] ] +; CHECK-NEXT: ret i32 [[RES]] +; +entry: + %cmp = icmp slt i32 %a, 1 + %dec = sub i32 %a, 1 + %res = select i1 %cmp, i32 0, i32 %dec, !prof !1 + ret i32 %res +} + +!0 = !{!"branch_weights", i32 1, i32 1} +!1 = !{!"branch_weights", i32 1, i32 1000}