Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -280,6 +280,10 @@ "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true", "Disable latency scheduling heuristic">; +def FeatureStorePairSuppress : SubtargetFeature< + "store-pair-suppress", "EnableStorePairSuppress", "true", + "Enable Store Pair Suppression heuristics">; + def FeatureForce32BitJumpTables : SubtargetFeature<"force-32bit-jump-tables", "Force32BitJumpTables", "true", "Force jump table entries to be 32-bits wide except at MinSize">; @@ -787,6 +791,7 @@ FeatureArithmeticCbzFusion, FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing, FeatureZCZeroingFPWorkaround] @@ -800,6 +805,7 @@ FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing] >; @@ -812,6 +818,7 @@ FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing] >; @@ -824,6 +831,7 @@ FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing] >; @@ -836,6 +844,7 @@ FeatureDisableLatencySchedHeuristic, FeatureFuseAES, FeatureFuseCryptoEOR, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing] >; @@ -854,6 +863,7 @@ FeatureFuseCryptoEOR, FeatureFuseAdrpAdd, FeatureFuseLiterals, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing]>; @@ -869,6 +879,7 @@ FeatureFuseCCSelect, FeatureFuseCryptoEOR, FeatureFuseLiterals, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing ]>; @@ -885,6 +896,7 @@ FeatureFuseCCSelect, FeatureFuseCryptoEOR, FeatureFuseLiterals, + FeatureStorePairSuppress, FeatureZCRegMove, FeatureZCZeroing ]>; Index: llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp +++ llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp @@ -11,6 +11,7 @@ // ===---------------------------------------------------------------------===// #include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -122,7 +123,10 @@ if (skipFunction(MF.getFunction()) || MF.getFunction().hasOptSize()) return false; - const TargetSubtargetInfo &ST = MF.getSubtarget(); + const AArch64Subtarget &ST = MF.getSubtarget(); + if (!ST.enableStorePairSuppress()) + return false; + TII = static_cast(ST.getInstrInfo()); TRI = ST.getRegisterInfo(); MRI = &MF.getRegInfo(); Index: llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll +++ llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -1,13 +1,13 @@ -; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-EVEN -; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-ODD -; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN -; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN +; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD ; The following tests use the balance-fp-ops feature, and should be independent of ; the target cpu. -; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN --check-prefix CHECK-BALFP -; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD --check-prefix CHECK-BALFP +; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN +; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so ; our test strategy is to: @@ -81,9 +81,7 @@ ; CHECK: fmsub [[x]] ; CHECK: fmadd [[y]] ; CHECK: fmadd [[x]] -; CHECK-BALFP: stp [[x]], [[y]] -; CHECK-A53-DAG: str [[x]] -; CHECK-A53-DAG: str [[y]] +; CHECK: stp [[x]], [[y]] define void @f2(double* nocapture readonly %p, double* nocapture %q) #0 { entry: @@ -176,9 +174,7 @@ ; CHECK: fmsub [[x]] ; CHECK: fmadd [[y]] ; CHECK: fmadd [[x]] -; CHECK-BALFP: stp [[x]], [[y]] -; CHECK-A53-DAG: str [[x]] -; CHECK-A53-DAG: str [[y]] +; CHECK: stp [[x]], [[y]] define void @f4(float* nocapture readonly %p, float* nocapture %q) #0 { entry: Index: llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll =================================================================== --- llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -385,12 +385,9 @@ ; CHECK-NEXT: bl return_in_block ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d0, [x8] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d2, [x8, #16] -; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d4, [x8, #32] -; CHECK-NEXT: str d5, [x8, #40] +; CHECK-NEXT: stp d0, d1, [x8] +; CHECK-NEXT: stp d2, d3, [x8, #16] +; CHECK-NEXT: stp d4, d5, [x8, #32] ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %1 = call %T_IN_BLOCK @return_in_block() @@ -403,12 +400,9 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: adrp x8, in_block_store ; CHECK-NEXT: add x8, x8, :lo12:in_block_store -; CHECK-NEXT: str d5, [x8, #40] -; CHECK-NEXT: str d4, [x8, #32] -; CHECK-NEXT: str d3, [x8, #24] -; CHECK-NEXT: str d2, [x8, #16] -; CHECK-NEXT: str d1, [x8, #8] -; CHECK-NEXT: str d0, [x8] +; CHECK-NEXT: stp d4, d5, [x8, #32] +; CHECK-NEXT: stp d2, d3, [x8, #16] +; CHECK-NEXT: stp d0, d1, [x8] ; CHECK-NEXT: ret store %T_IN_BLOCK %a, %T_IN_BLOCK* @in_block_store ret void Index: llvm/test/CodeGen/AArch64/arm64-windows-calls.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -148,8 +148,7 @@ store %struct.Pod %x, %struct.Pod* @Pod ret void ; CHECK: bl copy_pod - ; CHECK-NEXT: str d0, [{{.*}}] - ; CHECK-NEXT: str d1, [{{.*}}] + ; CHECK-NEXT: stp d0, d1, [{{.*}}] } @NotCXX14Aggregate = external global %struct.NotCXX14Aggregate Index: llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir =================================================================== --- llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir +++ llvm/test/CodeGen/AArch64/ldrpre-ldr-merge.mir @@ -69,8 +69,7 @@ ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 3 :: (load (s32)) - ; CHECK-NEXT: STRSui renamable $s0, renamable $x1, 0 :: (store (s32)) - ; CHECK-NEXT: STRSui renamable $s1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s32)) + ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) @@ -96,8 +95,7 @@ ; CHECK: liveins: $d0, $d1, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber $x1, renamable $d0, renamable $d1 = LDPDpre renamable $x1, 16 :: (load (s64)) - ; CHECK-NEXT: STRDui renamable $d0, renamable $x1, 0 :: (store (s64)) - ; CHECK-NEXT: STRDui renamable $d1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s64)) + ; CHECK-NEXT: STPDi renamable $d0, renamable $d1, renamable $x1, 0 :: (store (s64)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $d0 = LDRDpre killed renamable $x1, 128 :: (load (s64)) renamable $d1 = LDRDui renamable $x1, 1 :: (load (s64)) @@ -286,8 +284,7 @@ ; CHECK-NEXT: renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64)) ; CHECK-NEXT: STRXui renamable $x0, renamable $x0, 1 :: (store (s64)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STRSui renamable $s0, renamable $x1, 0 :: (store (s32)) - ; CHECK-NEXT: STRSui renamable $s1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s32)) + ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $x0 = LDRXui renamable $x1, 1 :: (load (s64)) @@ -359,8 +356,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 12, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32)) - ; CHECK-NEXT: STRSui renamable $s0, renamable $x1, 0 :: (store (s32)) - ; CHECK-NEXT: STRSui renamable $s1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s32)) + ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 12 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 2 :: (load (s32)) @@ -569,8 +565,7 @@ ; CHECK: liveins: $s0, $s1, $x1 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber $x1, renamable $s0, renamable $s1 = LDPSpre renamable $x1, 63 :: (load (s32)) - ; CHECK-NEXT: STRSui renamable $s0, renamable $x1, 0 :: (store (s32)) - ; CHECK-NEXT: STRSui renamable $s1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s32)) + ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 252 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) @@ -597,8 +592,7 @@ ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: early-clobber renamable $x1, renamable $s0 = LDRSpre renamable $x1, 251, implicit $w1 :: (load (s32)) ; CHECK-NEXT: renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) - ; CHECK-NEXT: STRSui renamable $s0, renamable $x1, 0 :: (store (s32)) - ; CHECK-NEXT: STRSui renamable $s1, renamable $x1, 1 :: ("aarch64-suppress-pair" store (s32)) + ; CHECK-NEXT: STPSi renamable $s0, renamable $s1, renamable $x1, 0 :: (store (s32)) ; CHECK-NEXT: RET undef $lr early-clobber renamable $x1, renamable $s0 = LDRSpre killed renamable $x1, 251 :: (load (s32)) renamable $s1 = LDRSui renamable $x1, 1 :: (load (s32)) Index: llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll =================================================================== --- llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll +++ llvm/test/CodeGen/AArch64/storepairsuppress_minsize.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-none-eabi -mcpu=cortex-a55 -mattr=+store-pair-suppress -o - %s | FileCheck %s ; Check that stp are not suppressed at minsize.