Index: llvm/lib/Target/ARM/ARM.td =================================================================== --- llvm/lib/Target/ARM/ARM.td +++ llvm/lib/Target/ARM/ARM.td @@ -194,6 +194,10 @@ "SlowLoadDSubregister", "true", "Loading into D subregs is slow">; +def FeatureUseWideStrideVFP : SubtargetFeature<"wide-stride-vfp", + "UseWideStrideVFP", "true", + "Use a wide stride when allocating VFP registers">; + // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD. def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", "DontWidenVMOVS", "true", @@ -865,6 +869,7 @@ FeatureHasRetAddrStack, FeatureNEONForFP, FeatureVFP4, + FeatureUseWideStrideVFP, FeatureMP, FeatureHWDivThumb, FeatureHWDivARM, @@ -1018,24 +1023,28 @@ FeatureNoPostRASched]>; def : ProcNoItin<"exynos-m1", [ARMv8a, ProcExynosM1, + FeatureUseWideStrideVFP, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"exynos-m2", [ARMv8a, ProcExynosM1, + FeatureUseWideStrideVFP, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"exynos-m3", [ARMv8a, ProcExynosM1, + FeatureUseWideStrideVFP, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, FeatureCRC]>; def : ProcNoItin<"exynos-m4", [ARMv8a, ProcExynosM1, + FeatureUseWideStrideVFP, FeatureHWDivThumb, FeatureHWDivARM, FeatureCrypto, Index: llvm/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.h +++ llvm/lib/Target/ARM/ARMSubtarget.h @@ -353,6 +353,9 @@ /// If true, loading into a D subregister will be penalized. bool SlowLoadDSubregister = false; + /// If true, use a wider stride when allocating VFP registers. + bool UseWideStrideVFP = false; + /// If true, the AGU and NEON/FPU units are multiplexed. bool HasMuxedUnits = false; @@ -596,6 +599,7 @@ bool hasVMLxHazards() const { return HasVMLxHazards; } bool hasSlowOddRegister() const { return SlowOddRegister; } bool hasSlowLoadDSubregister() const { return SlowLoadDSubregister; } + bool useWideStrideVFP() const { return UseWideStrideVFP; } bool hasMuxedUnits() const { return HasMuxedUnits; } bool dontWidenVMOVS() const { return DontWidenVMOVS; } bool useSplatVFPToNeon() const { return SplatVFPToNeon; } Index: llvm/lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.cpp +++ llvm/lib/Target/ARM/ARMSubtarget.cpp @@ -370,7 +370,8 @@ // For general targets, the prologue can grow when VFPs are allocated with // stride 4 (more vpush instructions). But WatchOS uses a compact unwind // format which it's more important to get right. - return isTargetWatchABI() || (isSwift() && !MF.getFunction().optForMinSize()); + return isTargetWatchABI() || + (useWideStrideVFP() && !MF.getFunction().optForMinSize()); } bool ARMSubtarget::useMovt(const MachineFunction &MF) const { Index: llvm/test/CodeGen/ARM/vfp-reg-stride.ll =================================================================== --- llvm/test/CodeGen/ARM/vfp-reg-stride.ll +++ llvm/test/CodeGen/ARM/vfp-reg-stride.ll @@ -1,42 +1,45 @@ -; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4 -; RUN: llc -mcpu=swift -mtriple=thumbv7k-apple-watchos -o - %s | FileCheck %s --check-prefix=CHECK-STRIDE4-WATCH -; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefix=CHECK-GENERIC +; RUN: llc -mcpu=swift -mtriple=thumbv7s-apple-ios -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-STRIDE4 +; RUN: llc -mcpu=swift -mtriple=thumbv7k-apple-watchos -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-STRIDE4-WATCH +; RUN: llc -mcpu=cortex-a57 -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GENERIC +; RUN: llc -mattr=wide-stride-vfp -mtriple=thumbv7-linux-gnueabihf -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GENERIC4 +; CHECK-LABEL: test_reg_stride: define void @test_reg_stride(double %a, double %b) { -; CHECK-STRIDE4-LABEL: test_reg_stride: ; CHECK-STRIDE4-DAG: vmov d16, r ; CHECK-STRIDE4-DAG: vmov d18, r -; CHECK-STRIDE4-WATCH-LABEL: test_reg_stride: ; CHECK-STRIDE4-WATCH-DAG: vmov.f64 d16, d ; CHECK-STRIDE4-WATCH-DAG: vmov.f64 d18, d -; CHECK-GENERIC-LABEL: test_reg_stride: ; CHECK-GENERIC-DAG: vmov.f64 d16, {{d[01]}} ; CHECK-GENERIC-DAG: vmov.f64 d17, {{d[01]}} +; CHECK-GENERIC4-DAG: vmov.f64 d16, {{d[01]}} +; CHECK-GENERIC4-DAG: vmov.f64 d18, {{d[01]}} + call void asm "", "~{r0},~{r1},~{d0},~{d1}"() call arm_aapcs_vfpcc void @eat_doubles(double %a, double %b) ret void } +; CHECK-LABEL: test_stride_minsize: define void @test_stride_minsize(float %a, float %b) minsize { -; CHECK-STRIDE4-LABEL: test_stride_minsize: ; CHECK-STRIDE4: vmov d2, {{r[01]}} ; CHECK-STRIDE4: vmov d3, {{r[01]}} -; CHECK-STRIDE4-WATCH-LABEL: test_stride_minsize: ; CHECK-STRIDE4-WATCH-DAG: vmov.f32 s4, {{s[01]}} ; CHECK-STRIDE4-WATCH-DAG: vmov.f32 s8, {{s[01]}} -; CHECK-GENERIC-LABEL: test_stride_minsize: ; CHECK-GENERIC-DAG: vmov.f32 s4, {{s[01]}} ; CHECK-GENERIC-DAG: vmov.f32 s6, {{s[01]}} + +; CHECK-GENERIC4-DAG: vmov.f32 s4, {{s[01]}} +; CHECK-GENERIC4-DAG: vmov.f32 s6, {{s[01]}} + call void asm "", "~{r0},~{r1},~{s0},~{s1},~{d0},~{d1}"() call arm_aapcs_vfpcc void @eat_floats(float %a, float %b) ret void } - declare arm_aapcs_vfpcc void @eat_doubles(double, double) declare arm_aapcs_vfpcc void @eat_floats(float, float)