Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -91,9 +91,6 @@ def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; -def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", - "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; - def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", "true", "STR of Q register with register offset is slow">; @@ -294,8 +291,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M1 processors", - [FeatureSlowPaired128, - FeatureCRC, + [FeatureCRC, FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, @@ -309,8 +305,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M2/M3 processors", - [FeatureSlowPaired128, - FeatureCRC, + [FeatureCRC, FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1894,19 +1894,6 @@ if (isLdStPairSuppressed(MI)) return false; - // On some CPUs quad load/store pairs are slower than two single load/stores. - if (Subtarget.isPaired128Slow()) { - switch (MI.getOpcode()) { - default: - break; - case AArch64::LDURQi: - case AArch64::STURQi: - case AArch64::LDRQui: - case AArch64::STRQui: - return false; - } - } - return true; } Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -102,7 +102,6 @@ bool CustomAsCheapAsMove = false; bool UsePostRAScheduler = false; bool Misaligned128StoreIsSlow = false; - bool Paired128IsSlow = false; bool STRQroIsSlow = false; bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; @@ -220,7 +219,6 @@ } bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } - bool isPaired128Slow() const { return Paired128IsSlow; } bool isSTRQroSlow() const { return STRQroIsSlow; } bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; Index: llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -1,6 +1,5 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s ; Test ldr clustering. ; CHECK: ********** MI Scheduling ********** @@ -8,11 +7,6 @@ ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int:BB#0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load i32, i32* %p1, align 2 @@ -28,11 +22,6 @@ ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_sext_int:BB#0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_sext_int(i32* %p) nounwind { %tmp = load i32, i32* %p, align 4 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 @@ -49,11 +38,6 @@ ; CHECK: Cluster ld/st SU(2) - SU(1) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDURWi ; CHECK: SU(2): %vreg{{[0-9]+}} = LDURWi -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldur_int:BB#0 -; EXYNOS: Cluster ld/st SU(2) - SU(1) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDURWi -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDURWi define i32 @ldur_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 @@ -69,11 +53,6 @@ ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -92,11 +71,6 @@ ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui ; CHECK: SU(4): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui -; EXYNOS: SU(4): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -115,11 +89,6 @@ ; CHECK-NOT: Cluster ld/st ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int_volatile:BB#0 -; EXYNOS-NOT: Cluster ld/st -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int_volatile(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load volatile i32, i32* %p1, align 2 @@ -129,15 +98,12 @@ ret i32 %tmp3 } -; Test ldq clustering (no clustering for Exynos). +; Test ldq clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldq_cluster:BB#0 ; CHECK: Cluster ld/st SU(1) - SU(3) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRQui ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRQui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldq_cluster:BB#0 -; EXYNOS-NOT: Cluster ld/st define <2 x i64> @ldq_cluster(i64* %p) { %a1 = bitcast i64* %p to <2 x i64>* %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 Index: llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll =================================================================== --- llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s - -; CHECK-LABEL: test_nopair_st -; CHECK: str -; CHECK: stur -; CHECK-NOT: stp -define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { - %tmp1 = bitcast double* %ptr to <2 x double>* - store <2 x double> %v2, <2 x double>* %tmp1, align 16 - %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2 - %tmp = bitcast double* %add.ptr to <2 x double>* - store <2 x double> %v1, <2 x double>* %tmp, align 16 - ret void -} - -; CHECK-LABEL: test_nopair_ld -; CHECK: ldr -; CHECK: ldr -; CHECK-NOT: ldp -define <2 x i64> @test_nopair_ld(i64* %p) { - %a1 = bitcast i64* %p to <2 x i64>* - %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 - %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2 - %a2 = bitcast i64* %add.ptr2 to <2 x i64>* - %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8 - %add = add nsw <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %add -}