Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -91,9 +91,6 @@ def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store", "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">; -def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128", - "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">; - def FeatureSlowSTRQro : SubtargetFeature<"slow-strqro-store", "STRQroIsSlow", "true", "STR of Q register with register offset is slow">; @@ -294,8 +291,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M1 processors", - [FeatureSlowPaired128, - FeatureCRC, + [FeatureCRC, FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, @@ -309,8 +305,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M2/M3 processors", - [FeatureSlowPaired128, - FeatureCRC, + [FeatureCRC, FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1894,19 +1894,6 @@ if (isLdStPairSuppressed(MI)) return false; - // On some CPUs quad load/store pairs are slower than two single load/stores. - if (Subtarget.isPaired128Slow()) { - switch (MI.getOpcode()) { - default: - break; - case AArch64::LDURQi: - case AArch64::STURQi: - case AArch64::LDRQui: - case AArch64::STRQui: - return false; - } - } - return true; } Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -102,7 +102,6 @@ bool CustomAsCheapAsMove = false; bool UsePostRAScheduler = false; bool Misaligned128StoreIsSlow = false; - bool Paired128IsSlow = false; bool STRQroIsSlow = false; bool UseAlternateSExtLoadCVTF32Pattern = false; bool HasArithmeticBccFusion = false; @@ -220,7 +219,6 @@ } bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; } bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; } - bool isPaired128Slow() const { return Paired128IsSlow; } bool isSTRQroSlow() const { return STRQroIsSlow; } bool useAlternateSExtLoadCVTF32Pattern() const { return UseAlternateSExtLoadCVTF32Pattern; Index: llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll +++ llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll @@ -1,6 +1,6 @@ ; REQUIRES: asserts -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s -; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,CORTEX +; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefixes=CHECK,EXYNOS ; Test ldr clustering. ; CHECK: ********** MI Scheduling ********** @@ -8,11 +8,6 @@ ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int:BB#0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load i32, i32* %p1, align 2 @@ -28,11 +23,6 @@ ; CHECK: Cluster ld/st SU(1) - SU(2) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_sext_int:BB#0 -; EXYNOS: Cluster ld/st SU(1) - SU(2) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_sext_int(i32* %p) nounwind { %tmp = load i32, i32* %p, align 4 %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 @@ -49,11 +39,6 @@ ; CHECK: Cluster ld/st SU(2) - SU(1) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDURWi ; CHECK: SU(2): %vreg{{[0-9]+}} = LDURWi -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldur_int:BB#0 -; EXYNOS: Cluster ld/st SU(2) - SU(1) -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDURWi -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDURWi define i32 @ldur_int(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 -1 %tmp1 = load i32, i32* %p1, align 2 @@ -69,11 +54,6 @@ ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRSWui ; CHECK: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_sext_zext_int:BB#0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: SU(4): %vreg{{[0-9]+}}:sub_32 = LDRWui define i64 @ldp_half_sext_zext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -92,11 +72,6 @@ ; CHECK: Cluster ld/st SU(3) - SU(4) ; CHECK: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui ; CHECK: SU(4): %vreg{{[0-9]+}} = LDRSWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldp_half_zext_sext_int:BB#0 -; EXYNOS: Cluster ld/st SU(3) - SU(4) -; EXYNOS: SU(3): %vreg{{[0-9]+}}:sub_32 = LDRWui -; EXYNOS: SU(4): %vreg{{[0-9]+}} = LDRSWui define i64 @ldp_half_zext_sext_int(i64* %q, i32* %p) nounwind { %tmp0 = load i64, i64* %q, align 4 %tmp = load i32, i32* %p, align 4 @@ -115,11 +90,6 @@ ; CHECK-NOT: Cluster ld/st ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRWui ; CHECK: SU(2): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldr_int_volatile:BB#0 -; EXYNOS-NOT: Cluster ld/st -; EXYNOS: SU(1): %vreg{{[0-9]+}} = LDRWui -; EXYNOS: SU(2): %vreg{{[0-9]+}} = LDRWui define i32 @ldr_int_volatile(i32* %a) nounwind { %p1 = getelementptr inbounds i32, i32* %a, i32 1 %tmp1 = load volatile i32, i32* %p1, align 2 @@ -129,15 +99,12 @@ ret i32 %tmp3 } -; Test ldq clustering (no clustering for Exynos). +; Test ldq clustering. ; CHECK: ********** MI Scheduling ********** ; CHECK-LABEL: ldq_cluster:BB#0 ; CHECK: Cluster ld/st SU(1) - SU(3) ; CHECK: SU(1): %vreg{{[0-9]+}} = LDRQui ; CHECK: SU(3): %vreg{{[0-9]+}} = LDRQui -; EXYNOS: ********** MI Scheduling ********** -; EXYNOS-LABEL: ldq_cluster:BB#0 -; EXYNOS-NOT: Cluster ld/st define <2 x i64> @ldq_cluster(i64* %p) { %a1 = bitcast i64* %p to <2 x i64>* %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 Index: llvm/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/test/CodeGen/AArch64/ldst-opt.ll @@ -1,6 +1,6 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN ; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1 -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1 -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS,NOSTRICTALIGN ; This file contains tests for the AArch64 load/store optimizer. @@ -1141,8 +1141,8 @@ define void @store-pair-post-indexed-quadword() nounwind { ; CHECK-LABEL: store-pair-post-indexed-quadword ; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64 -; EXYNOS: str q{{[0-9]+}}, [sp] -; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16] +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp] +; EXYNOS-NEXT: add sp, sp, #64 %src = alloca { fp128, fp128 }, align 8 %dst = alloca { fp128, fp128 }, align 8 @@ -1552,7 +1552,6 @@ define void @merge_zr64(i64* %p) { ; CHECK-LABEL: merge_zr64: ; CHECK: stp xzr, xzr, [x{{[0-9]+}}] -; CHECK-NEXT: ret entry: store i64 0, i64* %p %p1 = getelementptr i64, i64* %p, i64 1 @@ -1627,9 +1626,7 @@ define void @merge_zr64_4vecd(<4 x double>* %p) { ; CHECK-LABEL: merge_zr64_4vecd: ; CHECK: movi v[[REG:[0-9]]].2d, #0000000000000000 -; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] -; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] -; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] entry: store <4 x double> zeroinitializer, <4 x double>* %p ret void Index: llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll =================================================================== --- llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s -; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s - -; CHECK-LABEL: test_nopair_st -; CHECK: str -; CHECK: stur -; CHECK-NOT: stp -define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { - %tmp1 = bitcast double* %ptr to <2 x double>* - store <2 x double> %v2, <2 x double>* %tmp1, align 16 - %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2 - %tmp = bitcast double* %add.ptr to <2 x double>* - store <2 x double> %v1, <2 x double>* %tmp, align 16 - ret void -} - -; CHECK-LABEL: test_nopair_ld -; CHECK: ldr -; CHECK: ldr -; CHECK-NOT: ldp -define <2 x i64> @test_nopair_ld(i64* %p) { - %a1 = bitcast i64* %p to <2 x i64>* - %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 - %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2 - %a2 = bitcast i64* %add.ptr2 to <2 x i64>* - %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8 - %add = add nsw <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %add -} Index: llvm/test/CodeGen/AArch64/quad-ldp-stp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/quad-ldp-stp.ll @@ -0,0 +1,72 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=generic | FileCheck %s --check-prefixes=CHECK,GENERIC +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 | FileCheck %s --check-prefixes=CHECK,GENERIC +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 | FileCheck %s --check-prefixes=CHECK,EXYNOS + +; CHECK-LABEL: test_pre_pair_ld +; CHECK: ldr q{{[0-9]+}}, [x0, #-32]! +; CHECK: ldr q{{[0-9]+}}, [x0, #16] +define <2 x i64>* @test_pre_pair_ld(<2 x i64>* %p, <2 x i64>* %q) { + %p1 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 -1 + %tmp0 = load <2 x i64>, <2 x i64>* %p1, align 16 + %p2 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 -2 + %tmp1 = load <2 x i64>, <2 x i64>* %p2, align 16 + %add = add nsw <2 x i64> %tmp0, %tmp1 + store <2 x i64> %add, <2 x i64>* %q, align 16 + ret <2 x i64>* %p2 +} + +; CHECK-LABEL: test_pair_ld +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x{{[0-9]+}}] +define <2 x i64> @test_pair_ld(i64* %p) { + %a1 = bitcast i64* %p to <2 x i64>* + %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2 + %a2 = bitcast i64* %add.ptr2 to <2 x i64>* + %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8 + %add = add nsw <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %add +} + +; CHECK-LABEL: test_post_pair_ld +; GENERIC: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x0], #32 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x0]{{$}} +define <2 x i64>* @test_post_pair_ld(<2 x i64>* %p, <2 x i64>* %q) { + %tmp0 = load <2 x i64>, <2 x i64>* %p, align 16 + %p1 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 1 + %tmp1 = load <2 x i64>, <2 x i64>* %p1, align 16 + %add = add nsw <2 x i64> %tmp0, %tmp1 + store <2 x i64> %add, <2 x i64>* %q, align 16 + %p2 = getelementptr inbounds <2 x i64>, <2 x i64>* %p, i64 2 + ret <2 x i64>* %p2 +} + +; CHECK-LABEL: test_pre_pair_st +; CHECK: str q{{[0-9]+}}, [x0, #-32]! +; CHECK: str q{{[0-9]+}}, [x0, #16] +define <2 x double>* @test_pre_pair_st(<2 x double>* %ptr, <2 x double> %v1, <2 x double> %v2) { + %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -1 + store <2 x double> %v1, <2 x double>* %ptr1, align 16 + %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -2 + store <2 x double> %v2, <2 x double>* %ptr2, align 16 + ret <2 x double>* %ptr2 +} + +; CHECK-LABEL: test_pair_st +; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0, #-16] +define void @test_pair_st(<2 x double> * %ptr, <2 x double> %v1, <2 x double> %v2) { + store <2 x double> %v2, <2 x double>* %ptr, align 16 + %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 -1 + store <2 x double> %v1, <2 x double>* %ptr1, align 16 + ret void +} + +; CHECK-LABEL: test_post_pair_st +; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0] +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0]{{$}} +define <2 x double>* @test_post_pair_st(<2 x double>* %ptr, <2 x double> %v1, <2 x double> %v2) { + store <2 x double> %v1, <2 x double>* %ptr, align 16 + %ptr1 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 1 + store <2 x double> %v2, <2 x double>* %ptr1, align 16 + %ptr2 = getelementptr inbounds <2 x double>, <2 x double>* %ptr, i64 2 + ret <2 x double>* %ptr2 +}