Index: test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll =================================================================== --- test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll +++ test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll @@ -1,8 +1,14 @@ -; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-EVEN -; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A57 --check-prefix CHECK-ODD +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-EVEN +; RUN: llc < %s -mcpu=cortex-a57 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-BALFP --check-prefix CHECK-ODD ; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-EVEN ; RUN: llc < %s -mcpu=cortex-a53 -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-A53 --check-prefix CHECK-ODD +; The following tests use the balance-fp-ops feature, and should be independent of +; the target cpu. + +; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=1 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-EVEN --check-prefix CHECK-BALFP +; RUN: llc < %s -mtriple=aarch64-linux-gnueabi -mattr=+balance-fp-ops -aarch64-a57-fp-load-balancing-override=2 -aarch64-a57-fp-load-balancing-force-all -enable-misched=false -enable-post-misched=false | FileCheck %s --check-prefix CHECK --check-prefix CHECK-ODD --check-prefix CHECK-BALFP + ; Test the AArch64A57FPLoadBalancing pass. This pass relies heavily on register allocation, so ; our test strategy is to: ; * Force the pass to always perform register swapping even if the dest register is of the @@ -75,7 +81,7 @@ ; CHECK: fmsub [[x]] ; CHECK: fmadd [[y]] ; CHECK: fmadd [[x]] -; CHECK-A57: stp [[x]], [[y]] +; CHECK-BALFP: stp [[x]], [[y]] ; CHECK-A53-DAG: str [[x]] ; CHECK-A53-DAG: str [[y]] @@ -170,7 +176,7 @@ ; CHECK: fmsub [[x]] ; CHECK: fmadd [[y]] ; CHECK: fmadd [[x]] -; CHECK-A57: stp [[x]], [[y]] +; CHECK-BALFP: stp [[x]], [[y]] ; CHECK-A53-DAG: str [[x]] ; CHECK-A53-DAG: str [[y]] Index: test/CodeGen/AArch64/aarch64-gep-opt.ll =================================================================== --- test/CodeGen/AArch64/aarch64-gep-opt.ll +++ test/CodeGen/AArch64/aarch64-gep-opt.ll @@ -1,6 +1,9 @@ ; RUN: llc -O3 -aarch64-gep-opt=true -verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -O3 -aarch64-gep-opt=true -mattr=-use-aa -print-after=codegenprepare < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s +; RUN: llc -O3 -aarch64-gep-opt=true -mattr=+use-aa -print-after=codegenprepare < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s ; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cyclone < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-NoAA <%t %s ; RUN: llc -O3 -aarch64-gep-opt=true -print-after=codegenprepare -mcpu=cortex-a53 < %s >%t 2>&1 && FileCheck --check-prefix=CHECK-UseAA <%t %s + target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" target triple = "aarch64-linux-gnueabi" Index: test/CodeGen/AArch64/exynos-quad-ldp-stp.ll =================================================================== --- test/CodeGen/AArch64/exynos-quad-ldp-stp.ll +++ /dev/null @@ -1,28 +0,0 @@ -; RUN: llc < %s -march=aarch64 -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s - -; CHECK-LABEL: test_exynos_nopair_st -; CHECK: str -; CHECK: stur -; CHECK-NOT: stp -define void @test_exynos_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { - %tmp1 = bitcast double* %ptr to <2 x double>* - store <2 x double> %v2, <2 x double>* %tmp1, align 16 - %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2 - %tmp = bitcast double* %add.ptr to <2 x double>* - store <2 x double> %v1, <2 x double>* %tmp, align 16 - ret void -} - -; CHECK-LABEL: test_exynos_nopair_ld -; CHECK: ldr -; CHECK: ldr -; CHECK-NOT: ldp -define <2 x i64> @test_exynos_nopair_ld(i64* %p) { - %a1 = bitcast i64* %p to <2 x i64>* - %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 - %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2 - %a2 = bitcast i64* %add.ptr2 to <2 x i64>* - %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8 - %add = add nsw <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %add -} Index: test/CodeGen/AArch64/merge-store-dependency.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/merge-store-dependency.ll @@ -0,0 +1,63 @@ +; RUN: llc -mcpu cortex-a53 -march aarch64 %s -o - | FileCheck %s --check-prefix=A53 + +; PR26827 - Merge stores causes wrong dependency. +%struct1 = type { %struct1*, %struct1*, i32, i32, i16, i16, void (i32, i32, i8*)*, i8* } +@gv0 = internal unnamed_addr global i32 0, align 4 +@gv1 = internal unnamed_addr global %struct1** null, align 8 + +define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg) { +;CHECK-LABEL: test +entry: +; A53: mov [[DATA:w[0-9]+]], w1 +; A53: str q{{[0-9]+}}, {{.*}} +; A53: str q{{[0-9]+}}, {{.*}} +; A53: str [[DATA]], {{.*}} + + %0 = bitcast %struct1* %fde to i8* + tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false) + %state = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 4 + store i16 256, i16* %state, align 8 + %fd1 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 2 + store i32 %fd, i32* %fd1, align 8 + %force_eof = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 3 + store i32 0, i32* %force_eof, align 4 + %func2 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 6 + store void (i32, i32, i8*)* %func, void (i32, i32, i8*)** %func2, align 8 + %arg3 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 7 + store i8* %arg, i8** %arg3, align 8 + %call = tail call i32 (i32, i32, ...) @fcntl(i32 %fd, i32 4, i8* %0) #6 + %1 = load i32, i32* %fd1, align 8 + %cmp.i = icmp slt i32 %1, 0 + br i1 %cmp.i, label %if.then.i, label %while.body.i.preheader +if.then.i: + unreachable + +while.body.i.preheader: + %2 = load i32, i32* @gv0, align 4 + %3 = icmp eq i32* %fd1, @gv0 + br i1 %3, label %while.body.i.split, label %while.body.i.split.ver.us.preheader + +while.body.i.split.ver.us.preheader: + br label %while.body.i.split.ver.us + +while.body.i.split.ver.us: + %.reg2mem21.0 = phi i32 [ %mul.i.ver.us, %while.body.i.split.ver.us ], [ %2, %while.body.i.split.ver.us.preheader ] + %mul.i.ver.us = shl nsw i32 %.reg2mem21.0, 1 + %4 = icmp sgt i32 %mul.i.ver.us, %1 + br i1 %4, label %while.end.i, label %while.body.i.split.ver.us + +while.body.i.split: + br label %while.body.i.split + +while.end.i: + %call.i = tail call i8* @foo() + store i8* %call.i, i8** bitcast (%struct1*** @gv1 to i8**), align 8 + br label %exit + +exit: + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) +declare i32 @fcntl(i32, i32, ...) +declare noalias i8* @foo() Index: test/CodeGen/AArch64/merge-store.ll =================================================================== --- test/CodeGen/AArch64/merge-store.ll +++ test/CodeGen/AArch64/merge-store.ll @@ -1,6 +1,5 @@ -; RUN: llc -march aarch64 %s -o - | FileCheck %s -; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=cyclone | FileCheck %s --check-prefix=CYCLONE -; RUN: llc -mcpu cortex-a53 -march aarch64 %s -o - | FileCheck %s --check-prefix=A53 +; RUN: llc -march aarch64 %s -mcpu=cyclone -o - | FileCheck %s --check-prefix=CYCLONE --check-prefix=CHECK +; RUN: llc -march aarch64 %s -mattr=-slow-misaligned-128store -o - | FileCheck %s --check-prefix=MISALIGNED --check-prefix=CHECK @g0 = external global <3 x float>, align 16 @g1 = external global <3 x float>, align 4 @@ -39,9 +38,12 @@ store <2 x float> %shuffle1, <2 x float>* %idx1, align 8 ret void -; CHECK-LABEL: merge_vec_extract_stores -; CHECK: stur q0, [x0, #24] -; CHECK-NEXT: ret +; MISALIGNED-LABEL: merge_vec_extract_stores +; MISALIGNED: stur q0, [x0, #24] +; MISALIGNED-NEXT: ret + +; FIXME: Ideally we would like to use a generic target for this test, but this relies +; on suppressing store pairs. ; CYCLONE-LABEL: merge_vec_extract_stores ; CYCLONE: ext v1.16b, v0.16b, v0.16b, #8 @@ -49,66 +51,3 @@ ; CYCLONE-NEXT: str d1, [x0, #32] ; CYCLONE-NEXT: ret } - - -; PR26827 - Merge stores causes wrong dependency. -%struct1 = type { %struct1*, %struct1*, i32, i32, i16, i16, void (i32, i32, i8*)*, i8* } -@gv0 = internal unnamed_addr global i32 0, align 4 -@gv1 = internal unnamed_addr global %struct1** null, align 8 - -define void @test(%struct1* %fde, i32 %fd, void (i32, i32, i8*)* %func, i8* %arg) { -;CHECK-LABEL: test -entry: -;A53: mov [[DATA:w[0-9]+]], w1 -;A53: str q{{[0-9]+}}, {{.*}} -;A53: str q{{[0-9]+}}, {{.*}} -;A53: str [[DATA]], {{.*}} - - %0 = bitcast %struct1* %fde to i8* - tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false) - %state = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 4 - store i16 256, i16* %state, align 8 - %fd1 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 2 - store i32 %fd, i32* %fd1, align 8 - %force_eof = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 3 - store i32 0, i32* %force_eof, align 4 - %func2 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 6 - store void (i32, i32, i8*)* %func, void (i32, i32, i8*)** %func2, align 8 - %arg3 = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 7 - store i8* %arg, i8** %arg3, align 8 - %call = tail call i32 (i32, i32, ...) @fcntl(i32 %fd, i32 4, i8* %0) #6 - %1 = load i32, i32* %fd1, align 8 - %cmp.i = icmp slt i32 %1, 0 - br i1 %cmp.i, label %if.then.i, label %while.body.i.preheader -if.then.i: - unreachable - -while.body.i.preheader: - %2 = load i32, i32* @gv0, align 4 - %3 = icmp eq i32* %fd1, @gv0 - br i1 %3, label %while.body.i.split, label %while.body.i.split.ver.us.preheader - -while.body.i.split.ver.us.preheader: - br label %while.body.i.split.ver.us - -while.body.i.split.ver.us: - %.reg2mem21.0 = phi i32 [ %mul.i.ver.us, %while.body.i.split.ver.us ], [ %2, %while.body.i.split.ver.us.preheader ] - %mul.i.ver.us = shl nsw i32 %.reg2mem21.0, 1 - %4 = icmp sgt i32 %mul.i.ver.us, %1 - br i1 %4, label %while.end.i, label %while.body.i.split.ver.us - -while.body.i.split: - br label %while.body.i.split - -while.end.i: - %call.i = tail call i8* @foo() - store i8* %call.i, i8** bitcast (%struct1*** @gv1 to i8**), align 8 - br label %exit - -exit: - ret void -} - -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) -declare i32 @fcntl(i32, i32, ...) -declare noalias i8* @foo() Index: test/CodeGen/AArch64/misched-fusion.ll =================================================================== --- test/CodeGen/AArch64/misched-fusion.ll +++ test/CodeGen/AArch64/misched-fusion.ll @@ -1,4 +1,6 @@ +; RUN: llc -o - %s -mattr=+macroop-fusion,+use-postra-scheduler | FileCheck %s ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s + target triple = "arm64-apple-ios" declare void @foobar(i32 %v0, i32 %v1) @@ -8,12 +10,12 @@ ; CHECK: add w[[ADDRES:[0-9]+]], w1, #7 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13 ; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]] -; CHECK: mov x0, x[[ADDRES]] -; CHECK: mov x1, x[[SUBRES]] +; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]] +; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]] ; CHECK: bl _foobar ; CHECK: [[SKIPBLOCK]]: -; CHECK: mov x0, x[[SUBRES]] -; CHECK: mov x1, x[[ADDRES]] +; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]] +; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]] ; CHECK: bl _foobar define void @test_sub_cbz(i32 %a0, i32 %a1) { entry: Index: test/CodeGen/AArch64/no-quad-ldp-stp.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/no-quad-ldp-stp.ll @@ -0,0 +1,29 @@ +; RUN: llc < %s -march=aarch64 -mattr=+no-quad-ldst-pairs -verify-machineinstrs -asm-verbose=false | FileCheck %s +; RUN: llc < %s -march=aarch64 -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s + +; CHECK-LABEL: test_nopair_st +; CHECK: str +; CHECK: stur +; CHECK-NOT: stp +define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) { + %tmp1 = bitcast double* %ptr to <2 x double>* + store <2 x double> %v2, <2 x double>* %tmp1, align 16 + %add.ptr = getelementptr inbounds double, double* %ptr, i64 -2 + %tmp = bitcast double* %add.ptr to <2 x double>* + store <2 x double> %v1, <2 x double>* %tmp, align 16 + ret void +} + +; CHECK-LABEL: test_nopair_ld +; CHECK: ldr +; CHECK: ldr +; CHECK-NOT: ldp +define <2 x i64> @test_nopair_ld(i64* %p) { + %a1 = bitcast i64* %p to <2 x i64>* + %tmp1 = load <2 x i64>, < 2 x i64>* %a1, align 8 + %add.ptr2 = getelementptr inbounds i64, i64* %p, i64 2 + %a2 = bitcast i64* %add.ptr2 to <2 x i64>* + %tmp2 = load <2 x i64>, <2 x i64>* %a2, align 8 + %add = add nsw <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %add +} Index: test/CodeGen/AArch64/remat.ll =================================================================== --- test/CodeGen/AArch64/remat.ll +++ test/CodeGen/AArch64/remat.ll @@ -6,6 +6,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m1 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=vulcan -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s %X = type { i64, i64, i64 } declare void @f(%X*) Index: test/CodeGen/AArch64/sqrt-fastmath.ll =================================================================== --- test/CodeGen/AArch64/sqrt-fastmath.ll +++ test/CodeGen/AArch64/sqrt-fastmath.ll @@ -1,5 +1,7 @@ ; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=!sqrt,!vec-sqrt | FileCheck %s --check-prefix=FAULT ; RUN: llc < %s -mtriple=aarch64 -mattr=neon -recip=sqrt,vec-sqrt | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=neon,-use-reverse-square-root | FileCheck %s --check-prefix=FAULT +; RUN: llc < %s -mtriple=aarch64 -mattr=neon,+use-reverse-square-root | FileCheck %s declare float @llvm.sqrt.f32(float) #1 declare double @llvm.sqrt.f64(double) #1