Index: llvm/trunk/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.td +++ llvm/trunk/lib/Target/AArch64/AArch64.td @@ -80,13 +80,17 @@ def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", "Enable Scalable Vector Extension (SVE) instructions">; -/// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; +def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true", + "Has zero-cycle zeroing instructions for generic registers">; + +def FeatureZCZeroingFP : SubtargetFeature<"zcz-fp", "HasZeroCycleZeroingFP", "true", + "Has zero-cycle zeroing instructions for FP registers">; -/// Cyclone has instructions which zero registers for "free". def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions">; + "Has zero-cycle zeroing instructions", + [FeatureZCZeroingGP, FeatureZCZeroingFP]>; /// ... but the floating-point version doesn't quite work in rare cases on older /// CPUs. @@ -404,7 +408,7 @@ FeaturePostRAScheduler, FeatureSlowMisaligned128Store, FeatureUseRSqrt, - FeatureZCZeroing]>; + FeatureZCZeroingFP]>; def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", "Samsung Exynos-M2 processors", @@ -418,7 +422,7 @@ FeaturePerfMon, FeaturePostRAScheduler, FeatureSlowMisaligned128Store, - FeatureZCZeroing]>; + FeatureZCZeroingFP]>; def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", @@ -435,7 +439,7 @@ FeaturePerfMon, FeaturePostRAScheduler, FeaturePredictableSelectIsExpensive, - FeatureZCZeroing]>; + FeatureZCZeroingFP]>; def ProcKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo", "Qualcomm Kryo processors", [ Index: llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -503,7 +503,7 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { unsigned DestReg = MI.getOperand(0).getReg(); - if (STI->hasZeroCycleZeroing() && !STI->hasZeroCycleZeroingFPWorkaround()) { + if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround()) { // Convert H/S/D register to corresponding Q register if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31) DestReg = AArch64::Q0 + (DestReg - AArch64::H0); Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -729,9 +729,9 @@ case AArch64::FMOVH0: case AArch64::FMOVS0: case AArch64::FMOVD0: - return Subtarget.hasZeroCycleZeroing(); + return Subtarget.hasZeroCycleZeroingFP(); case TargetOpcode::COPY: - return (Subtarget.hasZeroCycleZeroing() && + return (Subtarget.hasZeroCycleZeroingGP() && (MI.getOperand(1).getReg() == AArch64::WZR || MI.getOperand(1).getReg() == AArch64::XZR)); } @@ -2481,7 +2481,7 @@ .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } - } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) { + } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) { BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); @@ -2518,7 +2518,7 @@ .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) { + } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) { BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); Index: llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/trunk/lib/Target/AArch64/AArch64Subtarget.h @@ -109,6 +109,8 @@ // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing = false; + bool HasZeroCycleZeroingGP = false; + bool HasZeroCycleZeroingFP = false; bool HasZeroCycleZeroingFPWorkaround = false; // StrictAlign - Disallow unaligned memory accesses. @@ -228,7 +230,9 @@ bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; } - bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool hasZeroCycleZeroingGP() const { return HasZeroCycleZeroingGP; } + + bool hasZeroCycleZeroingFP() const { return HasZeroCycleZeroingFP; } bool hasZeroCycleZeroingFPWorkaround() const { return HasZeroCycleZeroingFPWorkaround; Index: llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-regmov.ll @@ -1,16 +1,20 @@ -; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -; rdar://12254953 +; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=-zcm | FileCheck %s -check-prefixes=CHECK,NOT +; RUN: llc < %s -mtriple=arm64-apple-ios -mattr=+zcm | FileCheck %s -check-prefixes=CHECK,YES +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=CHECK,YES +; rdar://12254953 define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp { entry: ; CHECK-LABEL: t: -; CHECK: mov [[REG2:x[0-9]+]], x3 -; CHECK: mov [[REG1:x[0-9]+]], x2 -; CHECK: mov x0, x2 -; CHECK: mov x1, x3 +; NOT: mov [[REG2:w[0-9]+]], w3 +; NOT: mov [[REG1:w[0-9]+]], w2 +; YES: mov [[REG2:x[0-9]+]], x3 +; YES: mov [[REG1:x[0-9]+]], x2 ; CHECK: bl _foo -; CHECK: mov x0, [[REG1]] -; CHECK: mov x1, [[REG2]] +; NOT: mov w0, [[REG1]] +; NOT: mov w1, [[REG2]] +; YES: mov x0, [[REG1]] +; YES: mov x1, [[REG2]] %call = call i32 @foo(i32 %c, i32 %d) nounwind %call1 = call i32 @foo(i32 %c, i32 %d) nounwind unreachable Index: llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -1,9 +1,14 @@ -; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s -check-prefixes=ALL,CYCLONE -; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 < %s | FileCheck %s -check-prefixes=CYCLONE-FULLFP16 -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m1 < %s | FileCheck %s -check-prefixes=ALL,OTHERS -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=exynos-m3 < %s | FileCheck %s -check-prefixes=ALL,OTHERS -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s -check-prefixes=ALL,OTHERS -; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=falkor < %s | FileCheck %s -check-prefixes=ALL,OTHERS +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=-zcz | FileCheck %s -check-prefixes=ALL,NONEGP,NONEFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,ZERO16 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gp | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-fp | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZEROGP,NONEFP +; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZEROGP,NONE16 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NONEGP,ZEROFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZEROGP,ZEROFP declare void @bar(half, float, double, <2 x double>) declare void @bari(i32, i32) @@ -14,17 +19,22 @@ entry: ; ALL-LABEL: t1: ; ALL-NOT: fmov -; ALL: ldr h0,{{.*}} -; CYCLONE: fmov s1, wzr -; CYCLONE: fmov d2, xzr -; CYCLONE: movi.16b v3, #0 -; CYCLONE-FULLFP16: fmov h0, wzr -; CYCLONE-FULLFP16: fmov s1, wzr -; CYCLONE-FULLFP16: fmov d2, xzr -; CYCLONE-FULLFP16: movi.16b v3, #0 -; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000 -; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000 -; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000 +; NONEFP: ldr h0,{{.*}} +; NONEFP: fmov s1, wzr +; NONEFP: fmov d2, xzr +; NONEFP: movi{{(.16b)?}} v3{{(.2d)?}}, #0 +; NONE16: fmov h0, wzr +; NONE16: fmov s1, wzr +; NONE16: fmov d2, xzr +; NONE16: movi{{(.16b)?}} v3{{(.2d)?}}, #0 +; ZEROFP: ldr h0,{{.*}} +; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZERO16: movi v{{[0-3]+}}.2d, #0 +; ZERO16: movi v{{[0-3]+}}.2d, #0 +; ZERO16: movi v{{[0-3]+}}.2d, #0 +; ZERO16: movi v{{[0-3]+}}.2d, #0 tail call void @bar(half 0.000000e+00, float 0.000000e+00, double 0.000000e+00, <2 x double> ) nounwind ret void } @@ -32,9 +42,10 @@ define void @t2() nounwind ssp { entry: ; ALL-LABEL: t2: -; ALL-NOT: mov w0, wzr -; ALL: mov w{{[0-3]+}}, #0 -; ALL: mov w{{[0-3]+}}, #0 +; NONEGP: mov w0, wzr +; NONEGP: mov w1, wzr +; ZEROGP: mov w0, #0 +; ZEROGP: mov w1, #0 tail call void @bari(i32 0, i32 0) nounwind ret void } @@ -42,26 +53,26 @@ define void @t3() nounwind ssp { entry: ; ALL-LABEL: t3: -; ALL-NOT: mov x0, xzr -; ALL: mov x{{[0-3]+}}, #0 -; ALL: mov x{{[0-3]+}}, #0 +; NONEGP: mov x0, xzr +; NONEGP: mov x1, xzr +; ZEROGP: mov x0, #0 +; ZEROGP: mov x1, #0 tail call void @barl(i64 0, i64 0) nounwind ret void } define void @t4() nounwind ssp { ; ALL-LABEL: t4: -; ALL-NOT: fmov -; CYCLONE: fmov s{{[0-3]+}}, wzr -; CYCLONE: fmov s{{[0-3]+}}, wzr -; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr -; CYCLONE-FULLFP16: fmov s{{[0-3]+}}, wzr -; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000 -; OTHERS: movi v{{[0-3]+}}.2d, #0000000000000000 +; NONEFP: fmov s{{[0-3]+}}, wzr +; NONEFP: fmov s{{[0-3]+}}, wzr +; ZEROFP: movi v{{[0-3]+}}.2d, #0 +; ZEROFP: movi v{{[0-3]+}}.2d, #0 tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind ret void } +declare double @sin(double) + ; We used to produce spills+reloads for a Q register with zero cycle zeroing ; enabled. ; ALL-LABEL: foo: @@ -88,10 +99,133 @@ define <2 x i64> @t6() { ; ALL-LABEL: t6: -; CYCLONE: movi.16b v0, #0 -; OTHERS: movi v0.2d, #0000000000000000 - ret <2 x i64> zeroinitializer +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <2 x i64> zeroinitializer } +define i1 @ti1() { +entry: +; ALL-LABEL: ti1: +; NONEGP: mov w0, wzr +; ZEROGP: mov w0, #0 + ret i1 false +} + +define i8 @ti8() { +entry: +; ALL-LABEL: ti8: +; NONEGP: mov w0, wzr +; ZEROGP: mov w0, #0 + ret i8 0 +} + +define i16 @ti16() { +entry: +; ALL-LABEL: ti16: +; NONEGP: mov w0, wzr + ; ZEROGP: mov w0, #0 + ret i16 0 +} + +define i32 @ti32() { +entry: +; ALL-LABEL: ti32: +; NONEGP: mov w0, wzr +; ZEROGP: mov w0, #0 + ret i32 0 +} + +define i64 @ti64() { +entry: +; ALL-LABEL: ti64: +; NONEGP: mov x0, xzr +; ZEROGP: mov x0, #0 + ret i64 0 +} + +define float @tf32() { +entry: +; ALL-LABEL: tf32: +; NONEFP: mov s0, wzr +; ZEROFP: movi v0.2d, #0 + ret float 0.0 +} + +define double @td64() { +entry: +; ALL-LABEL: td64: +; NONEFP: mov d0, xzr +; ZEROFP: movi v0.2d, #0 + ret double 0.0 +} + +define <8 x i8> @tv8i8() { +entry: +; ALL-LABEL: tv8i8: +; ALL: movi d0, #0 + ret <8 x i8> +} + +define <4 x i16> @tv4i16() { +entry: +; ALL-LABEL: tv4i16: +; ALL: movi d0, #0 + ret <4 x i16> +} + +define <2 x i32> @tv2i32() { +entry: +; ALL-LABEL: tv2i32: +; ALL: movi d0, #0 + ret <2 x i32> +} + +define <2 x float> @tv2f32() { +entry: +; ALL-LABEL: tv2f32: +; ALL: movi d0, #0 + ret <2 x float> +} + +define <16 x i8> @tv16i8() { +entry: +; ALL-LABEL: tv16i8: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <16 x i8> +} + +define <8 x i16> @tv8i16() { +entry: +; ALL-LABEL: tv8i16: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <8 x i16> +} + +define <4 x i32> @tv4i32() { +entry: +; ALL-LABEL: tv4i32: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <4 x i32> +} + +define <2 x i64> @tv2i64() { +entry: +; ALL-LABEL: tv2i64: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <2 x i64> +} + +define <4 x float> @tv4f32() { +entry: +; ALL-LABEL: tv4f32: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <4 x float> +} + +define <2 x double> @tv2d64() { +entry: +; ALL-LABEL: tv2d64: +; ALL: movi{{(.16b)?}} v0{{(.2d)?}}, #0 + ret <2 x double> +} -declare double @sin(double)