Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -49,6 +49,14 @@ def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", "Has zero-cycle zeroing instructions">; +/// Similar to Cyclone, Kryo favors using imme #0 to zero out registers. But +/// Cylone's approach to zero a D or S register (movi D.2d + extract subreg) is +/// not great in Kryo. Kryo prefers using movi D #0 to directly zero out the D +/// register. +def FeatureImmeZeroing + : SubtargetFeature<"immez", "HasImmeZeroing", "true", + "Use Immediate #0 to zero a register">; + def FeatureStrictAlign : SubtargetFeature<"strict-align", "StrictAlign", "true", "Disallow all unaligned memory " @@ -139,7 +147,8 @@ FeatureNEON, FeatureCrypto, FeatureCRC, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureImmeZeroing]>; def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, Index: lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.cpp +++ lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1928,7 +1928,8 @@ .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } - } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) { + } else if (SrcReg == AArch64::WZR && (Subtarget.hasZeroCycleZeroing() || + Subtarget.hasImmeZeroing())) { BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm( AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { @@ -1964,7 +1965,8 @@ .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); - } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) { + } else if (SrcReg == AArch64::XZR && (Subtarget.hasZeroCycleZeroing() || + Subtarget.hasImmeZeroing())) { BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm( AArch64_AM::getShifterImm(AArch64_AM::LSL, 0)); } else { Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -64,6 +64,9 @@ // HasZeroCycleZeroing - Has zero-cycle zeroing instructions. bool HasZeroCycleZeroing; + // HasImmeZeroing - Use Immediate #0 to zero a register. + bool HasImmeZeroing; + // StrictAlign - Disallow unaligned memory accesses. bool StrictAlign; @@ -133,6 +136,8 @@ bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } + bool hasImmeZeroing() const { return HasImmeZeroing; } + bool requiresStrictAlign() const { return StrictAlign; } bool isX18Reserved() const { return ReserveX18; } Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -53,8 +53,8 @@ HasV8_1aOps(false), HasV8_2aOps(false), HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false), HasPerfMon(false), HasFullFP16(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), - StrictAlign(false), ReserveX18(TT.isOSDarwin()), IsLittle(LittleEndian), - CPUString(CPU), TargetTriple(TT), FrameLowering(), + HasImmeZeroing(false), StrictAlign(false), ReserveX18(TT.isOSDarwin()), + IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(), TLInfo(TM, *this), GISel() {} Index: test/CodeGen/AArch64/imme_zeroing.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/imme_zeroing.ll @@ -0,0 +1,45 @@ +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s + +; Verify that immediate #0 is used when zeroing a register. + +define i32 @test_int32(i32* nocapture readonly %p) { +; CHECK-LABEL: test_int32: +; CHECK: movz x9, #0 +; CHECK: movz w8, #0 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.06 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.06 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i64 @test_int64(i64* nocapture readonly %p) { +; CHECK-LABEL: test_int64: +; CHECK: movz x9, #0 +; CHECK: movz x8, #0 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i64 %add + +for.body: ; preds = %for.body, %entry + %i.07 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.06 = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %p, i64 %i.07 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %sum.06 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond = icmp eq i64 %inc, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}