Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -139,7 +139,8 @@ FeatureNEON, FeatureCrypto, FeatureCRC, - FeaturePerfMon]>; + FeaturePerfMon, + FeatureZCZeroing]>; def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8, FeatureNEON, Index: lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- lib/Target/AArch64/AArch64InstrInfo.td +++ lib/Target/AArch64/AArch64InstrInfo.td @@ -35,6 +35,7 @@ def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsCyclone : Predicate<"Subtarget->isCyclone()">; +def IsKryo : Predicate<"Subtarget->isKryo()">; //===----------------------------------------------------------------------===// // AArch64-specific DAG Nodes. @@ -4431,16 +4432,27 @@ "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. +// Use movi.2d to materialize 0.0 for Cyclone if the HW does zero-cycle zeroing. // Complexity is added to break a tie with a plain MOVI. let AddedComplexity = 1 in { def : Pat<(f32 fpimm0), (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; + Requires<[HasZCZ, IsCyclone]>; def : Pat<(f64 fpimm0), (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; + Requires<[HasZCZ, IsCyclone]>; +} + +// In Kryo, use movi to zero D registers and still use wzr to zero S registers. +let isReMaterializable = 1, isCodeGenOnly = 1 in { +def MOVIS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, + PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, + Sched<[WriteF]>, + Requires<[HasZCZ, IsKryo]>; +def MOVID0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, + PseudoInstExpansion<(MOVID FPR64:$Rd, 0)>, + Sched<[WriteF]>, + Requires<[HasZCZ, IsKryo]>; } def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; Index: test/CodeGen/AArch64/kryo_zeroing.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/kryo_zeroing.ll @@ -0,0 +1,87 @@ +; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=kryo < %s | FileCheck %s + +; Verify that immediate #0 is used when zeroing a D, X, W register. + +define i32 @test_int32(i32* nocapture readonly %p) { +; CHECK-LABEL: test_int32: +; CHECK: movz x9, #0 +; CHECK: movz w8, #0 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 %add + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.06 = phi i32 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %p, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %add = add nsw i32 %0, %sum.06 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define i64 @test_int64(i64* nocapture readonly %p) { +; CHECK-LABEL: test_int64: +; CHECK: movz x9, #0 +; CHECK: movz x8, #0 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i64 %add + +for.body: ; preds = %for.body, %entry + %i.07 = phi i64 [ 0, %entry ], [ %inc, %for.body ] + %sum.06 = phi i64 [ 0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds i64, i64* %p, i64 %i.07 + %0 = load i64, i64* %arrayidx, align 8 + %add = add nsw i64 %0, %sum.06 + %inc = add nuw nsw i64 %i.07, 1 + %exitcond = icmp eq i64 %inc, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define double @test_double(double* nocapture readonly %p) { +; CHECK-LABEL: test_double: +; CHECK: movz x8, #0 +; CHECK: movi d0, #0000000000000000 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret double %add + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.06 = phi double [ 0.0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %p, i64 %indvars.iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.06 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +} + +define float @test_float(float* nocapture readonly %p) { +; CHECK-LABEL: test_float: +; CHECK: fmov s0, wzr +; CHECK: movz x8, #0 +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret float %add + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %sum.06 = phi float [ 0.0, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %p, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.06 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond = icmp eq i64 %indvars.iv.next, 10 + br i1 %exitcond, label %for.cond.cleanup, label %for.body +}