Index: llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -49,6 +49,7 @@ class AArch64AsmPrinter : public AsmPrinter { AArch64MCInstLower MCInstLowering; StackMaps SM; + const AArch64Subtarget *STI; public: AArch64AsmPrinter(TargetMachine &TM, std::unique_ptr Streamer) @@ -83,6 +84,7 @@ bool runOnMachineFunction(MachineFunction &F) override { AArch64FI = F.getInfo(); + STI = static_cast(&F.getSubtarget()); return AsmPrinter::runOnMachineFunction(F); } @@ -111,6 +113,9 @@ /// \brief Emit the LOHs contained in AArch64FI. void EmitLOHs(); + /// Emit instruction to set float register to zero. + void EmitFMov0(const MachineInstr &MI); + typedef std::map MInstToMCSymbol; MInstToMCSymbol LOHInstToLabel; }; @@ -224,8 +229,7 @@ const TargetRegisterClass *RC, bool isVector, raw_ostream &O) { assert(MO.isReg() && "Should only get here with a register!"); - const AArch64RegisterInfo *RI = - MF->getSubtarget().getRegisterInfo(); + const TargetRegisterInfo *RI = STI->getRegisterInfo(); unsigned Reg = MO.getReg(); unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg)); assert(RI->regsOverlap(RegToPrint, Reg)); @@ -416,6 +420,40 @@ EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); } +void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) { + unsigned DestReg = MI.getOperand(0).getReg(); + if (STI->hasZeroCycleZeroing()) { + // Convert S/D register to corresponding Q register + if (AArch64::S0 <= DestReg && DestReg <= AArch64::S31) { + DestReg = AArch64::Q0 + (DestReg - AArch64::S0); + } else { + assert(AArch64::D0 <= DestReg && DestReg <= AArch64::D31); + DestReg = AArch64::Q0 + (DestReg - AArch64::D0); + } + MCInst MOVI; + MOVI.setOpcode(AArch64::MOVIv2d_ns); + MOVI.addOperand(MCOperand::createReg(DestReg)); + MOVI.addOperand(MCOperand::createImm(0)); + EmitToStreamer(*OutStreamer, MOVI); + } else { + MCInst FMov; + switch (MI.getOpcode()) { + default: llvm_unreachable("Unexpected opcode"); + case AArch64::FMOVS0: + FMov.setOpcode(AArch64::FMOVWSr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::WZR)); + break; + case AArch64::FMOVD0: + FMov.setOpcode(AArch64::FMOVXDr); + FMov.addOperand(MCOperand::createReg(DestReg)); + FMov.addOperand(MCOperand::createReg(AArch64::XZR)); + break; + } + EmitToStreamer(*OutStreamer, FMov); + } +} + // Simple pseudo-instructions have their lowering (with expansion to real // instructions) auto-generated. #include "AArch64GenMCPseudoLowering.inc" @@ -521,6 +559,11 @@ return; } + case AArch64::FMOVS0: + case AArch64::FMOVD0: + EmitFMov0(*MI); + return; + case TargetOpcode::STACKMAP: return LowerSTACKMAP(*OutStreamer, SM, *MI); Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.td @@ -301,9 +301,6 @@ //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -// -def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; -def NoZCZ : Predicate<"!Subtarget->hasZeroCycleZeroing()">; def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; def ForCodeSize : Predicate<"ForCodeSize">; @@ -2565,15 +2562,11 @@ defm FMOV : UnscaledConversion<"fmov">; // Add pseudo ops for FMOV 0 so we can mark them as isReMaterializable -let isReMaterializable = 1, isCodeGenOnly = 1 in { +let isReMaterializable = 1, isCodeGenOnly = 1, isAsCheapAsAMove = 1 in { def FMOVS0 : Pseudo<(outs FPR32:$Rd), (ins), [(set f32:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVWSr FPR32:$Rd, WZR)>, - Sched<[WriteF]>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; def FMOVD0 : Pseudo<(outs FPR64:$Rd), (ins), [(set f64:$Rd, (fpimm0))]>, - PseudoInstExpansion<(FMOVXDr FPR64:$Rd, XZR)>, - Sched<[WriteF]>, - Requires<[NoZCZ]>; + Sched<[WriteF]>; } //===----------------------------------------------------------------------===// @@ -4435,18 +4428,6 @@ "movi", ".2d", [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>; - -// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing. -// Complexity is added to break a tie with a plain MOVI. -let AddedComplexity = 1 in { -def : Pat<(f32 fpimm0), - (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>, - Requires<[HasZCZ]>; -def : Pat<(f64 fpimm0), - (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>, - Requires<[HasZCZ]>; -} - def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>; def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>; Index: llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll +++ llvm/trunk/test/CodeGen/AArch64/arm64-zero-cycle-zeroing.ll @@ -47,3 +47,29 @@ declare void @bari(i32, i32) declare void @barl(i64, i64) declare void @barf(float, float) + +; We used to produce spills+reloads for a Q register with zero cycle zeroing +; enabled. +; CHECK-LABEL: foo: +; CHECK-NOT: str {{q[0-9]+}} +; CHECK-NOT: ldr {{q[0-9]+}} +define double @foo(i32 %n) { +entry: + br label %for.body + +for.body: + %phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ] + %i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %conv21 = sitofp i32 %i.076 to double + %call = tail call fast double @sin(double %conv21) + %cmp.i = fcmp fast olt double %phi0, %call + %v0 = select i1 %cmp.i, double %call, double %phi0 + %inc = add nuw nsw i32 %i.076, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: + ret double %v0 +} + +declare double @sin(double) Index: llvm/trunk/test/CodeGen/AArch64/fp-cond-sel.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/fp-cond-sel.ll +++ llvm/trunk/test/CodeGen/AArch64/fp-cond-sel.ll @@ -12,8 +12,8 @@ %tst1 = icmp ugt i32 %lhs32, %rhs32 %val1 = select i1 %tst1, float 0.0, float 1.0 store float %val1, float* @varfloat -; CHECK: movi v[[FLT0:[0-9]+]].2d, #0 -; CHECK: fmov s[[FLT1:[0-9]+]], #1.0 +; CHECK-DAG: movi v[[FLT0:[0-9]+]].2d, #0 +; CHECK-DAG: fmov s[[FLT1:[0-9]+]], #1.0 ; CHECK: fcsel {{s[0-9]+}}, s[[FLT0]], s[[FLT1]], hi %rhs64 = sext i32 %rhs32 to i64