Index: llvm/lib/Target/AMDGPU/AMDGPU.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPU.td +++ llvm/lib/Target/AMDGPU/AMDGPU.td @@ -488,30 +488,6 @@ // Subtarget Features (options and debugging) //===------------------------------------------------------------===// -// Denormal handling for fp64 and fp16 is controlled by the same -// config register when fp16 supported. -// TODO: Do we need a separate f16 setting when not legal? -def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals", - "FP64FP16Denormals", - "true", - "Enable double and half precision denormal handling", - [FeatureFP64] ->; - -def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals", - "FP64FP16Denormals", - "true", - "Enable double and half precision denormal handling", - [FeatureFP64, FeatureFP64FP16Denormals] ->; - -def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals", - "FP64FP16Denormals", - "true", - "Enable half precision denormal handling", - [FeatureFP64FP16Denormals] ->; - def FeatureFPExceptions : SubtargetFeature<"fp-exceptions", "FPExceptions", "true", Index: llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "llvm/ADT/FloatingPointMode.h" #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" @@ -1034,7 +1035,9 @@ AC = &getAnalysis().getAssumptionCache(F); DA = &getAnalysis(); HasUnsafeFPMath = hasUnsafeFPMath(F); - HasFP32Denormals = ST->hasFP32Denormals(F); + + AMDGPU::SIModeRegisterDefaults Mode(F); + HasFP32Denormals = Mode.allFP32Denormals(); bool MadeChange = false; Index: llvm/lib/Target/AMDGPU/AMDGPUFeatures.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUFeatures.td +++ llvm/lib/Target/AMDGPU/AMDGPUFeatures.td @@ -18,15 +18,6 @@ "Enable single precision FMA (not as fast as mul+add, but fused)" >; -// Some instructions do not support denormals despite this flag. Using -// fp32 denormals also causes instructions to run at the double -// precision rate for the device. -def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals", - "FP32Denormals", - "true", - "Enable single precision denormal handling" ->; - class SubtargetFeatureLocalMemorySize : SubtargetFeature< "localmemorysize"#Value, "LocalMemorySize", Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -397,7 +397,7 @@ } #endif Subtarget = &MF.getSubtarget(); - Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget); + Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction()); return SelectionDAGISel::runOnMachineFunction(MF); } Index: llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1585,9 +1585,10 @@ const AMDGPUMachineFunction *MFI = MF.getInfo(); // float fr = mad(fqneg, fb, fa); - unsigned OpCode = MFI->getMode().allFP32Denormals() ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned OpCode = !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; + SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; @@ -1670,9 +1671,10 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); // Compute denominator reciprocal. - unsigned FMAD = MFI->getMode().allFP32Denormals() ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned FMAD = !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; + SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); Index: llvm/lib/Target/AMDGPU/AMDGPUInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructions.td +++ llvm/lib/Target/AMDGPU/AMDGPUInstructions.td @@ -900,3 +900,8 @@ [(fmaxnum_ieee_oneuse node:$src0, node:$src1), (fmaxnum_oneuse node:$src0, node:$src1)] >; + +def any_fmad : PatFrags<(ops node:$src0, node:$src1, node:$src2), + [(fmad node:$src0, node:$src1, node:$src2), + (AMDGPUfmad_ftz node:$src0, node:$src1, node:$src2)] +>; Index: llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -18,7 +18,7 @@ LocalMemoryObjects(), ExplicitKernArgSize(0), LDSSize(0), - Mode(MF.getFunction(), MF.getSubtarget()), + Mode(MF.getFunction()), IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())), NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath), MemoryBound(false), Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -65,7 +65,6 @@ protected: bool Has16BitInsts; bool HasMadMixInsts; - bool FP32Denormals; bool FPExceptions; bool HasSDWA; bool HasVOP3PInsts; @@ -148,15 +147,6 @@ return HasMadMixInsts; } - bool hasFP32Denormals(const Function &F) const { - // FIXME: This should not be a property of the subtarget. This should be a - // property with a default set by the calling convention which can be - // overridden by attributes. For now, use the subtarget feature as a - // placeholder attribute. The function arguments only purpose is to - // discourage use without a function context until this is removed. - return FP32Denormals; - } - bool hasFPExceptions() const { return FPExceptions; } @@ -295,7 +285,6 @@ bool HalfRate64Ops; // Dynamially set bits that enable features. - bool FP64FP16Denormals; bool FlatForGlobal; bool AutoWaitcntBeforeBarrier; bool CodeObjectV3; @@ -617,20 +606,6 @@ unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const; - /// Alias for hasFP64FP16Denormals - bool hasFP16Denormals(const Function &F) const { - return FP64FP16Denormals; - } - - /// Alias for hasFP64FP16Denormals - bool hasFP64Denormals(const Function &F) const { - return FP64FP16Denormals; - } - - bool hasFP64FP16Denormals(const Function &F) const { - return FP64FP16Denormals; - } - bool supportsMinMaxDenormModes() const { return getGeneration() >= AMDGPUSubtarget::GFX9; } Index: llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -54,13 +54,6 @@ FullFS += FS; ParseSubtargetFeatures(GPU, FullFS); - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { - FP32Denormals = false; - } - HasMulU24 = getGeneration() >= EVERGREEN; HasMulI24 = hasCaymanISA(); @@ -71,9 +64,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS) { // Determine default and user-specified characteristics - // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be - // enabled, but some instructions do not respect them and they run at the - // double precision rate, so don't enable by default. // // We want to be able to turn these off, but making this a subtarget feature // for SI has the unhelpful behavior that it unsets everything else if you @@ -88,15 +78,6 @@ if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA. FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,"; - // FIXME: I don't think think Evergreen has any useful support for - // denormals, but should be checked. Should we issue a warning somewhere - // if someone tries to enable these? - if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - FullFS += "+fp64-fp16-denormals,+fp32-denormals,"; - } else { - FullFS += "-fp32-denormals,"; - } - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS // Disable mutually exclusive bits. @@ -165,7 +146,6 @@ TargetTriple(TT), Has16BitInsts(false), HasMadMixInsts(false), - FP32Denormals(false), FPExceptions(false), HasSDWA(false), HasVOP3PInsts(false), @@ -193,7 +173,6 @@ FastFMAF32(false), HalfRate64Ops(false), - FP64FP16Denormals(false), FlatForGlobal(false), AutoWaitcntBeforeBarrier(false), CodeObjectV3(false), Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -133,7 +133,7 @@ TLI(ST->getTargetLowering()), CommonTTI(TM, F), IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())), - HasFP32Denormals(ST->hasFP32Denormals(F)) { } + HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()) {} bool hasBranchDivergence() { return true; } Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -690,8 +690,8 @@ // FIXME: dx10_clamp can just take the caller setting, but there seems to be // no way to support merge for backend defined attributes. - AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST); - AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST); + AMDGPU::SIModeRegisterDefaults CallerMode(*Caller); + AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee); return CallerMode.isInlineCompatible(CalleeMode); } Index: llvm/lib/Target/AMDGPU/R600Instructions.td =================================================================== --- llvm/lib/Target/AMDGPU/R600Instructions.td +++ llvm/lib/Target/AMDGPU/R600Instructions.td @@ -991,7 +991,7 @@ class MULADD_IEEE_Common inst> : R600_3OP < inst, "MULADD_IEEE", - [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))] + [(set f32:$dst, (any_fmad f32:$src0, f32:$src1, f32:$src2))] >; class FMA_Common inst> : R600_3OP < Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -693,8 +693,7 @@ FP64FP16InputDenormals(true), FP64FP16OutputDenormals(true) {} - // FIXME: Should not depend on the subtarget - SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST); + SIModeRegisterDefaults(const Function &F); static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) { const bool IsCompute = AMDGPU::isCompute(CC); @@ -702,8 +701,8 @@ SIModeRegisterDefaults Mode; Mode.DX10Clamp = true; Mode.IEEE = IsCompute; - Mode.FP32InputDenormals = true; - Mode.FP32OutputDenormals = true; + Mode.FP32InputDenormals = IsCompute; + Mode.FP32OutputDenormals = IsCompute; Mode.FP64FP16InputDenormals = true; Mode.FP64FP16OutputDenormals = true; return Mode; Index: llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -1303,8 +1303,7 @@ return true; } -SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F, - const GCNSubtarget &ST) { +SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) { *this = getDefaultForCallingConv(F.getCallingConv()); StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString(); @@ -1316,11 +1315,25 @@ if (!DX10ClampAttr.empty()) DX10Clamp = DX10ClampAttr == "true"; - // FIXME: Split this when denormal-fp-math is used - FP32InputDenormals = ST.hasFP32Denormals(F); - FP32OutputDenormals = FP32InputDenormals; - FP64FP16InputDenormals = ST.hasFP64FP16Denormals(F); - FP64FP16OutputDenormals = FP64FP16InputDenormals; + StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString(); + if (!DenormF32Attr.empty()) { + DenormalMode DenormMode = parseDenormalFPAttribute(DenormF32Attr); + FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } + + StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString(); + if (!DenormAttr.empty()) { + DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr); + + if (DenormF32Attr.empty()) { + FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } + + FP64FP16InputDenormals = DenormMode.Input == DenormalMode::IEEE; + FP64FP16OutputDenormals = DenormMode.Output == DenormalMode::IEEE; + } } namespace { Index: llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll =================================================================== --- llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll +++ llvm/test/Analysis/CostModel/AMDGPU/fdiv.ll @@ -278,5 +278,5 @@ ret void } -attributes #0 = { nounwind "target-features"="+fp32-denormals" } -attributes #1 = { nounwind "target-features"="-fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll +++ llvm/test/CodeGen/AMDGPU/amdgcn-ieee.ll @@ -150,12 +150,11 @@ ret void } -; FIXME: Should have denormals off by default. ; GCN-LABEL: {{^}}ps_ieee_mode_on: ; GCN: {{buffer|global|flat}}_load_dword [[VAL0:v[0-9]+]] ; GCN-NEXT: {{buffer|global|flat}}_load_dword [[VAL1:v[0-9]+]] -; GCN-DAG: v_max_f32_e32 [[QUIET0:v[0-9]+]], [[VAL0]], [[VAL0]] -; GCN-DAG: v_max_f32_e32 [[QUIET1:v[0-9]+]], [[VAL1]], [[VAL1]] +; GCN-DAG: v_mul_f32_e32 [[QUIET0:v[0-9]+]], 1.0, [[VAL0]] +; GCN-DAG: v_mul_f32_e32 [[QUIET1:v[0-9]+]], 1.0, [[VAL1]] ; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], [[QUIET0]], [[QUIET1]] ; GCN-NOT: v_mul_f32 define amdgpu_ps void @ps_ieee_mode_on() #1 { Index: llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll +++ llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll @@ -214,8 +214,8 @@ } attributes #0 = { nounwind optnone noinline } -attributes #1 = { nounwind "target-features"="-fp32-denormals" } -attributes #2 = { nounwind "target-features"="+fp32-denormals" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } ; CHECK: !0 = !{float 2.500000e+00} ; CHECK: !1 = !{float 5.000000e-01} Index: llvm/test/CodeGen/AMDGPU/clamp-modifier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp-modifier.ll +++ llvm/test/CodeGen/AMDGPU/clamp-modifier.ll @@ -389,10 +389,10 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind "target-features"="-fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "target-features"="+fp32-denormals" } -attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" } +attributes #2 = { nounwind "denormal-fp-math-f32"="ieee.ieee" } +attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "denormal-fp-math"="preserve-sign,preserve-sign" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3} Index: llvm/test/CodeGen/AMDGPU/clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp.ll +++ llvm/test/CodeGen/AMDGPU/clamp.ll @@ -767,8 +767,8 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half>, <2 x half>) #1 declare <2 x half> @llvm.maxnum.v2f16(<2 x half>, <2 x half>) #1 -attributes #0 = { nounwind "target-features"="-fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp32-denormals,-fp-exceptions" "no-nans-fp-math"="false" } -attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="-fp32-denormals,+fp-exceptions" "no-nans-fp-math"="false" } -attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp32-denormals,+fp-exceptions" "no-nans-fp-math"="false" } +attributes #2 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="-fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } +attributes #3 = { nounwind "amdgpu-dx10-clamp"="true" "target-features"="+fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } +attributes #4 = { nounwind "amdgpu-dx10-clamp"="false" "target-features"="+fp-exceptions" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-nans-fp-math"="false" } Index: llvm/test/CodeGen/AMDGPU/default-fp-mode.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/default-fp-mode.ll +++ llvm/test/CodeGen/AMDGPU/default-fp-mode.ll @@ -82,9 +82,17 @@ ret void } -; FIXME: Denormals should be off by default +; GCN-LABEL: {{^}}test_just_f32_attr_flush +; GCN: FloatMode: 192 +; GCN: IeeeMode: 1 +define amdgpu_kernel void @test_just_f32_attr_flush(float addrspace(1)* %out0, double addrspace(1)* %out1) #9 { + store float 0.0, float addrspace(1)* %out0 + store double 0.0, double addrspace(1)* %out1 + ret void +} + ; GCN-LABEL: {{^}}kill_gs_const: -; GCN: FloatMode: 240 +; GCN: FloatMode: 192 ; GCN: IeeeMode: 0 define amdgpu_gs void @kill_gs_const() { main_body: @@ -96,7 +104,7 @@ } ; GCN-LABEL: {{^}}kill_vcc_implicit_def: -; GCN: FloatMode: 240 +; GCN: FloatMode: 192 ; GCN: IeeeMode: 0 define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(4)* inreg, [17 x <16 x i8>] addrspace(4)* inreg, [17 x <4 x i32>] addrspace(4)* inreg, [34 x <8 x i32>] addrspace(4)* inreg, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { entry: @@ -110,10 +118,11 @@ attributes #0 = { nounwind "target-cpu"="tahiti" } attributes #1 = { nounwind "target-cpu"="fiji" } -attributes #2 = { nounwind "target-features"="+fp64-denormals" } -attributes #3 = { nounwind "target-features"="+fp32-denormals" } -attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-denormals" } -attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } -attributes #6 = { nounwind "target-features"="+fp64-fp16-denormals" } -attributes #7 = { nounwind "target-features"="-fp64-fp16-denormals" } -attributes #8 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } +attributes #2 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #3 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } +attributes #4 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #5 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #6 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #7 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #8 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #9 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll +++ llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mattr=+fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -march=amdgcn -mattr=-fast-fmaf,-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s +; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s -; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s -; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s +; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s +; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s ; FIXME: This should also fold when fma is actually fast if an FMA ; exists in the original program. Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s -; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s -; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp-exceptions -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=ieee < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s ; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32: ; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} Index: llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll @@ -739,6 +739,6 @@ } attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="-fp32-denormals" } -attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } -attributes #3 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denormals" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/fcanonicalize.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize.ll @@ -625,9 +625,9 @@ } attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "target-features"="-fp32-denormals" } -attributes #2 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" } -attributes #3 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" } -attributes #4 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="tonga" } -attributes #5 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" "target-cpu"="gfx900" } -attributes #6 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" "target-cpu"="gfx900" } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "denormal-fp-math"="ieee,ieee" } +attributes #4 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "target-cpu"="tonga" } +attributes #5 = { nounwind "denormal-fp-math"="ieee,ieee" "target-cpu"="gfx900" } +attributes #6 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "target-cpu"="gfx900" } Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8_9_10 %s @@ -192,7 +192,7 @@ ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] ; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { +define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #1 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -242,10 +242,10 @@ ret void } -declare i32 @llvm.amdgcn.workitem.id.x() #1 -declare half @llvm.sqrt.f16(half) #1 -declare half @llvm.fabs.f16(half) #1 +declare i32 @llvm.amdgcn.workitem.id.x() #2 +declare half @llvm.sqrt.f16(half) #2 +declare half @llvm.fabs.f16(half) #2 -attributes #0 = { nounwind } -attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/fdiv.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.ll @@ -284,8 +284,8 @@ ret void } -attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,+fp64-fp16-denormals,-flat-for-global" } -attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" } -attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" } +attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" } +attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-features"="-flat-for-global" } +attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" "target-features"="-flat-for-global" } !0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s ; GCN-LABEL: {{^}}div_1_by_x_25ulp: ; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000 Index: llvm/test/CodeGen/AMDGPU/fdot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdot2.ll +++ llvm/test/CodeGen/AMDGPU/fdot2.ll @@ -1,10 +1,10 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE -; RUN: llc -march=amdgcn -mcpu=gfx1011 -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT -; RUN: llc -march=amdgcn -mcpu=gfx1012 -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=-fp64-fp16-denormals,-fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT -; RUN: llc -march=amdgcn -mcpu=gfx906 -mattr=+fp64-fp16-denormals,+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX900 +; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX906-DL-UNSAFE +; RUN: llc -march=amdgcn -mcpu=gfx1011 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx1012 -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GCN-DL-UNSAFE,GFX10-DL-UNSAFE,GFX10-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906 +; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-CONTRACT +; RUN: llc -march=amdgcn -mcpu=gfx906 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX906-DENORM-CONTRACT ; (fadd (fmul S1.x, S2.x), (fadd (fmul (S1.y, S2.y), z))) -> (fdot2 S1, S2, z) ; Tests to make sure fdot2 is not generated when vector elements of dot-product expressions Index: llvm/test/CodeGen/AMDGPU/fma-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fma-combine.ll +++ llvm/test/CodeGen/AMDGPU/fma-combine.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -mattr=-fp32-denormals -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=verde -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be ; beneficial even without fp32 denormals, but they do require no-infs-fp-math Index: llvm/test/CodeGen/AMDGPU/fmaxnum.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmaxnum.ll +++ llvm/test/CodeGen/AMDGPU/fmaxnum.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on: ; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}} @@ -205,7 +205,7 @@ ; GCN: v_max_f32_e32 ; GCN: v_max_f32_e32 ; GCN-NOT: v_max_f32 -define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind { +define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) #0 { %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0 ret <3 x float> %val } @@ -218,5 +218,5 @@ declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #1 declare double @llvm.maxnum.f64(double, double) -attributes #0 = { nounwind } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/fminnum.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fminnum.ll +++ llvm/test/CodeGen/AMDGPU/fminnum.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on: ; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}} @@ -225,5 +225,5 @@ declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #1 declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #1 -attributes #0 = { nounwind } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll +++ llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll @@ -1,8 +1,8 @@ -; XUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-DENORM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-FLUSH %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s +; XUN: llc -mtriple=amdgcn-amd-amdhsa -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SIVI,VI-FLUSH %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-DENORM %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10,GFX8_10,GFX10-FLUSH %s ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't ; make add an instruction if the fadd has more than one use. Index: llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -1,13 +1,13 @@ -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,VI-FLUSH,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,VI-FLUSH,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,VI-DENORM-STRICT,VI-DENORM,VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,VI-DENORM-CONTRACT,VI-DENORM,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GFX10-FLUSH,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-STRICT,GCN-DENORM,GCN-DENORM-STRICT,GFX10-DENORM-STRICT,GFX10-DENORM,GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-CONTRACT,GCN-DENORM,GCN-DENORM-CONTRACT,GFX10-DENORM-CONTRACT,GFX10-DENORM,GFX10 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare half @llvm.fmuladd.f16(half, half, half) #1 Index: llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll +++ llvm/test/CodeGen/AMDGPU/fmuladd.f32.ll @@ -1,21 +1,21 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-STRICT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-STRICT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=-fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -mattr=+fp32-denormals,+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=-fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -mattr=+fp32-denormals,-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=tahiti -denormal-fp-math-f32=ieee -mattr=+fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=preserve-sign -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-CONTRACT,GCN-FLUSH-MAD,SI-FLUSH,GCN-FLUSH-SLOWFMA,GCN-FLUSH-SLOWFMA-CONTRACT,SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=verde -denormal-fp-math-f32=ieee -mattr=-fast-fmaf -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-CONTRACT,GCN-DENORM,SI-DENORM,GCN-DENORM-SLOWFMA,GCN-DENORM-SLOWFMA-CONTRACT,SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-MAD,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX900 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx900 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX900 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=preserve-sign -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GFX9-FLUSH,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT,GFX906 %s ; FIXME: Should probably test this, but sometimes selecting fmac is painful to match. -; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s +; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s ; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow. Index: llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -1,12 +1,12 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s - -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=preserve-sign -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s + +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -denormal-fp-math=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s declare i32 @llvm.amdgcn.workitem.id.x() #1 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1 Index: llvm/test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=-fp32-denormals,+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=-fp32-denormals,+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=hawaii -start-after=sink -mattr=+flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=fiji -start-after=sink --verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=VI -check-prefix=FUNC %s +; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=fiji -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=VI -check-prefix=FUNC %s ; -------------------------------------------------------------------------------- ; fadd tests @@ -2522,5 +2522,5 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/fpext-free.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fpext-free.ll +++ llvm/test/CodeGen/AMDGPU/fpext-free.ll @@ -1,7 +1,7 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32FLUSH %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32DENORM %s -; RUN: llc -march=amdgcn -mcpu=gfx803 -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI,VI-F32FLUSH %s -; RUN: llc -march=amdgcn -mcpu=gfx803 -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI,VI-F32DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,GFX9,GFX9-F32DENORM %s +; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI,VI-F32FLUSH %s +; RUN: llc -march=amdgcn -mcpu=gfx803 -denormal-fp-math-f32=ieee -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX89,VI,VI-F32DENORM %s ; fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z) Index: llvm/test/CodeGen/AMDGPU/frem.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/frem.ll +++ llvm/test/CodeGen/AMDGPU/frem.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}frem_f32: ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}} @@ -109,5 +109,5 @@ ret void } -attributes #0 = { nounwind "unsafe-fp-math"="false" } -attributes #1 = { nounwind "unsafe-fp-math"="true" } +attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll +++ llvm/test/CodeGen/AMDGPU/hsa-fp-mode.ll @@ -92,10 +92,10 @@ attributes #0 = { nounwind "target-cpu"="kaveri" "target-features"="-code-object-v3" } attributes #1 = { nounwind "target-cpu"="fiji" "target-features"="-code-object-v3" } -attributes #2 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,+fp64-fp16-denormals" } -attributes #3 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,-fp64-fp16-denormals" } -attributes #4 = { nounwind "target-features"="-code-object-v3,+fp32-denormals,+fp64-fp16-denormals" } -attributes #5 = { nounwind "target-features"="-code-object-v3,-fp32-denormals,-fp64-fp16-denormals" } +attributes #2 = { nounwind "target-features"="-code-object-v3" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "target-features"="-code-object-v3" "denormal-fp-math-f32"="ieee,ieee" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #4 = { nounwind "target-features"="-code-object-v3" "denormal-fp-math"="ieee,ieee" } +attributes #5 = { nounwind "target-features"="-code-object-v3" "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #6 = { nounwind "amdgpu-dx10-clamp"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } attributes #7 = { nounwind "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } attributes #8 = { nounwind "amdgpu-dx10-clamp"="false" "amdgpu-ieee"="false" "target-cpu"="fiji" "target-features"="-code-object-v3" } Index: llvm/test/CodeGen/AMDGPU/known-never-snan.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/known-never-snan.ll +++ llvm/test/CodeGen/AMDGPU/known-never-snan.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s ; Mostly overlaps with fmed3.ll to stress specific cases of ; isKnownNeverSNaN. @@ -667,5 +667,5 @@ declare float @llvm.amdgcn.fract.f32(float) #1 declare float @llvm.amdgcn.cubeid(float, float, float) #0 -attributes #0 = { nounwind } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -140,7 +140,7 @@ } attributes #0 = { nounwind readnone } -attributes #1 = { nounwind "unsafe-fp-math"="false" "target-features"="-fp32-denormals" } -attributes #2 = { nounwind "unsafe-fp-math"="true" "target-features"="-fp32-denormals" } -attributes #3 = { nounwind "unsafe-fp-math"="false" "target-features"="+fp32-denormals" } -attributes #4 = { nounwind "unsafe-fp-math"="true" "target-features"="+fp32-denormals" } +attributes #1 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #3 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="ieee,ieee" } +attributes #4 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -1,9 +1,9 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals,+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-FLUSH %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX10 -check-prefix=GFX10-DENORM %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) Index: llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,SIVI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -88,7 +88,7 @@ ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: %a.val = load volatile half, half addrspace(1)* %a %b.val = load volatile half, half addrspace(1)* %b @@ -157,7 +157,7 @@ ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: %b.val = load half, half addrspace(1)* %b %r.val = call half @llvm.maxnum.f16(half 3.0, half %b.val) @@ -225,7 +225,7 @@ ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.maxnum.f16(half %a.val, half 4.0) @@ -308,7 +308,7 @@ ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, - <2 x half> addrspace(1)* %b) { + <2 x half> addrspace(1)* %b) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %b.val = load <2 x half>, <2 x half> addrspace(1)* %b @@ -376,7 +376,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, - <2 x half> addrspace(1)* %b) { + <2 x half> addrspace(1)* %b) #0 { entry: %b.val = load <2 x half>, <2 x half> addrspace(1)* %b %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> , <2 x half> %b.val) @@ -443,7 +443,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, - <2 x half> addrspace(1)* %a) { + <2 x half> addrspace(1)* %a) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a.val, <2 x half> ) @@ -542,7 +542,7 @@ ; GFX9-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, - <3 x half> addrspace(1)* %b) { + <3 x half> addrspace(1)* %b) #0 { entry: %a.val = load <3 x half>, <3 x half> addrspace(1)* %a %b.val = load <3 x half>, <3 x half> addrspace(1)* %b @@ -654,7 +654,7 @@ ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, - <4 x half> addrspace(1)* %b) { + <4 x half> addrspace(1)* %b) #0 { entry: %a.val = load <4 x half>, <4 x half> addrspace(1)* %a %b.val = load <4 x half>, <4 x half> addrspace(1)* %b @@ -745,10 +745,12 @@ ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, - <4 x half> addrspace(1)* %b) { + <4 x half> addrspace(1)* %b) #0 { entry: %b.val = load <4 x half>, <4 x half> addrspace(1)* %b %r.val = call <4 x half> @llvm.maxnum.v4f16(<4 x half> , <4 x half> %b.val) store <4 x half> %r.val, <4 x half> addrspace(1)* %r ret void } + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -88,7 +88,7 @@ ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: %a.val = load volatile half, half addrspace(1)* %a %b.val = load volatile half, half addrspace(1)* %b @@ -97,7 +97,7 @@ ret void } -define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) { +define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { ; SI-LABEL: minnum_f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 @@ -180,7 +180,7 @@ ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: %b.val = load half, half addrspace(1)* %b %r.val = call half @llvm.minnum.f16(half 3.0, half %b.val) @@ -248,7 +248,7 @@ ; GFX9-NEXT: buffer_store_short v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm half addrspace(1)* %r, - half addrspace(1)* %a) { + half addrspace(1)* %a) #0 { entry: %a.val = load half, half addrspace(1)* %a %r.val = call half @llvm.minnum.f16(half %a.val, half 4.0) @@ -331,7 +331,7 @@ ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, <2 x half> addrspace(1)* %a, - <2 x half> addrspace(1)* %b) { + <2 x half> addrspace(1)* %b) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %b.val = load <2 x half>, <2 x half> addrspace(1)* %b @@ -340,7 +340,7 @@ ret void } -define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) { +define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b) #0 { ; SI-LABEL: minnum_v2f16_no_ieee: ; SI: ; %bb.0: ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 @@ -429,7 +429,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, - <2 x half> addrspace(1)* %b) { + <2 x half> addrspace(1)* %b) #0 { entry: %b.val = load <2 x half>, <2 x half> addrspace(1)* %b %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> , <2 x half> %b.val) @@ -496,7 +496,7 @@ ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <2 x half> addrspace(1)* %r, - <2 x half> addrspace(1)* %a) { + <2 x half> addrspace(1)* %a) #0 { entry: %a.val = load <2 x half>, <2 x half> addrspace(1)* %a %r.val = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a.val, <2 x half> ) @@ -595,7 +595,7 @@ ; GFX9-NEXT: s_endpgm <3 x half> addrspace(1)* %r, <3 x half> addrspace(1)* %a, - <3 x half> addrspace(1)* %b) { + <3 x half> addrspace(1)* %b) #0 { entry: %a.val = load <3 x half>, <3 x half> addrspace(1)* %a %b.val = load <3 x half>, <3 x half> addrspace(1)* %b @@ -707,7 +707,7 @@ ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, <4 x half> addrspace(1)* %a, - <4 x half> addrspace(1)* %b) { + <4 x half> addrspace(1)* %b) #0 { entry: %a.val = load <4 x half>, <4 x half> addrspace(1)* %a %b.val = load <4 x half>, <4 x half> addrspace(1)* %b @@ -798,10 +798,12 @@ ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm <4 x half> addrspace(1)* %r, - <4 x half> addrspace(1)* %b) { + <4 x half> addrspace(1)* %b) #0 { entry: %b.val = load <4 x half>, <4 x half> addrspace(1)* %b %r.val = call <4 x half> @llvm.minnum.v4f16(<4 x half> , <4 x half> %b.val) store <4 x half> %r.val, <4 x half> addrspace(1)* %r ret void } + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/mad-combine.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-combine.ll +++ llvm/test/CodeGen/AMDGPU/mad-combine.ll @@ -1,12 +1,12 @@ ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma. -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -fp-contract=fast < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=preserve-sign -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s ; Make sure we don't form mad with denormals -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tahiti -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=verde -denormal-fp-math-f32=ieee -fp-contract=fast -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s declare i32 @llvm.amdgcn.workitem.id.x() #0 declare float @llvm.fabs.f32(float) #0 Index: llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll @@ -143,5 +143,5 @@ declare float @llvm.fmuladd.f32(float, float, float) #1 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #1 -attributes #0 = { nounwind "target-features"="-fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -310,5 +310,5 @@ declare <3 x float> @llvm.fmuladd.v3f32(<3 x float>, <3 x float>, <3 x float>) #1 declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1 -attributes #0 = { nounwind "target-features"="-fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/mad-mix.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mad-mix.ll +++ llvm/test/CodeGen/AMDGPU/mad-mix.ll @@ -557,6 +557,6 @@ declare float @llvm.fmuladd.f32(float, float, float) #2 declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) #2 -attributes #0 = { nounwind "target-features"="-fp32-denormals" } -attributes #1 = { nounwind "target-features"="+fp32-denormals" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "denormal-fp-math-f32"="ieee,ieee" } attributes #2 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madak.ll +++ llvm/test/CodeGen/AMDGPU/madak.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp32-denormals -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s -; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-fp32-denormals -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6_8_9,MAD %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX9,GFX6_8_9,GFX8_9,GFX8_9_10,MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,GFX10-MAD %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -fp-contract=fast -amdgpu-enable-global-sgpr-addr < %s | FileCheck -check-prefixes=GCN,GFX10,GFX8_9_10,FMA %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.fabs.f32(float) nounwind readnone @@ -19,7 +19,7 @@ ; MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 -define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -53,7 +53,7 @@ ; MAD-DAG: v_mac_f32_e32 [[VK]], [[VA]], [[VC]] ; FMA-DAG: v_fmac_f32_e32 [[VK]], [[VA]], [[VC]] ; GCN: s_endpgm -define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -82,7 +82,7 @@ ; MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 ; GFX10-MAD: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 ; FMA: v_fmaak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 -define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { +define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -110,7 +110,7 @@ ; MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 ; FMA: v_fma_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0 -define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -134,7 +134,7 @@ ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] ; GFX10-MAD: v_mad_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 ; FMA: v_fma_f32 v{{[0-9]+}}, [[VA]], [[SB]], 0x41200000 -define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind { +define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -155,7 +155,7 @@ ; GFX6_8_9: v_mac_f32_e32 [[VK]], [[SB]], [[VA]] ; GFX10-MAD: v_madak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 ; FMA: v_fmaak_f32 v{{[0-9]+}}, [[SB]], [[VA]], 0x41200000 -define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -173,7 +173,7 @@ ; GFX8_9: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}} ; GFX10-MAD: v_mac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} ; FMA: v_fmac_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}} -define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind { +define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) #0 { %mul = fmul float %a, %b %madak = fadd float %mul, 10.0 store float %madak, float addrspace(1)* %out, align 4 @@ -189,7 +189,7 @@ ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 ; FMA: v_fma_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, 0x41200000 ; GCN: s_endpgm -define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -215,7 +215,7 @@ ; GFX10-MAD: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 ; FMA: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, 0x41200000 ; GCN: s_endpgm -define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { +define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid @@ -265,4 +265,4 @@ ret void } -attributes #0 = { nounwind} +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/madmk.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madmk.ll +++ llvm/test/CodeGen/AMDGPU/madmk.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s +; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; FIXME: None of these trigger madmk emission anymore. It is still ; possible, but requires the correct registers to be used which is @@ -12,7 +12,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]] -define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -35,7 +35,7 @@ ; GCN-DAG: v_mac_f32_e32 [[VB]], [[SK]], [[VA]] ; GCN-DAG: v_mac_f32_e32 [[VC]], [[SK]], [[VA]] ; GCN: s_endpgm -define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -64,7 +64,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]] -define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -83,7 +83,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind { +define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -99,7 +99,7 @@ ; GCN: v_mov_b32_e32 [[VREG2:v[0-9]+]], [[SREG]] ; GCN: v_mac_f32_e32 [[VREG2]], 0x41200000, [[VREG1]] ; GCN: s_endpgm -define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind { +define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -115,7 +115,7 @@ ; GCN-NOT: v_madmk_f32 ; GCN: v_mac_f32_e32 ; GCN: s_endpgm -define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind { +define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -132,7 +132,7 @@ ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN-DAG: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[SK]], [[VB]] -define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -153,7 +153,7 @@ ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{[sv][0-9]+}}, |{{v[0-9]+}}| -define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1 @@ -174,7 +174,7 @@ ; GCN: buffer_load_dword [[A:v[0-9]+]] ; GCN: s_mov_b32 [[SK:s[0-9]+]], 0x41200000 ; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[SK]], 2.0 -define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid @@ -191,7 +191,7 @@ ; SI: s_or_b64 ; SI: s_xor_b64 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}} -define amdgpu_kernel void @kill_madmk_verifier_error() nounwind { +define amdgpu_kernel void @kill_madmk_verifier_error() #0 { bb: br label %bb2 @@ -214,4 +214,5 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } Index: llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -257,5 +257,5 @@ declare void @foo(i32) #0 declare float @llvm.fmuladd.f32(float, float, float) #1 -attributes #0 = { nounwind willreturn } +attributes #0 = { nounwind willreturn "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone speculatable } Index: llvm/test/CodeGen/AMDGPU/omod.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/omod.ll +++ llvm/test/CodeGen/AMDGPU/omod.ll @@ -275,10 +275,10 @@ declare half @llvm.maxnum.f16(half, half) #1 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1 -attributes #0 = { nounwind "target-features"="-fp32-denormals" "no-signed-zeros-fp-math"="true" } +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" } -attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" } +attributes #2 = { nounwind "denormal-fp-math-f32"="ieee,ieee" "no-signed-zeros-fp-math"="true" } +attributes #3 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "no-signed-zeros-fp-math"="true" } attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" } !llvm.dbg.cu = !{!0} Index: llvm/test/CodeGen/AMDGPU/operand-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/operand-folding.ll +++ llvm/test/CodeGen/AMDGPU/operand-folding.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}fold_sgpr: ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s -define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) { +define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) #1 { entry: %tmp0 = icmp ne i32 %fold, 0 br i1 %tmp0, label %if, label %endif @@ -20,7 +20,7 @@ ; CHECK-LABEL: {{^}}fold_imm: ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5 -define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) { +define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) #1 { entry: %fold = add i32 3, 2 %tmp0 = icmp ne i32 %cmp, 0 @@ -46,7 +46,7 @@ ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]] ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}, -define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) { +define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) #1 { entry: %tmp0 = add i64 %val, 1 store i64 %tmp0, i64 addrspace(1)* %out @@ -61,7 +61,7 @@ ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}} -define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) #1 { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -80,7 +80,7 @@ ; CHECK-LABEL: {{^}}imm_one_use: ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}} -define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) { +define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) #1 { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = xor i32 %tmp0, 100 @@ -94,7 +94,7 @@ ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}} -define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) { +define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) #1 { entry: %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() %tmp1 = add i32 %tmp0, 1 @@ -114,7 +114,7 @@ ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}} ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]] ; CHECK: buffer_store_dword v[[LO]] -define amdgpu_kernel void @no_fold_tied_subregister() { +define amdgpu_kernel void @no_fold_tied_subregister() #1 { %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef %tmp2 = extractelement <2 x float> %tmp1, i32 0 %tmp3 = extractelement <2 x float> %tmp1, i32 1 @@ -128,7 +128,7 @@ ; CHECK-LABEL: {{^}}no_extra_fold_on_same_opnd ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}} ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define void @no_extra_fold_on_same_opnd() { +define void @no_extra_fold_on_same_opnd() #1 { entry: %s0 = load i32, i32 addrspace(5)* undef, align 4 %s0.i64= zext i32 %s0 to i64 @@ -151,3 +151,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0 attributes #0 = { nounwind readnone } +attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s @@ -150,8 +150,8 @@ declare float @llvm.fabs.f32(float) #1 declare float @llvm.sqrt.f32(float) #1 -attributes #0 = { nounwind "unsafe-fp-math"="false" } +attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } attributes #1 = { nounwind readnone } -attributes #2 = { nounwind "unsafe-fp-math"="true" } +attributes #2 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" } !0 = !{float 2.500000e+00} Index: llvm/test/CodeGen/AMDGPU/rcp_iflag.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rcp_iflag.ll +++ llvm/test/CodeGen/AMDGPU/rcp_iflag.ll @@ -1,8 +1,8 @@ -; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=GCN %s ; GCN-LABEL: {{^}}rcp_uint: ; GCN: v_rcp_iflag_f32_e32 -define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) { +define amdgpu_kernel void @rcp_uint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %cvt = uitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt @@ -12,10 +12,33 @@ ; GCN-LABEL: {{^}}rcp_sint: ; GCN: v_rcp_iflag_f32_e32 -define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) { +define amdgpu_kernel void @rcp_sint(i32 addrspace(1)* %in, float addrspace(1)* %out) #0 { %load = load i32, i32 addrspace(1)* %in, align 4 %cvt = sitofp i32 %load to float %div = fdiv float 1.000000e+00, %cvt store float %div, float addrspace(1)* %out, align 4 ret void } + +; GCN-LABEL: {{^}}rcp_uint_denorm: +; GCN-NOT: v_rcp_iflag_f32 +define amdgpu_kernel void @rcp_uint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %cvt = uitofp i32 %load to float + %div = fdiv float 1.000000e+00, %cvt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +; GCN-LABEL: {{^}}rcp_sint_denorm: +; GCN-NOT: v_rcp_iflag_f32 +define amdgpu_kernel void @rcp_sint_denorm(i32 addrspace(1)* %in, float addrspace(1)* %out) #1 { + %load = load i32, i32 addrspace(1)* %in, align 4 + %cvt = sitofp i32 %load to float + %div = fdiv float 1.000000e+00, %cvt + store float %div, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-math-f32"="ieee,ieee" } Index: llvm/test/CodeGen/AMDGPU/rsq.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/rsq.ll +++ llvm/test/CodeGen/AMDGPU/rsq.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone declare float @llvm.sqrt.f32(float) nounwind readnone @@ -8,7 +8,7 @@ ; SI-LABEL: {{^}}rsq_f32: ; SI: v_rsq_f32_e32 ; SI: s_endpgm -define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt @@ -20,7 +20,7 @@ ; SI-UNSAFE: v_rsq_f64_e32 ; SI-SAFE: v_sqrt_f64_e32 ; SI: s_endpgm -define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone %div = fdiv double 1.0, %sqrt @@ -31,7 +31,7 @@ ; SI-LABEL: {{^}}rsq_f32_sgpr: ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} ; SI: s_endpgm -define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind { +define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) #0 { %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone %div = fdiv float 1.0, %sqrt store float %div, float addrspace(1)* %out, align 4 @@ -55,7 +55,7 @@ ; SI-SAFE-NOT: v_rsq_f32 ; SI: s_endpgm -define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) { +define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid @@ -81,7 +81,7 @@ ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %val = load float, float addrspace(1)* %in, align 4 %sqrt = call float @llvm.sqrt.f32(float %val) %div = fdiv float -1.0, %sqrt @@ -96,7 +96,7 @@ ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 { %val = load double, double addrspace(1)* %in, align 4 %sqrt = call double @llvm.sqrt.f64(double %val) %div = fdiv double -1.0, %sqrt @@ -112,7 +112,7 @@ ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}} ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]] ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]] -define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #0 { %val = load float, float addrspace(1)* %in, align 4 %val.fneg = fsub float -0.0, %val %sqrt = call float @llvm.sqrt.f32(float %val.fneg) @@ -128,7 +128,7 @@ ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}} ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]] ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]] -define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind { +define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #0 { %val = load double, double addrspace(1)* %in, align 4 %val.fneg = fsub double -0.0, %val %sqrt = call double @llvm.sqrt.f64(double %val.fneg) @@ -136,3 +136,5 @@ store double %div, double addrspace(1)* %out, align 4 ret void } + +attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/sdivrem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdivrem64.ll +++ llvm/test/CodeGen/AMDGPU/sdivrem64.ll @@ -1,5 +1,5 @@ -;RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s ;FUNC-LABEL: {{^}}s_test_sdiv: @@ -43,7 +43,7 @@ ;GCN: v_trunc_f32_e32 ;GCN: v_mac_f32_e32 v{{[0-9]+}}, 0xcf800000 ;GCN: s_endpgm -define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %result = sdiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -90,7 +90,7 @@ ;GCN: v_trunc_f32_e32 ;GCN: v_mac_f32_e32 v{{[0-9]+}}, 0xcf800000 ;GCN: s_endpgm -define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -105,7 +105,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = sdiv i64 %1, %2 @@ -122,7 +122,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = ashr i64 %x, 33 %2 = ashr i64 %y, 33 %result = srem i64 %1, %2 @@ -142,7 +142,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = sdiv i64 %1, %2 @@ -162,10 +162,12 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = ashr i64 %x, 40 %2 = ashr i64 %y, 40 %result = srem i64 %1, %2 store i64 %result, i64 addrspace(1)* %out ret void } + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll +++ llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll @@ -1,7 +1,7 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=NOSDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI,GFX89,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_10,SDWA,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx1010 -amdgpu-sdwa-peephole -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10,GFX9_10,SDWA,GCN %s ; GCN-LABEL: {{^}}add_shr_i32: ; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}} @@ -12,7 +12,7 @@ ; GFX9: v_add_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX10: v_add_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %a = load i32, i32 addrspace(1)* %in, align 4 %shr = lshr i32 %a, 16 %add = add i32 %a, %shr @@ -28,7 +28,7 @@ ; VI: v_subrev_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9: v_sub_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10: v_sub_nc_u32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { +define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 { %a = load i32, i32 addrspace(1)* %in, align 4 %shr = lshr i32 %a, 16 %sub = sub i32 %shr, %a @@ -44,7 +44,7 @@ ; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 -define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) { +define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) #0 { %a = load i32, i32 addrspace(1)* %in1, align 4 %b = load i32, i32 addrspace(1)* %in2, align 4 %shra = lshr i32 %a, 16 @@ -61,7 +61,7 @@ ; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_u32_u24_sdwa -define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) { +define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) #0 { entry: %a = load i16, i16 addrspace(1)* %ina, align 4 %b = load i16, i16 addrspace(1)* %inb, align 4 @@ -84,7 +84,7 @@ ; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 @@ -111,7 +111,7 @@ ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) #0 { entry: %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4 %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4 @@ -146,7 +146,7 @@ ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9_10-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) #0 { entry: %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4 %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4 @@ -161,7 +161,7 @@ ; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_f16_sdwa -define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) { +define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) #0 { entry: %a = load half, half addrspace(1)* %ina, align 4 %b = load half, half addrspace(1)* %inb, align 4 @@ -184,7 +184,7 @@ ; GFX9_10: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 @@ -209,7 +209,7 @@ ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) #0 { entry: %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4 %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4 @@ -240,7 +240,7 @@ ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX9_10-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) #0 { entry: %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4 %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4 @@ -256,7 +256,7 @@ ; GFX10: v_mul_lo_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} ; SDWA-NOT: v_mul_u32_u24_sdwa -define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) { +define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) #0 { entry: %a = load i8, i8 addrspace(1)* %ina, align 4 %b = load i8, i8 addrspace(1)* %inb, align 4 @@ -285,7 +285,7 @@ ; GFX10: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, v ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) #0 { entry: %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4 %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4 @@ -315,7 +315,7 @@ ; GFX10-DAG: v_mul_lo_u16_e64 ; GFX10-DAG: v_mul_lo_u16_e64 -define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) #0 { entry: %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4 %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4 @@ -355,7 +355,7 @@ ; GFX10-DAG: v_mul_lo_u16_e64 ; GFX10-DAG: v_mul_lo_u16_e64 -define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) { +define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) #0 { entry: %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4 %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4 @@ -376,7 +376,7 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16( <2 x half> addrspace(1)* %r, - <2 x i16> addrspace(1)* %a) { + <2 x i16> addrspace(1)* %a) #0 { entry: %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a %r.val = sitofp <2 x i16> %a.val to <2 x half> @@ -399,7 +399,7 @@ ; GFX9_10: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]] ; GFX9_10: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]] -define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) { +define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) #0 { entry: %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4 %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4 @@ -421,7 +421,7 @@ ; GFX10: v_pk_mul_lo_u16 v{{[0-9]+}}, 0x141007b, v{{[0-9]+}} -define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) { +define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4 %mul = mul <2 x i16> %a, @@ -443,7 +443,7 @@ ; GFX9_10: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} ; GFX9_10: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}} -define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 @@ -460,7 +460,7 @@ ; GFX9_10: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} -define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) { +define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) #0 { entry: %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4 %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4 @@ -503,7 +503,7 @@ ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) { +define amdgpu_kernel void @pulled_out_test(<8 x i8> addrspace(1)* %sourceA, <8 x i8> addrspace(1)* %destValues) #0 { entry: %idxprom = ashr exact i64 15, 32 %arrayidx = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %sourceA, i64 %idxprom @@ -564,3 +564,5 @@ store volatile <2 x i32> %tmp12, <2 x i32> addrspace(1)* undef br label %bb1 } + +attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/udivrem24.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udivrem24.ll +++ llvm/test/CodeGen/AMDGPU/udivrem24.ll @@ -21,6 +21,63 @@ ret void } +; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in_out: +; SI: v_cvt_f32_ubyte +; SI-DAG: v_cvt_f32_ubyte +; SI-DAG: v_rcp_iflag_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define amdgpu_kernel void @udiv24_i8_denorm_flush_in_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_in: +; SI: v_cvt_f32_ubyte +; SI-DAG: v_cvt_f32_ubyte +; SI-DAG: v_rcp_iflag_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define amdgpu_kernel void @udiv24_i8_denorm_flush_in(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}udiv24_i8_denorm_flush_out: +; SI: v_cvt_f32_ubyte +; SI-DAG: v_cvt_f32_ubyte +; SI-DAG: v_rcp_iflag_f32 +; SI: v_cvt_u32_f32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define amdgpu_kernel void @udiv24_i8_denorm_flush_out(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #2 { + %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1 + %num = load i8, i8 addrspace(1) * %in + %den = load i8, i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + ; FUNC-LABEL: {{^}}udiv24_i16: ; SI: v_cvt_f32_u32 ; SI: v_cvt_f32_u32 @@ -325,3 +382,7 @@ store i32 %result, i32 addrspace(1)* %out, align 4 ret void } + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } +attributes #1 = { "denormal-fp-math-f32"="ieee,preserve-sign" } +attributes #2 = { "denormal-fp-math-f32"="preserve-sign,ieee" } Index: llvm/test/CodeGen/AMDGPU/udivrem64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/udivrem64.ll +++ llvm/test/CodeGen/AMDGPU/udivrem64.ll @@ -1,6 +1,6 @@ -;RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s -;RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s +;RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s ;FUNC-LABEL: {{^}}test_udiv: @@ -44,7 +44,7 @@ ;GCN: v_trunc_f32_e32 ;GCN: v_mac_f32_e32 v{{[0-9]+}}, 0xcf800000 ;GCN: s_endpgm -define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %result = udiv i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -91,7 +91,7 @@ ;GCN: v_trunc_f32_e32 ;GCN: v_mac_f32_e32 v{{[0-9]+}}, 0xcf800000 ;GCN: s_endpgm -define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out ret void @@ -106,7 +106,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = udiv i64 %1, %2 @@ -123,7 +123,7 @@ ;SI-NOT: v_lshr_b64 ;VI-NOT: v_lshrrev_b64 ;GCN: s_endpgm -define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = lshr i64 %x, 33 %2 = lshr i64 %y, 33 %result = urem i64 %1, %2 @@ -142,7 +142,7 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = udiv i64 %1, %2 @@ -161,7 +161,7 @@ ;VI-NOT: v_lshrrev_b64 ;GCN: v_mad_f32 ;GCN: s_endpgm -define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) { +define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) #0 { %1 = lshr i64 %x, 41 %2 = lshr i64 %y, 41 %result = urem i64 %1, %2 @@ -177,8 +177,10 @@ ;GCN: v_addc ;GCN: v_addc ;GCN: s_endpgm -define amdgpu_kernel void @test_udiv_k(i64 addrspace(1)* %out, i64 %x) { +define amdgpu_kernel void @test_udiv_k(i64 addrspace(1)* %out, i64 %x) #0 { %result = udiv i64 24, %x store i64 %result, i64 addrspace(1)* %out ret void } + +attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" } Index: llvm/test/CodeGen/AMDGPU/v_mac.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_mac.ll +++ llvm/test/CodeGen/AMDGPU/v_mac.ll @@ -1,6 +1,6 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals,+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s ; GCN-LABEL: {{^}}mac_vvv: ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}} Index: llvm/test/CodeGen/AMDGPU/v_mac_f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_mac_f16.ll +++ llvm/test/CodeGen/AMDGPU/v_mac_f16.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -mattr=-fp32-denormals,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mac_f16: ; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]] @@ -677,6 +677,6 @@ declare void @llvm.amdgcn.s.barrier() #2 -attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" } -attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" } +attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" } +attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" "denormal-fp-math"="preserve-sign,preserve-sign" } attributes #2 = { nounwind convergent } Index: llvm/test/CodeGen/AMDGPU/v_madak_f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -mattr=-fp32-denormals,-fp64-fp16-denormals -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI -; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-fp32-denormals,-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,SI +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,VI define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: @@ -52,7 +52,7 @@ ; VI-NEXT: s_endpgm half addrspace(1)* %r, half addrspace(1)* %a, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: %a.val = load half, half addrspace(1)* %a %b.val = load half, half addrspace(1)* %b @@ -136,7 +136,7 @@ half addrspace(1)* %r1, half addrspace(1)* %a, half addrspace(1)* %b, - half addrspace(1)* %c) { + half addrspace(1)* %c) #0 { entry: %a.val = load volatile half, half addrspace(1)* %a %b.val = load volatile half, half addrspace(1)* %b @@ -151,3 +151,5 @@ store half %r1.val, half addrspace(1)* %r1 ret void } + +attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" }