Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -324,7 +324,6 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { Type *Ty = FDiv.getType(); - // TODO: Handle half if (!Ty->getScalarType()->isFloatTy()) return false; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -2987,16 +2987,18 @@ bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; if (const ConstantFPSDNode *CLHS = dyn_cast(LHS)) { - if ((Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()))) { - + if (Unsafe || (VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + VT == MVT::f16) { if (CLHS->isExactlyValue(1.0)) { // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to // the CI documentation has a worst case error of 1 ulp. // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. // 1.0 / sqrt(x) -> rsq(x) - // + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. if (RHS.getOpcode() == ISD::FSQRT) @@ -3071,6 +3073,9 @@ } SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { + if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) + return FastLowered; + SDLoc SL(Op); SDValue Src0 = Op.getOperand(0); SDValue Src1 = Op.getOperand(1); Index: test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- test/CodeGen/AMDGPU/fdiv.f16.ll +++ test/CodeGen/AMDGPU/fdiv.f16.ll @@ -1,9 +1,10 @@ ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; Make sure fdiv is promoted to f32. -; GCN-LABEL: {{^}}fdiv_f16 +; GCN-LABEL: {{^}}v_fdiv_f16 ; SI: v_cvt_f32_f16 ; SI: v_cvt_f32_f16 ; SI: v_div_scale_f32 @@ -19,8 +20,8 @@ ; SI: v_div_fixup_f32 ; SI: v_cvt_f16_f32 -; VI: buffer_load_ushort [[LHS:v[0-9]+]] -; VI: buffer_load_ushort [[RHS:v[0-9]+]] +; VI: flat_load_ushort [[LHS:v[0-9]+]] +; VI: flat_load_ushort [[RHS:v[0-9]+]] ; VI-DAG: v_cvt_f32_f16_e32 [[CVT_LHS:v[0-9]+]], [[LHS]] ; VI-DAG: v_cvt_f32_f16_e32 [[CVT_RHS:v[0-9]+]], [[RHS]] @@ -29,15 +30,184 @@ ; VI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[RCP_RHS]], [[CVT_LHS]] ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] -; VI: buffer_store_short [[RESULT]] -define void @fdiv_f16( +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, - half addrspace(1)* %b) { + half addrspace(1)* %b) #0 { entry: - %a.val = load volatile half, half addrspace(1)* %a - %b.val = load volatile half, half addrspace(1)* %b + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %a.val = load volatile half, half addrspace(1)* %gep.a + %b.val = load volatile half, half addrspace(1)* %gep.b %r.val = fdiv half %a.val, %b.val - store half %r.val, half addrspace(1)* %r + store half %r.val, half addrspace(1)* %gep.r ret void } + +; GCN-LABEL: {{^}}v_rcp_f16: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; VI-NOT: [[RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv half 1.0, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rcp_f16_abs: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| +; VI-NOT: [RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %b.abs = call half @llvm.fabs.f16(half %b.val) + %r.val = fdiv half 1.0, %b.abs + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rcp_f16_arcp: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; VI-NOT: [[RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv arcp half 1.0, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rcp_f16_neg: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] +; VI-NOT: [RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv half -1.0, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rsq_f16: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] +; VI-NOT: [RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %b.sqrt = call half @llvm.sqrt.f16(half %b.val) + %r.val = fdiv half 1.0, %b.sqrt + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_rsq_f16_neg: +; VI: flat_load_ushort [[VAL:v[0-9]+]] +; VI-NOT: [[VAL]] +; VI: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] +; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] +; VI-NOT: [RESULT]] +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %b.val = load volatile half, half addrspace(1)* %gep.b + %b.sqrt = call half @llvm.sqrt.f16(half %b.val) + %r.val = fdiv half -1.0, %b.sqrt + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_fdiv_f16_arcp: +; VI: flat_load_ushort [[LHS:v[0-9]+]] +; VI: flat_load_ushort [[RHS:v[0-9]+]] + +; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %a.val = load volatile half, half addrspace(1)* %gep.a + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv arcp half %a.val, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +; GCN-LABEL: {{^}}v_fdiv_f16_unsafe: +; VI: flat_load_ushort [[LHS:v[0-9]+]] +; VI: flat_load_ushort [[RHS:v[0-9]+]] + +; VI: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] +; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]] + +; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %gep.a = getelementptr inbounds half, half addrspace(1)* %a, i64 %tid.ext + %gep.b = getelementptr inbounds half, half addrspace(1)* %b, i64 %tid.ext + %gep.r = getelementptr inbounds half, half addrspace(1)* %r, i64 %tid.ext + %a.val = load volatile half, half addrspace(1)* %gep.a + %b.val = load volatile half, half addrspace(1)* %gep.b + %r.val = fdiv half %a.val, %b.val + store half %r.val, half addrspace(1)* %gep.r + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 +declare half @llvm.sqrt.f16(half) #1 +declare half @llvm.fabs.f16(half) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "unsafe-fp-math"="true" }