Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -114,6 +114,11 @@ [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; +def int_amdgcn_rsq_legacy : GCCBuiltin<"__builtin_amdgcn_rsq_legacy">, + Intrinsic< + [llvm_float_ty], [llvm_float_ty], [IntrNoMem] +>; + def int_amdgcn_rsq_clamp : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1478,9 +1478,18 @@ DAG.getValueType(VT)); } -static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, EVT VT) { +static SDValue emitNonHSAIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), - "non-hsa intrinsic with hsa target"); + "non-hsa intrinsic with hsa target", + DL.getDebugLoc()); + DAG.getContext()->diagnose(BadIntrin); + return DAG.getUNDEF(VT); +} + +static SDValue emitRemovedIntrinsicError(SelectionDAG& DAG, SDLoc DL, EVT VT) { + DiagnosticInfoUnsupported BadIntrin(*DAG.getMachineFunction().getFunction(), + "intrinsic not supported on subtarget", + DL.getDebugLoc()); DAG.getContext()->diagnose(BadIntrin); return DAG.getUNDEF(VT); } @@ -1524,6 +1533,12 @@ case Intrinsic::amdgcn_rsq: case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_rsq_legacy: { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return emitRemovedIntrinsicError(DAG, DL, VT); + + return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + } case Intrinsic::amdgcn_rsq_clamp: case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) @@ -1541,55 +1556,55 @@ } case Intrinsic::r600_read_ngroups_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_X, false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Y, false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::NGROUPS_Z, false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_X, false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Y, false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), SI::KernelInputOffsets::GLOBAL_SIZE_Z, false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_X); case Intrinsic::r600_read_local_size_y: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Y); case Intrinsic::r600_read_local_size_z: if (Subtarget->isAmdHsaOS()) - return emitNonHSAIntrinsicError(DAG, VT); + return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); @@ -2547,6 +2562,9 @@ return DAG.getConstant(0, SDLoc(N), MVT::i1); } + if (N->getOperand(0).isUndef()) + return DAG.getUNDEF(MVT::i1); + return SDValue(); } @@ -2948,6 +2966,17 @@ return performClassCombine(N, DCI); case ISD::FCANONICALIZE: return performFCanonicalizeCombine(N, DCI); + case AMDGPUISD::FRACT: + case AMDGPUISD::RCP: + case AMDGPUISD::RSQ: + case AMDGPUISD::RSQ_LEGACY: + case AMDGPUISD::RSQ_CLAMP: + case AMDGPUISD::LDEXP: { + SDValue Src = N->getOperand(0); + if (Src.isUndef()) + return Src; + break; + } } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } Index: test/CodeGen/AMDGPU/llvm.amdgcn.class.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 declare i1 @llvm.amdgcn.class.f64(double, i32) #1 @@ -495,5 +495,17 @@ ret void } +; FIXME: Why is the extension still here? +; SI-LABEL: {{^}}test_class_undef_f32: +; SI-NOT: v_cmp_class +; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, +; SI: buffer_store_dword +define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 { + %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1 + %sext = sext i1 %result to i32 + store i32 %sext, i32 addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll @@ -20,5 +20,15 @@ ret void } +; GCN-LABEL: {{^}}v_fract_undef_f32: +; GCN-NOT: v_fract_f32 +; GCN-NOT: v0 +; GCN: buffer_store_dword v0 +define void @v_fract_undef_f32(float addrspace(1)* %out) #1 { + %fract = call float @llvm.amdgcn.fract.f32(float undef) + store float %fract, float addrspace(1)* %out + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s declare float @llvm.amdgcn.ldexp.f32(float, i32) nounwind readnone @@ -42,3 +42,11 @@ store double %result, double addrspace(1)* %out, align 8 ret void } + +; SI-LABEL: {{^}}test_ldexp_undef_f32: +; SI-NOT: v_ldexp_f32 +define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind { + %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone + store float %result, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll @@ -1,6 +1,6 @@ -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=SI -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s -; XUN: llc -march=amdgcn -mcpu=SI -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s +; XUN: llc -march=amdgcn -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp32-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE-SPDENORM -check-prefix=SI -check-prefix=FUNC %s @@ -69,5 +69,13 @@ ret void } +; FUNC-LABEL: {{^}}rcp_undef_f32: +; SI-NOT: v_rcp_f32 +define void @rcp_undef_f32(float addrspace(1)* %out) #1 { + %rcp = call float @llvm.amdgcn.rcp.f32(float undef) #0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll @@ -38,5 +38,13 @@ ret void } +; FUNC-LABEL: {{^}}rsq_clamp_undef_f32: +; SI-NOT: v_rsq_clamp_f32 +define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 { + %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef) + store float %rsq_clamp, float addrspace(1)* %out + ret void +} + attributes #0 = { nounwind } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll @@ -0,0 +1,39 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.amdgcn.rsq.legacy(float) #0 + +; FUNC-LABEL: {{^}}rsq_legacy_f32: +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0 +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0 +define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0 +; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000 +define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rsq_legacy_undef_f32: +; SI-NOT: v_rsq_legacy_f32 +define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float undef) + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s declare float @llvm.amdgcn.rsq.f32(float) #0 @@ -56,5 +56,13 @@ ret void } +; FUNC-LABEL: {{^}}rsq_undef_f32: +; SI-NOT: v_rsq_f32 +define void @rsq_undef_f32(float addrspace(1)* %out) #1 { + %rsq = call float @llvm.amdgcn.rsq.f32(float undef) + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + attributes #0 = { nounwind readnone } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/vi-removed-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/vi-removed-intrinsics.ll @@ -0,0 +1,24 @@ +; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s + +; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget + +declare float @llvm.amdgcn.rsq.legacy(float) #0 + +define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 { + %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4 + store float %rsq, float addrspace(1)* %out, align 4 + ret void +} + +attributes #0 = { nounwind readnone } +attributes #1 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!2, !3} + +!0 = distinct !DICompileUnit(language: DW_LANG_OpenCL, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug) +!1 = !DIFile(filename: "foo.cl", directory: "/dev/null") +!2 = !{i32 2, !"Dwarf Version", i32 4} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 1, column: 42, scope: !5) +!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)