Index: lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.h +++ lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -53,6 +53,7 @@ SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFLOG(SDValue Op, SelectionDAG &Dag) const; SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) const; Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -13,6 +13,10 @@ // //===----------------------------------------------------------------------===// +#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f +#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f +#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f + #include "AMDGPUISelLowering.h" #include "AMDGPU.h" #include "AMDGPUCallLowering.h" @@ -260,6 +264,14 @@ setOperationAction(ISD::FROUND, MVT::f32, Custom); setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FLOG, MVT::f32, Custom); + setOperationAction(ISD::FLOG10, MVT::f32, Custom); + + if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FLOG, MVT::f16, Custom); + setOperationAction(ISD::FLOG10, MVT::f16, Custom); + } + setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); @@ -429,6 +441,8 @@ setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); + setOperationAction(ISD::FLOG, VT, Expand); + setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); @@ -961,6 +975,9 @@ case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); + case ISD::FLOG: + case ISD::FLOG10: + return LowerFLOG(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); @@ -1875,6 +1892,28 @@ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + SDLoc SL(Op); + SDValue Operand = Op.getOperand(0); + + SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand); + SDValue Log2Base; + switch (Op.getOpcode()) { + case ISD::FLOG: + Log2Base = DAG.getConstantFP(AMDGPU_LOG2E_F, SL, VT); + break; + case ISD::FLOG10: + Log2Base = DAG.getConstantFP(AMDGPU_LN10_F / AMDGPU_LN2_F, SL, VT); + break; + default: + llvm_unreachable("Wrong log opcode"); + } + + return DAG.getNode(ISD::FDIV, SL, VT, Log2Operand, Log2Base); +} + SDValue AMDGPUTargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); Index: test/CodeGen/AMDGPU/llvm.log.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.log.f16.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIGFX9 %s + +declare half @llvm.log.f16(half %a) +declare <2 x half> @llvm.log.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}log_f16 +; GCN: buffer_load_ushort v[[A_F16_0:[0-9]+]] +; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_div_scale_f32 v[[A_F32_2:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_1]], v[[A_F32_1]], v[[A_F32_0]] +; SI: v_rcp_f32_e32 v[[A_F32_3:[0-9]+]], v[[A_F32_2:[0-9]+]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_1]], 0x3f317bed, v[[A_F32_1]] +; VIGFX9: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GCN: buffer_store_short v[[R_F16_0]] +; GCN: s_endpgm +define void @log_f16( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.log.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}log_v2f16 +; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_div_scale_f32 v[[A_F32_3:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_2]], v[[A_F32_2]], v[[R_F32_1]] +; SI: v_rcp_f32_e32 v[[R_F32_4:[0-9]+]], v[[A_F32_3]] +; SI: v_div_scale_f32 v[[A_F32_5:[0-9]+]], vcc, v[[A_F32_1]], v[[A_F32_2]], v[[R_F32_1]] +; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VIGFX9: v_mov_b32_e32 v[[A_F32_4:[0-9]+]] +; VI: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[R_F16_1]] +; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[R_F16_0]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_3]], v[[A_F32_4]], v[[A_F32_3]] +; VIGFX9: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[A_F32_3]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_2]], v[[A_F32_4]], v[[A_F32_2]] +; VIGFX9: v_cvt_f16_f32_e32 v[[A_F32_2]], v[[A_F32_2]] +; NO: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_1]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GFX9: v_lshl_or_b32 v[[R_F32_0:[0-9]+]], v[[R_F32_0]], 16, v[[R_F16_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_or_b32_e32 v[[R_F32_0]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_sdwa v[[R_F32_0:[0-9]+]], v[[R_F16_HI]], v[[R_F32_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GCN: buffer_store_dword v[[R_F32_0]] +; GCN: s_endpgm +define void @log_v2f16( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.log.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.log.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.log.ll @@ -0,0 +1,94 @@ +; RUN: llc < %s -march=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC + +; FUNC-LABEL: {{^}}test: +; EG: LOG_IEEE +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.log.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testv2: +; EG: LOG_IEEE +; EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.log.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testv4: +; EG: LOG_IEEE +; EG: LOG_IEEE +; EG: LOG_IEEE +; EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.log.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.log.f32(float) readnone +declare <2 x float> @llvm.log.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.log.v4f32(<4 x float>) readnone Index: test/CodeGen/AMDGPU/llvm.log10.f16.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.log10.f16.ll @@ -0,0 +1,70 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s +; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI -check-prefix=VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIGFX9 %s + +declare half @llvm.log10.f16(half %a) +declare <2 x half> @llvm.log10.v2f16(<2 x half> %a) + +; GCN-LABEL: {{^}}log10_f16 +; GCN: buffer_load_ushort v[[A_F16_0:[0-9]+]] +; SI: v_mov_b32_e32 v[[A_F32_1:[0-9]+]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_div_scale_f32 v[[A_F32_2:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_1]], v[[A_F32_1]], v[[A_F32_0]] +; SI: v_rcp_f32_e32 v[[A_F32_3:[0-9]+]], v[[A_F32_2:[0-9]+]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_1]], 0x3e9a1c98, v[[A_F32_1]] +; VIGFX9: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]] +; GCN: buffer_store_short v[[R_F16_0]] +; GCN: s_endpgm +define void @log10_f16( + half addrspace(1)* %r, + half addrspace(1)* %a) { +entry: + %a.val = load half, half addrspace(1)* %a + %r.val = call half @llvm.log10.f16(half %a.val) + store half %r.val, half addrspace(1)* %r + ret void +} + +; GCN-LABEL: {{^}}log10_v2f16 +; GCN: buffer_load_dword v[[A_F16_0:[0-9]+]] +; SI: v_mov_b32_e32 v[[A_F32_2:[0-9]+]] +; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_F16_0]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]] +; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_F16_0]] +; SI: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]] +; SI: v_div_scale_f32 v[[A_F32_3:[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, v[[A_F32_2]], v[[A_F32_2]], v[[R_F32_1]] +; SI: v_rcp_f32_e32 v[[R_F32_4:[0-9]+]], v[[A_F32_3]] +; SI: v_div_scale_f32 v[[A_F32_5:[0-9]+]], vcc, v[[A_F32_1]], v[[A_F32_2]], v[[R_F32_1]] +; SI: v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] +; VIGFX9: v_mov_b32_e32 v[[A_F32_4:[0-9]+]] +; VI: v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_F16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 +; GFX9: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_3:[0-9]+]], v[[R_F16_1]] +; VIGFX9: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_F16_0]] +; VIGFX9: v_cvt_f32_f16_e32 v[[A_F32_2:[0-9]+]], v[[R_F16_0]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_3]], v[[A_F32_4]], v[[A_F32_3]] +; VIGFX9: v_cvt_f16_f32_e32 v[[R_F16_3:[0-9]+]], v[[A_F32_3]] +; VIGFX9: v_mul_f32_e32 v[[A_F32_2]], v[[A_F32_4]], v[[A_F32_2]] +; VIGFX9: v_cvt_f16_f32_e32 v[[A_F32_2]], v[[A_F32_2]] +; NO: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_1]] +; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] +; GFX9: v_lshl_or_b32 v[[R_F32_0:[0-9]+]], v[[R_F32_0]], 16, v[[R_F16_1]] +; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] +; SI: v_or_b32_e32 v[[R_F32_0]], v[[R_F16_HI]], v[[R_F16_0]] +; VI: v_or_b32_sdwa v[[R_F32_0:[0-9]+]], v[[R_F16_HI]], v[[R_F32_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GCN: buffer_store_dword v[[R_F32_0]] +; GCN: s_endpgm +define void @log10_v2f16( + <2 x half> addrspace(1)* %r, + <2 x half> addrspace(1)* %a) { +entry: + %a.val = load <2 x half>, <2 x half> addrspace(1)* %a + %r.val = call <2 x half> @llvm.log10.v2f16(<2 x half> %a.val) + store <2 x half> %r.val, <2 x half> addrspace(1)* %r + ret void +} Index: test/CodeGen/AMDGPU/llvm.log10.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.log10.ll @@ -0,0 +1,94 @@ +; RUN: llc < %s -march=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC +; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC + +; FUNC-LABEL: {{^}}test: +; EG: LOG_IEEE +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 + +define void @test(float addrspace(1)* %out, float %in) { +entry: + %0 = call float @llvm.log10.f32(float %in) + store float %0, float addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testv2: +; EG: LOG_IEEE +; EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 + +define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) { +entry: + %0 = call <2 x float> @llvm.log10.v2f32(<2 x float> %in) + store <2 x float> %0, <2 x float> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}testv4: +; EG: LOG_IEEE +; EG: LOG_IEEE +; EG: LOG_IEEE +; EG: LOG_IEEE +; FIXME: We should be able to merge these packets together on Cayman so we +; have a maximum of 4 instructions. +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} (MASKED) +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}} +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +; SI: v_log_f32 +; SI: v_div_scale_f32 +; SI: v_div_scale_f32 +define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) { +entry: + %0 = call <4 x float> @llvm.log10.v4f32(<4 x float> %in) + store <4 x float> %0, <4 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.log10.f32(float) readnone +declare <2 x float> @llvm.log10.v2f32(<2 x float>) readnone +declare <4 x float> @llvm.log10.v4f32(<4 x float>) readnone