diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -289,6 +289,8 @@ }); } +static inline long as_long(double d) { union { double d; long u; } v; v.d = d; return v.u; } + bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1, InstCombiner &IC) const { // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or @@ -968,6 +970,52 @@ break; } + case Intrinsic::amdgcn_trig_preop: { + + const ulong TwoByPi[] = { + 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, + 0xdb629599, 0x3c439041, 0xfe5163ab, 0xdebbc561, + 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, + 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, + 0xe99c7026, 0xb45f7e41, 0x3991d639, 0x835339f4, + 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, + 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, + 0x4f463f66, 0x9e5fea2d, 0x7527bac7, 0xebe5f17b, + 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, + 0x56033046 + }; + + Value *Src = II.getArgOperand(0); + Value *Segment = II.getArgOperand(1); + + if (const ConstantFP *Csrc = dyn_cast(Src)) { + if (const ConstantInt *Cseg = dyn_cast(Segment)) { + + const APFloat &ArgVal = Csrc->getValueAPF(); + double Dsrc = ArgVal.convertToDouble(); + const APInt &SegVal = Cseg->getUniqueInteger(); + int Iseg = SegVal.getSExtValue(); + + const int Eclamp = 1077; + int E = (as_long(Dsrc) >> 52) & 0x7ff; + int Shift = (E > Eclamp ? E - Eclamp : 0) + 53 * Iseg; + int I = Shift >> 5; + int Bshift = Shift & 0x1f; + ulong Thi = (TwoByPi[I] << 32) | TwoByPi[I+1]; + ulong Tlo = TwoByPi[I+2] << 32; + if (Bshift > 0) + Thi = (Thi << Bshift) | (Tlo >> (64-Bshift)); + Thi >>= 11; + double Res = (double)Thi; + int Scale = -53 - Shift; + if (E >= 0x7b0) + Scale += 128; + Res = ldexp(Res, Scale); + return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Res)); + } + } + break; + } case Intrinsic::amdgcn_fmul_legacy: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -1,5 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx900 -instcombine < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1010 -instcombine < %s | FileCheck -check-prefix=GCN %s declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone @@ -28,3 +30,11 @@ store double %result, double addrspace(1)* %out, align 8 ret void } + +define protected amdgpu_kernel void @trig_preop_constfold(double addrspace(1)* nocapture %0, double addrspace(1)* nocapture readnone %1, i32 %2){ +; GCN: store double 0x2F42371D2126E970, double addrspace(1)* %0, align 8 +; GCN-NEXT: ret void + %4 = tail call contract double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) + store double %4, double addrspace(1)* %0, align 8 + ret void +}