diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1005,6 +1005,67 @@ break; } + case Intrinsic::amdgcn_trig_preop: { + + const uint32_t TwoByPi[] = { + 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041, + 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c, + 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41, + 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f, + 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d, + 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08, + 0x56033046}; + + Value *Src = II.getArgOperand(0); + Value *Segment = II.getArgOperand(1); + const ConstantFP *Csrc = dyn_cast(Src); + const ConstantInt *Cseg = dyn_cast(Segment); + + if (!(Csrc && Cseg)) + break; + + const APFloat &Fsrc = Csrc->getValueAPF(); + + const APInt &SegVal = Cseg->getUniqueInteger(); + bool Ovflow; + unsigned Numbits = 32; + bool Signed = true; + + APInt EClamp(Numbits, 1077, Signed); + APInt E = (Fsrc.bitcastToAPInt()).ashr(52); + E &= 0x7ff; + E = E.trunc(Numbits); + APInt Shift = + (E.sgt(EClamp) ? E.ssub_ov(EClamp, Ovflow) : APInt(Numbits, 0, Signed)) + .sadd_ov(APInt(Numbits, 53, Signed).smul_ov(SegVal, Ovflow), + Ovflow); + int32_t I = (Shift.ashr(5)).getSExtValue(); + APInt Bshift = Shift & 0x1f; + Numbits = 64; + Signed = false; + APInt Thi = APInt(Numbits, + (((uint64_t)TwoByPi[I] << 32) | (uint64_t)TwoByPi[I + 1]), + Signed); + APInt Tlo = APInt(Numbits, ((uint64_t)TwoByPi[I + 2] << 32), Signed); + + if (Bshift.sgt(0)) { + Numbits = 32; + Signed = true; + Thi = (Thi.shl(Bshift)) | + (Tlo.lshr(APInt(Numbits, 64, Signed).ssub_ov(Bshift, Ovflow))); + } + + Thi = Thi.lshr(11); + APFloat Res = APFloat(Thi.roundToDouble()); + int32_t Scale = -53 - Shift.getSExtValue(); + + if (E.sge(0x7b0)) + Scale += 128; + + Res = scalbn(Res, Scale, RoundingMode::NearestTiesToEven); + double Resd = Res.convertToDouble(); + return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Resd)); + } case Intrinsic::amdgcn_fmul_legacy: { Value *Op0 = II.getArgOperand(0); Value *Op1 = II.getArgOperand(1); diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll @@ -1,5 +1,7 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx900 -instcombine < %s | FileCheck -check-prefix=GCN %s +; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx1010 -instcombine < %s | FileCheck -check-prefix=GCN %s declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone @@ -28,3 +30,11 @@ store double %result, double addrspace(1)* %out, align 8 ret void } + +define protected amdgpu_kernel void @trig_preop_constfold(double addrspace(1)* nocapture %0, double addrspace(1)* nocapture readnone %1, i32 %2){ +; GCN: store double 0x2F42371D2126E970, double addrspace(1)* %0, align 8 +; GCN-NEXT: ret void + %4 = tail call contract double @llvm.amdgcn.trig.preop.f64(double 3.454350e+02, i32 5) + store double %4, double addrspace(1)* %0, align 8 + ret void +}