Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -56,10 +56,13 @@ Intrinsic<[], [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i32_ty], []>, GCCBuiltin<"__builtin_r600_rat_store_typed">; -def int_r600_rsq : Intrinsic< +def int_r600_recipsqrt_ieee : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] >; +def int_r600_recipsqrt_clamped : Intrinsic< + [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] +>; } // End TargetPrefix = "r600" Index: lib/Target/AMDGPU/AMDGPUIntrinsics.td =================================================================== --- lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -29,10 +29,6 @@ def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_rsq_clamped : Intrinsic< - [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] - >; - // Deprecated in favor of llvm.amdgcn.rsq def int_AMDGPU_rsq : Intrinsic< [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem] Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -831,15 +831,13 @@ return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, AMDGPU::T0_Z, VT); - // FIXME: Should be renamed to r600 prefix - case AMDGPUIntrinsic::AMDGPU_rsq_clamped: - return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); + case Intrinsic::r600_recipsqrt_ieee: + return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1)); - case Intrinsic::r600_rsq: - case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name - // XXX - I'm assuming SI's RSQ_LEGACY matches R600's behavior. - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + case Intrinsic::r600_recipsqrt_clamped: + return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); } + // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode()) break; } Index: lib/Target/AMDGPU/R600Instructions.td =================================================================== --- lib/Target/AMDGPU/R600Instructions.td +++ lib/Target/AMDGPU/R600Instructions.td @@ -1140,8 +1140,7 @@ } class RECIPSQRT_IEEE_Common inst> : R600_1OP_Helper < - inst, "RECIPSQRT_IEEE", AMDGPUrsq_legacy -> { + inst, "RECIPSQRT_IEEE", AMDGPUrsq> { let Itinerary = TransALU; } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1696,8 +1696,7 @@ return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); } - case Intrinsic::amdgcn_rsq_clamp: - case AMDGPUIntrinsic::AMDGPU_rsq_clamped: { // Legacy name + case Intrinsic::amdgcn_rsq_clamp: { if (Subtarget->getGeneration() < SISubtarget::VOLCANIC_ISLANDS) return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1)); Index: test/CodeGen/AMDGPU/big_alu.ll =================================================================== --- test/CodeGen/AMDGPU/big_alu.ll +++ test/CodeGen/AMDGPU/big_alu.ll @@ -99,7 +99,7 @@ %tmp88 = insertelement <4 x float> %tmp87, float %tmp32, i32 2 %tmp89 = insertelement <4 x float> %tmp88, float 0.000000e+00, i32 3 %tmp90 = call float @llvm.r600.dot4(<4 x float> %tmp85, <4 x float> %tmp89) - %tmp91 = call float @llvm.AMDGPU.rsq.clamped.f32(float %tmp90) + %tmp91 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp90) %tmp92 = fmul float %tmp30, %tmp91 %tmp93 = fmul float %tmp31, %tmp91 %tmp94 = fmul float %tmp32, %tmp91 @@ -198,7 +198,7 @@ %tmp181 = fadd float %tmp180, %tmp28 %tmp182 = fdiv float 1.000000e+00, %tmp33 %tmp183 = fmul float %tmp32, %tmp182 - %tmp184 = call float @fabs(float %tmp183) + %tmp184 = call float @llvm.fabs.f32(float %tmp183) %tmp185 = fmul float %tmp176, 0x3FD99999A0000000 %tmp186 = fadd float %tmp185, 0x3FAEB851E0000000 %tmp187 = fmul float %tmp177, 0x3FE3333340000000 @@ -350,7 +350,7 @@ %tmp329 = insertelement <4 x float> %tmp328, float %tmp322, i32 2 %tmp330 = insertelement <4 x float> %tmp329, float 0.000000e+00, i32 3 %tmp331 = call float @llvm.r600.dot4(<4 x float> %tmp326, <4 x float> %tmp330) - %tmp332 = call float @llvm.AMDGPU.rsq.clamped.f32(float %tmp331) + %tmp332 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp331) %tmp333 = fmul float %tmp318, %tmp332 %tmp334 = fmul float %tmp320, %tmp332 %tmp335 = fmul float %tmp322, %tmp332 @@ -383,9 +383,9 @@ %tmp362 = insertelement <4 x float> %tmp361, float %tmp45, i32 2 %tmp363 = insertelement <4 x float> %tmp362, float 0.000000e+00, i32 3 %tmp364 = call float @llvm.r600.dot4(<4 x float> %tmp359, <4 x float> %tmp363) - %tmp365 = call float @llvm.AMDGPU.rsq.clamped.f32(float %tmp364) + %tmp365 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp364) %tmp366 = fmul float %tmp45, %tmp365 - %tmp367 = call float @fabs(float %tmp366) + %tmp367 = call float @llvm.fabs.f32(float %tmp366) %tmp368 = fmul float %tmp178, 0x3FECCCCCC0000000 %tmp369 = fadd float %tmp368, %tmp367 %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000 @@ -409,9 +409,9 @@ %tmp388 = insertelement <4 x float> %tmp387, float %tmp45, i32 2 %tmp389 = insertelement <4 x float> %tmp388, float 0.000000e+00, i32 3 %tmp390 = call float @llvm.r600.dot4(<4 x float> %tmp385, <4 x float> %tmp389) - %tmp391 = call float @llvm.AMDGPU.rsq.clamped.f32(float %tmp390) + %tmp391 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp390) %tmp392 = fmul float %tmp45, %tmp391 - %tmp393 = call float @fabs(float %tmp392) + %tmp393 = call float @llvm.fabs.f32(float %tmp392) %tmp394 = fmul float %tmp178, 0x3FF51EB860000000 %tmp395 = fadd float %tmp394, %tmp393 %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000 @@ -1150,9 +1150,9 @@ %tmp875 = insertelement <4 x float> %tmp874, float %tmp45, i32 2 %tmp876 = insertelement <4 x float> %tmp875, float 0.000000e+00, i32 3 %tmp877 = call float @llvm.r600.dot4(<4 x float> %tmp872, <4 x float> %tmp876) - %tmp878 = call float @llvm.AMDGPU.rsq.clamped.f32(float %tmp877) + %tmp878 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp877) %tmp879 = fmul float %tmp45, %tmp878 - %tmp880 = call float @fabs(float %tmp879) + %tmp880 = call float @llvm.fabs.f32(float %tmp879) %tmp881 = fmul float %tmp178, 0x3FECCCCCC0000000 %tmp882 = fadd float %tmp881, %tmp880 %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000 @@ -1292,10 +1292,10 @@ declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0 ; Function Attrs: nounwind readnone -declare float @llvm.AMDGPU.rsq.clamped.f32(float) #0 +declare float @llvm.r600.recipsqrt.clamped.f32(float) #0 ; Function Attrs: nounwind readonly -declare float @fabs(float) #1 +declare float @llvm.fabs.f32(float) #1 ; Function Attrs: nounwind readnone declare float @llvm.exp2.f32(float) #0 Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.f64.ll +++ /dev/null @@ -1,21 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s - -declare double @llvm.AMDGPU.rsq.clamped.f64(double) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f64: -; SI: v_rsq_clamp_f64_e32 - -; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}] -; TODO: this constant should be folded: -; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1 -; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff -; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]] -; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff -; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]] - -define void @rsq_clamped_f64(double addrspace(1)* %out, double %src) nounwind { - %rsq_clamped = call double @llvm.AMDGPU.rsq.clamped.f64(double %src) nounwind readnone - store double %rsq_clamped, double addrspace(1)* %out, align 8 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.clamped.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -; FIXME: Uses of this should be moved to llvm.amdgcn.rsq.clamped, and -; an r600 variant added. - -declare float @llvm.AMDGPU.rsq.clamped.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_clamped_f32: -; SI: v_rsq_clamp_f32_e32 - -; VI-DAG: v_rsq_f32_e32 [[RSQ:v[0-9]+]], {{s[0-9]+}} -; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]] -; TODO: this constant should be folded: -; VI-DAG: v_mov_b32_e32 [[MINFLT:v[0-9]+]], 0xff7fffff -; VI: v_max_f32_e32 {{v[0-9]+}}, [[MIN]], [[MINFLT]] - -; EG: RECIPSQRT_CLAMPED - -define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq_clamped = call float @llvm.AMDGPU.rsq.clamped.f32(float %src) nounwind readnone - store float %rsq_clamped, float addrspace(1)* %out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.rsq.ll +++ /dev/null @@ -1,33 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s - -declare float @llvm.AMDGPU.rsq.f32(float) nounwind readnone - -; FUNC-LABEL: {{^}}rsq_f32: -; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}} -; EG: RECIPSQRT_IEEE -define void @rsq_f32(float addrspace(1)* %out, float %src) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float %src) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; TODO: Really these should be constant folded -; FUNC-LABEL: {{^}}rsq_f32_constant_4.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_4.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 4.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} - -; FUNC-LABEL: {{^}}rsq_f32_constant_100.0 -; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000 -; EG: RECIPSQRT_IEEE -define void @rsq_f32_constant_100.0(float addrspace(1)* %out) nounwind { - %rsq = call float @llvm.AMDGPU.rsq.f32(float 100.0) nounwind readnone - store float %rsq, float addrspace(1)* %out, align 4 - ret void -} Index: test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll @@ -0,0 +1,11 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s + +declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone + +; EG-LABEL: {{^}}rsq_clamped_f32: +; EG: RECIPSQRT_CLAMPED +define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind { + %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src) + store float %rsq_clamped, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll @@ -0,0 +1,28 @@ +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG %s + +declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone + +; EG-LABEL: {{^}}recipsqrt.ieee_f32: +; EG: RECIPSQRT_IEEE +define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind { + %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone + store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 + ret void +} + +; TODO: Really these should be constant folded +; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0 +; EG: RECIPSQRT_IEEE +define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind { + %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone + store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 + ret void +} + +; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0 +; EG: RECIPSQRT_IEEE +define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind { + %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone + store float %recipsqrt.ieee, float addrspace(1)* %out, align 4 + ret void +} Index: test/CodeGen/AMDGPU/pv.ll =================================================================== --- test/CodeGen/AMDGPU/pv.ll +++ test/CodeGen/AMDGPU/pv.ll @@ -102,8 +102,8 @@ %94 = insertelement <4 x float> %93, float %6, i32 2 %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3 %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95) - %97 = call float @fabs(float %96) - %98 = call float @llvm.AMDGPU.rsq.clamped.f32(float %97) + %97 = call float @llvm.fabs.f32(float %96) + %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97) %99 = fmul float %4, %98 %100 = fmul float %5, %98 %101 = fmul float %6, %98 @@ -222,19 +222,19 @@ declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1 ; Function Attrs: readonly -declare float @fabs(float) #2 +declare float @llvm.fabs.f32(float) #1 ; Function Attrs: readnone -declare float @llvm.AMDGPU.rsq.clamped.f32(float) #1 +declare float @llvm.r600.recipsqrt.clamped.f32(float) #1 ; Function Attrs: readnone declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1 ; Function Attrs: nounwind readonly -declare float @llvm.pow.f32(float, float) #3 +declare float @llvm.pow.f32(float, float) #2 -declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) #3 -attributes #1 = { readnone } -attributes #2 = { readonly } -attributes #3 = { nounwind readonly } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind }