Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -14746,6 +14746,32 @@ Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() }); return Builder.CreateCall(F, { Src0, Builder.getFalse() }); } + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: { + Intrinsic::ID Intrin; + switch (BuiltinID) { + case AMDGPU::BI__builtin_amdgcn_ds_faddf: + Intrin = Intrinsic::amdgcn_ds_fadd; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fminf: + Intrin = Intrinsic::amdgcn_ds_fmin; + break; + case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: + Intrin = Intrinsic::amdgcn_ds_fmax; + break; + } + llvm::Value *Src0 = EmitScalarExpr(E->getArg(0)); + llvm::Value *Src1 = EmitScalarExpr(E->getArg(1)); + llvm::Value *Src2 = EmitScalarExpr(E->getArg(2)); + llvm::Value *Src3 = EmitScalarExpr(E->getArg(3)); + llvm::Value *Src4 = EmitScalarExpr(E->getArg(4)); + llvm::Function *F = CGM.getIntrinsic(Intrin, { Src1->getType() }); + llvm::FunctionType *FTy = F->getFunctionType(); + llvm::Type *PTy = FTy->getParamType(0); + Src0 = Builder.CreatePointerBitCastOrAddrSpaceCast(Src0, PTy); + return Builder.CreateCall(F, { Src0, Src1, Src2, Src3, Src4 }); + } case AMDGPU::BI__builtin_amdgcn_read_exec: { CallInst *CI = cast( EmitSpecialRegisterBuiltin(*this, E, Int64Ty, Int64Ty, NormalRead, "exec")); Index: clang/test/CodeGenCUDA/builtins-amdgcn.cu =================================================================== --- clang/test/CodeGenCUDA/builtins-amdgcn.cu +++ clang/test/CodeGenCUDA/builtins-amdgcn.cu @@ -10,7 +10,7 @@ } // CHECK-LABEL: @_Z12test_ds_fmaxf( -// CHECK: call contract float @llvm.amdgcn.ds.fmax(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false) +// CHECK: call contract float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* @_ZZ12test_ds_fmaxfE6shared, float %{{[^,]*}}, i32 0, i32 0, i1 false) __global__ void test_ds_fmax(float src) { __shared__ float shared; Index: clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl =================================================================== --- clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -114,19 +114,19 @@ } // CHECK-LABEL: @test_ds_fadd -// CHECK: call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_faddf(local float *out, float src) { *out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmin -// CHECK: call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_fminf(local float *out, float src) { *out = __builtin_amdgcn_ds_fminf(out, src, 0, 0, false); } // CHECK-LABEL: @test_ds_fmax -// CHECK: call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) +// CHECK: call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %out, float %src, i32 0, i32 0, i1 false) void test_ds_fmaxf(local float *out, float src) { *out = __builtin_amdgcn_ds_fmaxf(out, src, 0, 0, false); } Index: llvm/include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -397,11 +397,10 @@ def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin; def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin; -class AMDGPULDSF32Intrin : - GCCBuiltin, - Intrinsic<[llvm_float_ty], - [LLVMQualPointerType, - llvm_float_ty, +class AMDGPULDSIntrin : + Intrinsic<[llvm_any_ty], + [LLVMQualPointerType, 3>, + LLVMMatchType<0>, llvm_i32_ty, // ordering llvm_i32_ty, // scope llvm_i1_ty], // isVolatile @@ -446,9 +445,9 @@ def int_amdgcn_ds_append : AMDGPUDSAppendConsumedIntrinsic; def int_amdgcn_ds_consume : AMDGPUDSAppendConsumedIntrinsic; -def int_amdgcn_ds_fadd : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_faddf">; -def int_amdgcn_ds_fmin : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fminf">; -def int_amdgcn_ds_fmax : AMDGPULDSF32Intrin<"__builtin_amdgcn_ds_fmaxf">; +def int_amdgcn_ds_fadd : AMDGPULDSIntrin; +def int_amdgcn_ds_fmin : AMDGPULDSIntrin; +def int_amdgcn_ds_fmax : AMDGPULDSIntrin; } // TargetPrefix = "amdgcn" Index: llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll +++ llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll @@ -1,9 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s -declare float @llvm.amdgcn.ds.fadd(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmin(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmax(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) ; GCN-LABEL: {{^}}lds_ds_fadd: ; VI-DAG: s_mov_b32 m0 @@ -19,9 +19,9 @@ %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } @@ -40,9 +40,9 @@ %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void } @@ -61,9 +61,9 @@ %shl1 = shl i32 %idx.add, 4 %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) store float %a3, float addrspace(1)* %out ret void }