diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -958,6 +958,9 @@ case Intrinsic::amdgcn_ds_fmax: case Intrinsic::amdgcn_is_shared: case Intrinsic::amdgcn_is_private: + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin: OpIndexes.push_back(0); return true; default: @@ -1032,6 +1035,18 @@ return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy}, {NewV, MaskOp}); } + case Intrinsic::amdgcn_flat_atomic_fadd: + case Intrinsic::amdgcn_flat_atomic_fmax: + case Intrinsic::amdgcn_flat_atomic_fmin: { + Module *M = II->getParent()->getParent()->getParent(); + Type *DestTy = II->getType(); + Type *SrcTy = NewV->getType(); + Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(), + {DestTy, SrcTy, DestTy}); + II->setArgOperand(0, NewV); + II->setCalledFunction(NewDecl); + return II; + } default: return nullptr; } diff --git a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/gep-const-address-space.ll +++ /dev/null @@ -1,30 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -march=amdgcn -mcpu=gfx90a -O3 < %s | FileCheck %s - -declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* nocapture, double) #8 - -define protected amdgpu_kernel void @IllegalGEPConst(i32 %a, double addrspace(1)* %b, double %c) { -; CHECK-LABEL: IllegalGEPConst: -; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 -; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_ashr_i32 s3, s2, 31 -; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; CHECK-NEXT: s_add_u32 s0, s4, s0 -; CHECK-NEXT: s_addc_u32 s1, s5, s1 -; CHECK-NEXT: v_mov_b32_e32 v0, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s7 -; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] -; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] offset:65528 -; CHECK-NEXT: s_endpgm -entry: - %i = add nsw i32 %a, -1 - %i.2 = sext i32 %i to i64 - %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 - %i.4 = addrspacecast double addrspace(1)* %i.3 to double* - %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.4, double %c) #8 - ret void -} - -attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -0,0 +1,183 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx90a -O3 < %s | FileCheck %s + +declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* nocapture, double) #8 +declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* nocapture, double) #8 +declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* nocapture, double) #8 + +define protected amdgpu_kernel void @InferNothing(i32 %a, double* %b, double %c) { +; CHECK-LABEL: InferNothing: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: s_add_i32 s0, s2, -1 +; CHECK-NEXT: s_ashr_i32 s1, s0, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[0:1], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1] +; CHECK-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double* %b, i64 %i.2 + %i.4 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.3, double %c) #8 + ret void +} + + +define protected amdgpu_kernel void @InferFadd(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: InferFadd: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = addrspacecast double addrspace(1)* %i.3 to double* + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.4, double %c) #8 + ret void +} + +define protected amdgpu_kernel void @InferFmax(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: InferFmax: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = addrspacecast double addrspace(1)* %i.3 to double* + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmax.f64.p0f64.f64(double* %i.4, double %c) #8 + ret void +} + +define protected amdgpu_kernel void @InferFmin(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: InferFmin: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: v_mov_b32_e32 v0, s6 +; CHECK-NEXT: v_mov_b32_e32 v1, s7 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1] offset:-8 +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = addrspacecast double addrspace(1)* %i.3 to double* + %i.5 = tail call contract double @llvm.amdgcn.flat.atomic.fmin.f64.p0f64.f64(double* %i.4, double %c) #8 + ret void +} + +define protected amdgpu_kernel void @InferMixed(i32 %a, double addrspace(1)* %b, double %c, double* %d) { +; CHECK-LABEL: InferMixed: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x3c +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: v_mov_b32_e32 v4, 0 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: v_mov_b32_e32 v0, s8 +; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] +; CHECK-NEXT: global_atomic_add_f64 v4, v[2:3], s[0:1] offset:-7 +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + br label %bb1 + +bb1: + %i.7 = ptrtoint double addrspace(1)* %i.3 to i64 + %i.8 = add nsw i64 %i.7, 1 + %i.9 = inttoptr i64 %i.8 to double addrspace(1)* + %i.10 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double * %d, double %c) #23 + %i.11 = addrspacecast double addrspace(1)* %i.9 to double* + %i.12 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.11, double %c) #23 + ret void +} + +define protected amdgpu_kernel void @InferPHI(i32 %a, double addrspace(1)* %b, double %c) { +; CHECK-LABEL: InferPHI: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_load_dword s2, s[0:1], 0x24 +; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_ashr_i32 s3, s2, 31 +; CHECK-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 +; CHECK-NEXT: s_add_u32 s0, s4, s0 +; CHECK-NEXT: s_addc_u32 s1, s5, s1 +; CHECK-NEXT: s_add_u32 s0, s0, -8 +; CHECK-NEXT: s_addc_u32 s1, s1, -1 +; CHECK-NEXT: .LBB5_1: ; %bb0 +; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: s_cmp_lg_u64 s[0:1], 1 +; CHECK-NEXT: s_cbranch_scc1 .LBB5_1 +; CHECK-NEXT: ; %bb.2: ; %bb1 +; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1] +; CHECK-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] +; CHECK-NEXT: s_endpgm +entry: + %i = add nsw i32 %a, -1 + %i.2 = sext i32 %i to i64 + %i.3 = getelementptr inbounds double, double addrspace(1)* %b, i64 %i.2 + %i.4 = ptrtoint double addrspace(1)* %i.3 to i64 + br label %bb0 + +bb0: + %phi = phi double addrspace(1)* [ %i.3, %entry ], [ %i.9, %bb0 ] + %i.7 = ptrtoint double addrspace(1)* %phi to i64 + %i.8 = sub nsw i64 %i.7, 1 + %cmp2 = icmp eq i64 %i.8, 0 + %i.9 = inttoptr i64 %i.7 to double addrspace(1)* + br i1 %cmp2, label %bb1, label %bb0 + +bb1: + %i.10 = addrspacecast double addrspace(1)* %i.9 to double* + %i.11 = tail call contract double @llvm.amdgcn.flat.atomic.fadd.f64.p0f64.f64(double* %i.10, double %c) #23 + ret void +} + + +attributes #8 = { argmemonly mustprogress nounwind willreturn "target-cpu"="gfx90a" } +