diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1351,7 +1351,7 @@ def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">, AssemblerPredicate<(all_of FeatureGFX9Insts)>; -def HasLDSFPAtomics : Predicate<"Subtarget->hasLDSFPAtomics()">, +def HasLDSFPAtomicAdd : Predicate<"Subtarget->hasLDSFPAtomicAdd()">, AssemblerPredicate<(all_of FeatureGFX8Insts)>; def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">, diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1316,7 +1316,7 @@ } auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD); - if (ST.hasLDSFPAtomics()) { + if (ST.hasLDSFPAtomicAdd()) { Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -428,11 +428,10 @@ defm DS_OR_B32 : DS_1A1D_NORET_mc<"ds_or_b32">; defm DS_XOR_B32 : DS_1A1D_NORET_mc<"ds_xor_b32">; -let SubtargetPredicate = HasLDSFPAtomics in { +let SubtargetPredicate = HasLDSFPAtomicAdd in { defm DS_ADD_F32 : DS_1A1D_NORET_mc<"ds_add_f32">; } -// FIXME: Are these really present pre-gfx8? defm DS_MIN_F32 : DS_1A1D_NORET_mc<"ds_min_f32">; defm DS_MAX_F32 : DS_1A1D_NORET_mc<"ds_max_f32">; @@ -493,7 +492,7 @@ defm DS_ADD_RTN_U32 : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">; -let SubtargetPredicate = HasLDSFPAtomics in { +let SubtargetPredicate = HasLDSFPAtomicAdd in { defm DS_ADD_RTN_F32 : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">; } defm DS_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">; @@ -665,7 +664,7 @@ } // let SubtargetPredicate = isGFX8Plus -let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in { +let SubtargetPredicate = HasLDSFPAtomicAdd, OtherPredicates = [HasDsSrc2Insts] in { def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">; } @@ -933,11 +932,11 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; -defm : DSAtomicCmpXChg_mc; - -let SubtargetPredicate = HasLDSFPAtomics in { defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; +defm : DSAtomicCmpXChg_mc; + +let SubtargetPredicate = HasLDSFPAtomicAdd in { defm : DSAtomicRetPat_mc; } @@ -954,6 +953,8 @@ defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; +defm : DSAtomicRetPat_mc; defm : DSAtomicCmpXChg_mc; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -811,9 +811,7 @@ return HasScalarAtomics; } - bool hasLDSFPAtomics() const { - return GFX8Insts; - } + bool hasLDSFPAtomicAdd() const { return GFX8Insts; } /// \returns true if the subtarget has the v_permlanex16_b32 instruction. bool hasPermLaneX16() const { return getGeneration() >= GFX10; } diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12209,7 +12209,7 @@ // DS FP atomics do repect the denormal mode, but the rounding mode is fixed // to round-to-nearest-even. // The only exception is DS_ADD_F64 which never flushes regardless of mode. - if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { + if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) { if (!Ty->isDoubleTy()) return AtomicExpansionKind::None; diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fadd.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s + +declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) + +; GCN-LABEL: {{^}}lds_ds_fadd: +; VI-DAG: s_mov_b32 m0 +; GFX9-NOT: m0 +; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 +; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 +; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 +; GCN: s_waitcnt lgkmcnt(1) +; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] +define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -0,0 +1,1282 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7 %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX10 %s + +; RUN: llc -global-isel -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_SI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX7 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_VI %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=G_GFX9 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=G_GFX10 %s + +declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) +declare double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* nocapture, double, i32, i32, i1) +declare double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* nocapture, double, i32, i32, i1) + + +define amdgpu_kernel void @lds_ds_fmin(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; SI-LABEL: lds_ds_fmin: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s7, 0xe8f000 +; SI-NEXT: s_add_u32 s4, s4, s3 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s3, s[0:1], 0xa +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s0, 4 +; SI-NEXT: s_lshl_b32 s0, s0, 3 +; SI-NEXT: s_add_i32 s0, s0, 32 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: ds_min_rtn_f32 v1, v1, v0 +; SI-NEXT: s_add_i32 s1, s1, 64 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: ds_min_f32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_min_rtn_f32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; SI-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fmin: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s1, s0, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_min_f32 v2, v0 offset:64 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; VI-LABEL: lds_ds_fmin: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s3 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: ds_min_f32 v2, v0 offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_min_rtn_f32 v0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fmin: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s1, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_min_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_ds_fmin: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s3 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ds_min_rtn_f32 v1, v1, v0 offset:32 +; GFX10-NEXT: ds_min_f32 v2, v0 offset:64 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: ds_min_rtn_f32 v0, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-NEXT: s_endpgm +; +; G_SI-LABEL: lds_ds_fmin: +; G_SI: ; %bb.0: +; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_mov_b32 s6, -1 +; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 +; G_SI-NEXT: s_add_u32 s4, s4, s3 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; G_SI-NEXT: s_addc_u32 s5, s5, 0 +; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: s_mov_b32 m0, -1 +; G_SI-NEXT: ds_min_rtn_f32 v1, v1, v0 +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 +; G_SI-NEXT: ds_min_rtn_f32 v0, v2, v0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_SI-NEXT: v_mov_b32_e32 v1, s0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; G_SI-NEXT: s_endpgm +; +; G_GFX7-LABEL: lds_ds_fmin: +; G_GFX7: ; %bb.0: +; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_mov_b32 s10, -1 +; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s8, s8, s3 +; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: s_mov_b32 m0, -1 +; G_GFX7-NEXT: ds_min_rtn_f32 v1, v1, v0 +; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: ds_min_rtn_f32 v0, v2, v0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX7-NEXT: s_endpgm +; +; G_VI-LABEL: lds_ds_fmin: +; G_VI: ; %bb.0: +; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_mov_b32 s90, -1 +; G_VI-NEXT: s_mov_b32 s91, 0xe80000 +; G_VI-NEXT: s_add_u32 s88, s88, s3 +; G_VI-NEXT: s_addc_u32 s89, s89, 0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_VI-NEXT: v_mov_b32_e32 v1, s1 +; G_VI-NEXT: s_mov_b32 m0, -1 +; G_VI-NEXT: ds_min_rtn_f32 v1, v1, v0 +; G_VI-NEXT: s_lshl_b32 s0, s0, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: ds_min_rtn_f32 v0, v2, v0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_VI-NEXT: v_mov_b32_e32 v1, s6 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; G_VI-NEXT: s_endpgm +; +; G_GFX9-LABEL: lds_ds_fmin: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_mov_b32 s10, -1 +; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s8, s8, s3 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s1 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX9-NEXT: ds_min_rtn_f32 v1, v2, v1 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX9-NEXT: s_endpgm +; +; G_GFX10-LABEL: lds_ds_fmin: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s6, -1 +; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s4, s4, s3 +; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_add_i32 s2, s2, 4 +; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 +; G_GFX10-NEXT: ds_min_rtn_f32 v0, v0, v1 +; G_GFX10-NEXT: ds_min_rtn_f32 v1, v2, v1 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) +; G_GFX10-NEXT: ds_min_rtn_f32 v0, v3, v0 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) +; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: s_endpgm + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(5)* %out + ret void +} + +define amdgpu_kernel void @lds_ds_fmax(float addrspace(5)* %out, float addrspace(3)* %ptrf, i32 %idx) { +; SI-LABEL: lds_ds_fmax: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s7, 0xe8f000 +; SI-NEXT: s_add_u32 s4, s4, s3 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s3, s[0:1], 0xa +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_addc_u32 s5, s5, 0 +; SI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s0, 4 +; SI-NEXT: s_lshl_b32 s0, s0, 3 +; SI-NEXT: s_add_i32 s0, s0, 32 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: ds_max_rtn_f32 v1, v1, v0 +; SI-NEXT: s_add_i32 s1, s1, 64 +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: ds_max_f32 v2, v0 +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_max_rtn_f32 v0, v0, v1 +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; SI-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fmax: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_mov_b32 s6, -1 +; GFX7-NEXT: s_mov_b32 s7, 0xe8f000 +; GFX7-NEXT: s_add_u32 s4, s4, s3 +; GFX7-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7-NEXT: s_addc_u32 s5, s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s1, s0, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_mov_b32_e32 v1, s0 +; GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 +; GFX7-NEXT: v_mov_b32_e32 v2, s1 +; GFX7-NEXT: ds_max_f32 v2, v0 offset:64 +; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, s2 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; GFX7-NEXT: s_endpgm +; +; VI-LABEL: lds_ds_fmax: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s3 +; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: ds_max_f32 v2, v0 offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s3 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_max_rtn_f32 v0, v0, v1 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fmax: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s1, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 +; GFX9-NEXT: s_lshl_b32 s0, s4, 4 +; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: ds_max_f32 v2, v0 offset:64 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_ds_fmax: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s3 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0x42280000 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s0, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 +; GFX10-NEXT: v_mov_b32_e32 v1, s0 +; GFX10-NEXT: s_lshl_b32 s0, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: ds_max_rtn_f32 v1, v1, v0 offset:32 +; GFX10-NEXT: ds_max_f32 v2, v0 offset:64 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: ds_max_rtn_f32 v0, v3, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; GFX10-NEXT: s_endpgm +; +; G_SI-LABEL: lds_ds_fmax: +; G_SI: ; %bb.0: +; G_SI-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_mov_b32 s6, -1 +; G_SI-NEXT: s_mov_b32 s7, 0xe8f000 +; G_SI-NEXT: s_add_u32 s4, s4, s3 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s0, s[0:1], 0x9 +; G_SI-NEXT: s_addc_u32 s5, s5, 0 +; G_SI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_lshl_b32 s1, s2, 3 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: s_mov_b32 m0, -1 +; G_SI-NEXT: ds_max_rtn_f32 v1, v1, v0 +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v2, s2 +; G_SI-NEXT: ds_max_rtn_f32 v0, v2, v0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_SI-NEXT: v_mov_b32_e32 v1, s0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; G_SI-NEXT: s_endpgm +; +; G_GFX7-LABEL: lds_ds_fmax: +; G_GFX7: ; %bb.0: +; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_mov_b32 s10, -1 +; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s8, s8, s3 +; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX7-NEXT: s_mov_b32 m0, -1 +; G_GFX7-NEXT: ds_max_rtn_f32 v1, v1, v0 +; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX7-NEXT: ds_max_rtn_f32 v0, v2, v0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX7-NEXT: s_endpgm +; +; G_VI-LABEL: lds_ds_fmax: +; G_VI: ; %bb.0: +; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_mov_b32 s90, -1 +; G_VI-NEXT: s_mov_b32 s91, 0xe80000 +; G_VI-NEXT: s_add_u32 s88, s88, s3 +; G_VI-NEXT: s_addc_u32 s89, s89, 0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: v_mov_b32_e32 v0, 0x42280000 +; G_VI-NEXT: v_mov_b32_e32 v1, s1 +; G_VI-NEXT: s_mov_b32 m0, -1 +; G_VI-NEXT: ds_max_rtn_f32 v1, v1, v0 +; G_VI-NEXT: s_lshl_b32 s0, s0, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s0 +; G_VI-NEXT: ds_max_rtn_f32 v0, v2, v0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_VI-NEXT: v_mov_b32_e32 v1, s6 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: buffer_store_dword v0, v1, s[88:91], 0 offen +; G_VI-NEXT: s_endpgm +; +; G_GFX9-LABEL: lds_ds_fmax: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_mov_b32 s10, -1 +; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s8, s8, s3 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s1 +; G_GFX9-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v2, s0 +; G_GFX9-NEXT: ds_max_rtn_f32 v1, v2, v1 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: v_mov_b32_e32 v1, s7 +; G_GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s6 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; G_GFX9-NEXT: s_endpgm +; +; G_GFX10-LABEL: lds_ds_fmax: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX10-NEXT: s_mov_b32 s4, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s5, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s6, -1 +; G_GFX10-NEXT: s_mov_b32 s7, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s4, s4, s3 +; G_GFX10-NEXT: s_addc_u32 s5, s5, 0 +; G_GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; G_GFX10-NEXT: v_mov_b32_e32 v1, 0x42280000 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_add_i32 s2, s2, 4 +; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX10-NEXT: s_lshl_b32 s2, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 +; G_GFX10-NEXT: v_mov_b32_e32 v3, s1 +; G_GFX10-NEXT: ds_max_rtn_f32 v0, v0, v1 +; G_GFX10-NEXT: ds_max_rtn_f32 v1, v2, v1 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) +; G_GFX10-NEXT: ds_max_rtn_f32 v0, v3, v0 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(1) +; G_GFX10-NEXT: v_mov_b32_e32 v1, s0 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen +; G_GFX10-NEXT: s_endpgm + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* + %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) + store float %a3, float addrspace(5)* %out + ret void +} + +define amdgpu_kernel void @lds_ds_fmin_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) { +; SI-LABEL: lds_ds_fmin_f64: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s3 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s3, s[0:1], 0xa +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s0, 4 +; SI-NEXT: s_lshl_b32 s0, s0, 3 +; SI-NEXT: s_add_i32 s4, s0, 32 +; SI-NEXT: s_add_i32 s5, s1, 64 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x40450000 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: ds_min_f64 v4, v[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_add_i32 s0, s2, 4 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; SI-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fmin_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0x40450000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s1, s0, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: ds_min_f64 v4, v[0:1] offset:64 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_add_i32 s0, s4, 4 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX7-NEXT: s_endpgm +; +; VI-LABEL: lds_ds_fmin_f64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s3 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x40450000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: ds_min_f64 v4, v[0:1] offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_add_i32 s0, s4, 4 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fmin_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40450000 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s5, s4, 4 +; GFX9-NEXT: s_lshl_b32 s4, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: ds_min_f64 v5, v[0:1] offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_ds_fmin_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s3 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s1, 0x40450000 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s5, s4, 3 +; GFX10-NEXT: s_lshl_b32 s0, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX10-NEXT: ds_min_f64 v4, v[0:1] offset:64 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX10-NEXT: s_endpgm +; +; G_SI-LABEL: lds_ds_fmin_f64: +; G_SI: ; %bb.0: +; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_mov_b32 s10, -1 +; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 +; G_SI-NEXT: s_add_u32 s8, s8, s3 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; G_SI-NEXT: s_addc_u32 s9, s9, 0 +; G_SI-NEXT: s_mov_b32 s0, 0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_lshl_b32 s5, s2, 3 +; G_SI-NEXT: s_mov_b32 s1, 0x40450000 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: s_mov_b32 m0, -1 +; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s2 +; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; G_SI-NEXT: v_mov_b32_e32 v2, s4 +; G_SI-NEXT: s_add_u32 s0, s4, 4 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_SI-NEXT: s_waitcnt expcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; G_SI-NEXT: s_endpgm +; +; G_GFX7-LABEL: lds_ds_fmin_f64: +; G_GFX7: ; %bb.0: +; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_mov_b32 s10, -1 +; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s8, s8, s3 +; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_mov_b32 s4, 0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 +; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: s_mov_b32 m0, -1 +; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; G_GFX7-NEXT: s_add_u32 s0, s6, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen +; G_GFX7-NEXT: s_endpgm +; +; G_VI-LABEL: lds_ds_fmin_f64: +; G_VI: ; %bb.0: +; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_mov_b32 s90, -1 +; G_VI-NEXT: s_mov_b32 s91, 0xe80000 +; G_VI-NEXT: s_add_u32 s88, s88, s3 +; G_VI-NEXT: s_addc_u32 s89, s89, 0 +; G_VI-NEXT: s_mov_b32 s4, 0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_mov_b32 s5, 0x40450000 +; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: v_mov_b32_e32 v0, s4 +; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: v_mov_b32_e32 v1, s5 +; G_VI-NEXT: s_mov_b32 m0, -1 +; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; G_VI-NEXT: s_lshl_b32 s0, s0, 4 +; G_VI-NEXT: v_mov_b32_e32 v4, s0 +; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: ds_min_rtn_f64 v[0:1], v0, v[2:3] +; G_VI-NEXT: s_add_u32 s0, s6, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s6 +; G_VI-NEXT: v_mov_b32_e32 v3, s0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; G_VI-NEXT: s_endpgm +; +; G_GFX9-LABEL: lds_ds_fmin_f64: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_mov_b32 s10, -1 +; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s8, s8, s3 +; G_GFX9-NEXT: s_mov_b32 s0, 0 +; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 +; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v5, v[0:1] +; G_GFX9-NEXT: v_mov_b32_e32 v4, s7 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: ds_min_rtn_f64 v[0:1], v4, v[2:3] +; G_GFX9-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; G_GFX9-NEXT: s_endpgm +; +; G_GFX10-LABEL: lds_ds_fmin_f64: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_clause 0x1 +; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s10, -1 +; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s8, s8, s3 +; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX10-NEXT: s_mov_b32 s0, 0 +; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_add_i32 s2, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v5, s7 +; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX10-NEXT: s_lshl_b32 s0, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX10-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v4, v[0:1] +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: ds_min_rtn_f64 v[0:1], v5, v[2:3] +; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; G_GFX10-NEXT: s_endpgm + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to double addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to double addrspace(3)* + %a1 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call double @llvm.amdgcn.ds.fmin.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false) + store double %a3, double addrspace(5)* %out + ret void +} + +define amdgpu_kernel void @lds_ds_fmax_f64(double addrspace(5)* %out, double addrspace(3)* %ptrf, i32 %idx) { +; SI-LABEL: lds_ds_fmax_f64: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s11, 0xe8f000 +; SI-NEXT: s_add_u32 s8, s8, s3 +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 +; SI-NEXT: s_load_dword s3, s[0:1], 0xa +; SI-NEXT: s_load_dword s0, s[0:1], 0xb +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_lshl_b32 s1, s0, 4 +; SI-NEXT: s_lshl_b32 s0, s0, 3 +; SI-NEXT: s_add_i32 s4, s0, 32 +; SI-NEXT: s_add_i32 s5, s1, 64 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: s_mov_b32 s1, 0x40450000 +; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s5 +; SI-NEXT: ds_max_f64 v4, v[0:1] +; SI-NEXT: v_mov_b32_e32 v0, s3 +; SI-NEXT: s_add_i32 s0, s2, 4 +; SI-NEXT: s_waitcnt lgkmcnt(1) +; SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; SI-NEXT: v_mov_b32_e32 v2, s0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s2 +; SI-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen +; SI-NEXT: s_endpgm +; +; GFX7-LABEL: lds_ds_fmax_f64: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX7-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GFX7-NEXT: s_load_dword s0, s[0:1], 0xb +; GFX7-NEXT: s_mov_b32 s10, -1 +; GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; GFX7-NEXT: s_add_u32 s8, s8, s3 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 +; GFX7-NEXT: s_mov_b32 s2, 0 +; GFX7-NEXT: s_mov_b32 s3, 0x40450000 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: s_lshl_b32 s1, s0, 4 +; GFX7-NEXT: s_lshl_b32 s0, s0, 3 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v1, s3 +; GFX7-NEXT: s_mov_b32 m0, -1 +; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 +; GFX7-NEXT: ds_max_f64 v4, v[0:1] offset:64 +; GFX7-NEXT: v_mov_b32_e32 v0, s5 +; GFX7-NEXT: s_add_i32 s0, s4, 4 +; GFX7-NEXT: s_waitcnt lgkmcnt(1) +; GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen +; GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX7-NEXT: s_endpgm +; +; VI-LABEL: lds_ds_fmax_f64: +; VI: ; %bb.0: +; VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 +; VI-NEXT: s_load_dword s0, s[0:1], 0x2c +; VI-NEXT: s_mov_b32 s90, -1 +; VI-NEXT: s_mov_b32 s91, 0xe80000 +; VI-NEXT: s_add_u32 s88, s88, s3 +; VI-NEXT: s_addc_u32 s89, s89, 0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_mov_b32 s3, 0x40450000 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: s_lshl_b32 s1, s0, 4 +; VI-NEXT: s_lshl_b32 s0, s0, 3 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: s_mov_b32 m0, -1 +; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: ds_max_f64 v4, v[0:1] offset:64 +; VI-NEXT: v_mov_b32_e32 v0, s5 +; VI-NEXT: s_add_i32 s0, s4, 4 +; VI-NEXT: s_waitcnt lgkmcnt(1) +; VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; VI-NEXT: s_endpgm +; +; GFX9-LABEL: lds_ds_fmax_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; GFX9-NEXT: s_add_u32 s8, s8, s3 +; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX9-NEXT: s_addc_u32 s9, s9, 0 +; GFX9-NEXT: s_mov_b32 s0, 0 +; GFX9-NEXT: s_mov_b32 s1, 0x40450000 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_lshl_b32 s5, s4, 4 +; GFX9-NEXT: s_lshl_b32 s4, s4, 3 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 +; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX9-NEXT: v_mov_b32_e32 v5, s5 +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: ds_max_f64 v5, v[0:1] offset:64 +; GFX9-NEXT: s_waitcnt lgkmcnt(1) +; GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, s2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: lds_ds_fmax_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; GFX10-NEXT: s_mov_b32 s10, -1 +; GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; GFX10-NEXT: s_add_u32 s8, s8, s3 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_addc_u32 s9, s9, 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_mov_b32 s1, 0x40450000 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_lshl_b32 s5, s4, 3 +; GFX10-NEXT: s_lshl_b32 s0, s4, 4 +; GFX10-NEXT: v_mov_b32_e32 v2, s5 +; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s3 +; GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 +; GFX10-NEXT: ds_max_f64 v4, v[0:1] offset:64 +; GFX10-NEXT: s_waitcnt lgkmcnt(1) +; GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] +; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; GFX10-NEXT: s_endpgm +; +; G_SI-LABEL: lds_ds_fmax_f64: +; G_SI: ; %bb.0: +; G_SI-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_SI-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_SI-NEXT: s_mov_b32 s10, -1 +; G_SI-NEXT: s_mov_b32 s11, 0xe8f000 +; G_SI-NEXT: s_add_u32 s8, s8, s3 +; G_SI-NEXT: s_load_dword s2, s[0:1], 0xb +; G_SI-NEXT: s_load_dword s3, s[0:1], 0xa +; G_SI-NEXT: s_load_dword s4, s[0:1], 0x9 +; G_SI-NEXT: s_addc_u32 s9, s9, 0 +; G_SI-NEXT: s_mov_b32 s0, 0 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: s_add_i32 s2, s2, 4 +; G_SI-NEXT: s_lshl_b32 s5, s2, 3 +; G_SI-NEXT: s_mov_b32 s1, 0x40450000 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: v_mov_b32_e32 v1, s1 +; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: s_mov_b32 m0, -1 +; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; G_SI-NEXT: s_lshl_b32 s2, s2, 4 +; G_SI-NEXT: v_mov_b32_e32 v4, s2 +; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s3 +; G_SI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; G_SI-NEXT: v_mov_b32_e32 v2, s4 +; G_SI-NEXT: s_add_u32 s0, s4, 4 +; G_SI-NEXT: s_waitcnt lgkmcnt(0) +; G_SI-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_SI-NEXT: s_waitcnt expcnt(0) +; G_SI-NEXT: v_mov_b32_e32 v0, s0 +; G_SI-NEXT: buffer_store_dword v1, v0, s[8:11], 0 offen +; G_SI-NEXT: s_endpgm +; +; G_GFX7-LABEL: lds_ds_fmax_f64: +; G_GFX7: ; %bb.0: +; G_GFX7-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX7-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX7-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x9 +; G_GFX7-NEXT: s_load_dword s2, s[0:1], 0xb +; G_GFX7-NEXT: s_mov_b32 s10, -1 +; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 +; G_GFX7-NEXT: s_add_u32 s8, s8, s3 +; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX7-NEXT: s_mov_b32 s4, 0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 +; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 +; G_GFX7-NEXT: s_mov_b32 m0, -1 +; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX7-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: v_mov_b32_e32 v0, s7 +; G_GFX7-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; G_GFX7-NEXT: s_add_u32 s0, s6, 4 +; G_GFX7-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX7-NEXT: v_mov_b32_e32 v3, s0 +; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX7-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX7-NEXT: buffer_store_dword v1, v3, s[8:11], 0 offen +; G_GFX7-NEXT: s_endpgm +; +; G_VI-LABEL: lds_ds_fmax_f64: +; G_VI: ; %bb.0: +; G_VI-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; G_VI-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; G_VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_VI-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_VI-NEXT: s_mov_b32 s90, -1 +; G_VI-NEXT: s_mov_b32 s91, 0xe80000 +; G_VI-NEXT: s_add_u32 s88, s88, s3 +; G_VI-NEXT: s_addc_u32 s89, s89, 0 +; G_VI-NEXT: s_mov_b32 s4, 0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_mov_b32 s5, 0x40450000 +; G_VI-NEXT: s_lshl_b32 s1, s0, 3 +; G_VI-NEXT: v_mov_b32_e32 v0, s4 +; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: v_mov_b32_e32 v1, s5 +; G_VI-NEXT: s_mov_b32 m0, -1 +; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; G_VI-NEXT: s_lshl_b32 s0, s0, 4 +; G_VI-NEXT: v_mov_b32_e32 v4, s0 +; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: v_mov_b32_e32 v0, s7 +; G_VI-NEXT: ds_max_rtn_f64 v[0:1], v0, v[2:3] +; G_VI-NEXT: s_add_u32 s0, s6, 4 +; G_VI-NEXT: v_mov_b32_e32 v2, s6 +; G_VI-NEXT: v_mov_b32_e32 v3, s0 +; G_VI-NEXT: s_waitcnt lgkmcnt(0) +; G_VI-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen +; G_VI-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; G_VI-NEXT: s_endpgm +; +; G_GFX9-LABEL: lds_ds_fmax_f64: +; G_GFX9: ; %bb.0: +; G_GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX9-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX9-NEXT: s_mov_b32 s10, -1 +; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 +; G_GFX9-NEXT: s_add_u32 s8, s8, s3 +; G_GFX9-NEXT: s_mov_b32 s0, 0 +; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 +; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 +; G_GFX9-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX9-NEXT: s_lshl_b32 s0, s0, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v5, s0 +; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v5, v[0:1] +; G_GFX9-NEXT: v_mov_b32_e32 v4, s7 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: ds_max_rtn_f64 v[0:1], v4, v[2:3] +; G_GFX9-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX9-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX9-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; G_GFX9-NEXT: s_endpgm +; +; G_GFX10-LABEL: lds_ds_fmax_f64: +; G_GFX10: ; %bb.0: +; G_GFX10-NEXT: s_clause 0x1 +; G_GFX10-NEXT: s_load_dword s2, s[0:1], 0x2c +; G_GFX10-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x24 +; G_GFX10-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0 +; G_GFX10-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1 +; G_GFX10-NEXT: s_mov_b32 s10, -1 +; G_GFX10-NEXT: s_mov_b32 s11, 0x31c16000 +; G_GFX10-NEXT: s_add_u32 s8, s8, s3 +; G_GFX10-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX10-NEXT: s_mov_b32 s0, 0 +; G_GFX10-NEXT: s_mov_b32 s1, 0x40450000 +; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 +; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: s_add_i32 s2, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v5, s7 +; G_GFX10-NEXT: s_lshl_b32 s3, s2, 3 +; G_GFX10-NEXT: s_lshl_b32 s0, s2, 4 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s3 +; G_GFX10-NEXT: v_mov_b32_e32 v4, s0 +; G_GFX10-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] +; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v4, v[0:1] +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: ds_max_rtn_f64 v[0:1], v5, v[2:3] +; G_GFX10-NEXT: v_mov_b32_e32 v2, s6 +; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) +; G_GFX10-NEXT: buffer_store_dword v0, v2, s[8:11], 0 offen +; G_GFX10-NEXT: buffer_store_dword v1, v2, s[8:11], 0 offen offset:4 +; G_GFX10-NEXT: s_endpgm + %idx.add = add nuw i32 %idx, 4 + %shl0 = shl i32 %idx.add, 3 + %shl1 = shl i32 %idx.add, 4 + %ptr0 = inttoptr i32 %shl0 to double addrspace(3)* + %ptr1 = inttoptr i32 %shl1 to double addrspace(3)* + %a1 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr0, double 4.2e+1, i32 0, i32 0, i1 false) + %a2 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptr1, double 4.2e+1, i32 0, i32 0, i1 false) + %a3 = call double @llvm.amdgcn.ds.fmax.f64(double addrspace(3)* %ptrf, double %a1, i32 0, i32 0, i1 false) + store double %a3, double addrspace(5)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll b/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/lds_atomic_f32.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s - -declare float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* nocapture, float, i32, i32, i1) -declare float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* nocapture, float, i32, i32, i1) - -; GCN-LABEL: {{^}}lds_ds_fadd: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; GCN: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; GCN: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; GCN: s_waitcnt lgkmcnt(1) -; GCN: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* - %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fadd.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) - store float %a3, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}lds_ds_fmin: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; GCN: ds_min_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; GCN: ds_min_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; GCN: s_waitcnt lgkmcnt(1) -; GCN: ds_min_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_ds_fmin(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* - %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmin.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) - store float %a3, float addrspace(1)* %out - ret void -} - -; GCN-LABEL: {{^}}lds_ds_fmax: -; VI-DAG: s_mov_b32 m0 -; GFX9-NOT: m0 -; GCN-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 -; GCN: ds_max_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 -; GCN: ds_max_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; GCN: s_waitcnt lgkmcnt(1) -; GCN: ds_max_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] -define amdgpu_kernel void @lds_ds_fmax(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { - %idx.add = add nuw i32 %idx, 4 - %shl0 = shl i32 %idx.add, 3 - %shl1 = shl i32 %idx.add, 4 - %ptr0 = inttoptr i32 %shl0 to float addrspace(3)* - %ptr1 = inttoptr i32 %shl1 to float addrspace(3)* - %a1 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr0, float 4.2e+1, i32 0, i32 0, i1 false) - %a2 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptr1, float 4.2e+1, i32 0, i32 0, i1 false) - %a3 = call float @llvm.amdgcn.ds.fmax.f32(float addrspace(3)* %ptrf, float %a1, i32 0, i32 0, i1 false) - store float %a3, float addrspace(1)* %out - ret void -}