Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +; -------------------------------------------------------------------------------- +; amdgcn atomic csub +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_csub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GCN-LABEL: global_csub_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GCN-LABEL: global_csub_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_csub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GCN-LABEL: global_csub_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off glc +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data) + ret void +} + +define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GCN-LABEL: global_csub_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off offset:-128 glc +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data) + ret void +} + +declare i32 @llvm.amdgcn.global.atomic.csub.p1i32(i32 addrspace(1)* nocapture, i32) #0 + +attributes #0 = { argmemonly nounwind willreturn } Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx908 < %s | FileCheck -check-prefixes=GCN,GFX908 %s + +; Test using saddr addressing mode of global_* flat atomic instructions. + +; -------------------------------------------------------------------------------- +; amdgcn global atomic fadd +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { +; GCN-LABEL: global_fadd_saddr_f32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: global_atomic_add_f32 v[2:3], v1, off +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep0, float %data) + ret void +} + +define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { +; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: global_atomic_add_f32 v[2:3], v1, off offset:-128 +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* %cast.gep1, float %data) + ret void +} + +define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) { +; GCN-LABEL: global_fadd_saddr_v2f16_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: global_atomic_pk_add_f16 v[2:3], v1, off +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep0, <2 x half> %data) + ret void +} + +define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) { +; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GCN-NEXT: global_atomic_pk_add_f16 v[2:3], v1, off offset:-128 +; GCN-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* + call void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* %cast.gep1, <2 x half> %data) + ret void +} + +declare void @llvm.amdgcn.global.atomic.fadd.f32.p1f32(float addrspace(1)* nocapture, float) #0 +declare void @llvm.amdgcn.global.atomic.fadd.v2f16.p1v2f16(<2 x half> addrspace(1)* nocapture, <2 x half>) #0 + +attributes #0 = { argmemonly nounwind willreturn } Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -0,0 +1,3536 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +; Test using saddr addressing mode of global_* flat atomic instructions. + +define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +; Maximum positive offset on gfx10 +define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:2047 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_2047: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:2047 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +; Maximum negative offset on gfx10 +define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:2048 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_i32_rtn_neg2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:42 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:42 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; All atomicrmw ops +; -------------------------------------------------------------------------------- + +; -------------------------------------------------------------------------------- +; atomicrmw xchg +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xchg_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xchg_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xchg_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xchg_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw add +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_add_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_add_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_add_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_add_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_add_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_add_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_add_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_add_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_add_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_add_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_add_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_add_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_add_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_add_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_add_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_add_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw sub +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_sub_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sub_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sub_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_sub_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_sub_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_sub_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_sub_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sub_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sub_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_sub_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_sub_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_sub_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw and +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_and_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_and_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_and_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_and_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_and_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_and_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_and_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_and_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_and_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_and_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_and_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_and_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_and_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_and_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_and_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_and_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw or +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_or_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_or_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_or_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_or_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_or_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_or_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_or_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_or_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_or_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_or_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_or_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_or_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_or_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_or_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_or_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_or_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw xor +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xor_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xor_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xor_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xor_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xor_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xor_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xor_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xor_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_xor_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xor_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xor_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_xor_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw max +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_max_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_max_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_max_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_max_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_max_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_max_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_max_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_max_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw min +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_min_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_min_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_min_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_min_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_min_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_min_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_min_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_min_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw umax +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umax_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umax_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umax_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umax_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umax_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umax_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umax_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umax_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; atomicrmw umin +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umin_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umin_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umin_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin v[2:3], v1, off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umin_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin v[2:3], v1, off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst + ret void +} + +define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umin_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umin_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umin_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umin_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst + ret void +} + +define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; cmpxchg +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst + %rtn = extractvalue { i32, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst + %rtn = extractvalue { i32, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst + %rtn = extractvalue { i64, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst + %rtn = extractvalue { i64, i1 } %cmpxchg, 0 + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst + ret void +} + +define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { +; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v6, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 +; GFX10-NEXT: v_mov_b32_e32 v6, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst + ret void +} + +; -------------------------------------------------------------------------------- +; amdgcn atomic inc +; -------------------------------------------------------------------------------- + +declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 + +define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_inc_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_inc_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_inc_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_inc_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_inc_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_inc v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_inc_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_inc_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_inc v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_inc_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_inc_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_inc_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_inc_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_inc_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_inc_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_inc_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_inc_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_inc_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false) + ret void +} + +; -------------------------------------------------------------------------------- +; amdgcn atomic dec +; -------------------------------------------------------------------------------- + +declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0 +declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 + +define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_dec_saddr_i32_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_dec_saddr_i32_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_dec_saddr_i32_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_dec_saddr_i32_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i32 %rtn to float + ret float %cast.rtn +} + +define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_dec_saddr_i32_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_dec v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_dec_saddr_i32_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_dec_saddr_i32_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_atomic_dec v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_dec_saddr_i32_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_dec_saddr_i64_rtn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_dec_saddr_i64_rtn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_dec_saddr_i64_rtn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_dec_saddr_i64_rtn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false) + %cast.rtn = bitcast i64 %rtn to <2 x float> + ret <2 x float> %cast.rtn +} + +define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_dec_saddr_i64_nortn: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_dec_saddr_i64_nortn: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false) + ret void +} + +define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_dec_saddr_i64_nortn_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_dec_saddr_i64_nortn_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false) + ret void +} + +attributes #0 = { argmemonly nounwind willreturn } Index: llvm/test/CodeGen/AMDGPU/global-saddr-load.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -0,0 +1,2983 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +; Test using saddr addressing mode of global_*load_* flat instructions. + +; -------------------------------------------------------------------------------- +; Basic addressing patterns +; -------------------------------------------------------------------------------- + +; Basic pattern, no immediate offset. +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx9 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx9 + 1 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4096 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx9 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4096 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx9 - 1 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4097(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff000, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -4097 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx10 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx10 + 1 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx10 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum negative offset on gfx10 - 1 +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2049 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Maximum positive offset on gfx9, and immediate needs to be moved lower. +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 4095 + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; pointer addressing done in integers +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add = add i64 %sbase.as.int, %zext.offset + %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v0, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s3, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v0, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s3, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; zext forced to LHS of addressing expression, with immediate offset in non-canonical position +define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add.immoffset = add i64 %sbase.as.int, 128 + %add = add i64 %zext.offset, %add.immoffset + %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { +; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_b64 v[1:2], v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[1:2], v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { +; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_read_b64 v[1:2], v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:42 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[1:2], v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v1, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:42 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar +define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { +; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s2, s4 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both 64-bit base and 32-bit offset are scalar, with immediate offset. +define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { +; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s2, s4 +; GCN-NEXT: s_addc_u32 s1, s3, 0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-24 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -24 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression +define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { +; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s4, s2 +; GCN-NEXT: s_addc_u32 s1, 0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %dirty.gep = inttoptr i64 %add to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; Both components uniform, zext forced to LHS of addressing expression, with immediate offset +define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { +; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: +; GCN: ; %bb.0: +; GCN-NEXT: s_add_u32 s0, s4, s2 +; GCN-NEXT: s_addc_u32 s1, 0, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: ; implicit-def: $vcc_hi +; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 + %add = add i64 %zext.offset, %sbase.as.int + %add.immoffset = add i64 %add, 128 + %dirty.gep = inttoptr i64 %add.immoffset to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %dirty.gep + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset. +define amdgpu_ps float @global_load_i8_vgpr64_sgpr32(i8 addrspace(1)* %vbase, i32 inreg %soffset) { +; GFX9-LABEL: global_load_i8_vgpr64_sgpr32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; divergent 64-bit base, 32-bit scalar offset, with imm offset +define amdgpu_ps float @global_load_i8_vgpr64_sgpr32_offset_4095(i8 addrspace(1)* %vbase, i32 inreg %soffset) { +; GFX9-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s2 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %soffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %vbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 4095 + %load = load i8, i8 addrspace(1)* %gep1 + %zext = zext i8 %load to i32 + %to.vgpr = bitcast i32 %zext to float + ret float %to.vgpr +} + +; -------------------------------------------------------------------------------- +; Natural addressing shifts with restricted range +; -------------------------------------------------------------------------------- + +; Cannot push the shift into 32-bits, and cannot match. +define amdgpu_ps float @global_load_saddr_f32_natural_addressing(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { +; GFX9-LABEL: global_load_saddr_f32_natural_addressing: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f32_natural_addressing: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %voffset = load i32, i32 addrspace(1)* %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset + %load = load float, float addrspace(1)* %gep + ret float %load +} + +; Cannot push the shift into 32-bits, with an immediate offset. +define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { +; GFX9-LABEL: global_load_saddr_f32_natural_addressing_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f32_natural_addressing_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %voffset = load i32, i32 addrspace(1)* %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* + %load = load float, float addrspace(1)* %gep1.cast + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits. +define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { +; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset + %load = load float, float addrspace(1)* %gep + ret float %load +} + +; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset +define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { +; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:400 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:400 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds float, float addrspace(1)* %gep0, i64 100 + %load = load float, float addrspace(1)* %gep1 + ret float %load +} + +; Range is 1 beyond the limit where we can move the shift into 32-bits. +define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_too_large(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { +; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_too_large: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !1 + %zext.offset = zext i32 %voffset to i64 + %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset + %load = load float, float addrspace(1)* %gep + ret float %load +} + +; -------------------------------------------------------------------------------- +; Stress various type loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %cast.load = bitcast i16 %load to half + ret half %cast.load +} + +define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* + %load = load half, half addrspace(1)* %gep0.cast + ret half %load +} + +define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)* + %load = load half, half addrspace(1)* %gep1.cast + ret half %load +} + +define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %gep0.cast + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %load = load i32, i32 addrspace(1)* %gep1.cast + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* + %load = load float, float addrspace(1)* %gep0.cast + ret float %load +} + +define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* + %load = load float, float addrspace(1)* %gep1.cast + ret float %load +} + +define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* + %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep0.cast + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i16> addrspace(1)* + %load = load <2 x i16>, <2 x i16> addrspace(1)* %gep1.cast + %cast.load = bitcast <2 x i16> %load to <2 x half> + ret <2 x half> %cast.load +} + +define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* + %load = load <2 x half>, <2 x half> addrspace(1)* %gep0.cast + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2f16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2f16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x half> addrspace(1)* + %load = load <2 x half>, <2 x half> addrspace(1)* %gep1.cast + ret <2 x half> %load +} + +define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_p3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_p3: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* + %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep0.cast + %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_p3_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_p3_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)* + %load = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %gep1.cast + %cast.load0 = ptrtoint i8 addrspace(3)* %load to i32 + %cast.load1 = bitcast i32 %cast.load0 to <2 x half> + ret <2 x half> %cast.load1 +} + +define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* + %load = load double, double addrspace(1)* %gep0.cast + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_f64_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_f64_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)* + %load = load double, double addrspace(1)* %gep1.cast + %cast.load = bitcast double %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %load = load i64, i64 addrspace(1)* %gep0.cast + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i64_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i64_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %load = load i64, i64 addrspace(1)* %gep1.cast + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* + %load = load <2 x float>, <2 x float> addrspace(1)* %gep0.cast + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2f32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2f32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)* + %load = load <2 x float>, <2 x float> addrspace(1)* %gep1.cast + ret <2 x float> %load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* + %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep0.cast + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)* + %load = load <2 x i32>, <2 x i32> addrspace(1)* %gep1.cast + %cast.load = bitcast <2 x i32> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* + %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep0.cast + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4i16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4i16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)* + %load = load <4 x i16>, <4 x i16> addrspace(1)* %gep1.cast + %cast.load = bitcast <4 x i16> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* + %load = load <4 x half>, <4 x half> addrspace(1)* %gep0.cast + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4f16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4f16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)* + %load = load <4 x half>, <4 x half> addrspace(1)* %gep1.cast + %cast.load = bitcast <4 x half> %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_p1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_p1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* + %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep0.cast + %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_p1_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_p1_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)* + %load = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %gep1.cast + %cast.load0 = ptrtoint i8 addrspace(1)* %load to i64 + %cast.load1 = bitcast i64 %cast.load0 to <2 x float> + ret <2 x float> %cast.load1 +} + +define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v3f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v3f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* + %load = load <3 x float>, <3 x float> addrspace(1)* %gep0.cast + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v3f32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v3f32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)* + %load = load <3 x float>, <3 x float> addrspace(1)* %gep1.cast + ret <3 x float> %load +} + +define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v3i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v3i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* + %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep0.cast + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v3i32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v3i32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* + %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast + %cast.load = bitcast <3 x i32> %load to <3 x float> + ret <3 x float> %cast.load +} + +define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v6f16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v6f16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* + %load = load <6 x half>, <6 x half> addrspace(1)* %gep0.cast + ret <6 x half> %load +} + +define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v6f16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v6f16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)* + %load = load <6 x half>, <6 x half> addrspace(1)* %gep1.cast + ret <6 x half> %load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4f32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4f32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* + %load = load <4 x float>, <4 x float> addrspace(1)* %gep0.cast + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4f32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4f32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)* + %load = load <4 x float>, <4 x float> addrspace(1)* %gep1.cast + ret <4 x float> %load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* + %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep0.cast + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4i32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4i32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)* + %load = load <4 x i32>, <4 x i32> addrspace(1)* %gep1.cast + %cast.load = bitcast <4 x i32> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* + %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep0.cast + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2i64_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2i64_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)* + %load = load <2 x i64>, <2 x i64> addrspace(1)* %gep1.cast + %cast.load = bitcast <2 x i64> %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* + %load = load i128, i128 addrspace(1)* %gep0.cast + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i128_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i128_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i128 addrspace(1)* + %load = load i128, i128 addrspace(1)* %gep1.cast + %cast.load = bitcast i128 %load to <4 x float> + ret <4 x float> %cast.load +} + +define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2p1: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2p1: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* + %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast + %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v2p1_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v2p1_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)* + %load = load <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast + %cast.load0 = ptrtoint <2 x i8 addrspace(1)*> %load to <2 x i64> + %cast.load1 = bitcast <2 x i64> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4p3: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4p3: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* + %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast + %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_v4p3_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_v4p3_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)* + %load = load <4 x i8 addrspace(3)*>, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast + %cast.load0 = ptrtoint <4 x i8 addrspace(3)*> %load to <4 x i32> + %cast.load1 = bitcast <4 x i32> %cast.load0 to <4 x float> + ret <4 x float> %cast.load1 +} + +; -------------------------------------------------------------------------------- +; Extending loads +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_sextload_saddr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sextload_saddr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_sextload_saddr_i8_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_sbyte v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sextload_saddr_i8_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %load = load i8, i8 addrspace(1)* %gep1 + %sextload = sext i8 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_sextload_saddr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_sshort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sextload_saddr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sshort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_sextload_saddr_i16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_sshort v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_sextload_saddr_i16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sshort v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %sextload = sext i16 %load to i32 + %cast.load = bitcast i32 %sextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_zextload_saddr_i8: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_zextload_saddr_i8: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %load = load i8, i8 addrspace(1)* %gep0 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_zextload_saddr_i8_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_zextload_saddr_i8_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %load = load i8, i8 addrspace(1)* %gep1 + %zextload = zext i8 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_zextload_saddr_i16: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_zextload_saddr_i16: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_zextload_saddr_i16_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_zextload_saddr_i16_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %zextload = zext i16 %load to i32 + %cast.load = bitcast i32 %zextload to float + ret float %cast.load +} + +; -------------------------------------------------------------------------------- +; Atomic load +; -------------------------------------------------------------------------------- + +define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: atomic_global_load_saddr_i32: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: atomic_global_load_saddr_i32: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + %load = load atomic i32, i32 addrspace(1)* %gep0.cast seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + %load = load atomic i32, i32 addrspace(1)* %gep1.cast seq_cst, align 4 + %cast.load = bitcast i32 %load to float + ret float %cast.load +} + +define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: atomic_global_load_saddr_i64: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: atomic_global_load_saddr_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + %load = load atomic i64, i64 addrspace(1)* %gep0.cast seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: buffer_wbinvl1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 glc dlc +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: buffer_gl0_inv +; GFX10-NEXT: buffer_gl1_inv +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + %load = load atomic i64, i64 addrspace(1)* %gep1.cast seq_cst, align 8 + %cast.load = bitcast i64 %load to <2 x float> + ret <2 x float> %cast.load +} + +; -------------------------------------------------------------------------------- +; D16 load (low 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_short_d16 v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16 v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_short_d16 v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16 v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> undef, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> %reg, i16 %load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ubyte_d16 v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte_d16 v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep0.cast + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ubyte_d16 v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte_d16 v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep1.cast + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_sbyte_d16 v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte_d16 v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep0.cast + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_sbyte_d16 v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte_d16 v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep1.cast + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 0 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +; -------------------------------------------------------------------------------- +; D16 hi load (hi16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_short_d16_hi v0, v[0:1], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_short_d16_hi v0, v[0:1], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> undef, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_short_d16_hi v0, v[1:2], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v1, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_short_d16_hi v0, v[1:2], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v1, s0, s2, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s3, 0, s0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16_hi v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16_hi v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep0.cast + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_short_d16_hi v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_short_d16_hi v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %load = load i16, i16 addrspace(1)* %gep1.cast + %build = insertelement <2 x i16> %reg, i16 %load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep0.cast + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep1.cast + %zext.load = zext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %zext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep0.cast + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { +; GFX9-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off offset:-128 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: ; return to shader part epilog +; +; GFX10-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off offset:-128 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v1 +; GFX10-NEXT: ; return to shader part epilog + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* + %load = load i8, i8 addrspace(1)* %gep1.cast + %sext.load = sext i8 %load to i16 + %build = insertelement <2 x i16> %reg, i16 %sext.load, i32 1 + %cast = bitcast <2 x i16> %build to <2 x half> + ret <2 x half> %cast +} + +!0 = !{i32 0, i32 1073741824} ; (1 << 30) +!1 = !{i32 0, i32 1073741825} ; (1 << 30) + 1 Index: llvm/test/CodeGen/AMDGPU/global-saddr-store.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -0,0 +1,1507 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s + +; Test using saddr addressing mode of global_*store_* flat instructions. + +define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { +; GFX9-LABEL: global_store_saddr_i8_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_byte v[0:1], v2, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i8_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_store_byte v[0:1], v2, off +; GFX10-NEXT: s_endpgm + %voffset = load i32, i32 addrspace(1)* %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + store i8 %data, i8 addrspace(1)* %gep0 + ret void +} + +; Maximum positive offset on gfx10 +define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { +; GFX9-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_byte v[0:1], v2, off offset:2047 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_store_byte v[0:1], v2, off offset:2047 +; GFX10-NEXT: s_endpgm + %voffset = load i32, i32 addrspace(1)* %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 + store i8 %data, i8 addrspace(1)* %gep1 + ret void +} + +; Maximum negative offset on gfx10 +define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { +; GFX9-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_store_byte v[0:1], v2, off offset:-2048 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: +; GFX10: ; %bb.0: +; GFX10-NEXT: global_load_dword v0, v[0:1], off +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: global_store_byte v[0:1], v2, off offset:-2048 +; GFX10-NEXT: s_endpgm + %voffset = load i32, i32 addrspace(1)* %voffset.ptr + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 + store i8 %data, i8 addrspace(1)* %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Uniformity edge cases +; -------------------------------------------------------------------------------- + +@ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef + +; Base pointer is uniform, but also in VGPRs +define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { +; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_byte v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_store_byte v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + store i8 %data, i8 addrspace(1)* %gep0 + ret void +} + +; Base pointer is uniform, but also in VGPRs, with imm offset +define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { +; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: ds_read_b64 v[2:3], v2 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_byte v[2:3], v1, off offset:-120 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: ds_read_b64 v[2:3], v2 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_store_byte v[2:3], v1, off offset:-120 +; GFX10-NEXT: s_endpgm + %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -120 + store i8 %data, i8 addrspace(1)* %gep1 + ret void +} + +; -------------------------------------------------------------------------------- +; Stress various type stores +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) { +; GFX9-LABEL: global_store_saddr_i16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + store i16 %data, i16 addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) { +; GFX9-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + store i16 %data, i16 addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) { +; GFX9-LABEL: global_store_saddr_f16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* + store half %data, half addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) { +; GFX9-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to half addrspace(1)* + store half %data, half addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_store_saddr_i32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + store i32 %data, i32 addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + store i32 %data, i32 addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { +; GFX9-LABEL: global_store_saddr_f32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* + store float %data, float addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { +; GFX9-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to float addrspace(1)* + store float %data, float addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) { +; GFX9-LABEL: global_store_saddr_p3_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_p3_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* + store i8 addrspace(3)* %data, i8 addrspace(3)* addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) { +; GFX9-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(3)* addrspace(1)* + store i8 addrspace(3)* %data, i8 addrspace(3)* addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_store_saddr_i64_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i64_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + store i64 %data, i64 addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + store i64 %data, i64 addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) { +; GFX9-LABEL: global_store_saddr_f64_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f64_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* + store double %data, double addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) { +; GFX9-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to double addrspace(1)* + store double %data, double addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v2i32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2i32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* + store <2 x i32> %data, <2 x i32> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i32> addrspace(1)* + store <2 x i32> %data, <2 x i32> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX9-LABEL: global_store_saddr_v2f32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2f32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* + store <2 x float> %data, <2 x float> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) { +; GFX9-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x float> addrspace(1)* + store <2 x float> %data, <2 x float> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v4i16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4i16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* + store <4 x i16> %data, <4 x i16> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i16> addrspace(1)* + store <4 x i16> %data, <4 x i16> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX9-LABEL: global_store_saddr_v4f16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4f16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* + store <4 x half> %data, <4 x half> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) { +; GFX9-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x half> addrspace(1)* + store <4 x half> %data, <4 x half> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) { +; GFX9-LABEL: global_store_saddr_p1_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_p1_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* + store i8 addrspace(1)* %data, i8 addrspace(1)* addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) { +; GFX9-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i8 addrspace(1)* addrspace(1)* + store i8 addrspace(1)* %data, i8 addrspace(1)* addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v3i32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v3i32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* + store <3 x i32> %data, <3 x i32> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* + store <3 x i32> %data, <3 x i32> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX9-LABEL: global_store_saddr_v3f32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v3f32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* + store <3 x float> %data, <3 x float> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) { +; GFX9-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x float> addrspace(1)* + store <3 x float> %data, <3 x float> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v6i16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v6i16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x i16> addrspace(1)* + store <6 x i16> %data, <6 x i16> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x i16> addrspace(1)* + store <6 x i16> %data, <6 x i16> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX9-LABEL: global_store_saddr_v6f16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v6f16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* + store <6 x half> %data, <6 x half> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) { +; GFX9-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v5, s3 +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <6 x half> addrspace(1)* + store <6 x half> %data, <6 x half> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v4i32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4i32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* + store <4 x i32> %data, <4 x i32> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) { +; GFX9-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i32> addrspace(1)* + store <4 x i32> %data, <4 x i32> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX9-LABEL: global_store_saddr_v4f32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4f32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* + store <4 x float> %data, <4 x float> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) { +; GFX9-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x float> addrspace(1)* + store <4 x float> %data, <4 x float> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX9-LABEL: global_store_saddr_v2i64_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2i64_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* + store <2 x i64> %data, <2 x i64> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) { +; GFX9-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i64> addrspace(1)* + store <2 x i64> %data, <2 x i64> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX9-LABEL: global_store_saddr_v2f64_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2f64_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x double> addrspace(1)* + store <2 x double> %data, <2 x double> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) { +; GFX9-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x double> addrspace(1)* + store <2 x double> %data, <2 x double> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v8i16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v8i16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x i16> addrspace(1)* + store <8 x i16> %data, <8 x i16> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) { +; GFX9-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <8 x i16> addrspace(1)* + store <8 x i16> %data, <8 x i16> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX9-LABEL: global_store_saddr_v8f16_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v8f16_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x half> addrspace(1)* + store <8 x half> %data, <8 x half> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) { +; GFX9-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <8 x half> addrspace(1)* + store <8 x half> %data, <8 x half> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) { +; GFX9-LABEL: global_store_saddr_v2p1_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2p1_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* + store <2 x i8 addrspace(1)*> %data, <2 x i8 addrspace(1)*> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) { +; GFX9-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <2 x i8 addrspace(1)*> addrspace(1)* + store <2 x i8 addrspace(1)*> %data, <2 x i8 addrspace(1)*> addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) { +; GFX9-LABEL: global_store_saddr_v4p3_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4p3_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* + store <4 x i8 addrspace(3)*> %data, <4 x i8 addrspace(3)*> addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) { +; GFX9-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v6, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc +; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 +; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <4 x i8 addrspace(3)*> addrspace(1)* + store <4 x i8 addrspace(3)*> %data, <4 x i8 addrspace(3)*> addrspace(1)* %gep1.cast + ret void +} + +; -------------------------------------------------------------------------------- +; Atomic store +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_global_store_saddr_i32_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* + store atomic i32 %data, i32 addrspace(1)* %gep0.cast seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { +; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, 0xffffff80, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)* + store atomic i32 %data, i32 addrspace(1)* %gep1.cast seq_cst, align 4 + ret void +} + +define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_global_store_saddr_i64_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* + store atomic i64 %data, i64 addrspace(1)* %gep0.cast seq_cst, align 8 + ret void +} + +define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { +; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0xffffff80, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v4, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 +; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, 0xffffff80, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v4, vcc_lo +; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)* + store atomic i64 %data, i64 addrspace(1)* %gep1.cast seq_cst, align 8 + ret void +} + +; -------------------------------------------------------------------------------- +; D16 HI store (hi 16) +; -------------------------------------------------------------------------------- + +define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX9-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short_d16_hi v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short_d16_hi v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, i16 addrspace(1)* %gep0.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX9-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_short_d16_hi v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_short_d16_hi v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %gep1.cast = bitcast i8 addrspace(1)* %gep1 to i16 addrspace(1)* + %data.hi = extractelement <2 x i16> %data, i32 1 + store i16 %data.hi, i16 addrspace(1)* %gep1.cast + ret void +} + +define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX9-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_byte_d16_hi v[2:3], v1, off +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_byte_d16_hi v[2:3], v1, off +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, i8 addrspace(1)* %gep0 + ret void +} + +define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { +; GFX9-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX9: ; %bb.0: +; GFX9-NEXT: v_mov_b32_e32 v3, s3 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_store_byte_d16_hi v[2:3], v1, off offset:-128 +; GFX9-NEXT: s_endpgm +; +; GFX10-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 +; GFX10-NEXT: global_store_byte_d16_hi v[2:3], v1, off offset:-128 +; GFX10-NEXT: s_endpgm + %zext.offset = zext i32 %voffset to i64 + %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset + %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 + %data.hi = extractelement <2 x i16> %data, i32 1 + %data.hi.trunc = trunc i16 %data.hi to i8 + store i8 %data.hi.trunc, i8 addrspace(1)* %gep1 + ret void +}