Index: llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3488,7 +3488,7 @@ std::pair AMDGPUInstructionSelector::getPtrBaseWithConstantOffset( Register Root, const MachineRegisterInfo &MRI) const { - MachineInstr *RootI = MRI.getVRegDef(Root); + MachineInstr *RootI = getDefIgnoringCopies(Root, MRI); if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD) return {Root, 0}; @@ -3679,6 +3679,11 @@ bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl( MachineOperand &Root, Register &RSrcReg, Register &SOffset, int64_t &Offset) const { + + // FIXME: Pattern should not reach here. + if (STI.useFlatForGlobal()) + return false; + MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg()); if (shouldUseAddr64(AddrData)) return false; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll @@ -9,19 +9,18 @@ ; CHECK-LABEL: use_lds_globals: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CHECK-NEXT: s_add_u32 s2, 4, 4 -; CHECK-NEXT: v_mov_b32_e32 v0, s2 +; CHECK-NEXT: v_mov_b32_e32 v0, 4 ; CHECK-NEXT: s_mov_b32 m0, -1 -; CHECK-NEXT: ds_read_b32 v2, v0 +; CHECK-NEXT: ds_read_b32 v3, v0 offset:4 +; CHECK-NEXT: v_mov_b32_e32 v2, 9 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_add_u32 s0, s0, 4 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, s0 ; CHECK-NEXT: v_mov_b32_e32 v1, s1 -; CHECK-NEXT: flat_store_dword v[0:1], v2 -; CHECK-NEXT: v_mov_b32_e32 v0, 9 -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: ds_write_b32 v1, v0 +; CHECK-NEXT: flat_store_dword v[0:1], v3 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: ds_write_b32 v0, v2 ; CHECK-NEXT: s_endpgm entry: %tmp0 = getelementptr [128 x i32], [128 x i32] addrspace(3)* @lds_512_4, i32 0, i32 1 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -72,9 +72,8 @@ ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -88,9 +87,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 +; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -164,9 +162,8 @@ ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; CI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_dec_noret_i32_offset: @@ -175,9 +172,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 +; VI-NEXT: ds_dec_rtn_u32 v0, v1, v0 offset:16 ; VI-NEXT: s_endpgm ; GFX9-LABEL: lds_atomic_dec_noret_i32_offset: ; GFX9: ; %bb.0: @@ -1256,9 +1252,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -1273,9 +1268,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -1345,9 +1339,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_dec_noret_i64_offset: @@ -1357,9 +1350,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_endpgm ; GFX9-LABEL: lds_atomic_dec_noret_i64_offset: ; GFX9: ; %bb.0: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -76,9 +76,8 @@ ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 16 ; CI-NEXT: v_mov_b32_e32 v1, s2 -; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -92,9 +91,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 16 ; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 +; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -105,11 +103,10 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x8 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -161,9 +158,8 @@ ; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 16 ; CI-NEXT: v_mov_b32_e32 v1, s0 -; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; CI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i32_offset: @@ -172,19 +168,17 @@ ; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 16 ; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 +; VI-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i32_offset: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 42 +; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: ds_inc_rtn_u32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-NEXT: ds_inc_rtn_u32 v0, v1, v0 offset:16 ; GFX9-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4 %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false) @@ -622,9 +616,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s2, s2, 32 ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) @@ -639,9 +632,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) @@ -655,9 +647,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s2, s2, 32 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -713,9 +704,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_add_u32 s0, s0, 32 ; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; CI-NEXT: s_endpgm ; ; VI-LABEL: lds_atomic_inc_noret_i64_offset: @@ -725,9 +715,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: lds_atomic_inc_noret_i64_offset: @@ -736,9 +725,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 42 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_add_u32 s0, s0, 32 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 ; GFX9-NEXT: s_endpgm %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4 %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fadd.ll @@ -26,20 +26,18 @@ define amdgpu_ps float @ds_fadd_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fadd_f32_ss_offset: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fadd_f32_ss_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 @@ -69,19 +67,17 @@ define amdgpu_ps void @ds_fadd_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fadd_f32_ss_offset_nortn: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: ds_fadd_f32_ss_offset_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_add_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_add_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fadd(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmax.ll @@ -53,20 +53,18 @@ define amdgpu_ps float @ds_fmax_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmax_f32_ss_offset: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmax_f32_ss_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset @@ -74,12 +72,10 @@ ; GFX8-MIR: liveins: $sgpr2, $sgpr3 ; GFX8-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512 - ; GFX8-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc - ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] - ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-MIR: $m0 = S_MOV_B32 -1 - ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) + ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) ; GFX8-MIR: $vgpr0 = COPY [[DS_MAX_RTN_F32_]] ; GFX8-MIR: SI_RETURN_TO_EPILOG implicit $vgpr0 ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset @@ -87,11 +83,9 @@ ; GFX9-MIR: liveins: $sgpr2, $sgpr3 ; GFX9-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX9-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512 - ; GFX9-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc - ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] - ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) + ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) ; GFX9-MIR: $vgpr0 = COPY [[DS_MAX_RTN_F32_gfx9_]] ; GFX9-MIR: SI_RETURN_TO_EPILOG implicit $vgpr0 %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 @@ -140,42 +134,36 @@ define amdgpu_ps void @ds_fmax_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmax_f32_ss_offset_nortn: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: ds_fmax_f32_ss_offset_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_max_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_max_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_endpgm ; GFX8-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn ; GFX8-MIR: bb.1 (%ir-block.0): ; GFX8-MIR: liveins: $sgpr2, $sgpr3 ; GFX8-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX8-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX8-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512 - ; GFX8-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc - ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] - ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX8-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] ; GFX8-MIR: $m0 = S_MOV_B32 -1 - ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY2]], [[COPY3]], 0, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) + ; GFX8-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX8-MIR: [[DS_MAX_RTN_F32_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32 [[COPY3]], [[COPY2]], 512, 0, implicit $m0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) ; GFX8-MIR: S_ENDPGM 0 ; GFX9-MIR-LABEL: name: ds_fmax_f32_ss_offset_nortn ; GFX9-MIR: bb.1 (%ir-block.0): ; GFX9-MIR: liveins: $sgpr2, $sgpr3 ; GFX9-MIR: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr2 ; GFX9-MIR: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr3 - ; GFX9-MIR: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 512 - ; GFX9-MIR: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc - ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[S_ADD_U32_]] - ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] - ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY2]], [[COPY3]], 0, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) + ; GFX9-MIR: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY1]] + ; GFX9-MIR: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[COPY]] + ; GFX9-MIR: [[DS_MAX_RTN_F32_gfx9_:%[0-9]+]]:vgpr_32 = DS_MAX_RTN_F32_gfx9 [[COPY3]], [[COPY2]], 512, 0, implicit $exec :: (load store 4 on %ir.gep, addrspace 3) ; GFX9-MIR: S_ENDPGM 0 %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fmax(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ds.fmin.ll @@ -26,20 +26,18 @@ define amdgpu_ps float @ds_fmin_f32_ss_offset(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmin_f32_ss_offset: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ; return to shader part epilog ; ; GFX9-LABEL: ds_fmin_f32_ss_offset: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 @@ -69,19 +67,17 @@ define amdgpu_ps void @ds_fmin_f32_ss_offset_nortn(float addrspace(3)* inreg %ptr, float inreg %val) { ; GFX8-LABEL: ds_fmin_f32_ss_offset_nortn: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_add_u32 s0, s2, 0x200 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s3 +; GFX8-NEXT: v_mov_b32_e32 v1, s2 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX8-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: ds_fmin_f32_ss_offset_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: s_add_u32 s0, s2, 0x200 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: ds_min_rtn_f32 v0, v0, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: ds_min_rtn_f32 v0, v1, v0 offset:512 ; GFX9-NEXT: s_endpgm %gep = getelementptr float, float addrspace(3)* %ptr, i32 128 %unused = call float @llvm.amdgcn.ds.fmin(float addrspace(3)* %gep, float %val, i32 0, i32 0, i1 false)