diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -178,6 +178,10 @@ bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); bool applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); + /// Transform a multiply by a power-of-2 value to a left shift. + bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -106,7 +106,6 @@ (match (COPY $d, $s):$mi, [{ return Helper.matchCombineCopy(*${mi}); }]), (apply [{ Helper.applyCombineCopy(*${mi}); }])>; -def trivial_combines : GICombineGroup<[copy_prop]>; def extending_loads : GICombineRule< (defs root:$root, extending_load_matchdata:$matchinfo), @@ -136,5 +135,14 @@ [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>; +def mul_to_shl_matchdata : GIDefMatchData<"unsigned">; +def mul_to_shl : GICombineRule< + (defs root:$d, mul_to_shl_matchdata:$matchinfo), + (match (G_MUL $d, $op1, $op2):$mi, + [{ return Helper.matchCombineMulToShl(*${mi}, ${matchinfo}); }]), + (apply [{ Helper.applyCombineMulToShl(*${mi}, ${matchinfo}); }])>; + +def trivial_combines : GICombineGroup<[copy_prop, mul_to_shl]>; def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain, combines_for_extload, combine_indexed_load_store]>; + diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -17,6 +17,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "gi-combiner" @@ -1385,6 +1386,30 @@ return true; } +bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, + unsigned &ShiftVal) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); + auto MaybeImmVal = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeImmVal || !isPowerOf2_64(MaybeImmVal->Value)) + return false; + ShiftVal = Log2_64(MaybeImmVal->Value); + return true; +} + +bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI, + unsigned &ShiftVal) { + assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); + MachineIRBuilder MIB(MI); + LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg()); + auto ShiftCst = MIB.buildConstant(ShiftTy, ShiftVal); + Observer.changingInstr(MI); + MI.setDesc(MIB.getTII().get(TargetOpcode::G_SHL)); + MI.getOperand(2).setReg(ShiftCst.getReg(0)); + Observer.changedInstr(MI); + return true; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul-to-shl.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul-to-shl.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-mul-to-shl.mir @@ -0,0 +1,98 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs -mtriple aarch64-unknown-unknown %s -o - | FileCheck %s +... +--- +name: mul_to_shl +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_to_shl + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK: $x0 = COPY [[SHL]](s64) + ; CHECK: RET_ReallyLR implicit-def $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 4 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) + RET_ReallyLR implicit-def $x0 +... +--- +name: mul_to_shl_16 +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_to_shl_16 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s64) + ; CHECK: $x0 = COPY [[SHL]](s64) + ; CHECK: RET_ReallyLR implicit-def $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 16 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) + RET_ReallyLR implicit-def $x0 +... +--- +name: mul_to_shl_vector_16 +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $q0 + ; Currently not implemented. + ; CHECK-LABEL: name: mul_to_shl_vector_16 + ; CHECK: liveins: $q0 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32) + ; CHECK: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[COPY]], [[BUILD_VECTOR]] + ; CHECK: $q0 = COPY [[MUL]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit-def $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(<4 x s32>) = G_BUILD_VECTOR %1(s32), %1(s32), %1(s32), %1(s32) + %3:_(<4 x s32>) = G_MUL %0, %2(<4 x s32>) + $q0 = COPY %3(<4 x s32>) + RET_ReallyLR implicit-def $q0 +... +--- +name: mul_to_shl_non_pow2 +alignment: 4 +tracksRegLiveness: true +frameInfo: + maxAlignment: 1 +machineFunctionInfo: {} +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: mul_to_shl_non_pow2 + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 10 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[COPY]], [[C]] + ; CHECK: $x0 = COPY [[MUL]](s64) + ; CHECK: RET_ReallyLR implicit-def $x0 + %0:_(s64) = COPY $x0 + %1:_(s64) = G_CONSTANT i64 10 + %2:_(s64) = G_MUL %0, %1(s64) + $x0 = COPY %2(s64) + RET_ReallyLR implicit-def $x0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-relocs.ll @@ -5,8 +5,8 @@ @lds.defined = unnamed_addr addrspace(3) global [8 x i32] undef, align 8 ; GCN-LABEL: {{^}}test_basic: -; GCN: s_add_u32 s0, lds.defined@abs32@lo, s2 ; encoding: [0xff,0x02,0x00,0x80,A,A,A,A] -; GCN: v_add_u32_e32 v0, lds.external@abs32@lo, v0 ; encoding: [0xff,0x00,0x00,0x68,A,A,A,A] +; GCN: s_add_u32 s0, lds.defined@abs32@lo, s0 ; encoding: [0xff,0x00,0x00,0x80,A,A,A,A] +; GCN: v_mov_b32_e32 v2, s0 ; encoding: [0x00,0x02,0x04,0x7e] ; GCN: .globl lds.external ; GCN: .amdgpu_lds lds.external, 0, 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -368,52 +368,42 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v4, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v4, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -451,19 +441,14 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v0, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc @@ -471,19 +456,14 @@ ; ; VI-LABEL: global_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v0, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc @@ -689,52 +669,42 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { ; CI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v4, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v4, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_dec v0, v[0:1], v4 glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_dec v2, v[2:3], v4 glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_addr64: ; GFX9: ; %bb.0: @@ -772,19 +742,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32* %ptr) #0 { ; CI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v0, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc @@ -792,19 +757,14 @@ ; ; VI-LABEL: flat_atomic_dec_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v0, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc @@ -1022,54 +982,44 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { ; CI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v4, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v5, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v5 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v4, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: @@ -1108,43 +1058,33 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64* %ptr) #0 { ; CI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v0, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_addr64: @@ -1179,40 +1119,40 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0: ; CI: ; %bb.0: -; CI-NEXT: v_mul_lo_u32 v5, v0, 4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v6, 9 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 8, v0 +; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; CI-NEXT: v_add_i32_e32 v5, vcc, 8, v5 -; CI-NEXT: ds_dec_rtn_u32 v5, v5, v6 -; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: ds_dec_rtn_u32 v3, v0, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dword v[0:1], v4 -; CI-NEXT: flat_store_dword v[2:3], v5 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_shl_base_lds_0: ; VI: ; %bb.0: -; VI-NEXT: v_mul_lo_u32 v5, v0, 4 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0, v5 -; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v5 -; VI-NEXT: ds_dec_rtn_u32 v5, v5, v6 -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: ds_dec_rtn_u32 v3, v0, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; GFX9-LABEL: atomic_dec_shl_base_lds_0: ; GFX9: ; %bb.0: @@ -1607,54 +1547,44 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v4, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v5, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v5 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v4, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_ret_i64_offset_addr64: ; GFX9: ; %bb.0: @@ -1693,43 +1623,33 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_dec_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v0, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; GFX9-LABEL: global_atomic_dec_noret_i64_offset_addr64: @@ -1764,42 +1684,42 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_dec_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: v_mul_lo_u32 v7, v0, 8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; CI-NEXT: v_mov_b32_e32 v0, 9 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0, v7 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 16, v7 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_dec_rtn_u64 v[0:1], v7, v[0:1] +; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v6 -; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_dec_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: v_mul_lo_u32 v7, v0, 8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0, v7 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v7 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_dec_rtn_u64 v[0:1], v7, v[0:1] +; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v6 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; GFX9-LABEL: atomic_dec_shl_base_lds_0_i64: ; GFX9: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -370,76 +370,62 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v4, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v4, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 4 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 4 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, 4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add3_u32 v3, v1, v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: global_atomic_inc v0, v[0:1], v4, off glc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: global_atomic_inc v2, v[2:3], v4, off glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id @@ -453,19 +439,14 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v0, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc @@ -473,19 +454,14 @@ ; ; VI-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v0, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc @@ -493,20 +469,16 @@ ; ; GFX9-LABEL: global_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 4 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 4 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: global_atomic_inc v0, v[0:1], v2, off glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -521,58 +493,58 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i32: ; CI: ; %bb.0: -; CI-NEXT: v_mul_lo_u32 v5, v0, 4 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mov_b32_e32 v6, 9 +; CI-NEXT: v_add_i32_e32 v2, vcc, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 8, v0 +; CI-NEXT: v_mov_b32_e32 v1, 9 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: v_add_i32_e32 v5, vcc, 0, v5 -; CI-NEXT: v_add_i32_e32 v5, vcc, 8, v5 -; CI-NEXT: ds_inc_rtn_u32 v5, v5, v6 -; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: ds_inc_rtn_u32 v3, v0, v1 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: v_mov_b32_e32 v2, s0 -; CI-NEXT: flat_store_dword v[0:1], v4 -; CI-NEXT: flat_store_dword v[2:3], v5 +; CI-NEXT: flat_store_dword v[0:1], v2 +; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: flat_store_dword v[0:1], v3 ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i32: ; VI: ; %bb.0: -; VI-NEXT: v_mul_lo_u32 v5, v0, 4 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mov_b32_e32 v6, 9 +; VI-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; VI-NEXT: v_mov_b32_e32 v1, 9 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: v_add_u32_e32 v5, vcc, 0, v5 -; VI-NEXT: v_add_u32_e32 v5, vcc, 8, v5 -; VI-NEXT: ds_inc_rtn_u32 v5, v5, v6 -; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: ds_inc_rtn_u32 v3, v0, v1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s0 -; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: flat_store_dword v[2:3], v5 +; VI-NEXT: flat_store_dword v[0:1], v2 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mul_lo_u32 v1, v0, 4 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 9 -; GFX9-NEXT: v_add_u32_e32 v0, 0, v1 +; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 -; GFX9-NEXT: ds_inc_rtn_u32 v2, v0, v2 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 +; GFX9-NEXT: ds_inc_rtn_u32 v3, v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %idx.0 = add nsw i32 %tid.x, 2 @@ -958,79 +930,65 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v4, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v5, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v5 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v4, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add3_u32 v3, v1, v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[4:5], off glc +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: global_atomic_inc_x2 v[2:3], v[4:5], v[2:3], off glc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id @@ -1044,63 +1002,49 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 { ; CI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v0, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: global_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[0:1], v[2:3], off glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1185,76 +1129,62 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32* %out, i32* %ptr) #0 { ; CI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v4, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v2, s3 +; CI-NEXT: v_add_i32_e32 v3, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; CI-NEXT: v_add_i32_e32 v2, vcc, 20, v3 +; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; CI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dword v[2:3], v0 +; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v4, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: flat_atomic_inc v0, v[0:1], v4 glc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s3 +; VI-NEXT: v_add_u32_e32 v3, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v5, vcc, v2, v1, vcc +; VI-NEXT: v_add_u32_e32 v2, vcc, 20, v3 +; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc +; VI-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dword v[2:3], v0 +; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 4 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 4 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, 4 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add3_u32 v3, v1, v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 glc -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v2, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 20, v3 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v5, vcc +; GFX9-NEXT: flat_atomic_inc v2, v[2:3], v4 glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[2:3], v0 +; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i32, i32* %ptr, i32 %id @@ -1268,19 +1198,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32* %ptr) #0 { ; CI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 4 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 4 -; CI-NEXT: v_mul_lo_u32 v0, v0, 4 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; CI-NEXT: v_mov_b32_e32 v2, 42 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; CI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc @@ -1288,19 +1213,14 @@ ; ; VI-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 4 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 4 -; VI-NEXT: v_mul_lo_u32 v0, v0, 4 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 42 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; VI-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc @@ -1308,20 +1228,16 @@ ; ; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 4 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 4 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 4 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 20, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v2 glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() @@ -1336,54 +1252,54 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 { ; CI-LABEL: atomic_inc_shl_base_lds_0_i64: ; CI: ; %bb.0: -; CI-NEXT: v_mul_lo_u32 v7, v0, 8 +; CI-NEXT: v_add_i32_e32 v4, vcc, 2, v0 +; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_add_i32_e32 v6, vcc, 2, v0 +; CI-NEXT: v_add_i32_e32 v0, vcc, 0, v0 +; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0 ; CI-NEXT: v_mov_b32_e32 v0, 9 -; CI-NEXT: v_add_i32_e32 v7, vcc, 0, v7 ; CI-NEXT: v_mov_b32_e32 v1, 0 -; CI-NEXT: v_add_i32_e32 v7, vcc, 16, v7 ; CI-NEXT: s_mov_b32 m0, -1 -; CI-NEXT: ds_inc_rtn_u64 v[0:1], v7, v[0:1] +; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: v_mov_b32_e32 v5, s1 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: v_mov_b32_e32 v4, s0 -; CI-NEXT: flat_store_dword v[2:3], v6 -; CI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v2, s0 +; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm ; ; VI-LABEL: atomic_inc_shl_base_lds_0_i64: ; VI: ; %bb.0: -; VI-NEXT: v_mul_lo_u32 v7, v0, 8 +; VI-NEXT: v_add_u32_e32 v4, vcc, 2, v0 +; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_add_u32_e32 v6, vcc, 2, v0 +; VI-NEXT: v_add_u32_e32 v0, vcc, 0, v0 +; VI-NEXT: v_add_u32_e32 v2, vcc, 16, v0 ; VI-NEXT: v_mov_b32_e32 v0, 9 -; VI-NEXT: v_add_u32_e32 v7, vcc, 0, v7 ; VI-NEXT: v_mov_b32_e32 v1, 0 -; VI-NEXT: v_add_u32_e32 v7, vcc, 16, v7 ; VI-NEXT: s_mov_b32 m0, -1 -; VI-NEXT: ds_inc_rtn_u64 v[0:1], v7, v[0:1] +; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: flat_store_dword v[2:3], v6 -; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] +; VI-NEXT: flat_store_dword v[2:3], v4 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mul_lo_u32 v3, v0, 8 -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mov_b32_e32 v1, 9 ; GFX9-NEXT: v_add_u32_e32 v4, 2, v0 -; GFX9-NEXT: v_add_u32_e32 v0, 0, v3 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 -; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 +; GFX9-NEXT: v_add_u32_e32 v0, 0, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 16, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 9 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1480,79 +1396,65 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64* %out, i64* %ptr) #0 { ; CI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v4, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s3 -; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v4 -; CI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v1 -; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; CI-NEXT: v_mov_b32_e32 v5, s1 -; CI-NEXT: v_add_i32_e32 v2, vcc, s0, v4 -; CI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; CI-NEXT: v_mov_b32_e32 v4, 42 -; CI-NEXT: v_mov_b32_e32 v5, 0 -; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; CI-NEXT: v_mov_b32_e32 v4, s3 +; CI-NEXT: v_add_i32_e32 v5, vcc, s2, v0 +; CI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v5 +; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; CI-NEXT: v_mov_b32_e32 v4, s1 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; CI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v4, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s3 -; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v4 -; VI-NEXT: v_addc_u32_e32 v2, vcc, v0, v3, vcc -; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v1 -; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v2, vcc -; VI-NEXT: v_mov_b32_e32 v5, s1 -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v4 -; VI-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc -; VI-NEXT: v_mov_b32_e32 v4, 42 -; VI-NEXT: v_mov_b32_e32 v5, 0 -; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; VI-NEXT: v_mov_b32_e32 v4, s3 +; VI-NEXT: v_add_u32_e32 v5, vcc, s2, v0 +; VI-NEXT: v_addc_u32_e32 v6, vcc, v4, v1, vcc +; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v5 +; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc +; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; VI-NEXT: v_mov_b32_e32 v4, s1 +; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v4, v0, 8 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s3 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add3_u32 v3, v1, v2, v3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 42 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] glc +; GFX9-NEXT: v_mov_b32_e32 v4, s3 +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, v4, v1, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, 40, v5 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v6, vcc +; GFX9-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc +; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.tid = getelementptr i64, i64* %ptr, i32 %id @@ -1566,63 +1468,49 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64* %ptr) #0 { ; CI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 ; CI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; CI-NEXT: v_mul_hi_u32 v3, v0, 8 -; CI-NEXT: v_mul_lo_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v3 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 +; CI-NEXT: v_mov_b32_e32 v2, 42 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v4, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; CI-NEXT: v_add_i32_e32 v0, vcc, 40, v0 -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; CI-NEXT: s_endpgm ; ; VI-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; VI: ; %bb.0: -; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; VI-NEXT: v_mul_lo_u32 v1, v1, 8 -; VI-NEXT: v_mul_lo_u32 v2, v0, 0 ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; VI-NEXT: v_mul_hi_u32 v3, v0, 8 -; VI-NEXT: v_mul_lo_u32 v0, v0, 8 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; VI-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; VI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VI-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, 42 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; VI-NEXT: v_add_u32_e32 v0, vcc, 40, v0 -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; VI-NEXT: s_endpgm ; ; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_addr64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 42 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, 40, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, 42 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc ; GFX9-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll @@ -7,16 +7,11 @@ define amdgpu_kernel void @is_private_vgpr(i8* addrspace(1)* %ptr.ptr) { ; CI-LABEL: is_private_vgpr: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 -; CI-NEXT: v_mul_lo_u32 v3, v0, 8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mul_hi_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -29,13 +24,9 @@ ; ; GFX9-LABEL: is_private_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll @@ -7,16 +7,11 @@ define amdgpu_kernel void @is_local_vgpr(i8* addrspace(1)* %ptr.ptr) { ; CI-LABEL: is_local_vgpr: ; CI: ; %bb.0: -; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; CI-NEXT: v_mul_lo_u32 v1, v1, 8 -; CI-NEXT: v_mul_lo_u32 v2, v0, 0 -; CI-NEXT: v_mul_lo_u32 v3, v0, 8 ; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; CI-NEXT: v_mul_hi_u32 v0, v0, 8 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v2 -; CI-NEXT: v_add_i32_e32 v1, vcc, v1, v0 +; CI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; CI-NEXT: v_lshl_b64 v[0:1], v[0:1], 3 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v3 +; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_mov_b32_e32 v2, s1 ; CI-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] @@ -29,13 +24,9 @@ ; ; GFX9-LABEL: is_local_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0 -; GFX9-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX9-NEXT: v_mul_lo_u32 v2, v0, 0 -; GFX9-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX9-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX9-NEXT: v_add3_u32 v1, v1, v2, v3 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX9-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -2,6 +2,7 @@ ; RUN: llc -global-isel -march=amdgcn -mcpu=tonga -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s +; XFAIL: * ; FIXME: Merge with DAG test define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in1, i32 %in2) { @@ -18,7 +19,6 @@ ; GFX8-NEXT: v_mov_b32_dpp v2, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 @@ -39,18 +39,13 @@ define amdgpu_kernel void @update_dpp64_test(i64 addrspace(1)* %arg, i64 %in1, i64 %in2) { ; GFX8-LABEL: update_dpp64_test: ; GFX8: ; %bb.0: -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX8-NEXT: v_mul_lo_u32 v1, v1, 8 -; GFX8-NEXT: v_mul_lo_u32 v2, v0, 0 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_mul_hi_u32 v3, v0, 8 -; GFX8-NEXT: v_mul_lo_u32 v0, v0, 8 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v2 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX8-NEXT: v_lshlrev_b64 v[0:1], 3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 @@ -60,7 +55,6 @@ ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm -; ; GFX10-LABEL: update_dpp64_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -253,36 +253,28 @@ ; GFX6-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v0, s4, 4 -; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: s_mul_i32 s3, s4, 4 -; GFX6-NEXT: s_mul_i32 s5, s5, 4 -; GFX6-NEXT: s_mul_i32 s4, s4, 0 -; GFX6-NEXT: s_add_i32 s5, s5, s4 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s5, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s3 +; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v0, s4, 4 -; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: s_mul_i32 s3, s4, 4 -; GFX7-NEXT: s_mul_i32 s5, s5, 4 -; GFX7-NEXT: s_mul_i32 s4, s4, 0 -; GFX7-NEXT: s_add_i32 s5, s5, s4 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[4:5], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, s5, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s3 +; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %soffset @@ -294,36 +286,20 @@ ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX6-NEXT: s_mul_i32 s2, s0, 4 -; GFX6-NEXT: s_mul_i32 s1, s1, 4 -; GFX6-NEXT: s_mul_i32 s0, s0, 0 -; GFX6-NEXT: s_add_i32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], 2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX7-NEXT: s_mul_i32 s2, s0, 4 -; GFX7-NEXT: s_mul_i32 s1, s1, 4 -; GFX7-NEXT: s_mul_i32 s0, s0, 0 -; GFX7-NEXT: s_add_i32 s1, s1, s0 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[0:1], 2 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %soffset @@ -335,36 +311,20 @@ ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX6-NEXT: s_mul_i32 s2, s0, 4 -; GFX6-NEXT: s_mul_i32 s1, s1, 4 -; GFX6-NEXT: s_mul_i32 s0, s0, 0 -; GFX6-NEXT: s_add_i32 s1, s1, s0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX6-NEXT: s_mov_b32 s6, 0 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[0:1], 2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: s_mov_b64 s[4:5], 0 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:1024 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX7-NEXT: s_mul_i32 s2, s0, 4 -; GFX7-NEXT: s_mul_i32 s1, s1, 4 -; GFX7-NEXT: s_mul_i32 s0, s0, 0 -; GFX7-NEXT: s_add_i32 s1, s1, s0 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc ; GFX7-NEXT: s_mov_b32 s6, 0 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[0:1], 2 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: s_mov_b64 s[4:5], 0 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:1024 ; GFX7-NEXT: s_endpgm %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 %soffset @@ -376,49 +336,33 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_mov_b32 s1, 0 -; GFX6-NEXT: s_movk_i32 s0, 0x400 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 -; GFX6-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 -; GFX6-NEXT: v_mov_b32_e32 v2, s0 +; GFX6-NEXT: s_movk_i32 s4, 0x400 +; GFX6-NEXT: s_mov_b32 s5, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX6-NEXT: v_mul_hi_u32 v2, s2, 4 -; GFX6-NEXT: s_mul_i32 s0, s2, 4 -; GFX6-NEXT: s_mul_i32 s3, s3, 4 -; GFX6-NEXT: s_mul_i32 s2, s2, 0 -; GFX6-NEXT: s_add_i32 s3, s3, s2 +; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s3, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX6-NEXT: s_mov_b32 s2, s1 +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 +; GFX6-NEXT: s_mov_b32 s2, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_mov_b32 s1, 0 -; GFX7-NEXT: s_movk_i32 s0, 0x400 -; GFX7-NEXT: v_mov_b32_e32 v3, s1 -; GFX7-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x200000 -; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: s_movk_i32 s4, 0x400 +; GFX7-NEXT: s_mov_b32 s5, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, s4 +; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 -; GFX7-NEXT: v_mul_hi_u32 v2, s2, 4 -; GFX7-NEXT: s_mul_i32 s0, s2, 4 -; GFX7-NEXT: s_mul_i32 s3, s3, 4 -; GFX7-NEXT: s_mul_i32 s2, s2, 0 -; GFX7-NEXT: s_add_i32 s3, s3, s2 +; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s3, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc -; GFX7-NEXT: s_mov_b32 s2, s1 +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: s_mov_b32 s2, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 256 @@ -431,35 +375,25 @@ ; GFX6-LABEL: mubuf_store_sgpr_ptr_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset store i32 0, i32 addrspace(1)* %gep @@ -470,37 +404,27 @@ ; GFX6-LABEL: mubuf_store_sgpr_ptr_vgpr_offset_offset4095: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc -; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[0:3], s4 addr64 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_vgpr_offset_offset4095: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc -; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[0:3], s4 addr64 +; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64 ; GFX7-NEXT: s_endpgm %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 %voffset %gep1 = getelementptr i32, i32 addrspace(1)* %gep0, i32 4095 @@ -511,35 +435,25 @@ ; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4095_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 ; GFX6-NEXT: s_add_u32 s4, s2, 0x3ffc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s6, 0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_addc_u32 s5, s3, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s7, 0xf000 -; GFX6-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4095_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 ; GFX7-NEXT: s_add_u32 s4, s2, 0x3ffc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s6, 0 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_addc_u32 s5, s3, 0 -; GFX7-NEXT: v_mov_b32_e32 v0, 0 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s7, 0xf000 -; GFX7-NEXT: buffer_store_dword v0, v[1:2], s[4:7], 0 addr64 +; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_endpgm %gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 4095 %gep1 = getelementptr i32, i32 addrspace(1)* %gep0, i32 %voffset @@ -797,15 +711,11 @@ ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v0, s2, 4 -; GFX6-NEXT: s_mul_i32 s4, s2, 0 -; GFX6-NEXT: s_mul_i32 s3, s3, 4 -; GFX6-NEXT: s_add_i32 s3, s3, s4 -; GFX6-NEXT: s_mul_i32 s2, s2, 4 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s3, v0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 +; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog @@ -815,15 +725,11 @@ ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v0, s2, 4 -; GFX7-NEXT: s_mul_i32 s4, s2, 0 -; GFX7-NEXT: s_mul_i32 s3, s3, 4 -; GFX7-NEXT: s_add_i32 s3, s3, s4 -; GFX7-NEXT: s_mul_i32 s2, s2, 4 -; GFX7-NEXT: v_add_i32_e32 v1, vcc, s3, v0 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -836,17 +742,9 @@ ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX6-NEXT: s_mul_i32 s2, s0, 0 -; GFX6-NEXT: s_mul_i32 s1, s1, 4 -; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_mul_i32 s0, s0, 4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog @@ -854,17 +752,9 @@ ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX7-NEXT: s_mul_i32 s2, s0, 0 -; GFX7-NEXT: s_mul_i32 s1, s1, 4 -; GFX7-NEXT: s_add_i32 s1, s1, s2 -; GFX7-NEXT: s_mul_i32 s0, s0, 4 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -877,17 +767,9 @@ ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX6-NEXT: s_mul_i32 s2, s0, 0 -; GFX6-NEXT: s_mul_i32 s1, s1, 4 -; GFX6-NEXT: s_add_i32 s1, s1, s2 -; GFX6-NEXT: s_mul_i32 s0, s0, 4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog @@ -895,17 +777,9 @@ ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset_offset256: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v2, s0, 4 -; GFX7-NEXT: s_mul_i32 s2, s0, 0 -; GFX7-NEXT: s_mul_i32 s1, s1, 4 -; GFX7-NEXT: s_add_i32 s1, s1, s2 -; GFX7-NEXT: s_mul_i32 s0, s0, 4 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s1, v2 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -918,48 +792,32 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) { ; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX6: ; %bb.0: -; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX6-NEXT: v_mul_hi_u32 v4, s0, 4 ; GFX6-NEXT: s_movk_i32 s4, 0x400 ; GFX6-NEXT: s_mov_b32 s5, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, s4 -; GFX6-NEXT: s_mul_i32 s2, s0, 0 -; GFX6-NEXT: s_mul_i32 s1, s1, 4 -; GFX6-NEXT: s_add_i32 s1, s1, s2 +; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX6-NEXT: s_mul_i32 s0, s0, 4 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s1, v4 -; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s5 -; GFX6-NEXT: s_mov_b64 s[0:1], 0 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset: ; GFX7: ; %bb.0: -; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 -; GFX7-NEXT: v_mul_hi_u32 v4, s0, 4 ; GFX7-NEXT: s_movk_i32 s4, 0x400 ; GFX7-NEXT: s_mov_b32 s5, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: s_mul_i32 s2, s0, 0 -; GFX7-NEXT: s_mul_i32 s1, s1, 4 -; GFX7-NEXT: s_add_i32 s1, s1, s2 +; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc -; GFX7-NEXT: s_mul_i32 s0, s0, 4 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, s1, v4 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc +; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s5 -; GFX7-NEXT: s_mov_b64 s[0:1], 0 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog @@ -973,34 +831,24 @@ ; GFX6-LABEL: mubuf_load_sgpr_ptr_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep = getelementptr float, float addrspace(1)* %ptr, i32 %voffset @@ -1012,36 +860,26 @@ ; GFX6-LABEL: mubuf_load_sgpr_ptr_vgpr_offset_offset4095: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_mov_b32 s0, s2 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX6-NEXT: s_mov_b32 s1, s3 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_movk_i32 s4, 0x3ffc -; GFX6-NEXT: buffer_load_dword v0, v[1:2], s[0:3], s4 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_vgpr_offset_offset4095: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_mov_b32 s0, s2 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 ; GFX7-NEXT: s_mov_b32 s1, s3 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_movk_i32 s4, 0x3ffc -; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], s4 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i32 %voffset @@ -1053,34 +891,24 @@ ; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset: ; GFX6: ; %bb.0: ; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX6-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX6-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX6-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX6-NEXT: v_mul_hi_u32 v0, v0, 4 ; GFX6-NEXT: s_add_u32 s0, s2, 0x3ffc -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX6-NEXT: s_addc_u32 s1, s3, 0 ; GFX6-NEXT: s_mov_b32 s2, 0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX7-NEXT: v_mul_lo_u32 v2, v1, 4 -; GFX7-NEXT: v_mul_lo_u32 v3, v0, 0 -; GFX7-NEXT: v_mul_lo_u32 v1, v0, 4 -; GFX7-NEXT: v_mul_hi_u32 v0, v0, 4 ; GFX7-NEXT: s_add_u32 s0, s2, 0x3ffc -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v3 +; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2 ; GFX7-NEXT: s_addc_u32 s1, s3, 0 ; GFX7-NEXT: s_mov_b32 s2, 0 -; GFX7-NEXT: v_add_i32_e32 v2, vcc, v2, v0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 -; GFX7-NEXT: buffer_load_dword v0, v[1:2], s[0:3], 0 addr64 +; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: ; return to shader part epilog %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095