Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -174,9 +174,10 @@ } static void addDefsToList(const MachineInstr &MI, DenseSet &Defs) { - // XXX: Should this be looking for implicit defs? - for (const MachineOperand &Def : MI.defs()) - Defs.insert(Def.getReg()); + for (const MachineOperand &Def : MI.operands()) { + if (Def.isReg() && Def.isDef()) + Defs.insert(Def.getReg()); + } } static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A, Index: test/CodeGen/AMDGPU/merge-load-store.mir =================================================================== --- test/CodeGen/AMDGPU/merge-load-store.mir +++ test/CodeGen/AMDGPU/merge-load-store.mir @@ -24,6 +24,41 @@ store i32 %4, i32 addrspace(3)* %ptr.0 ret void } + + @lds0 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4 + @lds1 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4 + @lds2 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4 + @lds3 = external dso_local unnamed_addr addrspace(3) global [256 x i32], align 4 + + define void @asm_defines_address() #0 { + bb: + %tmp1 = load i32, i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0), align 4 + %0 = and i32 %tmp1, 255 + %tmp3 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef), align 4 + %tmp6 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef), align 4 + %tmp7 = tail call i32 asm "v_or_b32 $0, 0, $1", "=v,v"(i32 %tmp6) #1 + %tmp10 = lshr i32 %tmp7, 16 + %tmp11 = and i32 %tmp10, 255 + %tmp12 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp11 + %tmp13 = load i32, i32 addrspace(3)* %tmp12, align 4 + %tmp14 = xor i32 %tmp3, %tmp13 + %tmp15 = lshr i32 %tmp14, 8 + %tmp16 = and i32 %tmp15, 16711680 + %tmp19 = lshr i32 %tmp16, 16 + %tmp20 = and i32 %tmp19, 255 + %tmp21 = getelementptr inbounds [256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 %tmp20 + %tmp22 = load i32, i32 addrspace(3)* %tmp21, align 4 + %tmp24 = load i32, i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef), align 4 + %tmp25 = xor i32 %tmp22, %tmp24 + %tmp26 = and i32 %tmp25, -16777216 + %tmp28 = or i32 %0, %tmp26 + store volatile i32 %tmp28, i32 addrspace(1)* undef + ret void + } + + attributes #0 = { convergent nounwind } + attributes #1 = { convergent nounwind readnone } + ... --- name: mem_dependency @@ -68,3 +103,29 @@ S_ENDPGM ... +--- +# Make sure the asm def isn't moved after the point where it's used for +# the address. +# CHECK-LABEL: name: asm_defines_address +# CHECK: DS_READ2ST64_B32 +# CHECK: DS_READ2ST64_B32 +# CHECK: INLINEASM +# CHECK: DS_READ_B32 +# CHECK: DS_READ_B32 +name: asm_defines_address +tracksRegLiveness: true +registers: + - { id: 0, class: vgpr_32, preferred-register: '' } +body: | + bb.0: + %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %2:vgpr_32 = DS_READ_B32 %1, 3072, 0, implicit $m0, implicit $exec :: (dereferenceable load 4 from `i32 addrspace(3)* getelementptr inbounds ([256 x i32], [256 x i32] addrspace(3)* @lds0, i32 0, i32 0)`, addrspace 3) + %3:vgpr_32 = DS_READ_B32 %1, 2048, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds1, i32 0, i32 undef)`, addrspace 3) + %4:vgpr_32 = DS_READ_B32 %1, 1024, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds3, i32 0, i32 undef)`, addrspace 3) + INLINEASM &"v_or_b32 $0, 0, $1", 32, 327690, def %0, 327689, %4 + %5:vgpr_32 = DS_READ_B32 %0, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 3) + %6:vgpr_32 = DS_READ_B32 %5, 2048, 0, implicit $m0, implicit $exec :: (load 4 from %ir.tmp21, addrspace 3) + %7:vgpr_32 = DS_READ_B32 %1, 0, 0, implicit $m0, implicit $exec :: (load 4 from `i32 addrspace(3)* getelementptr ([256 x i32], [256 x i32] addrspace(3)* @lds2, i32 0, i32 undef)`, addrspace 3) + S_SETPC_B64_return undef $sgpr30_sgpr31, implicit %6, implicit %7 + +...