Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -263,11 +263,26 @@ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } +// Get the adjacent instruction which defines physical Reg used by this MI. +static MachineInstr *getPhysRegAdjacentInstr(MachineInstr &MI, unsigned Reg) { + if (!TargetRegisterInfo::isPhysicalRegister(Reg)) + return nullptr; + // Only look at previous instruction for the defining instr. + MachineBasicBlock::reverse_iterator I = MI; + I++; + if (I->definesRegister(Reg)) + return &*I; + return nullptr; +} + +// used to extend addToListsIfDependent to express Bailing. +enum AddToStat {AddToTrue, AddToFalse, AddToBail }; // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. -static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, - DenseSet &PhysRegUses, - SmallVectorImpl &Insts) { +static AddToStat addToListsIfDependent(MachineInstr &MI, + DenseSet &RegDefs, + DenseSet &PhysRegUses, + SmallVectorImpl &Insts) { for (MachineOperand &Use : MI.operands()) { // If one of the defs is read, then there is a use of Def between I and the // instruction that I will potentially be merged with. We will need to move @@ -281,13 +296,23 @@ ((Use.readsReg() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { + // If this MI depends on a physReg such as SCC, find and add defining + // instr. If not found, bail on this optimization. + if (Use.isImplicit() && + TargetRegisterInfo::isPhysicalRegister(Use.getReg())) { + MachineInstr *Prev = getPhysRegAdjacentInstr(MI, Use.getReg()); + if (Prev) + Insts.push_back(&*Prev); + else + return AddToBail; + } Insts.push_back(&MI); addDefsUsesToList(MI, RegDefs, PhysRegUses); - return true; + return AddToTrue; } } - return false; + return AddToFalse; } static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp, @@ -577,8 +602,11 @@ // When we match I with another DS instruction we will be moving I down // to the location of the matched instruction any uses of I will need to // be moved down as well. - addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove); + AddToStat AStat = addToListsIfDependent(*MBBI, RegDefsToMove, + PhysRegUsesToMove, + CI.InstsToMove); + if (AStat == AddToBail) + return false; continue; } @@ -592,9 +620,13 @@ // DS_WRITE_B32 addr, f(w), idx1 // where the DS_READ_B32 ends up in InstsToMove and therefore prevents // merging of the two writes. - if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove, - CI.InstsToMove)) + AddToStat AStat = addToListsIfDependent(*MBBI, RegDefsToMove, + PhysRegUsesToMove, + CI.InstsToMove); + if (AStat == AddToTrue) continue; + if (AStat == AddToBail) + return false; bool Match = true; for (unsigned i = 0; i < NumAddresses; i++) { Index: test/CodeGen/AMDGPU/scc-add-lshl-addc.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scc-add-lshl-addc.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=CHECK %s + +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK-NOT: s_lshl_b32 +; CHECK: s_addc_u32 +; CHECK: global_load_dword + +%0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] } +%1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] } +%2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] } +%3 = type { %4, %5, %3* } +%4 = type { i64, i64, i64, i64, i32 } +%5 = type { i8, i8, i16, i16, i16, i16, i64 } +%6 = type { %3 } +%7 = type { %8*, %8*, i8*, i8*, [16384 x i8] } +%8 = type { %8*, %8*, i8*, i8*, [0 x i8] } +%9 = type { %8*, %8*, i8*, i8*, [256 x i8] } +%10 = type { [1024 x i16] } +%11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }> + +@omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16 +@usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8 + +define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr { +bb: + %tmp = tail call i64 @__ockl_get_local_size() + %tmp1 = trunc i64 %tmp to i32 + br i1 undef, label %bb2, label %bb3 + +bb2: ; preds = %bb + ret void + +bb3: ; preds = %bb + %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4 + %tmp5 = and i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 0 + %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1 + %tmp8 = trunc i32 %tmp7 to i16 + store i16 %tmp8, i16* undef, align 2 + %tmp9 = getelementptr inbounds %1, %1* null, i64 0, i32 0, i32 4, i64 0, i32 3 + store i8* undef, i8** %tmp9, align 8 + store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8 + %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg, i64 0, i64 %tmp11 + %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4 + %tmp14 = add nsw i32 %tmp13, %tmp10 + store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4 + %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef + %tmp18 = addrspacecast i32 addrspace(1)* %tmp17 to i32* + %tmp19 = atomicrmw volatile add i32* %tmp18, i32 0 seq_cst + unreachable +} + +declare i64 @__ockl_get_local_size() local_unnamed_addr +declare i32 @llvm.amdgcn.workgroup.id.x() Index: test/CodeGen/AMDGPU/scc-missing-add.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scc-missing-add.mir @@ -0,0 +1,159 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass=si-load-store-opt -o - %s | FileCheck -check-prefix=GFX9 %s + +# This test presents a sequnce of DS_READ instructions that could be combined +# into a single DS_READ provided all the dependent instructions are correctly +# identified and moved. In this situation an S_ADDC depends on an S_ADD, +# however the S_ADD is further away than 10 instructions and will not be found. +# The SILoadStoreOptimizer pass needs to detect the S_ADD was not found and +# abandon the transformation. + +# GFX9-LABEL: name: __omp_offloading_802_d9e513_main_l28 +# GFX9: DS_READ +# GFX9: DS_WRITE +# GFX9: S_ADD +# GFX9: S_ADDC +# GFX9: GLOBAL_LOAD_DWORD +# GFX9: GLOBAL_STORE_DWORD +# GFX9: DS_READ + +--- | + + %0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] } + %1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] } + %2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] } + %3 = type { %4, %5, %3* } + %4 = type { i64, i64, i64, i64, i32 } + %5 = type { i8, i8, i16, i16, i16, i16, i64 } + %6 = type { %3 } + %7 = type { %8*, %8*, i8*, i8*, [16384 x i8] } + %8 = type { %8*, %8*, i8*, i8*, [0 x i8] } + %9 = type { %8*, %8*, i8*, i8*, [256 x i8] } + %10 = type { [1024 x i16] } + %11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }> + + @omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16 + @usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 + @execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 + @omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8 + + define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr #0 { + bb: + %tmp = tail call i64 @__ockl_get_local_size() + br i1 undef, label %bb2, label %bb3, !amdgpu.uniform !0 + + bb2: ; preds = %bb + ret void + + bb3: ; preds = %bb + %__omp_offloading_802_d9e513_main_l28.kernarg.segment = call nonnull align 16 dereferenceable(44) i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + %arg.kernarg.offset = getelementptr inbounds i8, i8 addrspace(4)* %__omp_offloading_802_d9e513_main_l28.kernarg.segment, i64 36 + %arg.kernarg.offset.cast = bitcast i8 addrspace(4)* %arg.kernarg.offset to [992 x i32] addrspace(1)* addrspace(4)*, !amdgpu.uniform !0, !amdgpu.noclobber !0 + %arg.load = load [992 x i32] addrspace(1)*, [992 x i32] addrspace(1)* addrspace(4)* %arg.kernarg.offset.cast, align 4, !invariant.load !0 + %tmp1 = trunc i64 %tmp to i32 + %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4 + %tmp5 = and i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 0 + %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1 + %tmp8 = trunc i32 %tmp7 to i16 + store i16 %tmp8, i16* undef, align 2 + store i8* undef, i8** inttoptr (i64 184 to i8**), align 8 + store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8 + %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg.load, i64 0, i64 %tmp11, !amdgpu.uniform !0 + %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4 + %tmp14 = add nsw i32 %tmp13, %tmp10 + store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4 + %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef + %0 = addrspacecast i32 addrspace(1)* %tmp17 to i32* + %tmp19 = atomicrmw volatile add i32* %0, i32 0 seq_cst + unreachable + } + + declare i64 @__ockl_get_local_size() local_unnamed_addr + declare i32 @llvm.amdgcn.workgroup.id.x() + declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() + + !0 = !{} + +... +--- +name: __omp_offloading_802_d9e513_main_l28 +body: | + bb.0.bb: + successors: %bb.1(0x7fffffff), %bb.2(0x00000001) + liveins: $sgpr0_sgpr1, $sgpr2 + + %3:sreg_32_xm0 = COPY $sgpr2 + %2:sgpr_64 = COPY $sgpr0_sgpr1 + ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + %5:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @__ockl_get_local_size + 4, target-flags(amdgpu-gotprel32-hi) @__ockl_get_local_size + 4, implicit-def dead $scc + %6:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %5, 0, 0 :: (dereferenceable invariant load 8 from got, addrspace 4) + %7:sreg_128 = COPY $sgpr96_sgpr97_sgpr98_sgpr99 + %8:sreg_32_xm0 = COPY $sgpr101 + $sgpr0_sgpr1_sgpr2_sgpr3 = COPY %7 + $sgpr4 = COPY %8 + $sgpr30_sgpr31 = SI_CALL killed %6, @__ockl_get_local_size, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit-def $vgpr0_vgpr1 + ADJCALLSTACKDOWN 0, 4, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr101 + %53:vreg_64 = COPY $vgpr0_vgpr1 + S_CBRANCH_SCC1 %bb.2, implicit undef $scc + S_BRANCH %bb.1 + + bb.1.bb2: + S_ENDPGM 0 + + bb.2.bb3: + %10:sreg_64_xexec = S_LOAD_DWORDX2_IMM %2, 36, 0 :: (dereferenceable invariant load 8 from %ir.arg.kernarg.offset.cast, align 4, addrspace 4) + %12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + %13:vgpr_32 = DS_READ_B32_gfx9 %12, 184, 0, implicit $exec :: (dereferenceable load 4 from @execution_param, addrspace 3) + %55:vgpr_32 = V_BFE_I32 %13, 0, 1, implicit $exec + %16:vgpr_32 = V_AND_B32_e32 killed %55, %53.sub0, implicit $exec + %18:sreg_64 = IMPLICIT_DEF + %19:vreg_64 = COPY %18 + FLAT_STORE_SHORT killed %19, killed %16, 0, 0, 0, implicit $exec, implicit $flat_scr :: (store 2 into `i16* undef`) + %20:sreg_32_xm0 = S_GETREG_B32 31759 + %21:sreg_32_xm0 = S_LSHL_B32 killed %20, 16, implicit-def dead $scc + %56:vgpr_32 = V_MOV_B32_e32 8, implicit $exec + %57:vgpr_32 = COPY killed %21 + %24:vreg_64 = REG_SEQUENCE killed %56, %subreg.sub0, killed %57, %subreg.sub1 + DS_WRITE_B64_gfx9 %12, killed %24, 168, 0, implicit $exec :: (store 8 into `i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1)`, addrspace 3) + %25:sreg_32_xm0 = S_ASHR_I32 %3, 31, implicit-def dead $scc + %27:sreg_64 = REG_SEQUENCE %3, %subreg.sub0, %25, %subreg.sub1 + %29:sreg_64 = S_LSHL_B64 killed %27, 2, implicit-def dead $scc + %69:sreg_32_xm0 = S_ADD_U32 %10.sub0, %29.sub0, implicit-def $scc + %150:vgpr_32 = COPY killed %21 + %151:vgpr_32 = COPY killed %21 + %152:vgpr_32 = COPY killed %21 + %153:vgpr_32 = COPY killed %21 + %154:vgpr_32 = COPY killed %21 + %155:vgpr_32 = COPY killed %21 + %156:vgpr_32 = COPY killed %21 + %157:vgpr_32 = COPY killed %21 + %158:vgpr_32 = COPY killed %21 + %159:vgpr_32 = COPY killed %21 + %160:vgpr_32 = COPY killed %21 + %70:sreg_32_xm0 = S_ADDC_U32 %10.sub1, %29.sub1, implicit-def $scc, implicit $scc + %30:sreg_64 = REG_SEQUENCE %69, %subreg.sub0, %70, %subreg.sub1 + %130:sreg_64 = REG_SEQUENCE %160, %157 + %131:sreg_64 = REG_SEQUENCE %158, %159 + %32:vreg_64 = COPY %30 + %31:vgpr_32 = GLOBAL_LOAD_DWORD %32, 0, 0, 0, implicit $exec :: (load 4 from %ir.tmp12, addrspace 1) + %58:vgpr_32 = nsw V_ADD_U32_e64 %31, %3, 0, implicit $exec + GLOBAL_STORE_DWORD %32, %58, 0, 0, 0, implicit $exec :: (store 4 into %ir.tmp12, addrspace 1) + %37:vgpr_32 = DS_READ_B32_gfx9 %12, 0, 0, implicit $exec :: (dereferenceable load 4 from @usedSlotIdx, addrspace 3) + %38:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @omptarget_nvptx_device_State + 4, target-flags(amdgpu-gotprel32-hi) @omptarget_nvptx_device_State + 4, implicit-def dead $scc + %39:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed %38, 0, 0 :: (dereferenceable invariant load 8 from got, addrspace 4) + %40:sreg_32_xm0 = S_MOV_B32 37501328 + %43:vreg_64 = COPY killed %39 + %41:vreg_64, %42:sreg_64 = V_MAD_I64_I32 killed %37, killed %40, %43, 0, implicit $exec + %65:sgpr_32 = S_MOV_B32 37501188 + %60:vgpr_32 = V_ADD_I32_e32 %65, %41.sub0, implicit-def $vcc, implicit $exec + %62:sreg_64_xexec = COPY killed $vcc + %61:vgpr_32, dead %63:sreg_64_xexec = V_ADDC_U32_e64 %41.sub1, 0, killed %62, 0, implicit $exec + %59:vreg_64 = REG_SEQUENCE %60, %subreg.sub0, %61, %subreg.sub1 + %52:vgpr_32 = V_MOV_B32_e32 0, implicit $exec + FLAT_ATOMIC_ADD %59, %52, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load store seq_cst 4 on %ir.0) +...