Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -263,6 +263,20 @@ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } +// Find the associated instruction which sets SCC for an MI. +static MachineInstr *addSCCDependInstr(MachineInstr &MI) { + if (!MI.hasRegisterImplicitUseOperand(AMDGPU::SCC)) + return nullptr; + + MachineBasicBlock::reverse_iterator I = MI, E = MI.getParent()->rend(); + I++; + for (; I != E; ++I) + if (I->definesRegister(AMDGPU::SCC)) + return &*I; + assert(0 && "Failed to find carry instr"); + return nullptr; +} + // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, @@ -281,6 +295,10 @@ ((Use.readsReg() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { + // If this MI depends on SCC, find and add defining instr. + MachineInstr *Prev = addSCCDependInstr(MI); + if (Prev) + Insts.push_back(&*Prev); Insts.push_back(&MI); addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; Index: test/CodeGen/AMDGPU/scc-add-lshl-addc.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scc-add-lshl-addc.ll @@ -0,0 +1,64 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 %s -o - | FileCheck -check-prefix=CHECK %s + +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK-NOT: s_lshl_b32 +; CHECK: s_addc_u32 +; CHECK: global_load_dword + +%0 = type { [32 x %1], [32 x %1*], i32, [32 x i32], i32, [8 x i8] } +%1 = type { %2, [1024 x %3], [1024 x %3*], %10, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] } +%2 = type { %3, %6, i64, [8 x i8], [64 x %7], [1 x %9] } +%3 = type { %4, %5, %3* } +%4 = type { i64, i64, i64, i64, i32 } +%5 = type { i8, i8, i16, i16, i16, i16, i64 } +%6 = type { %3 } +%7 = type { %8*, %8*, i8*, i8*, [16384 x i8] } +%8 = type { %8*, %8*, i8*, i8*, [0 x i8] } +%9 = type { %8*, %8*, i8*, i8*, [256 x i8] } +%10 = type { [1024 x i16] } +%11 = type <{ [20 x i8*], i8**, i32, [4 x i8] }> + +@omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %0], align 16 +@usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %11, align 8 + +define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %arg) local_unnamed_addr { +bb: + %tmp = tail call i64 @__ockl_get_local_size() + %tmp1 = trunc i64 %tmp to i32 + br i1 undef, label %bb2, label %bb3 + +bb2: ; preds = %bb + ret void + +bb3: ; preds = %bb + %tmp4 = load i32, i32 addrspace(3)* @execution_param, align 4 + %tmp5 = and i32 %tmp4, 1 + %tmp6 = icmp eq i32 %tmp5, 0 + %tmp7 = select i1 %tmp6, i32 0, i32 %tmp1 + %tmp8 = trunc i32 %tmp7 to i16 + store i16 %tmp8, i16* undef, align 2 + %tmp9 = getelementptr inbounds %1, %1* null, i64 0, i32 0, i32 4, i64 0, i32 3 + store i8* undef, i8** %tmp9, align 8 + store i8** getelementptr (%11, %11* addrspacecast (%11 addrspace(3)* @omptarget_nvptx_globalArgs to %11*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%11, %11 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8 + %tmp10 = tail call i32 @llvm.amdgcn.workgroup.id.x() + %tmp11 = sext i32 %tmp10 to i64 + %tmp12 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %arg, i64 0, i64 %tmp11 + %tmp13 = load i32, i32 addrspace(1)* %tmp12, align 4 + %tmp14 = add nsw i32 %tmp13, %tmp10 + store i32 %tmp14, i32 addrspace(1)* %tmp12, align 4 + %tmp15 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4 + %tmp16 = sext i32 %tmp15 to i64 + %tmp17 = getelementptr inbounds [64 x %0], [64 x %0] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %tmp16, i32 3, i64 undef + %tmp18 = addrspacecast i32 addrspace(1)* %tmp17 to i32* + %tmp19 = atomicrmw volatile add i32* %tmp18, i32 0 seq_cst + unreachable +} + +declare i64 @__ockl_get_local_size() local_unnamed_addr +declare i32 @llvm.amdgcn.workgroup.id.x()