Index: lib/Target/AMDGPU/SILoadStoreOptimizer.cpp =================================================================== --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -263,6 +263,20 @@ return !(A->mayStore() || B->mayStore()) || !A->mayAlias(AA, *B, true); } +// Find the associated instruction which sets SCC for an MI. +static MachineInstr *addSCCDependInstr(MachineInstr &MI) { + if (!MI.hasRegisterImplicitUseOperand(AMDGPU::SCC)) + return nullptr; + + MachineBasicBlock::reverse_iterator I = MI, E = MI.getParent()->rend(); + I++; + for (; I != E; ++I) + if (I->definesRegister(AMDGPU::SCC)) + return &*I; + assert(0 && "Failed to find carry instr"); + return nullptr; +} + // Add MI and its defs to the lists if MI reads one of the defs that are // already in the list. Returns true in that case. static bool addToListsIfDependent(MachineInstr &MI, DenseSet &RegDefs, @@ -281,6 +295,10 @@ ((Use.readsReg() && RegDefs.count(Use.getReg())) || (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) && PhysRegUses.count(Use.getReg())))) { + // If this MI depends on SCC, find and add defining instr. + MachineInstr *Prev = addSCCDependInstr(MI); + if (Prev) + Insts.push_back(&*Prev); Insts.push_back(&MI); addDefsUsesToList(MI, RegDefs, PhysRegUses); return true; Index: test/CodeGen/AMDGPU/scc-add-lshl-addc.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/scc-add-lshl-addc.ll @@ -0,0 +1,99 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-code-object-v3 -disable-promote-alloca-to-lds -mcpu=gfx900 -amdgpu-dump-hsa-metadata %s -o - | FileCheck -check-prefix=CHECK %s + +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK: s_addc_u32 +; CHECK: s_add_u32 +; CHECK-NOT: s_lshl_b32 +; CHECK: s_addc_u32 +; CHECK: global_load_dword + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" +target triple = "amdgcn-amd-amdhsa" + +%class.omptarget_nvptx_Queue.11.45.62.79.96.113.147.181.198.215.232.759.827.1031.1098 = type { [32 x %class.omptarget_nvptx_ThreadPrivateContext.10.44.61.78.95.112.146.180.197.214.231.758.826.1030.1097], [32 x %class.omptarget_nvptx_ThreadPrivateContext.10.44.61.78.95.112.146.180.197.214.231.758.826.1030.1097*], i32, [32 x i32], i32, [8 x i8] } +%class.omptarget_nvptx_ThreadPrivateContext.10.44.61.78.95.112.146.180.197.214.231.758.826.1030.1097 = type { %class.omptarget_nvptx_TeamDescr.8.42.59.76.93.110.144.178.195.212.229.756.824.1028.1095, [1024 x %class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090], [1024 x %class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090*], %union.anon.9.43.60.77.94.111.145.179.196.213.230.757.825.1029.1096, [1024 x i32], [1024 x i64], [1024 x i64], [1024 x i64], [1024 x i64] } +%class.omptarget_nvptx_TeamDescr.8.42.59.76.93.110.144.178.195.212.229.756.824.1028.1095 = type { %class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090, %class.omptarget_nvptx_WorkDescr.4.38.55.72.89.106.140.174.191.208.225.752.820.1024.1091, i64, [8 x i8], [64 x %struct.__kmpc_data_sharing_worker_slot_static.6.40.57.74.91.108.142.176.193.210.227.754.822.1026.1093], [1 x %struct.__kmpc_data_sharing_master_slot_static.7.41.58.75.92.109.143.177.194.211.228.755.823.1027.1094] } +%class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090 = type { %"struct.omptarget_nvptx_TaskDescr::SavedLoopDescr_items.1.35.52.69.86.103.137.171.188.205.222.749.817.1021.1088", %"struct.omptarget_nvptx_TaskDescr::TaskDescr_items.2.36.53.70.87.104.138.172.189.206.223.750.818.1022.1089", %class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090* } +%"struct.omptarget_nvptx_TaskDescr::SavedLoopDescr_items.1.35.52.69.86.103.137.171.188.205.222.749.817.1021.1088" = type { i64, i64, i64, i64, i32 } +%"struct.omptarget_nvptx_TaskDescr::TaskDescr_items.2.36.53.70.87.104.138.172.189.206.223.750.818.1022.1089" = type { i8, i8, i16, i16, i16, i16, i64 } +%class.omptarget_nvptx_WorkDescr.4.38.55.72.89.106.140.174.191.208.225.752.820.1024.1091 = type { %class.omptarget_nvptx_TaskDescr.3.37.54.71.88.105.139.173.190.207.224.751.819.1023.1090 } +%struct.__kmpc_data_sharing_worker_slot_static.6.40.57.74.91.108.142.176.193.210.227.754.822.1026.1093 = type { %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, i8*, i8*, [16384 x i8] } +%struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092 = type { %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, i8*, i8*, [0 x i8] } +%struct.__kmpc_data_sharing_master_slot_static.7.41.58.75.92.109.143.177.194.211.228.755.823.1027.1094 = type { %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, %struct.__kmpc_data_sharing_slot.5.39.56.73.90.107.141.175.192.209.226.753.821.1025.1092*, i8*, i8*, [256 x i8] } +%union.anon.9.43.60.77.94.111.145.179.196.213.230.757.825.1029.1096 = type { [1024 x i16] } +%class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099 = type <{ [20 x i8*], i8**, i32, [4 x i8] }> + +@omptarget_nvptx_device_State = external addrspace(1) externally_initialized global [64 x %class.omptarget_nvptx_Queue.11.45.62.79.96.113.147.181.198.215.232.759.827.1031.1098], align 16 +@usedSlotIdx = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@execution_param = external local_unnamed_addr addrspace(3) externally_initialized global i32, align 4 +@omptarget_nvptx_globalArgs = external addrspace(3) externally_initialized global %class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099, align 8 + +define amdgpu_kernel void @__omp_offloading_802_d9e513_main_l28([992 x i32] addrspace(1)* %A) local_unnamed_addr #0 { +entry: + %0 = tail call i64 @__ockl_get_local_size() #3 + %1 = trunc i64 %0 to i32 + br i1 undef, label %.worker, label %.mastercheck + +.worker: ; preds = %entry + ret void + +.mastercheck: ; preds = %entry + %2 = load i32, i32 addrspace(3)* @execution_param, align 4, !tbaa !1 + %and.i.i.i.i.i.i = and i32 %2, 1 + %cmp.i.i.i.i.i.i = icmp eq i32 %and.i.i.i.i.i.i, 0 + %storemerge.i.i.i.i.i = select i1 %cmp.i.i.i.i.i.i, i32 0, i32 %1 + %conv.i.i.i = trunc i32 %storemerge.i.i.i.i.i to i16 + store i16 %conv.i.i.i, i16* undef, align 2, !tbaa !5 + %DataEnd.i.i.i = getelementptr inbounds %class.omptarget_nvptx_ThreadPrivateContext.10.44.61.78.95.112.146.180.197.214.231.758.826.1030.1097, %class.omptarget_nvptx_ThreadPrivateContext.10.44.61.78.95.112.146.180.197.214.231.758.826.1030.1097* null, i64 0, i32 0, i32 4, i64 0, i32 3 + store i8* undef, i8** %DataEnd.i.i.i, align 8, !tbaa !13 + store i8** getelementptr (%class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099, %class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099* addrspacecast (%class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099 addrspace(3)* @omptarget_nvptx_globalArgs to %class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099*), i64 0, i32 0, i64 0), i8** addrspace(3)* getelementptr inbounds (%class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099, %class.omptarget_nvptx_SharedArgs.16.50.67.84.101.118.152.186.203.220.237.764.832.1036.1099 addrspace(3)* @omptarget_nvptx_globalArgs, i32 0, i32 1), align 8, !tbaa !15 + %3 = tail call i32 @llvm.amdgcn.workgroup.id.x() #4 + %idxprom.i1 = sext i32 %3 to i64 + %arrayidx.i22 = getelementptr inbounds [992 x i32], [992 x i32] addrspace(1)* %A, i64 0, i64 %idxprom.i1 + %4 = load i32, i32 addrspace(1)* %arrayidx.i22, align 4, !tbaa !17 + %add.i = add nsw i32 %4, %3 + store i32 %add.i, i32 addrspace(1)* %arrayidx.i22, align 4, !tbaa !17 + %5 = load i32, i32 addrspace(3)* @usedSlotIdx, align 4, !tbaa !1 + %idxprom.i = sext i32 %5 to i64 + %arrayidx.i6.i.i = getelementptr inbounds [64 x %class.omptarget_nvptx_Queue.11.45.62.79.96.113.147.181.198.215.232.759.827.1031.1098], [64 x %class.omptarget_nvptx_Queue.11.45.62.79.96.113.147.181.198.215.232.759.827.1031.1098] addrspace(1)* @omptarget_nvptx_device_State, i64 0, i64 %idxprom.i, i32 3, i64 undef + %6 = addrspacecast i32 addrspace(1)* %arrayidx.i6.i.i to i32* + %7 = atomicrmw volatile add i32* %6, i32 0 seq_cst + unreachable +} + +declare i64 @__ockl_get_local_size() local_unnamed_addr #1 + +; Function Attrs: nounwind readnone speculatable +declare i32 @llvm.amdgcn.workgroup.id.x() #2 + +attributes #0 = { "use-soft-float"="false" } +attributes #1 = { "target-cpu"="gfx900" } +attributes #2 = { nounwind readnone speculatable } +attributes #3 = { alwaysinline nounwind readnone } +attributes #4 = { nounwind } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 8.0.0 "} +!1 = !{!2, !2, i64 0} +!2 = !{!"int", !3, i64 0} +!3 = !{!"omnipotent char", !4, i64 0} +!4 = !{!"Simple C++ TBAA"} +!5 = !{!6, !11, i64 42} +!6 = !{!"_ZTS25omptarget_nvptx_TaskDescr", !7, i64 0, !10, i64 40, !12, i64 64} +!7 = !{!"_ZTSN25omptarget_nvptx_TaskDescr20SavedLoopDescr_itemsE", !8, i64 0, !8, i64 8, !8, i64 16, !8, i64 24, !9, i64 32} +!8 = !{!"long", !3, i64 0} +!9 = !{!"_ZTS11kmp_sched_t", !3, i64 0} +!10 = !{!"_ZTSN25omptarget_nvptx_TaskDescr15TaskDescr_itemsE", !3, i64 0, !3, i64 1, !11, i64 2, !11, i64 4, !11, i64 6, !11, i64 8, !8, i64 16} +!11 = !{!"short", !3, i64 0} +!12 = !{!"any pointer", !3, i64 0} +!13 = !{!14, !12, i64 24} +!14 = !{!"_ZTS38__kmpc_data_sharing_worker_slot_static", !12, i64 0, !12, i64 8, !12, i64 16, !12, i64 24, !3, i64 32} +!15 = !{!16, !12, i64 160} +!16 = !{!"_ZTS26omptarget_nvptx_SharedArgs", !3, i64 0, !12, i64 160, !2, i64 168} +!17 = !{!18, !18, i64 0} +!18 = !{!"int", !19, i64 0} +!19 = !{!"omnipotent char", !20, i64 0} +!20 = !{!"Simple C/C++ TBAA"}