diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp --- a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp +++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp @@ -2357,11 +2357,13 @@ // Temporarily match the pattern generated by clang for teams regions. // TODO: Remove this once the new runtime is in place. - ConstantInt *One, *NegOne; + ConstantInt *One, *NegOne, *NegSixtyFour; + Value *NumThreads; CmpInst::Predicate Pred; auto &&m_ThreadID = m_Intrinsic(); auto &&m_WarpSize = m_Intrinsic(); auto &&m_BlockSize = m_Intrinsic(); + auto &&m_AMDThreadID = m_Intrinsic(); if (match(Cmp, m_Cmp(Pred, m_ThreadID, m_And(m_Sub(m_BlockSize, m_ConstantInt(One)), m_Xor(m_Sub(m_WarpSize, m_ConstantInt(One)), @@ -2370,6 +2372,17 @@ Pred == CmpInst::Predicate::ICMP_EQ) return true; + // Match the same pattern for AMDGPU. + if (match(Cmp, m_Cmp(Pred, m_AMDThreadID, + m_And(m_Sub(m_Value(NumThreads), m_ConstantInt(One)), + m_ConstantInt(NegSixtyFour))))) + if (One->isOne() && (NegSixtyFour->getSExtValue() == -64) && + Pred == CmpInst::Predicate::ICMP_EQ) + if (isa(NumThreads) && + dyn_cast(NumThreads)->getCalledFunction()->getName() == + "__kmpc_amdgcn_gpu_num_threads") + return true; + ConstantInt *C = dyn_cast(Cmp->getOperand(1)); if (!C || !C->isZero()) return false; diff --git a/llvm/test/Transforms/OpenMP/replace_globalization.ll b/llvm/test/Transforms/OpenMP/replace_globalization.ll --- a/llvm/test/Transforms/OpenMP/replace_globalization.ll +++ b/llvm/test/Transforms/OpenMP/replace_globalization.ll @@ -7,8 +7,10 @@ ; CHECK-REMARKS: remark: replace_globalization.c:5:7: Replaced globalized variable with 16 bytes of shared memory ; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory +; CHECK-REMARKS: remark: replace_globalization.c:5:14: Replaced globalized variable with 4 bytes of shared memory ; CHECK: [[SHARED_X:@.+]] = internal addrspace(3) global [16 x i8] undef ; CHECK: [[SHARED_Y:@.+]] = internal addrspace(3) global [4 x i8] undef +; CHECK: [[SHARED_Z:@.+]] = internal addrspace(3) global [4 x i8] undef ; CHECK: %{{.*}} = call i8* @__kmpc_alloc_shared({{.*}}) ; CHECK: call void @__kmpc_free_shared({{.*}}) @@ -24,7 +26,8 @@ define void @bar() { call void @baz() - call void @qux() + call void @nvidia() + call void @amd() ret void } @@ -46,7 +49,7 @@ } ; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Y]], i32 0, i32 0) to i8*) to [4 x i32]* -define internal void @qux() { +define internal void @nvidia() { entry: %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() %ntid = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x() @@ -68,6 +71,25 @@ ret void } +; CHECK: %{{.*}} = bitcast i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([4 x i8], [4 x i8] addrspace(3)* [[SHARED_Z]], i32 0, i32 0) to i8*) to [4 x i32]* +define internal void @amd() { +entry: + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %ntid = call i32 @__kmpc_amdgcn_gpu_num_threads() + %0 = sub nuw i32 %ntid, 1 + %master_tid = and i32 %0, -64 + %1 = icmp eq i32 %tid, %master_tid + br i1 %1, label %master, label %exit +master: + %z = call i8* @__kmpc_alloc_shared(i64 4), !dbg !12 + %z_on_stack = bitcast i8* %z to [4 x i32]* + %2 = bitcast [4 x i32]* %z_on_stack to i8* + call void @use(i8* %2) + call void @__kmpc_free_shared(i8* %z) + br label %exit +exit: + ret void +} define void @use(i8* %x) { entry: @@ -85,6 +107,9 @@ declare i32 @llvm.nvvm.read.ptx.sreg.warpsize() +declare i32 @llvm.amdgcn.workitem.id.x() + +declare i32 @__kmpc_amdgcn_gpu_num_threads() !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6}