Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -70,7 +70,7 @@ friend BaseT; const GCNSubtarget *ST; - const AMDGPUTargetLowering *TLI; + const SITargetLowering *TLI; AMDGPUTTIImpl CommonTTI; bool IsGraphicsShader; bool HasFP32Denormals; @@ -183,6 +183,9 @@ unsigned getCFInstrCost(unsigned Opcode); + bool isInlineAsmSourceOfDivergence(const CallInst *CI, + ArrayRef Indices = {}) const; + int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index); bool isSourceOfDivergence(const Value *V) const; bool isAlwaysUniform(const Value *V) const; Index: llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -578,8 +578,6 @@ } } - - static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); @@ -606,6 +604,54 @@ } } +/// Analyze if the results of inline asm are divergent. If \p Indices is empty, +/// this is analyzing the collective result of all output registers. Otherwise, +/// this is only querying a specific result index if this returns multiple +/// registers in a struct. +bool GCNTTIImpl::isInlineAsmSourceOfDivergence( + const CallInst *CI, ArrayRef Indices) const { + // TODO: Handle complex extract indices + if (Indices.size() > 1) + return true; + + const DataLayout &DL = CI->getModule()->getDataLayout(); + const SIRegisterInfo *TRI = ST->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints + = TLI->ParseConstraints(DL, ST->getRegisterInfo(), CS); + + const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0]; + + int OutputIdx = 0; + for (auto &TC : TargetConstraints) { + if (TC.Type != InlineAsm::isOutput) + continue; + + // Skip outputs we don't care about. + if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++) + continue; + + TLI->ComputeConstraintToUse(TC, SDValue()); + + Register AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint( + TRI, TC.ConstraintCode, TC.ConstraintVT); + if (AssignedReg) { + // FIXME: This is a workaround for getRegForInlineAsmConstraint + // returning VS_32 + RC = TRI->getPhysRegClass(AssignedReg); + } + + // For AGPR constraints null is returned on subtargets without AGPRs, so + // assume divergent for null. + if (!RC || !TRI->isSGPRClass(RC)) + return true; + } + + return false; +} + /// \returns true if the new GPU divergence analysis is enabled. bool GCNTTIImpl::useGPUDivergenceAnalysis() const { return !UseLegacyDA; @@ -638,7 +684,23 @@ return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID()); // Assume all function calls are a source of divergence. - if (isa(V) || isa(V)) + if (const CallInst *CI = dyn_cast(V)) { + if (isa(CI->getCalledValue())) + return isInlineAsmSourceOfDivergence(CI); + return true; + } + + if (const ExtractValueInst *ExtValue = dyn_cast(V)) { + Value *ExtSrc = ExtValue->getOperand(0); + if (const CallInst *CI = dyn_cast(ExtSrc)) { + if (isa(CI->getCalledValue())) + return isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); + return true; + } + } + + // Assume all function calls are a source of divergence. + if (isa(V)) return true; return false; @@ -656,6 +718,19 @@ return true; } } + + const ExtractValueInst *ExtValue = dyn_cast(V); + if (!ExtValue) + return false; + + if (const CallInst *CI = dyn_cast(ExtValue->getOperand(0))) { + // If we have inline asm returning mixed SGPR and VGPR results, we inferred + // divergent for the overall struct return. We need to override it in the + // case we're extracting an SGPR component here. + if (isa(CI->getCalledValue())) + return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices()); + } + return false; } Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -10585,6 +10585,8 @@ return std::make_pair(RC->getRegister(Idx), RC); } } + + // FIXME: Returns VS_32 for physical SGPR constraints return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } Index: llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/DivergenceAnalysis/AMDGPU/inline-asm.ll @@ -0,0 +1,83 @@ +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=tahiti -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx908 -analyze -divergence -use-gpu-divergence-analysis %s | FileCheck %s +; Make sure nothing crashes on targets with or without AGPRs + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_virtreg_output': +; CHECK-NOT: DIVERGENT +define i32 @inline_asm_1_sgpr_virtreg_output() { + %sgpr = call i32 asm "s_mov_b32 $0, 0", "=s"() + ret i32 %sgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_sgpr_physreg_output': +; CHECK-NOT: DIVERGENT +define i32 @inline_asm_1_sgpr_physreg_output() { + %sgpr = call i32 asm "s_mov_b32 s0, 0", "={s0}"() + ret i32 %sgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_virtreg_output': +; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"() +define i32 @inline_asm_1_vgpr_virtreg_output() { + %vgpr = call i32 asm "v_mov_b32 $0, 0", "=v"() + ret i32 %vgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_vgpr_physreg_output': +; CHECK: DIVERGENT: %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"() +define i32 @inline_asm_1_vgpr_physreg_output() { + %vgpr = call i32 asm "v_mov_b32 v0, 0", "={v0}"() + ret i32 %vgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_virtreg_output': +; CHECK: DIVERGENT: %vgpr = call i32 asm "; def $0", "=a"() +define i32 @inline_asm_1_agpr_virtreg_output() { + %vgpr = call i32 asm "; def $0", "=a"() + ret i32 %vgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_1_agpr_physreg_output': +; CHECK: DIVERGENT: %vgpr = call i32 asm "; def a0", "={a0}"() +define i32 @inline_asm_1_agpr_physreg_output() { + %vgpr = call i32 asm "; def a0", "={a0}"() + ret i32 %vgpr +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_2_sgpr_virtreg_output': +; CHECK-NOT: DIVERGENT +define void @inline_asm_2_sgpr_virtreg_output() { + %asm = call { i32, i32 } asm "; def $0, $1", "=s,=s"() + %sgpr0 = extractvalue { i32, i32 } %asm, 0 + %sgpr1 = extractvalue { i32, i32 } %asm, 1 + store i32 %sgpr0, i32 addrspace(1)* undef + store i32 %sgpr1, i32 addrspace(1)* undef + ret void +} + +; One output is SGPR, one is VGPR. Infer divergent for the aggregate, but uniform on the SGPR extract +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_sgpr_vgpr_virtreg_output': +; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"() +; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 0 +; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 1 +define void @inline_asm_sgpr_vgpr_virtreg_output() { + %asm = call { i32, i32 } asm "; def $0, $1", "=s,=v"() + %sgpr = extractvalue { i32, i32 } %asm, 0 + %vgpr = extractvalue { i32, i32 } %asm, 1 + store i32 %sgpr, i32 addrspace(1)* undef + store i32 %vgpr, i32 addrspace(1)* undef + ret void +} + +; CHECK: Printing analysis 'Legacy Divergence Analysis' for function 'inline_asm_vgpr_sgpr_virtreg_output': +; CHECK: DIVERGENT: %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"() +; CHECK-NEXT: DIVERGENT: %vgpr = extractvalue { i32, i32 } %asm, 0 +; CHECK-NEXT: {{^[ \t]+}}%sgpr = extractvalue { i32, i32 } %asm, 1 +define void @inline_asm_vgpr_sgpr_virtreg_output() { + %asm = call { i32, i32 } asm "; def $0, $1", "=v,=s"() + %vgpr = extractvalue { i32, i32 } %asm, 0 + %sgpr = extractvalue { i32, i32 } %asm, 1 + store i32 %vgpr, i32 addrspace(1)* undef + store i32 %sgpr, i32 addrspace(1)* undef + ret void +} Index: llvm/test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/inline-asm.ll +++ llvm/test/CodeGen/AMDGPU/inline-asm.ll @@ -21,11 +21,30 @@ } -; CHECK: {{^}}branch_on_asm: -; Make sure inline assembly is treted as divergent. -; CHECK: s_mov_b32 s{{[0-9]+}}, 0 +; CHECK-LABEL: {{^}}branch_on_asm_vgpr: +; Make sure VGPR inline assembly is treated as divergent. +; CHECK: v_mov_b32 v{{[0-9]+}}, 0 +; CHECK: v_cmp_eq_u32 ; CHECK: s_and_saveexec_b64 -define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) { +define amdgpu_kernel void @branch_on_asm_vgpr(i32 addrspace(1)* %out) { + %zero = call i32 asm "v_mov_b32 $0, 0", "=v"() + %cmp = icmp eq i32 %zero, 0 + br i1 %cmp, label %if, label %endif + +if: + store i32 0, i32 addrspace(1)* %out + br label %endif + +endif: + ret void +} + +; CHECK-LABEL: {{^}}branch_on_asm_sgpr: +; Make sure SGPR inline assembly is treated as uniform +; CHECK: s_mov_b32 s{{[0-9]+}}, 0 +; CHECK: s_cmp_lg_u32 +; CHECK: s_cbranch_scc0 +define amdgpu_kernel void @branch_on_asm_sgpr(i32 addrspace(1)* %out) { %zero = call i32 asm "s_mov_b32 $0, 0", "=s"() %cmp = icmp eq i32 %zero, 0 br i1 %cmp, label %if, label %endif