Index: include/llvm/IR/IntrinsicsAMDGPU.td =================================================================== --- include/llvm/IR/IntrinsicsAMDGPU.td +++ include/llvm/IR/IntrinsicsAMDGPU.td @@ -131,4 +131,19 @@ GCCBuiltin<"__builtin_amdgcn_dispatch_ptr">, Intrinsic<[LLVMQualPointerType], [], [IntrNoMem]>; +// __builtin_amdgcn_v_interp_p1 , , , +def int_amdgcn_v_interp_p1 : + GCCBuiltin<"__builtin_amdgcn_v_interp_p1">, + Intrinsic<[llvm_float_ty], + [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; // This intrinsic reads from lds, but the memory + // values are constant, so it behaves like IntrNoMem. + +// __builtin_amdgcn_v_interp_p2 , , , , +def int_amdgcn_v_interp_p2 : + GCCBuiltin<"__builtin_amdgcn_v_interp_p2">, + Intrinsic<[llvm_float_ty], + [llvm_float_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem]>; // See int_amdgcn_v_interp_p1 for why this is + // IntrNoMem. } Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -53,6 +53,10 @@ void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; +FunctionPass *createAMDGPUIntrinsicExpanderPass(); +void initializeAMDGPUIntrinsicExpanderPass(PassRegistry &); +extern char &AMDGPUIntrinsicExpanderID; + void initializeSIFoldOperandsPass(PassRegistry &); extern char &SIFoldOperandsID; Index: lib/Target/AMDGPU/AMDGPUIntrinsicExpander.cpp =================================================================== --- /dev/null +++ lib/Target/AMDGPU/AMDGPUIntrinsicExpander.cpp @@ -0,0 +1,111 @@ +//===-- AMDGPUIntrinsicExpanderPass.cpp ---------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This pass expands high level intrinsics into lower-level operations. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "amdgpu-intrinsic-expander" + +using namespace llvm; + +namespace { + +class AMDGPUIntrinsicExpander : public FunctionPass, + public InstVisitor { +private: + Module *Mod; + + void expandSIFsInterp(IntrinsicInst &I); + +public: + static char ID; + + AMDGPUIntrinsicExpander() : FunctionPass(ID) { } + + bool doInitialization(Module &M) override; + bool runOnFunction(Function &F) override; + const char *getPassName() const override { + return "AMDGPU Expand Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesAll(); + FunctionPass::getAnalysisUsage(AU); + } + + void visitCallInst(CallInst &I); +}; + +} + +char AMDGPUIntrinsicExpander::ID = 0; + +char &llvm::AMDGPUIntrinsicExpanderID = AMDGPUIntrinsicExpander::ID; + + +INITIALIZE_PASS_BEGIN(AMDGPUIntrinsicExpander, DEBUG_TYPE, + "AMDGPU expand intrinsics", false, false) +INITIALIZE_PASS_END(AMDGPUIntrinsicExpander, DEBUG_TYPE, + "AMDGPU expand intrinsics", false, false) + +bool AMDGPUIntrinsicExpander::doInitialization(Module &M) { + Mod = &M; + return false; +} + +void AMDGPUIntrinsicExpander::expandSIFsInterp(IntrinsicInst &I) { + IRBuilder<> Builder(&I); + + Function *InterpP1 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_v_interp_p1); + Function *InterpP2 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_v_interp_p2); + + Value *AttrChan = I.getOperand(0); + Value *Attr = I.getOperand(1); + Value *M0 = I.getOperand(2); + Value *IJCoords = I.getOperand(3); + + Value *ICoord = Builder.CreateExtractElement(IJCoords, (uint64_t)0, ""); + Value *JCoord = Builder.CreateExtractElement(IJCoords, 1, ""); + + Value *InterpP1Args[] = {ICoord, AttrChan, Attr, M0}; + Value *P1 = Builder.CreateCall(InterpP1, InterpP1Args); + + Value *InterpP2Args[] = {P1, JCoord, AttrChan, Attr, M0}; + Value *P2 = Builder.CreateCall(InterpP2, InterpP2Args); + + I.replaceAllUsesWith(P2); + I.eraseFromParent(); +} + +void AMDGPUIntrinsicExpander::visitCallInst(CallInst &I) { + IntrinsicInst *Intr = dyn_cast(&I); + if (!Intr) + return; + + if (Intr->getCalledFunction()->getName() == "llvm.SI.fs.interp") + expandSIFsInterp(*Intr); +} + +bool AMDGPUIntrinsicExpander::runOnFunction(Function &F) { + bool Changed = false; + visit(F); + + return Changed; +} + +FunctionPass *llvm::createAMDGPUIntrinsicExpanderPass() { + return new AMDGPUIntrinsicExpander(); +} Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -51,6 +51,7 @@ initializeSIFixControlFlowLiveIntervalsPass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); + initializeAMDGPUIntrinsicExpanderPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -272,6 +273,8 @@ bool GCNPassConfig::addPreISel() { AMDGPUPassConfig::addPreISel(); + addPass(createAMDGPUIntrinsicExpanderPass()); + // FIXME: We need to run a pass to propagate the attributes when calls are // supported. addPass(&AMDGPUAnnotateKernelFeaturesID); Index: lib/Target/AMDGPU/CMakeLists.txt =================================================================== --- lib/Target/AMDGPU/CMakeLists.txt +++ lib/Target/AMDGPU/CMakeLists.txt @@ -20,6 +20,7 @@ AMDGPUDiagnosticInfoUnsupported.cpp AMDGPUFrameLowering.cpp AMDGPUHSATargetObjectFile.cpp + AMDGPUIntrinsicExpander.cpp AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPUMCInstLower.cpp Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1264,20 +1264,18 @@ if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef()) return DAG.getUNDEF(MVT::i32); return Op; - case AMDGPUIntrinsic::SI_fs_interp: { - SDValue IJ = Op.getOperand(4); - SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(0, DL, MVT::i32)); - SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ, - DAG.getConstant(1, DL, MVT::i32)); - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3)); + case Intrinsic::amdgcn_v_interp_p1: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); SDValue Glue = M0.getValue(1); - SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL, - DAG.getVTList(MVT::f32, MVT::Glue), - I, Op.getOperand(1), Op.getOperand(2), Glue); - Glue = SDValue(P1.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J, - Op.getOperand(1), Op.getOperand(2), Glue); + return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Glue); + } + case Intrinsic::amdgcn_v_interp_p2: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = SDValue(M0.getNode(), 1); + return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), + Glue); } default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); Index: lib/Target/AMDGPU/SIIntrinsics.td =================================================================== --- lib/Target/AMDGPU/SIIntrinsics.td +++ lib/Target/AMDGPU/SIIntrinsics.td @@ -185,7 +185,6 @@ /* Interpolation Intrinsics */ def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; - def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>; /* Control flow Intrinsics */ Index: test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll +++ test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -18,6 +18,25 @@ ret void } +;GCN-LABEL: {{^}}v_interp: +;GCN-NOT: s_wqm +;GCN: s_mov_b32 m0, s{{[0-9]+}} +;GCN-NEXT: v_interp_mov_f32 +;GCN: v_interp_p1_f32 +;GCN: v_interp_p2_f32 +define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +main_body: + %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) + %i = extractelement <2 x i32> %4, i32 0 + %j = extractelement <2 x i32> %4, i32 1 + %p0_0 = call float @llvm.amdgcn.v.interp.p1(i32 %i, i32 0, i32 0, i32 %3) + %p1_0 = call float @llvm.amdgcn.v.interp.p2(float %p0_0, i32 %j, i32 0, i32 0, i32 %3) + %p0_1 = call float @llvm.amdgcn.v.interp.p1(i32 %i, i32 1, i32 0, i32 %3) + %p1_1 = call float @llvm.amdgcn.v.interp.p2(float %p0_1, i32 %j, i32 1, i32 0, i32 %3) + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %p0_0, float %p1_1, float %p1_1) + ret void +} + ; Thest that v_interp_p1 uses different source and destination registers ; on 16 bank LDS chips. @@ -52,6 +71,12 @@ ; Function Attrs: nounwind readnone declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.v.interp.p1(i32, i32, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.amdgcn.v.interp.p2(float, i32, i32, i32, i32) #1 + declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="0" }