Index: include/llvm/IR/CallingConv.h =================================================================== --- include/llvm/IR/CallingConv.h +++ include/llvm/IR/CallingConv.h @@ -178,6 +178,18 @@ /// which have an "optimized" convention to preserve registers. AVR_BUILTIN = 86, + /// Calling convention used for Mesa vertex shaders. + AMDGPU_VS = 87, + + /// Calling convention used for Mesa geometry shaders. + AMDGPU_GS = 88, + + /// Calling convention used for Mesa pixel shaders. + AMDGPU_PS = 89, + + /// Calling convention used for Mesa compute shaders. + AMDGPU_CS = 90, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; Index: lib/AsmParser/LLLexer.cpp =================================================================== --- lib/AsmParser/LLLexer.cpp +++ lib/AsmParser/LLLexer.cpp @@ -597,6 +597,10 @@ KEYWORD(hhvmcc); KEYWORD(hhvm_ccc); KEYWORD(cxx_fast_tlscc); + KEYWORD(amdgpu_vs); + KEYWORD(amdgpu_gs); + KEYWORD(amdgpu_ps); + KEYWORD(amdgpu_cs); KEYWORD(cc); KEYWORD(c); Index: lib/AsmParser/LLParser.cpp =================================================================== --- lib/AsmParser/LLParser.cpp +++ lib/AsmParser/LLParser.cpp @@ -1573,6 +1573,12 @@ /// ::= 'hhvmcc' /// ::= 'hhvm_ccc' /// ::= 'cxx_fast_tlscc' +/// ::= 'amdgpu_vs' +/// ::= 'amdgpu_tcs' +/// ::= 'amdgpu_tes' +/// ::= 'amdgpu_gs' +/// ::= 'amdgpu_ps' +/// ::= 'amdgpu_cs' /// ::= 'cc' UINT /// bool LLParser::ParseOptionalCallingConv(unsigned &CC) { @@ -1607,6 +1613,10 @@ case lltok::kw_hhvmcc: CC = CallingConv::HHVM; break; case lltok::kw_hhvm_ccc: CC = CallingConv::HHVM_C; break; case lltok::kw_cxx_fast_tlscc: CC = CallingConv::CXX_FAST_TLS; break; + case lltok::kw_amdgpu_vs: CC = CallingConv::AMDGPU_VS; break; + case lltok::kw_amdgpu_gs: CC = CallingConv::AMDGPU_GS; break; + case lltok::kw_amdgpu_ps: CC = CallingConv::AMDGPU_PS; break; + case lltok::kw_amdgpu_cs: CC = CallingConv::AMDGPU_CS; break; case lltok::kw_cc: { Lex.Lex(); return ParseUInt32(CC); Index: lib/AsmParser/LLToken.h =================================================================== --- lib/AsmParser/LLToken.h +++ lib/AsmParser/LLToken.h @@ -104,6 +104,10 @@ kw_x86_intrcc, kw_hhvmcc, kw_hhvm_ccc, kw_cxx_fast_tlscc, + kw_amdgpu_vs, + kw_amdgpu_gs, + kw_amdgpu_ps, + kw_amdgpu_cs, // Attributes: kw_attributes, Index: lib/IR/AsmWriter.cpp =================================================================== --- lib/IR/AsmWriter.cpp +++ lib/IR/AsmWriter.cpp @@ -318,6 +318,10 @@ case CallingConv::X86_INTR: Out << "x86_intrcc"; break; case CallingConv::HHVM: Out << "hhvmcc"; break; case CallingConv::HHVM_C: Out << "hhvm_ccc"; break; + case CallingConv::AMDGPU_VS: Out << "amdgpu_vs"; break; + case CallingConv::AMDGPU_GS: Out << "amdgpu_gs"; break; + case CallingConv::AMDGPU_PS: Out << "amdgpu_ps"; break; + case CallingConv::AMDGPU_CS: Out << "amdgpu_cs"; break; } } Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -123,15 +123,6 @@ } // End namespace llvm -namespace ShaderType { - enum Type { - PIXEL = 0, - VERTEX = 1, - GEOMETRY = 2, - COMPUTE = 3 - }; -} - /// OpenCL uses address spaces to differentiate between /// various memory regions on the hardware. On the CPU /// all of the address spaces point to the same memory, Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -301,21 +301,21 @@ unsigned RsrcReg; if (STM.getGeneration() >= AMDGPUSubtarget::EVERGREEN) { // Evergreen / Northern Islands - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::COMPUTE: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; - case ShaderType::GEOMETRY: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; - case ShaderType::PIXEL: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; - case ShaderType::VERTEX: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_CS: RsrcReg = R_0288D4_SQ_PGM_RESOURCES_LS; break; + case CallingConv::AMDGPU_GS: RsrcReg = R_028878_SQ_PGM_RESOURCES_GS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028844_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_VS: RsrcReg = R_028860_SQ_PGM_RESOURCES_VS; break; } } else { // R600 / R700 - switch (MFI->getShaderType()) { + switch (MF.getFunction()->getCallingConv()) { default: // Fall through - case ShaderType::GEOMETRY: // Fall through - case ShaderType::COMPUTE: // Fall through - case ShaderType::VERTEX: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; - case ShaderType::PIXEL: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; + case CallingConv::AMDGPU_GS: // Fall through + case CallingConv::AMDGPU_CS: // Fall through + case CallingConv::AMDGPU_VS: RsrcReg = R_028868_SQ_PGM_RESOURCES_VS; break; + case CallingConv::AMDGPU_PS: RsrcReg = R_028850_SQ_PGM_RESOURCES_PS; break; } } @@ -325,7 +325,7 @@ OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4); OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4); OutStreamer->EmitIntValue(alignTo(MFI->LDSSize, 4) >> 2, 4); } @@ -544,13 +544,13 @@ S_00B84C_EXCP_EN(0); } -static unsigned getRsrcReg(unsigned ShaderType) { - switch (ShaderType) { +static unsigned getRsrcReg(CallingConv::ID CallConv) { + switch (CallConv) { default: // Fall through - case ShaderType::COMPUTE: return R_00B848_COMPUTE_PGM_RSRC1; - case ShaderType::GEOMETRY: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; - case ShaderType::PIXEL: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; - case ShaderType::VERTEX: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; + case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; + case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; + case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; } } @@ -558,9 +558,9 @@ const SIProgramInfo &KernelInfo) { const AMDGPUSubtarget &STM = MF.getSubtarget(); const SIMachineFunctionInfo *MFI = MF.getInfo(); - unsigned RsrcReg = getRsrcReg(MFI->getShaderType()); + unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); - if (MFI->getShaderType() == ShaderType::COMPUTE) { + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); @@ -577,13 +577,13 @@ OutStreamer->EmitIntValue(RsrcReg, 4); OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); - if (STM.isVGPRSpillingEnabled(MFI)) { + if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); } } - if (MFI->getShaderType() == ShaderType::PIXEL) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -117,14 +117,12 @@ CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >=" "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() < " "AMDGPUSubtarget::SOUTHERN_ISLANDS && " - "State.getMachineFunction().getInfo()" - "->getShaderType() == ShaderType::COMPUTE", + "!AMDGPU::isShader(State.getCallingConv())", CCDelegateTo>, CCIf<"static_cast" "(State.getMachineFunction().getSubtarget()).getGeneration() >= " Index: lib/Target/AMDGPU/AMDGPUMachineFunction.h =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.h +++ lib/Target/AMDGPU/AMDGPUMachineFunction.h @@ -17,7 +17,6 @@ class AMDGPUMachineFunction : public MachineFunctionInfo { virtual void anchor(); - unsigned ShaderType; public: AMDGPUMachineFunction(const MachineFunction &MF); @@ -30,10 +29,6 @@ /// Start of implicit kernel args unsigned ABIArgOffset; - unsigned getShaderType() const { - return ShaderType; - } - bool isKernel() const { // FIXME: Assume everything is a kernel until function calls are supported. return true; Index: lib/Target/AMDGPU/AMDGPUMachineFunction.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -10,11 +10,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) : MachineFunctionInfo(), - ShaderType(ShaderType::COMPUTE), LDSSize(0), ABIArgOffset(0), ScratchSize(0), IsKernel(true) { - - ShaderType = AMDGPU::getShaderType(*MF.getFunction()); } Index: lib/Target/AMDGPU/AMDGPUSubtarget.h =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.h +++ lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -305,7 +305,7 @@ bool isAmdHsaOS() const { return TargetTriple.getOS() == Triple::AMDHSA; } - bool isVGPRSpillingEnabled(const SIMachineFunctionInfo *MFI) const; + bool isVGPRSpillingEnabled(const Function& F) const; bool isXNACKEnabled() const { return EnableXNACK; Index: lib/Target/AMDGPU/AMDGPUSubtarget.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -136,9 +136,8 @@ return AMDGPU::getIsaVersion(getFeatureBits()); } -bool AMDGPUSubtarget::isVGPRSpillingEnabled( - const SIMachineFunctionInfo *MFI) const { - return MFI->getShaderType() == ShaderType::COMPUTE || EnableVGPRSpilling; +bool AMDGPUSubtarget::isVGPRSpillingEnabled(const Function& F) const { + return !AMDGPU::isShader(F.getCallingConv()) || EnableVGPRSpilling; } void AMDGPUSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -265,10 +265,9 @@ static bool isArgPassedInSGPR(const Argument *A) { const Function *F = A->getParent(); - unsigned ShaderType = AMDGPU::getShaderType(*F); // Arguments to compute shaders are never a source of divergence. - if (ShaderType == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) return true; // For non-compute shaders, SGPR inputs are marked with either inreg or byval. Index: lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp =================================================================== --- lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -46,9 +46,9 @@ unsigned CurrentEntries; unsigned CurrentSubEntries; - CFStack(const AMDGPUSubtarget *st, unsigned ShaderType) : ST(st), + CFStack(const AMDGPUSubtarget *st, CallingConv::ID cc) : ST(st), // We need to reserve a stack entry for CALL_FS in vertex shaders. - MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + MaxStackSize(cc == CallingConv::AMDGPU_VS ? 1 : 0), CurrentEntries(0), CurrentSubEntries(0) { } unsigned getLoopDepth(); @@ -478,14 +478,14 @@ TRI = static_cast(ST->getRegisterInfo()); R600MachineFunctionInfo *MFI = MF.getInfo(); - CFStack CFStack(ST, MFI->getShaderType()); + CFStack CFStack(ST, MF.getFunction()->getCallingConv()); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector > > LoopStack; std::vector IfThenElseStack; - if (MFI->getShaderType() == ShaderType::VERTEX) { + if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_VS) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; Index: lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/R600ISelLowering.cpp +++ lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1759,7 +1759,7 @@ MemVT = MemVT.getVectorElementType(); } - if (MFI->getShaderType() != ShaderType::COMPUTE) { + if (AMDGPU::isShader(CallConv)) { unsigned Reg = MF.addLiveIn(VA.getLocReg(), &AMDGPU::R600_Reg128RegClass); SDValue Register = DAG.getCopyFromReg(Chain, DL, Reg, VT); InVals.push_back(Register); Index: lib/Target/AMDGPU/R600InstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/R600InstrInfo.cpp +++ lib/Target/AMDGPU/R600InstrInfo.cpp @@ -204,8 +204,7 @@ bool R600InstrInfo::usesVertexCache(const MachineInstr *MI) const { const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return MFI->getShaderType() != ShaderType::COMPUTE && + return !AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && usesVertexCache(MI->getOpcode()); } @@ -215,8 +214,7 @@ bool R600InstrInfo::usesTextureCache(const MachineInstr *MI) const { const MachineFunction *MF = MI->getParent()->getParent(); - const R600MachineFunctionInfo *MFI = MF->getInfo(); - return (MFI->getShaderType() == ShaderType::COMPUTE && + return (AMDGPU::isCompute(MF->getFunction()->getCallingConv()) && usesVertexCache(MI->getOpcode())) || usesTextureCache(MI->getOpcode()); } Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -596,7 +596,7 @@ SIMachineFunctionInfo *Info = MF.getInfo(); const AMDGPUSubtarget &ST = MF.getSubtarget(); - if (Subtarget->isAmdHsaOS() && Info->getShaderType() != ShaderType::COMPUTE) { + if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { const Function *Fn = MF.getFunction(); DiagnosticInfoUnsupported NoGraphicsHSA( *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()); @@ -604,8 +604,6 @@ return SDValue(); } - // FIXME: We currently assume all calling conventions are kernels. - SmallVector Splits; BitVector Skipped(Ins.size()); @@ -613,7 +611,7 @@ const ISD::InputArg &Arg = Ins[i]; // First check if it's a PS input addr - if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && + if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() && !Arg.Flags.isByVal() && PSInputNum <= 15) { if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) { @@ -631,7 +629,8 @@ } // Second split vertices into their elements - if (Info->getShaderType() != ShaderType::COMPUTE && Arg.VT.isVector()) { + if (AMDGPU::isShader(CallConv) && + Arg.VT.isVector()) { ISD::InputArg NewArg = Arg; NewArg.Flags.setSplit(); NewArg.VT = Arg.VT.getVectorElementType(); @@ -647,7 +646,7 @@ NewArg.PartOffset += NewArg.VT.getStoreSize(); } - } else if (Info->getShaderType() != ShaderType::COMPUTE) { + } else if (AMDGPU::isShader(CallConv)) { Splits.push_back(Arg); } } @@ -668,7 +667,7 @@ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled. // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be // enabled too. - if (Info->getShaderType() == ShaderType::PIXEL && + if (CallConv == CallingConv::AMDGPU_PS && ((Info->getPSInputAddr() & 0x7F) == 0 || ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) { @@ -678,7 +677,7 @@ Info->PSInputEna |= 1; } - if (Info->getShaderType() == ShaderType::COMPUTE) { + if (!AMDGPU::isShader(CallConv)) { getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, Splits); } @@ -922,7 +921,7 @@ MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo(); - if (Info->getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(CallConv)) return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs, OutVals, DL, DAG); Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -596,7 +596,7 @@ return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); @@ -682,7 +682,7 @@ return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); @@ -728,7 +728,7 @@ return TIDReg; - if (MFI->getShaderType() == ShaderType::COMPUTE && + if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg Index: lib/Target/AMDGPU/SILowerControlFlow.cpp =================================================================== --- lib/Target/AMDGPU/SILowerControlFlow.cpp +++ lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -169,8 +169,7 @@ MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MI.getDebugLoc(); - if (MBB.getParent()->getInfo()->getShaderType() != - ShaderType::PIXEL || + if (MBB.getParent()->getFunction()->getCallingConv() != CallingConv::AMDGPU_PS || !shouldSkip(&MBB, &MBB.getParent()->back())) return; @@ -328,11 +327,10 @@ const MachineOperand &Op = MI.getOperand(0); #ifndef NDEBUG - const SIMachineFunctionInfo *MFI - = MBB.getParent()->getInfo(); + CallingConv::ID CallConv = MBB.getParent()->getFunction()->getCallingConv(); // Kill is only allowed in pixel / geometry shaders. - assert(MFI->getShaderType() == ShaderType::PIXEL || - MFI->getShaderType() == ShaderType::GEOMETRY); + assert(CallConv == CallingConv::AMDGPU_PS || + CallConv == CallingConv::AMDGPU_GS); #endif // Clear this thread from the exec mask if the operand is negative Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -80,7 +80,7 @@ const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); - if (getShaderType() == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F->getCallingConv())) KernargSegmentPtr = true; if (F->hasFnAttribute("amdgpu-work-group-id-y")) @@ -100,7 +100,7 @@ if (WorkItemIDZ) WorkItemIDY = true; - bool MaySpill = ST.isVGPRSpillingEnabled(this); + bool MaySpill = ST.isVGPRSpillingEnabled(*F); bool HasStackObjects = FrameInfo->hasStackObjects(); if (HasStackObjects || MaySpill) @@ -202,5 +202,7 @@ const AMDGPUSubtarget &ST = MF.getSubtarget(); // FIXME: We should get this information from kernel attributes if it // is available. - return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); + if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) + return 256; + return ST.getWavefrontSize(); } Index: lib/Target/AMDGPU/SITypeRewriter.cpp =================================================================== --- lib/Target/AMDGPU/SITypeRewriter.cpp +++ lib/Target/AMDGPU/SITypeRewriter.cpp @@ -62,7 +62,7 @@ } bool SITypeRewriter::runOnFunction(Function &F) { - if (AMDGPU::getShaderType(F) == ShaderType::COMPUTE) + if (!AMDGPU::isShader(F.getCallingConv())) return false; visit(F); Index: lib/Target/AMDGPU/SIWholeQuadMode.cpp =================================================================== --- lib/Target/AMDGPU/SIWholeQuadMode.cpp +++ lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -425,9 +425,7 @@ } bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { - SIMachineFunctionInfo *MFI = MF.getInfo(); - - if (MFI->getShaderType() != ShaderType::PIXEL) + if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS) return false; Instructions.clear(); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -11,6 +11,7 @@ #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H #include "AMDKernelCodeT.h" +#include "llvm/IR/CallingConv.h" namespace llvm { @@ -44,9 +45,10 @@ bool isGlobalSegment(const GlobalValue *GV); bool isReadOnlySegment(const GlobalValue *GV); -unsigned getShaderType(const Function &F); unsigned getInitialPSInputAddr(const Function &F); +bool isShader(CallingConv::ID cc); +bool isCompute(CallingConv::ID cc); bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -124,14 +124,26 @@ return Result; } -unsigned getShaderType(const Function &F) { - return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); -} - unsigned getInitialPSInputAddr(const Function &F) { return getIntegerAttribute(F, "InitialPSInputAddr", 0); } +bool isShader(CallingConv::ID cc) { + switch(cc) { + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + return true; + default: + return false; + } +} + +bool isCompute(CallingConv::ID cc) { + return !isShader(cc) || cc == CallingConv::AMDGPU_CS; +} + bool isSI(const MCSubtargetInfo &STI) { return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands]; } Index: test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll =================================================================== --- test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll +++ test/Analysis/DivergenceAnalysis/AMDGPU/kernel-args.ll @@ -9,7 +9,7 @@ ; CHECK: DIVERGENT: float %arg5 ; CHECK: DIVERGENT: i32 %arg6 -define void @main([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { +define cc 87 void @main([4 x <16 x i8>] addrspace(2)* byval %arg0, float inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <3 x i32> %arg4, float %arg5, i32 %arg6) #0 { ret void } Index: test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll @@ -0,0 +1,21 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + + +; GCN-LABEL: {{^}}shader_cc: +; GCN: v_add_i32_e32 v0, vcc, s8, v0 +define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { + %vi = bitcast float %v to i32 + %x = add i32 %vi, %w + %xf = bitcast i32 %x to float + ret float %xf +} + +; GCN-LABEL: {{^}}kernel_cc: +; GCN: s_endpgm +define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) { + %vi = bitcast float %v to i32 + %x = add i32 %vi, %w + %xf = bitcast i32 %x to float + ret float %xf +} Index: test/CodeGen/AMDGPU/big_alu.ll =================================================================== --- test/CodeGen/AMDGPU/big_alu.ll +++ test/CodeGen/AMDGPU/big_alu.ll @@ -3,7 +3,7 @@ ; This test ensures that R600 backend can handle ifcvt properly ; and do not generate ALU clauses with more than 128 instructions. -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #1 { +define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) { main_body: %0 = extractelement <4 x float> %reg0, i32 0 %1 = extractelement <4 x float> %reg0, i32 1 @@ -1297,7 +1297,6 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) attributes #0 = { alwaysinline nounwind readnone } -attributes #1 = { "ShaderType"="0" } attributes #2 = { readnone } attributes #3 = { nounwind readnone } attributes #4 = { readonly } Index: test/CodeGen/AMDGPU/bitcast.ll =================================================================== --- test/CodeGen/AMDGPU/bitcast.ll +++ test/CodeGen/AMDGPU/bitcast.ll @@ -7,7 +7,7 @@ ; FUNC-LABEL: {{^}}v32i8_to_v8i32: ; SI: s_endpgm -define void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { +define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 { entry: %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0 %2 = bitcast <32 x i8> %1 to <8 x i32> @@ -75,5 +75,3 @@ store <2 x i32> %bc, <2 x i32> addrspace(1)* %out, align 8 ret void } - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/call_fs.ll =================================================================== --- test/CodeGen/AMDGPU/call_fs.ll +++ test/CodeGen/AMDGPU/call_fs.ll @@ -10,8 +10,6 @@ ; R600:CALL_FS ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x80,0x89] -define void @call_fs() #0 { +define amdgpu_vs void @call_fs() { ret void } - -attributes #0 = { "ShaderType"="1" } ; Vertex Shader Index: test/CodeGen/AMDGPU/cayman-loop-bug.ll =================================================================== --- test/CodeGen/AMDGPU/cayman-loop-bug.ll +++ test/CodeGen/AMDGPU/cayman-loop-bug.ll @@ -8,7 +8,7 @@ ; CHECK-NOT: ALU_PUSH_BEFORE ; CHECK: END_LOOP ; CHECK: END_LOOP -define void @main (<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @main (<4 x float> inreg %reg0) { entry: br label %outer_loop outer_loop: @@ -28,5 +28,3 @@ exit: ret void } - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file Index: test/CodeGen/AMDGPU/commute-shifts.ll =================================================================== --- test/CodeGen/AMDGPU/commute-shifts.ll +++ test/CodeGen/AMDGPU/commute-shifts.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}main: ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}} ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1 -define void @main() #0 { +define amdgpu_ps void @main() #0 { bb: %tmp = fptosi float undef to i32 %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -25,5 +25,5 @@ declare i32 @llvm.SI.packf16(float, float) #1 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } +attributes #0 = { "enable-no-nans-fp-math"="true" } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/complex-folding.ll =================================================================== --- test/CodeGen/AMDGPU/complex-folding.ll +++ test/CodeGen/AMDGPU/complex-folding.ll @@ -2,7 +2,7 @@ ; CHECK: {{^}}main: ; CHECK-NOT: MOV -define void @main(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @main(<4 x float> inreg %reg0) { entry: %0 = extractelement <4 x float> %reg0, i32 0 %1 = call float @fabs(float %0) @@ -15,5 +15,3 @@ declare float @fabs(float ) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file Index: test/CodeGen/AMDGPU/elf.ll =================================================================== --- test/CodeGen/AMDGPU/elf.ll +++ test/CodeGen/AMDGPU/elf.ll @@ -24,7 +24,7 @@ ; TONGA-NEXT: .long 576 ; CONFIG: .p2align 8 ; CONFIG: test: -define void @test(i32 %p) #0 { +define amdgpu_ps void @test(i32 %p) { %i = add i32 %p, 2 %r = bitcast i32 %i to float call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r) @@ -32,5 +32,3 @@ } declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } ; Pixel Shader Index: test/CodeGen/AMDGPU/fetch-limits.r600.ll =================================================================== --- test/CodeGen/AMDGPU/fetch-limits.r600.ll +++ test/CodeGen/AMDGPU/fetch-limits.r600.ll @@ -7,7 +7,7 @@ ; CHECK: Fetch clause ; CHECK: Fetch clause -define void @fetch_limits_r600() #0 { +define amdgpu_ps void @fetch_limits_r600() { entry: %0 = load <4 x float>, <4 x float> addrspace(8)* null %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) @@ -42,7 +42,5 @@ ret void } -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) Index: test/CodeGen/AMDGPU/fetch-limits.r700+.ll =================================================================== --- test/CodeGen/AMDGPU/fetch-limits.r700+.ll +++ test/CodeGen/AMDGPU/fetch-limits.r700+.ll @@ -16,7 +16,7 @@ ; CHECK: Fetch clause ; CHECK: Fetch clause -define void @fetch_limits_r700() #0 { +define amdgpu_ps void @fetch_limits_r700() { entry: %0 = load <4 x float>, <4 x float> addrspace(8)* null %1 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) @@ -75,7 +75,5 @@ ret void } -attributes #0 = { "ShaderType"="0" } ; Pixel Shader - declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) Index: test/CodeGen/AMDGPU/floor.ll =================================================================== --- test/CodeGen/AMDGPU/floor.ll +++ test/CodeGen/AMDGPU/floor.ll @@ -1,7 +1,7 @@ ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s ; CHECK: FLOOR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = call float @floor(float %r0) %vec = insertelement <4 x float> undef, float %r1, i32 0 @@ -12,4 +12,3 @@ declare float @floor(float) readonly declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/fmad.ll =================================================================== --- test/CodeGen/AMDGPU/fmad.ll +++ test/CodeGen/AMDGPU/fmad.ll @@ -2,7 +2,7 @@ ;CHECK: MULADD_IEEE * {{T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = extractelement <4 x float> %reg0, i32 2 @@ -15,5 +15,3 @@ declare float @fabs(float ) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file Index: test/CodeGen/AMDGPU/fmax.ll =================================================================== --- test/CodeGen/AMDGPU/fmax.ll +++ test/CodeGen/AMDGPU/fmax.ll @@ -2,7 +2,7 @@ ;CHECK: MAX * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = fcmp oge float %r0, %r1 @@ -13,5 +13,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file Index: test/CodeGen/AMDGPU/fmin.ll =================================================================== --- test/CodeGen/AMDGPU/fmin.ll +++ test/CodeGen/AMDGPU/fmin.ll @@ -2,7 +2,7 @@ ;CHECK: MIN * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = fcmp uge float %r0, %r1 @@ -13,5 +13,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } \ No newline at end of file Index: test/CodeGen/AMDGPU/inline-asm.ll =================================================================== --- test/CodeGen/AMDGPU/inline-asm.ll +++ test/CodeGen/AMDGPU/inline-asm.ll @@ -14,14 +14,12 @@ ; CHECK: {{^}}inline_asm_shader: ; CHECK: s_endpgm ; CHECK: s_endpgm -define void @inline_asm_shader() #0 { +define amdgpu_ps void @inline_asm_shader() { entry: call void asm sideeffect "s_endpgm", ""() ret void } -attributes #0 = { "ShaderType"="0" } - ; CHECK: {{^}}branch_on_asm: ; Make sure inline assembly is treted as divergent. Index: test/CodeGen/AMDGPU/input-mods.ll =================================================================== --- test/CodeGen/AMDGPU/input-mods.ll +++ test/CodeGen/AMDGPU/input-mods.ll @@ -9,7 +9,7 @@ ;CM: EXP_IEEE T{{[0-9]+}}.Z (MASKED), -|T{{[0-9]+}}.X| ;CM: EXP_IEEE * T{{[0-9]+}}.W (MASKED), -|T{{[0-9]+}}.X| -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = call float @llvm.fabs.f32(float %r0) %r2 = fsub float -0.000000e+00, %r1 @@ -22,5 +22,3 @@ declare float @llvm.exp2.f32(float) readnone declare float @llvm.fabs.f32(float) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/jump-address.ll =================================================================== --- test/CodeGen/AMDGPU/jump-address.ll +++ test/CodeGen/AMDGPU/jump-address.ll @@ -4,7 +4,7 @@ ; CHECK: EXPORT ; CHECK-NOT: EXPORT -define void @main() #0 { +define amdgpu_ps void @main() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1) %1 = extractelement <4 x float> %0, i32 0 @@ -48,5 +48,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/large-alloca-graphics.ll =================================================================== --- test/CodeGen/AMDGPU/large-alloca-graphics.ll +++ test/CodeGen/AMDGPU/large-alloca-graphics.ll @@ -12,7 +12,7 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 -define void @large_alloca_pixel_shader(i32 %x, i32 %y) #1 { +define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 { %large = alloca [8192 x i32], align 4 %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 store volatile i32 %x, i32* %gep @@ -33,7 +33,7 @@ ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s1 offen ; ALL: ; ScratchSize: 32772 -define void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #1 { +define amdgpu_ps void @large_alloca_pixel_shader_inreg(i32 inreg %x, i32 inreg %y) #0 { %large = alloca [8192 x i32], align 4 %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191 store volatile i32 %x, i32* %gep @@ -44,4 +44,3 @@ } attributes #0 = { nounwind } -attributes #1 = { nounwind "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll @@ -5,7 +5,7 @@ ; CHECK: CUBE T{{[0-9]}}.Y ; CHECK: CUBE T{{[0-9]}}.Z ; CHECK: CUBE * T{{[0-9]}}.W -define void @cube() #0 { +define amdgpu_ps void @cube() { main_body: %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9) %1 = extractelement <4 x float> %0, i32 3 @@ -43,16 +43,15 @@ } ; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1 +declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0 ; Function Attrs: readnone -declare float @fabs(float) #1 +declare float @fabs(float) #0 ; Function Attrs: readnone -declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1 +declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #0 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { readnone } +attributes #0 = { readnone } Index: test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll @@ -5,7 +5,7 @@ ; SI-NOT: v_cmpx_le_f32 ; SI: s_mov_b64 exec, 0 -define void @kill_gs_const() #0 { +define amdgpu_gs void @kill_gs_const() { main_body: %0 = icmp ule i32 0, 3 %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 @@ -21,7 +21,7 @@ ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}} ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}} ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]] -define void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #1 { +define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { entry: %tmp0 = fcmp olt float %13, 0.0 call void @llvm.AMDGPU.kill(float %14) @@ -33,7 +33,4 @@ declare void @llvm.AMDGPU.kill(float) declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="2" } -attributes #1 = { "ShaderType"="0" } - !0 = !{!"const", null, i32 1} Index: test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll +++ test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll @@ -10,7 +10,7 @@ ;GCN: v_interp_p1_f32 ;GCN: v_interp_p2_f32 -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) { main_body: %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4) @@ -25,7 +25,7 @@ ; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug: ; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]] -define void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) #0 { +define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) { main_body: %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7) %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) @@ -42,19 +42,18 @@ } ; Function Attrs: readnone -declare float @fabs(float) #2 +declare float @fabs(float) #1 ; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 +declare i32 @llvm.SI.packf16(float, float) #0 ; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.constant(i32, i32, i32) #1 +declare float @llvm.SI.fs.constant(i32, i32, i32) #0 ; Function Attrs: nounwind readnone -declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1 +declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } -attributes #2 = { readnone } +attributes #0 = { nounwind readnone } +attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/llvm.SI.gather4.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.gather4.ll +++ test/CodeGen/AMDGPU/llvm.SI.gather4.ll @@ -3,7 +3,7 @@ ;CHECK-LABEL: {{^}}gather4_v2: ;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_v2() #0 { +define amdgpu_ps void @gather4_v2() { main_body: %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -16,7 +16,7 @@ ;CHECK-LABEL: {{^}}gather4: ;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4() #0 { +define amdgpu_ps void @gather4() { main_body: %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -29,7 +29,7 @@ ;CHECK-LABEL: {{^}}gather4_cl: ;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl() #0 { +define amdgpu_ps void @gather4_cl() { main_body: %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -42,7 +42,7 @@ ;CHECK-LABEL: {{^}}gather4_l: ;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l() #0 { +define amdgpu_ps void @gather4_l() { main_body: %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -55,7 +55,7 @@ ;CHECK-LABEL: {{^}}gather4_b: ;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b() #0 { +define amdgpu_ps void @gather4_b() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -68,7 +68,7 @@ ;CHECK-LABEL: {{^}}gather4_b_cl: ;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl() #0 { +define amdgpu_ps void @gather4_b_cl() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -81,7 +81,7 @@ ;CHECK-LABEL: {{^}}gather4_b_cl_v8: ;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_v8() #0 { +define amdgpu_ps void @gather4_b_cl_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -94,7 +94,7 @@ ;CHECK-LABEL: {{^}}gather4_lz_v2: ;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_v2() #0 { +define amdgpu_ps void @gather4_lz_v2() { main_body: %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -107,7 +107,7 @@ ;CHECK-LABEL: {{^}}gather4_lz: ;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz() #0 { +define amdgpu_ps void @gather4_lz() { main_body: %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -122,7 +122,7 @@ ;CHECK-LABEL: {{^}}gather4_o: ;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_o() #0 { +define amdgpu_ps void @gather4_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -135,7 +135,7 @@ ;CHECK-LABEL: {{^}}gather4_cl_o: ;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o() #0 { +define amdgpu_ps void @gather4_cl_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -148,7 +148,7 @@ ;CHECK-LABEL: {{^}}gather4_cl_o_v8: ;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_cl_o_v8() #0 { +define amdgpu_ps void @gather4_cl_o_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -161,7 +161,7 @@ ;CHECK-LABEL: {{^}}gather4_l_o: ;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o() #0 { +define amdgpu_ps void @gather4_l_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -174,7 +174,7 @@ ;CHECK-LABEL: {{^}}gather4_l_o_v8: ;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_l_o_v8() #0 { +define amdgpu_ps void @gather4_l_o_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -187,7 +187,7 @@ ;CHECK-LABEL: {{^}}gather4_b_o: ;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o() #0 { +define amdgpu_ps void @gather4_b_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -200,7 +200,7 @@ ;CHECK-LABEL: {{^}}gather4_b_o_v8: ;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_o_v8() #0 { +define amdgpu_ps void @gather4_b_o_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -213,7 +213,7 @@ ;CHECK-LABEL: {{^}}gather4_b_cl_o: ;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_b_cl_o() #0 { +define amdgpu_ps void @gather4_b_cl_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -226,7 +226,7 @@ ;CHECK-LABEL: {{^}}gather4_lz_o: ;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_lz_o() #0 { +define amdgpu_ps void @gather4_lz_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -241,7 +241,7 @@ ;CHECK-LABEL: {{^}}gather4_c: ;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c() #0 { +define amdgpu_ps void @gather4_c() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -254,7 +254,7 @@ ;CHECK-LABEL: {{^}}gather4_c_cl: ;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl() #0 { +define amdgpu_ps void @gather4_c_cl() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -267,7 +267,7 @@ ;CHECK-LABEL: {{^}}gather4_c_cl_v8: ;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_v8() #0 { +define amdgpu_ps void @gather4_c_cl_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -280,7 +280,7 @@ ;CHECK-LABEL: {{^}}gather4_c_l: ;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l() #0 { +define amdgpu_ps void @gather4_c_l() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -293,7 +293,7 @@ ;CHECK-LABEL: {{^}}gather4_c_l_v8: ;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_v8() #0 { +define amdgpu_ps void @gather4_c_l_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -306,7 +306,7 @@ ;CHECK-LABEL: {{^}}gather4_c_b: ;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b() #0 { +define amdgpu_ps void @gather4_c_b() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -319,7 +319,7 @@ ;CHECK-LABEL: {{^}}gather4_c_b_v8: ;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_v8() #0 { +define amdgpu_ps void @gather4_c_b_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -332,7 +332,7 @@ ;CHECK-LABEL: {{^}}gather4_c_b_cl: ;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl() #0 { +define amdgpu_ps void @gather4_c_b_cl() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -345,7 +345,7 @@ ;CHECK-LABEL: {{^}}gather4_c_lz: ;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz() #0 { +define amdgpu_ps void @gather4_c_lz() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -360,7 +360,7 @@ ;CHECK-LABEL: {{^}}gather4_c_o: ;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o() #0 { +define amdgpu_ps void @gather4_c_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -373,7 +373,7 @@ ;CHECK-LABEL: {{^}}gather4_c_o_v8: ;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_o_v8() #0 { +define amdgpu_ps void @gather4_c_o_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -386,7 +386,7 @@ ;CHECK-LABEL: {{^}}gather4_c_cl_o: ;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_cl_o() #0 { +define amdgpu_ps void @gather4_c_cl_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -399,7 +399,7 @@ ;CHECK-LABEL: {{^}}gather4_c_l_o: ;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_l_o() #0 { +define amdgpu_ps void @gather4_c_l_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -412,7 +412,7 @@ ;CHECK-LABEL: {{^}}gather4_c_b_o: ;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_o() #0 { +define amdgpu_ps void @gather4_c_b_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -425,7 +425,7 @@ ;CHECK-LABEL: {{^}}gather4_c_b_cl_o: ;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_b_cl_o() #0 { +define amdgpu_ps void @gather4_c_b_cl_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -438,7 +438,7 @@ ;CHECK-LABEL: {{^}}gather4_c_lz_o: ;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o() #0 { +define amdgpu_ps void @gather4_c_lz_o() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -451,7 +451,7 @@ ;CHECK-LABEL: {{^}}gather4_c_lz_o_v8: ;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da -define void @gather4_c_lz_o_v8() #0 { +define amdgpu_ps void @gather4_c_lz_o_v8() { main_body: %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -464,46 +464,45 @@ -declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.getlod.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.getlod.ll +++ test/CodeGen/AMDGPU/llvm.SI.getlod.ll @@ -3,7 +3,7 @@ ;CHECK-LABEL: {{^}}getlod: ;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define void @getlod() #0 { +define amdgpu_ps void @getlod() { main_body: %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -14,7 +14,7 @@ ;CHECK-LABEL: {{^}}getlod_v2: ;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define void @getlod_v2() #0 { +define amdgpu_ps void @getlod_v2() { main_body: %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -25,7 +25,7 @@ ;CHECK-LABEL: {{^}}getlod_v4: ;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da -define void @getlod_v4() #0 { +define amdgpu_ps void @getlod_v4() { main_body: %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -35,11 +35,10 @@ } -declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.ll @@ -3,7 +3,7 @@ ;CHECK-LABEL: {{^}}image_load: ;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @image_load() #0 { +define amdgpu_ps void @image_load() { main_body: %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -16,7 +16,7 @@ ;CHECK-LABEL: {{^}}image_load_mip: ;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @image_load_mip() #0 { +define amdgpu_ps void @image_load_mip() { main_body: %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -29,7 +29,7 @@ ;CHECK-LABEL: {{^}}getresinfo: ;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @getresinfo() #0 { +define amdgpu_ps void @getresinfo() { main_body: %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -40,11 +40,10 @@ ret void } -declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}v1: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd -define void @v1(i32 %a1) #0 { +define amdgpu_ps void @v1(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -16,7 +16,7 @@ ; CHECK-LABEL: {{^}}v2: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb -define void @v2(i32 %a1) #0 { +define amdgpu_ps void @v2(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -29,7 +29,7 @@ ; CHECK-LABEL: {{^}}v3: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe -define void @v3(i32 %a1) #0 { +define amdgpu_ps void @v3(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -42,7 +42,7 @@ ; CHECK-LABEL: {{^}}v4: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7 -define void @v4(i32 %a1) #0 { +define amdgpu_ps void @v4(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -55,7 +55,7 @@ ; CHECK-LABEL: {{^}}v5: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa -define void @v5(i32 %a1) #0 { +define amdgpu_ps void @v5(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -67,7 +67,7 @@ ; CHECK-LABEL: {{^}}v6: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6 -define void @v6(i32 %a1) #0 { +define amdgpu_ps void @v6(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -79,7 +79,7 @@ ; CHECK-LABEL: {{^}}v7: ; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9 -define void @v7(i32 %a1) #0 { +define amdgpu_ps void @v7(i32 %a1) { entry: %0 = insertelement <1 x i32> undef, i32 %a1, i32 0 %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -92,5 +92,3 @@ declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.SI.image.sample.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.sample.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.sample.ll @@ -4,7 +4,7 @@ ;CHECK-LABEL: {{^}}sample: ;CHECK: s_wqm ;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample() #0 { +define amdgpu_ps void @sample() { main_body: %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -18,7 +18,7 @@ ;CHECK-LABEL: {{^}}sample_cl: ;CHECK: s_wqm ;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl() #0 { +define amdgpu_ps void @sample_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -32,7 +32,7 @@ ;CHECK-LABEL: {{^}}sample_d: ;CHECK-NOT: s_wqm ;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d() #0 { +define amdgpu_ps void @sample_d() { main_body: %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -46,7 +46,7 @@ ;CHECK-LABEL: {{^}}sample_d_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl() #0 { +define amdgpu_ps void @sample_d_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -60,7 +60,7 @@ ;CHECK-LABEL: {{^}}sample_l: ;CHECK-NOT: s_wqm ;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l() #0 { +define amdgpu_ps void @sample_l() { main_body: %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -74,7 +74,7 @@ ;CHECK-LABEL: {{^}}sample_b: ;CHECK: s_wqm ;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b() #0 { +define amdgpu_ps void @sample_b() { main_body: %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -88,7 +88,7 @@ ;CHECK-LABEL: {{^}}sample_b_cl: ;CHECK: s_wqm ;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl() #0 { +define amdgpu_ps void @sample_b_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -102,7 +102,7 @@ ;CHECK-LABEL: {{^}}sample_lz: ;CHECK-NOT: s_wqm ;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz() #0 { +define amdgpu_ps void @sample_lz() { main_body: %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -116,7 +116,7 @@ ;CHECK-LABEL: {{^}}sample_cd: ;CHECK-NOT: s_wqm ;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd() #0 { +define amdgpu_ps void @sample_cd() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -130,7 +130,7 @@ ;CHECK-LABEL: {{^}}sample_cd_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl() #0 { +define amdgpu_ps void @sample_cd_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -144,7 +144,7 @@ ;CHECK-LABEL: {{^}}sample_c: ;CHECK: s_wqm ;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c() #0 { +define amdgpu_ps void @sample_c() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -158,7 +158,7 @@ ;CHECK-LABEL: {{^}}sample_c_cl: ;CHECK: s_wqm ;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl() #0 { +define amdgpu_ps void @sample_c_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -172,7 +172,7 @@ ;CHECK-LABEL: {{^}}sample_c_d: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d() #0 { +define amdgpu_ps void @sample_c_d() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -186,7 +186,7 @@ ;CHECK-LABEL: {{^}}sample_c_d_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl() #0 { +define amdgpu_ps void @sample_c_d_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -200,7 +200,7 @@ ;CHECK-LABEL: {{^}}sample_c_l: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l() #0 { +define amdgpu_ps void @sample_c_l() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -214,7 +214,7 @@ ;CHECK-LABEL: {{^}}sample_c_b: ;CHECK: s_wqm ;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b() #0 { +define amdgpu_ps void @sample_c_b() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -228,7 +228,7 @@ ;CHECK-LABEL: {{^}}sample_c_b_cl: ;CHECK: s_wqm ;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl() #0 { +define amdgpu_ps void @sample_c_b_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -242,7 +242,7 @@ ;CHECK-LABEL: {{^}}sample_c_lz: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz() #0 { +define amdgpu_ps void @sample_c_lz() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -256,7 +256,7 @@ ;CHECK-LABEL: {{^}}sample_c_cd: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd() #0 { +define amdgpu_ps void @sample_c_cd() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -270,7 +270,7 @@ ;CHECK-LABEL: {{^}}sample_c_cd_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl() #0 { +define amdgpu_ps void @sample_c_cd_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -282,29 +282,28 @@ } -declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll +++ test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll @@ -4,7 +4,7 @@ ;CHECK-LABEL: {{^}}sample: ;CHECK: s_wqm ;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample() #0 { +define amdgpu_ps void @sample() { main_body: %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -18,7 +18,7 @@ ;CHECK-LABEL: {{^}}sample_cl: ;CHECK: s_wqm ;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cl() #0 { +define amdgpu_ps void @sample_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -32,7 +32,7 @@ ;CHECK-LABEL: {{^}}sample_d: ;CHECK-NOT: s_wqm ;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d() #0 { +define amdgpu_ps void @sample_d() { main_body: %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -46,7 +46,7 @@ ;CHECK-LABEL: {{^}}sample_d_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_d_cl() #0 { +define amdgpu_ps void @sample_d_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -60,7 +60,7 @@ ;CHECK-LABEL: {{^}}sample_l: ;CHECK-NOT: s_wqm ;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_l() #0 { +define amdgpu_ps void @sample_l() { main_body: %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -74,7 +74,7 @@ ;CHECK-LABEL: {{^}}sample_b: ;CHECK: s_wqm ;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b() #0 { +define amdgpu_ps void @sample_b() { main_body: %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -88,7 +88,7 @@ ;CHECK-LABEL: {{^}}sample_b_cl: ;CHECK: s_wqm ;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_b_cl() #0 { +define amdgpu_ps void @sample_b_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -102,7 +102,7 @@ ;CHECK-LABEL: {{^}}sample_lz: ;CHECK-NOT: s_wqm ;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_lz() #0 { +define amdgpu_ps void @sample_lz() { main_body: %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -116,7 +116,7 @@ ;CHECK-LABEL: {{^}}sample_cd: ;CHECK-NOT: s_wqm ;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd() #0 { +define amdgpu_ps void @sample_cd() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -130,7 +130,7 @@ ;CHECK-LABEL: {{^}}sample_cd_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_cd_cl() #0 { +define amdgpu_ps void @sample_cd_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -144,7 +144,7 @@ ;CHECK-LABEL: {{^}}sample_c: ;CHECK: s_wqm ;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c() #0 { +define amdgpu_ps void @sample_c() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -158,7 +158,7 @@ ;CHECK-LABEL: {{^}}sample_c_cl: ;CHECK: s_wqm ;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cl() #0 { +define amdgpu_ps void @sample_c_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -172,7 +172,7 @@ ;CHECK-LABEL: {{^}}sample_c_d: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d() #0 { +define amdgpu_ps void @sample_c_d() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -186,7 +186,7 @@ ;CHECK-LABEL: {{^}}sample_c_d_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_d_cl() #0 { +define amdgpu_ps void @sample_c_d_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -200,7 +200,7 @@ ;CHECK-LABEL: {{^}}sample_c_l: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_l() #0 { +define amdgpu_ps void @sample_c_l() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -214,7 +214,7 @@ ;CHECK-LABEL: {{^}}sample_c_b: ;CHECK: s_wqm ;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b() #0 { +define amdgpu_ps void @sample_c_b() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -228,7 +228,7 @@ ;CHECK-LABEL: {{^}}sample_c_b_cl: ;CHECK: s_wqm ;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_b_cl() #0 { +define amdgpu_ps void @sample_c_b_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -242,7 +242,7 @@ ;CHECK-LABEL: {{^}}sample_c_lz: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_lz() #0 { +define amdgpu_ps void @sample_c_lz() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -256,7 +256,7 @@ ;CHECK-LABEL: {{^}}sample_c_cd: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd() #0 { +define amdgpu_ps void @sample_c_cd() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -270,7 +270,7 @@ ;CHECK-LABEL: {{^}}sample_c_cd_cl: ;CHECK-NOT: s_wqm ;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf -define void @sample_c_cd_cl() #0 { +define amdgpu_ps void @sample_c_cd_cl() { main_body: %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %r0 = extractelement <4 x float> %r, i32 0 @@ -282,29 +282,28 @@ } -declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 -declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 -declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 +declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.load.dword.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.load.dword.ll +++ test/CodeGen/AMDGPU/llvm.SI.load.dword.ll @@ -14,7 +14,7 @@ ; CHECK: s_movk_i32 [[K:s[0-9]+]], 0x4d2 ; encoding ; CHECK: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, [[K]] idxen offen offset:65535 glc slc -define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) #0 { +define amdgpu_vs void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <32 x i8>] addrspace(2)* byval %arg2, [2 x <16 x i8>] addrspace(2)* byval %arg3, [17 x <16 x i8>] addrspace(2)* inreg %arg4, [17 x <16 x i8>] addrspace(2)* inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9) { main_body: %tmp = getelementptr [2 x <16 x i8>], [2 x <16 x i8>] addrspace(2)* %arg3, i64 0, i32 1 %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -40,14 +40,13 @@ } ; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #1 +declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #0 ; Function Attrs: nounwind readonly -declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #1 +declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="1" } -attributes #1 = { nounwind readonly } +attributes #0 = { nounwind readonly } !0 = !{!"const", null, i32 1} Index: test/CodeGen/AMDGPU/llvm.SI.packf16.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.packf16.ll +++ test/CodeGen/AMDGPU/llvm.SI.packf16.ll @@ -6,7 +6,7 @@ ; GCN: v_cvt_pkrtz_f16_f32 ; GCN-NOT: v_cvt_pkrtz_f16_f32 -define void @main(float %src) #0 { +define amdgpu_ps void @main(float %src) { main_body: %p1 = call i32 @llvm.SI.packf16(float undef, float %src) %p2 = call i32 @llvm.SI.packf16(float %src, float undef) @@ -21,9 +21,8 @@ } ; Function Attrs: nounwind readnone -declare i32 @llvm.SI.packf16(float, float) #1 +declare i32 @llvm.SI.packf16(float, float) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll +++ test/CodeGen/AMDGPU/llvm.SI.sendmsg-m0.ll @@ -7,7 +7,7 @@ ; BOTH-NEXT: s_sendmsg Gs_done(nop) ; BOTH-NEXT: s_endpgm -define void @main(i32 inreg %a) #0 { +define amdgpu_gs void @main(i32 inreg %a) #0 { main_body: call void @llvm.SI.sendmsg(i32 3, i32 %a) ret void @@ -16,5 +16,5 @@ ; Function Attrs: nounwind declare void @llvm.SI.sendmsg(i32, i32) #1 -attributes #0 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #0 = { "unsafe-fp-math"="true" } attributes #1 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll +++ test/CodeGen/AMDGPU/llvm.SI.tbuffer.store.ll @@ -3,7 +3,7 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK: tbuffer_store_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, 0x20, -1, 0, -1, 0, 14, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test1(i32 %a1, i32 %vaddr) #0 { +define amdgpu_vs void @test1(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, i32 4, i32 %vaddr, i32 0, i32 32, i32 14, i32 4, i32 1, i32 0, i32 1, @@ -13,7 +13,7 @@ ;CHECK-LABEL: {{^}}test2: ;CHECK: tbuffer_store_format_xyz {{v\[[0-9]+:[0-9]+\]}}, 0x18, -1, 0, -1, 0, 13, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test2(i32 %a1, i32 %vaddr) #0 { +define amdgpu_vs void @test2(i32 %a1, i32 %vaddr) { %vdata = insertelement <4 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v4i32(<16 x i8> undef, <4 x i32> %vdata, i32 3, i32 %vaddr, i32 0, i32 24, i32 13, i32 4, i32 1, i32 0, i32 1, @@ -23,7 +23,7 @@ ;CHECK-LABEL: {{^}}test3: ;CHECK: tbuffer_store_format_xy {{v\[[0-9]+:[0-9]+\]}}, 0x10, -1, 0, -1, 0, 11, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test3(i32 %a1, i32 %vaddr) #0 { +define amdgpu_vs void @test3(i32 %a1, i32 %vaddr) { %vdata = insertelement <2 x i32> undef, i32 %a1, i32 0 call void @llvm.SI.tbuffer.store.v2i32(<16 x i8> undef, <2 x i32> %vdata, i32 2, i32 %vaddr, i32 0, i32 16, i32 11, i32 4, i32 1, i32 0, i32 1, @@ -33,7 +33,7 @@ ;CHECK-LABEL: {{^}}test4: ;CHECK: tbuffer_store_format_x {{v[0-9]+}}, 0x8, -1, 0, -1, 0, 4, 4, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, -1, 0, 0 -define void @test4(i32 %vdata, i32 %vaddr) #0 { +define amdgpu_vs void @test4(i32 %vdata, i32 %vaddr) { call void @llvm.SI.tbuffer.store.i32(<16 x i8> undef, i32 %vdata, i32 1, i32 %vaddr, i32 0, i32 8, i32 4, i32 4, i32 1, i32 0, i32 1, i32 1, i32 0) @@ -43,5 +43,3 @@ declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/llvm.SI.tid.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.SI.tid.ll +++ test/CodeGen/AMDGPU/llvm.SI.tid.ll @@ -5,7 +5,7 @@ ;SI: v_mbcnt_hi_u32_b32_e32 ;VI: v_mbcnt_hi_u32_b32_e64 -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { main_body: %4 = call i32 @llvm.SI.tid() %5 = bitcast i32 %4 to float Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.atomic.ll @@ -16,7 +16,7 @@ ;CHECK: buffer_atomic_swap v0, s[0:3], [[SOFS]] offset:1 glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_swap v0, s[0:3], 0{{$}} -define float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) #0 { +define amdgpu_ps float @test1(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex, i32 %voffset) { main_body: %o1 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i1 0) %o2 = call i32 @llvm.amdgcn.buffer.atomic.swap(i32 %o1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) @@ -48,7 +48,7 @@ ;CHECK: buffer_atomic_or v0, v1, s[0:3], 0 idxen glc ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_xor v0, v1, s[0:3], 0 idxen glc -define float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 { +define amdgpu_ps float @test2(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) { main_body: %t1 = call i32 @llvm.amdgcn.buffer.atomic.add(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) %t2 = call i32 @llvm.amdgcn.buffer.atomic.sub(i32 %t1, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) @@ -80,7 +80,7 @@ ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, v3, s[0:3], 0 offen offset:42 glc ;CHECK-DAG: s_waitcnt vmcnt(0) ;CHECK: buffer_atomic_cmpswap {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[SOFS]] offset:1 glc -define float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) #0 { +define amdgpu_ps float @test3(<4 x i32> inreg %rsrc, i32 %data, i32 %cmp, i32 %vindex, i32 %voffset) { main_body: %o1 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %data, i32 %cmp, <4 x i32> %rsrc, i32 0, i32 0, i1 0) %o2 = call i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32 %o1, i32 %cmp, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0) @@ -100,17 +100,16 @@ ret float %out } -declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #1 -declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #1 +declare i32 @llvm.amdgcn.buffer.atomic.swap(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.add(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.sub(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umin(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.smax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.umax(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.and(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.or(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.xor(i32, <4 x i32>, i32, i32, i1) #0 +declare i32 @llvm.amdgcn.buffer.atomic.cmpswap(i32, i32, <4 x i32>, i32, i32, i1) #0 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind } +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.format.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_load_format_xyzw v[4:7], s[0:3], 0 glc ;CHECK: buffer_load_format_xyzw v[8:11], s[0:3], 0 slc ;CHECK: s_waitcnt -define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -20,7 +20,7 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_format_xyzw v[0:3], s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret <4 x float> %data @@ -33,7 +33,7 @@ ;CHECK: s_mov_b32 [[OFS2:s[0-9]+]], 0x8fff ;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS2]] offset:1 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4156, i1 0, i1 0) %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 36860, i1 0, i1 0) @@ -49,7 +49,7 @@ ;CHECK-NOT: s_mov ;CHECK: buffer_load_format_xyzw {{v\[[0-9]+:[0-9]+\]}}, s[0:3], [[OFS]] offset:81 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) #0 { +define amdgpu_ps <4 x float> @buffer_load_immoffs_reuse(<4 x i32> inreg) { main_body: %d.0 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4160, i1 0, i1 0) %d.1 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 4176, i1 0, i1 0) @@ -60,7 +60,7 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data @@ -69,7 +69,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -78,7 +78,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 offen offset:58 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 58 %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) @@ -88,7 +88,7 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data @@ -98,7 +98,7 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -107,7 +107,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x: ;CHECK: buffer_load_format_x v0, s[0:3], 0 ;CHECK: s_waitcnt -define float @buffer_load_x(<4 x i32> inreg %rsrc) #0 { +define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) { main_body: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) ret float %data @@ -116,15 +116,14 @@ ;CHECK-LABEL: {{^}}buffer_load_xy: ;CHECK: buffer_load_format_xy v[0:1], s[0:3], 0 ;CHECK: s_waitcnt -define <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) #0 { +define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) { main_body: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 0, i32 0, i1 0, i1 0) ret <2 x float> %data } -declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readonly } +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load.ll @@ -6,7 +6,7 @@ ;CHECK: buffer_load_dwordx4 v[4:7], s[0:3], 0 glc ;CHECK: buffer_load_dwordx4 v[8:11], s[0:3], 0 slc ;CHECK: s_waitcnt -define {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) #0 { +define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 0, i1 0) %data_glc = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -20,7 +20,7 @@ ;CHECK-LABEL: {{^}}buffer_load_immoffs: ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], 0 offset:42 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs(<4 x i32> inreg) #0 { +define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret <4 x float> %data @@ -30,7 +30,7 @@ ;CHECK: s_movk_i32 [[OFFSET:s[0-9]+]], 0x1fff ;CHECK: buffer_load_dwordx4 v[0:3], s[0:3], [[OFFSET]] offset:1 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) #0 { +define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 8192, i1 0, i1 0) ret <4 x float> %data @@ -39,7 +39,7 @@ ;CHECK-LABEL: {{^}}buffer_load_idx: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 idxen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 0, i1 0, i1 0) ret <4 x float> %data @@ -48,7 +48,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -57,7 +57,7 @@ ;CHECK-LABEL: {{^}}buffer_load_ofs_imm: ;CHECK: buffer_load_dwordx4 v[0:3], v0, s[0:3], 0 offen offset:58 ;CHECK: s_waitcnt -define <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) { main_body: %ofs = add i32 %1, 58 %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 0, i32 %ofs, i1 0, i1 0) @@ -67,7 +67,7 @@ ;CHECK-LABEL: {{^}}buffer_load_both: ;CHECK: buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %1, i32 %2, i1 0, i1 0) ret <4 x float> %data @@ -77,7 +77,7 @@ ;CHECK: v_mov_b32_e32 v2, v0 ;CHECK: buffer_load_dwordx4 v[0:3], v[1:2], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) #0 { +define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i32) { main_body: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %2, i32 %1, i1 0, i1 0) ret <4 x float> %data @@ -86,7 +86,7 @@ ;CHECK-LABEL: {{^}}buffer_load_x1: ;CHECK: buffer_load_dword v0, v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps float @buffer_load_x1(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret float %data @@ -95,15 +95,14 @@ ;CHECK-LABEL: {{^}}buffer_load_x2: ;CHECK: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 idxen offen ;CHECK: s_waitcnt -define <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 { +define amdgpu_ps <2 x float> @buffer_load_x2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) { main_body: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 0, i1 0) ret <2 x float> %data } -declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #0 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readonly } +attributes #0 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.format.ll @@ -5,7 +5,7 @@ ;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 ;CHECK: buffer_store_format_xyzw v[4:7], s[0:3], 0 glc ;CHECK: buffer_store_format_xyzw v[8:11], s[0:3], 0 slc -define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -15,7 +15,7 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK: buffer_store_format_xyzw v[0:3], s[0:3], 0 offset:42 -define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret void @@ -23,7 +23,7 @@ ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 idxen -define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 { +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) ret void @@ -31,7 +31,7 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK: buffer_store_format_xyzw v[0:3], v4, s[0:3], 0 offen -define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) ret void @@ -39,7 +39,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK: buffer_store_format_xyzw v[0:3], v[4:5], s[0:3], 0 idxen offen -define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 { +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) ret void @@ -48,7 +48,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK: buffer_store_format_xyzw v[0:3], v[5:6], s[0:3], 0 idxen offen -define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 { +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) ret void @@ -62,7 +62,7 @@ ;CHECK: buffer_load_format_xyzw v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_format_xyzw v[0:3], v6, s[0:3], 0 idxen -define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) @@ -72,7 +72,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK: buffer_store_format_x v0, v1, s[0:3], 0 idxen -define void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.format.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void @@ -80,17 +80,16 @@ ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK: buffer_store_format_xy v[0:1], v2, s[0:3], 0 idxen -define void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.format.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind } -attributes #2 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.buffer.store.ll @@ -5,7 +5,7 @@ ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 ;CHECK: buffer_store_dwordx4 v[4:7], s[0:3], 0 glc ;CHECK: buffer_store_dwordx4 v[8:11], s[0:3], 0 slc -define void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) #0 { +define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i1 0, i1 0) call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i1 1, i1 0) @@ -15,7 +15,7 @@ ;CHECK-LABEL: {{^}}buffer_store_immoffs: ;CHECK: buffer_store_dwordx4 v[0:3], s[0:3], 0 offset:42 -define void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) #0 { +define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 42, i1 0, i1 0) ret void @@ -23,7 +23,7 @@ ;CHECK-LABEL: {{^}}buffer_store_idx: ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 idxen -define void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) #0 { +define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) ret void @@ -31,7 +31,7 @@ ;CHECK-LABEL: {{^}}buffer_store_ofs: ;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen -define void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) #0 { +define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 %2, i1 0, i1 0) ret void @@ -39,7 +39,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both: ;CHECK: buffer_store_dwordx4 v[0:3], v[4:5], s[0:3], 0 idxen offen -define void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) #0 { +define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 %3, i1 0, i1 0) ret void @@ -48,7 +48,7 @@ ;CHECK-LABEL: {{^}}buffer_store_both_reversed: ;CHECK: v_mov_b32_e32 v6, v4 ;CHECK: buffer_store_dwordx4 v[0:3], v[5:6], s[0:3], 0 idxen offen -define void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) #0 { +define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %3, i32 %2, i1 0, i1 0) ret void @@ -62,7 +62,7 @@ ;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 idxen ;CHECK: s_waitcnt vmcnt(0) ;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 idxen -define void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) #0 { +define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32, i32) { main_body: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i1 0, i1 0) @@ -72,7 +72,7 @@ ;CHECK-LABEL: {{^}}buffer_store_x1: ;CHECK: buffer_store_dword v0, v1, s[0:3], 0 idxen -define void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void @@ -80,17 +80,16 @@ ;CHECK-LABEL: {{^}}buffer_store_x2: ;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 idxen -define void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) #0 { +define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data, i32 %index) { main_body: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0) ret void } -declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1 -declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #2 +declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #0 +declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind } -attributes #2 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.ll @@ -5,7 +5,7 @@ ;SI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_swap v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) #0 { +define amdgpu_ps float @image_atomic_swap(<8 x i32> inreg, <4 x i32>, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -16,7 +16,7 @@ ;SI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x02,0x00,0x00] ;VI: image_atomic_swap v2, v[0:1], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x02,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) #0 { +define amdgpu_ps float @image_atomic_swap_v2i32(<8 x i32> inreg, <2 x i32>, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32 %2, <2 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -27,7 +27,7 @@ ;SI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x3c,0xf0,0x00,0x01,0x00,0x00] ;VI: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x40,0xf0,0x00,0x01,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) #0 { +define amdgpu_ps float @image_atomic_swap_i32(<8 x i32> inreg, i32, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.swap.i32(i32 %2, i32 %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -39,7 +39,7 @@ ;VI: image_atomic_cmpswap v[4:5], v[0:3], s[0:7] dmask:0x3 unorm glc ; encoding: [0x00,0x33,0x44,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) ;CHECK: v_mov_b32_e32 v0, v4 -define float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) #0 { +define amdgpu_ps float @image_atomic_cmpswap(<8 x i32> inreg, <4 x i32>, i32, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32 %2, i32 %3, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -50,7 +50,7 @@ ;SI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x44,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_add v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) #0 { +define amdgpu_ps float @image_atomic_add(<8 x i32> inreg, <4 x i32>, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.add.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -61,7 +61,7 @@ ;SI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x48,0xf0,0x00,0x04,0x00,0x00] ;VI: image_atomic_sub v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x4c,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) #0 { +define amdgpu_ps float @image_atomic_sub(<8 x i32> inreg, <4 x i32>, i32) { main_body: %orig = call i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %orig.f = bitcast i32 %orig to float @@ -87,7 +87,7 @@ ;CHECK: s_waitcnt vmcnt(0) ;CHECK: image_atomic_dec v4, v[0:3], s[0:7] dmask:0x1 unorm glc ; encoding: [0x00,0x31,0x70,0xf0,0x00,0x04,0x00,0x00] ;CHECK: s_waitcnt vmcnt(0) -define float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) #0 { +define amdgpu_ps float @image_atomic_unchanged(<8 x i32> inreg, <4 x i32>, i32) { main_body: %t0 = call i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32 %2, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) %t1 = call i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32 %t0, <4 x i32> %1, <8 x i32> %0, i1 0, i1 0, i1 0) @@ -102,23 +102,22 @@ ret float %out } -declare i32 @llvm.amdgcn.image.atomic.swap.i32(i32, i32, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32, <2 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 +declare i32 @llvm.amdgcn.image.atomic.swap.i32(i32, i32, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.swap.v2i32(i32, <2 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.swap.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 -declare i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32, i32, <4 x i32>, <8 x i32>,i1, i1, i1) #1 +declare i32 @llvm.amdgcn.image.atomic.cmpswap.v4i32(i32, i32, <4 x i32>, <8 x i32>,i1, i1, i1) #0 -declare i32 @llvm.amdgcn.image.atomic.add.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.and.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.or.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 -declare i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #1 +declare i32 @llvm.amdgcn.image.atomic.add.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.sub.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.smin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.umin.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.smax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.umax.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.and.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.or.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.xor.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.inc.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 +declare i32 @llvm.amdgcn.image.atomic.dec.v4i32(i32, <4 x i32>, <8 x i32>, i1, i1, i1) #0 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind } +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/llvm.amdgcn.image.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.image.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.image.ll @@ -4,7 +4,7 @@ ;CHECK-LABEL: {{^}}image_load_v4i32: ;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) -define <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex @@ -13,7 +13,7 @@ ;CHECK-LABEL: {{^}}image_load_v2i32: ;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) -define <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 { +define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex @@ -22,7 +22,7 @@ ;CHECK-LABEL: {{^}}image_load_i32: ;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) -define <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 { +define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex @@ -31,7 +31,7 @@ ;CHECK-LABEL: {{^}}image_load_mip: ;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) -define <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret <4 x float> %tex @@ -40,7 +40,7 @@ ;CHECK-LABEL: {{^}}image_load_1: ;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm ;CHECK: s_waitcnt vmcnt(0) -define float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) %elt = extractelement <4 x float> %tex, i32 0 @@ -50,7 +50,7 @@ ;CHECK-LABEL: {{^}}image_store_v4i32: ;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { +define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { main_body: call void @llvm.amdgcn.image.store.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void @@ -58,7 +58,7 @@ ;CHECK-LABEL: {{^}}image_store_v2i32: ;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm -define void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 { +define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) { main_body: call void @llvm.amdgcn.image.store.v2i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void @@ -66,7 +66,7 @@ ;CHECK-LABEL: {{^}}image_store_i32: ;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm -define void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 { +define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) { main_body: call void @llvm.amdgcn.image.store.i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void @@ -74,7 +74,7 @@ ;CHECK-LABEL: {{^}}image_store_mip: ;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm -define void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 { +define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) { main_body: call void @llvm.amdgcn.image.store.mip.v4i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) ret void @@ -88,7 +88,7 @@ ;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm ;CHECK: s_waitcnt vmcnt(0) ;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm -define void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) #0 { +define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) { main_body: call void @llvm.amdgcn.image.store.i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0) %data = call <4 x float> @llvm.amdgcn.image.load.i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0) @@ -96,16 +96,15 @@ ret void } -declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v2i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 +declare void @llvm.amdgcn.image.store.mip.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0 -declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 -declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 +declare <4 x float> @llvm.amdgcn.image.load.i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.v2i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 +declare <4 x float> @llvm.amdgcn.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind } -attributes #2 = { nounwind readonly } +attributes #0 = { nounwind } +attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll @@ -6,7 +6,7 @@ ;GCN: s_mov_b32 m0, s{{[0-9]+}} ;GCN: v_interp_p1_f32 ;GCN: v_interp_p2_f32 -define void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) #0 { +define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) { main_body: %i = extractelement <2 x i32> %4, i32 0 %j = extractelement <2 x i32> %4, i32 1 @@ -19,12 +19,11 @@ } ; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p1(i32, i32, i32, i32) #0 ; Function Attrs: nounwind readnone -declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #1 +declare float @llvm.amdgcn.interp.p2(float, i32, i32, i32, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll +++ test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -6,7 +6,7 @@ ;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] ;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] -define void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { main_body: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1 %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1 Index: test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll +++ test/CodeGen/AMDGPU/llvm.amdgpu.kilp.ll @@ -3,7 +3,7 @@ ; SI-LABEL: {{^}}kilp_gs_const: ; SI: s_mov_b64 exec, 0 -define void @kilp_gs_const() #0 { +define amdgpu_gs void @kilp_gs_const() { main_body: %0 = icmp ule i32 0, 3 %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00 @@ -16,6 +16,4 @@ declare void @llvm.AMDGPU.kilp(float) -attributes #0 = { "ShaderType"="2" } - !0 = !{!"const", null, i32 1} Index: test/CodeGen/AMDGPU/llvm.cos.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.cos.ll +++ test/CodeGen/AMDGPU/llvm.cos.ll @@ -37,5 +37,3 @@ declare float @llvm.cos.f32(float) readnone declare <4 x float> @llvm.cos.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.pow.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.pow.ll +++ test/CodeGen/AMDGPU/llvm.pow.ll @@ -5,7 +5,7 @@ ;CHECK-NEXT: MUL NON-IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -define void @test1(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test1(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 %r2 = call float @llvm.pow.f32( float %r0, float %r1) @@ -27,7 +27,7 @@ ;CHECK-NEXT: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], PS}}, ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, ;CHECK-NEXT: EXP_IEEE * T{{[0-9]+\.[XYZW], PV\.[XYZW]}}, -define void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_ps void @test2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { %vec = call <4 x float> @llvm.pow.v4f32( <4 x float> %reg0, <4 x float> %reg1) call void @llvm.R600.store.swizzle(<4 x float> %vec, i32 0, i32 0) ret void @@ -36,5 +36,3 @@ declare float @llvm.pow.f32(float ,float ) readonly declare <4 x float> @llvm.pow.v4f32(<4 x float> ,<4 x float> ) readonly declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/llvm.sin.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.sin.ll +++ test/CodeGen/AMDGPU/llvm.sin.ll @@ -88,5 +88,3 @@ declare float @llvm.sin.f32(float) readnone declare <4 x float> @llvm.sin.v4f32(<4 x float>) readnone - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/load-input-fold.ll =================================================================== --- test/CodeGen/AMDGPU/load-input-fold.ll +++ test/CodeGen/AMDGPU/load-input-fold.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -111,7 +111,6 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } attributes #1 = { readnone } attributes #2 = { readonly } attributes #3 = { nounwind readonly } Index: test/CodeGen/AMDGPU/m0-spill.ll =================================================================== --- test/CodeGen/AMDGPU/m0-spill.ll +++ test/CodeGen/AMDGPU/m0-spill.ll @@ -5,7 +5,7 @@ ; CHECK-LABEL: {{^}}main: ; CHECK-NOT: v_readlane_b32 m0 -define void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) "ShaderType"="0" { +define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) { main_body: %4 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3) %cmp = fcmp ueq float 0.0, %4 Index: test/CodeGen/AMDGPU/max-literals.ll =================================================================== --- test/CodeGen/AMDGPU/max-literals.ll +++ test/CodeGen/AMDGPU/max-literals.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: ADD * -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -32,7 +32,7 @@ ; CHECK-LABEL: {{^}}main2: ; CHECK-NOT: ADD * -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -63,5 +63,4 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/mubuf.ll =================================================================== --- test/CodeGen/AMDGPU/mubuf.ll +++ test/CodeGen/AMDGPU/mubuf.ll @@ -55,7 +55,7 @@ ; CHECK-LABEL: {{^}}soffset_max_imm: ; CHECK: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 64 offen glc -define void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +define amdgpu_gs void @soffset_max_imm([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 @@ -74,7 +74,7 @@ ; CHECK-LABEL: {{^}}soffset_no_fold: ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x41 ; CHECK: buffer_load_dword v{{[0-9+]}}, v{{[0-9+]}}, s[{{[0-9]+}}:{{[0-9]+}}], [[SOFFSET]] offen glc -define void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) #1 { +define amdgpu_gs void @soffset_no_fold([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, [32 x <8 x i32>] addrspace(2)* byval, i32 inreg, i32 inreg, i32, i32, i32, i32, i32, i32, i32, i32) { main_body: %tmp0 = getelementptr [6 x <16 x i8>], [6 x <16 x i8>] addrspace(2)* %0, i32 0, i32 0 %tmp1 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp0 @@ -179,5 +179,5 @@ declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32) #3 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -attributes #1 = { "ShaderType"="2" "unsafe-fp-math"="true" } +attributes #1 = { "unsafe-fp-math"="true" } attributes #3 = { nounwind readonly } Index: test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll =================================================================== --- test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll +++ test/CodeGen/AMDGPU/no-hsa-graphics-shaders.ll @@ -1,18 +1,14 @@ ; RUN: not llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa < %s 2>&1 | FileCheck %s ; CHECK: in function pixel_s{{.*}}: unsupported non-compute shaders with HSA -define void @pixel_shader() #0 { +define amdgpu_ps void @pixel_shader() #0 { ret void } -define void @vertex_shader() #1 { +define amdgpu_vs void @vertex_shader() #0 { ret void } -define void @geometry_shader() #2 { +define amdgpu_gs void @geometry_shader() #0 { ret void } - -attributes #0 = { nounwind "ShaderType"="0" } -attributes #1 = { nounwind "ShaderType"="1" } -attributes #2 = { nounwind "ShaderType"="2" } Index: test/CodeGen/AMDGPU/predicate-dp4.ll =================================================================== --- test/CodeGen/AMDGPU/predicate-dp4.ll +++ test/CodeGen/AMDGPU/predicate-dp4.ll @@ -3,7 +3,7 @@ ; CHECK-LABEL: {{^}}main: ; CHECK: PRED_SETE_INT * Pred, ; CHECK: DOT4 T{{[0-9]+}}.X, T0.X, T0.X, Pred_sel_one -define void @main(<4 x float> inreg) #0 { +define amdgpu_ps void @main(<4 x float> inreg) { main_body: %1 = extractelement <4 x float> %0, i32 0 %2 = bitcast float %1 to i32 @@ -24,4 +24,3 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1 declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) attributes #1 = { readnone } -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/pv-packing.ll =================================================================== --- test/CodeGen/AMDGPU/pv-packing.ll +++ test/CodeGen/AMDGPU/pv-packing.ll @@ -3,7 +3,7 @@ ;CHECK: DOT4 T{{[0-9]\.X}} ;CHECK: MULADD_IEEE * T{{[0-9]\.W}} -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -41,5 +41,4 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/pv.ll =================================================================== --- test/CodeGen/AMDGPU/pv.ll +++ test/CodeGen/AMDGPU/pv.ll @@ -3,7 +3,7 @@ ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -235,7 +235,6 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } attributes #1 = { readnone } attributes #2 = { readonly } attributes #3 = { nounwind readonly } Index: test/CodeGen/AMDGPU/r600-encoding.ll =================================================================== --- test/CodeGen/AMDGPU/r600-encoding.ll +++ test/CodeGen/AMDGPU/r600-encoding.ll @@ -10,7 +10,7 @@ ; R600: {{^}}test: ; R600: MUL_IEEE {{[ *TXYZWPVxyzw.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { entry: %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = extractelement <4 x float> %reg0, i32 1 @@ -21,5 +21,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/r600-export-fix.ll =================================================================== --- test/CodeGen/AMDGPU/r600-export-fix.ll +++ test/CodeGen/AMDGPU/r600-export-fix.ll @@ -10,7 +10,7 @@ ;CHECK: EXPORT T{{[0-9]}}.0000 -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -138,5 +138,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll =================================================================== --- test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll +++ test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -define void @main(<4 x float> inreg, <4 x float> inreg) #0 { +define amdgpu_ps void @main(<4 x float> inreg, <4 x float> inreg) { main_body: %2 = extractelement <4 x float> %0, i32 0 %3 = extractelement <4 x float> %0, i32 1 @@ -54,5 +54,4 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="0" } attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/r600cfg.ll =================================================================== --- test/CodeGen/AMDGPU/r600cfg.ll +++ test/CodeGen/AMDGPU/r600cfg.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -115,5 +115,3 @@ declare void @llvm.R600.store.stream.output(<4 x float>, i32, i32, i32) declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/reciprocal.ll =================================================================== --- test/CodeGen/AMDGPU/reciprocal.ll +++ test/CodeGen/AMDGPU/reciprocal.ll @@ -2,7 +2,7 @@ ;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_ps void @test(<4 x float> inreg %reg0) { %r0 = extractelement <4 x float> %reg0, i32 0 %r1 = fdiv float 1.0, %r0 %vec = insertelement <4 x float> undef, float %r1, i32 0 @@ -11,5 +11,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="0" } Index: test/CodeGen/AMDGPU/ret.ll =================================================================== --- test/CodeGen/AMDGPU/ret.ll +++ test/CodeGen/AMDGPU/ret.ll @@ -1,8 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s -attributes #0 = { "ShaderType"="1" } - declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) ; GCN-LABEL: {{^}}vgpr: @@ -11,7 +9,7 @@ ; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) %x = fadd float %3, 1.0 %a = insertvalue {float, float} undef, float %x, 0 @@ -28,7 +26,7 @@ ; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0} } @@ -45,8 +43,8 @@ ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v6 ; GCN-NOT: s_endpgm -attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" } -define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { +attributes #0 = { "InitialPSInputAddr"="0" } +define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { %i0 = extractelement <2 x i32> %4, i32 0 %i1 = extractelement <2 x i32> %4, i32 1 %i2 = extractelement <2 x i32> %7, i32 0 @@ -71,7 +69,7 @@ ; GCN-LABEL: {{^}}ps_input_ena_no_inputs: ; GCN: v_mov_b32_e32 v0, 1.0 ; GCN-NOT: s_endpgm -define float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { +define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { ret float 1.0 } @@ -85,7 +83,7 @@ ; GCN-DAG: v_mov_b32_e32 v1, v2 ; GCN: v_mov_b32_e32 v2, v3 ; GCN-NOT: s_endpgm -define {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { +define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { %f = bitcast <2 x i32> %8 to <2 x float> %s = insertvalue {float, <2 x float>} undef, float %14, 0 %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1 @@ -104,8 +102,8 @@ ; GCN-DAG: v_mov_b32_e32 v3, v6 ; GCN-DAG: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" } -define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 { +attributes #1 = { "InitialPSInputAddr"="1" } +define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { %i0 = extractelement <2 x i32> %4, i32 0 %i1 = extractelement <2 x i32> %4, i32 1 %i2 = extractelement <2 x i32> %7, i32 0 @@ -134,8 +132,8 @@ ; GCN: v_mov_b32_e32 v3, v8 ; GCN: v_mov_b32_e32 v4, v12 ; GCN-NOT: s_endpgm -attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" } -define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 { +attributes #2 = { "InitialPSInputAddr"="119" } +define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 { %i0 = extractelement <2 x i32> %4, i32 0 %i1 = extractelement <2 x i32> %4, i32 1 %i2 = extractelement <2 x i32> %7, i32 0 @@ -164,8 +162,8 @@ ; GCN: v_mov_b32_e32 v3, v4 ; GCN: v_mov_b32_e32 v4, v8 ; GCN-NOT: s_endpgm -attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" } -define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 { +attributes #3 = { "InitialPSInputAddr"="418" } +define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 { %i0 = extractelement <2 x i32> %4, i32 0 %i1 = extractelement <2 x i32> %4, i32 1 %i2 = extractelement <2 x i32> %7, i32 0 @@ -187,7 +185,7 @@ ; GCN: s_add_i32 s0, s3, 2 ; GCN: s_mov_b32 s2, s3 ; GCN-NOT: s_endpgm -define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { %x = add i32 %2, 2 %a = insertvalue {i32, i32, i32} undef, i32 %x, 0 %b = insertvalue {i32, i32, i32} %a, i32 %1, 1 @@ -203,7 +201,7 @@ ; GCN-DAG: s_mov_b32 s2, 7 ; GCN-DAG: s_mov_b32 s3, 8 ; GCN-NOT: s_endpgm -define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { %x = add i32 %2, 2 ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8} } @@ -218,7 +216,7 @@ ; GCN: s_mov_b32 s2, s3 ; GCN: s_waitcnt expcnt(0) ; GCN-NOT: s_endpgm -define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) %v = fadd float %3, 1.0 %s = add i32 %2, 2 @@ -239,7 +237,7 @@ ; GCN-DAG: v_mov_b32_e32 v1, 2.0 ; GCN-DAG: v_mov_b32_e32 v2, 4.0 ; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3 -define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { +define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) { call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }} } Index: test/CodeGen/AMDGPU/ret_jump.ll =================================================================== --- test/CodeGen/AMDGPU/ret_jump.ll +++ test/CodeGen/AMDGPU/ret_jump.ll @@ -12,7 +12,7 @@ ; ModuleID = 'bugpoint-reduced-simplified.bc' target triple = "amdgcn--" -define <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { +define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 { main_body: %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7) %p87 = fmul float undef, %p83 @@ -53,5 +53,5 @@ ; Function Attrs: nounwind readnone declare float @llvm.floor.f32(float) #1 -attributes #0 = { "InitialPSInputAddr"="36983" "ShaderType"="0" } +attributes #0 = { "InitialPSInputAddr"="36983" } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/rv7x0_count3.ll =================================================================== --- test/CodeGen/AMDGPU/rv7x0_count3.ll +++ test/CodeGen/AMDGPU/rv7x0_count3.ll @@ -2,7 +2,7 @@ ; CHECK: TEX 9 @6 ; encoding: [0x06,0x00,0x00,0x00,0x00,0x04,0x88,0x80] -define void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @test(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { %1 = extractelement <4 x float> %reg1, i32 0 %2 = extractelement <4 x float> %reg1, i32 1 %3 = extractelement <4 x float> %reg1, i32 2 @@ -37,5 +37,3 @@ declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll +++ test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs ;REQUIRES: asserts -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #1 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -79,4 +79,3 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) attributes #0 = { readnone } -attributes #1 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll +++ test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -7,7 +7,7 @@ ; SI-LABEL: {{^}}main( -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 2 @@ -159,5 +159,3 @@ %115 = fadd float %temp4.0, 1.000000e+00 br label %Flow1 } - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll +++ test/CodeGen/AMDGPU/schedule-vs-if-nested-loop.ll @@ -1,7 +1,7 @@ ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched ;REQUIRES: asserts -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -128,5 +128,3 @@ } declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/sgpr-copy.ll =================================================================== --- test/CodeGen/AMDGPU/sgpr-copy.ll +++ test/CodeGen/AMDGPU/sgpr-copy.ll @@ -11,7 +11,7 @@ ; CHECK-LABEL: {{^}}phi1: ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]] -define void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -35,7 +35,7 @@ ; Make sure this program doesn't crash ; CHECK-LABEL: {{^}}phi2: -define void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -156,7 +156,7 @@ ; We just want ot make sure the program doesn't crash ; CHECK-LABEL: {{^}}loop: -define void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -227,7 +227,7 @@ ; CHECK: image_sample ; CHECK: exp ; CHECK: s_endpgm -define void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -291,7 +291,7 @@ ; This test is just checking that we don't crash / assertion fail. ; CHECK-LABEL: {{^}}copy2: ; CHECK: s_endpgm -define void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { entry: br label %LOOP68 @@ -321,7 +321,7 @@ ; CHECK: image_sample ; CHECK: image_sample ; CHECK: s_endpgm -define void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { bb: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0 %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2 @@ -365,7 +365,7 @@ ; Check the the resource descriptor is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0 @@ -380,7 +380,7 @@ ; Check the the sampler is stored in an sgpr. ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 { %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0 @@ -394,7 +394,7 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } +attributes #0 = { "unsafe-fp-math"="true" } attributes #1 = { nounwind readnone } attributes #2 = { readonly } attributes #3 = { readnone } Index: test/CodeGen/AMDGPU/shared-op-cycle.ll =================================================================== --- test/CodeGen/AMDGPU/shared-op-cycle.ll +++ test/CodeGen/AMDGPU/shared-op-cycle.ll @@ -4,7 +4,7 @@ ; CHECK: MULADD_IEEE * ; CHECK-NOT: MULADD_IEEE * -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2) { %w0 = extractelement <4 x float> %reg0, i32 3 %w1 = extractelement <4 x float> %reg1, i32 3 %w2 = extractelement <4 x float> %reg2, i32 3 @@ -28,5 +28,4 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } -attributes #1 = { readnone } \ No newline at end of file +attributes #1 = { readnone } Index: test/CodeGen/AMDGPU/si-literal-folding.ll =================================================================== --- test/CodeGen/AMDGPU/si-literal-folding.ll +++ test/CodeGen/AMDGPU/si-literal-folding.ll @@ -4,7 +4,7 @@ ; CHECK-LABEL: {{^}}main: ; CHECK-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0xbf4353f8 -define void @main(float) #0 { +define amdgpu_vs void @main(float) { main_body: %1 = fmul float %0, 0x3FE86A7F00000000 %2 = fmul float %0, 0xBFE86A7F00000000 @@ -13,5 +13,3 @@ } declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/si-lod-bias.ll =================================================================== --- test/CodeGen/AMDGPU/si-lod-bias.ll +++ test/CodeGen/AMDGPU/si-lod-bias.ll @@ -6,7 +6,7 @@ ; CHECK: {{^}}main: ; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf -define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 { +define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0 %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -45,7 +45,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } + attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} Index: test/CodeGen/AMDGPU/si-scheduler.ll =================================================================== --- test/CodeGen/AMDGPU/si-scheduler.ll +++ test/CodeGen/AMDGPU/si-scheduler.ll @@ -11,7 +11,7 @@ ; CHECK: s_waitcnt vmcnt(0) ; CHECK: exp ; CHECK: s_endpgm -define void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { +define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 { main_body: %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)* %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0 @@ -49,7 +49,6 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} Index: test/CodeGen/AMDGPU/si-sgpr-spill.ll =================================================================== --- test/CodeGen/AMDGPU/si-sgpr-spill.ll +++ test/CodeGen/AMDGPU/si-sgpr-spill.ll @@ -22,7 +22,7 @@ ; Writing to M0 from an SMRD instruction will hang the GPU. ; CHECK-NOT: s_buffer_load_dword m0 ; CHECK: s_endpgm -define void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -668,7 +668,7 @@ ; CHECK-LABEL: {{^}}main1: ; CHECK: s_endpgm -define void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 { +define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) { main_body: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0 %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -1610,7 +1610,6 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } attributes #1 = { readnone } attributes #2 = { nounwind readnone } attributes #3 = { nounwind readonly } Index: test/CodeGen/AMDGPU/si-spill-cf.ll =================================================================== --- test/CodeGen/AMDGPU/si-spill-cf.ll +++ test/CodeGen/AMDGPU/si-spill-cf.ll @@ -6,7 +6,7 @@ ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]] ; SI-NOT: v_readlane_b32 [[SAVED]] -define void @main() #1 { +define amdgpu_ps void @main() { main_body: %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16) %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32) @@ -510,5 +510,5 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { alwaysinline nounwind readnone } -attributes #1 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" } +attributes #1 = { "enable-no-nans-fp-math"="true" } attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -2,7 +2,7 @@ declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -declare void @llvm.amdgcn.s.barrier() #2 +declare void @llvm.amdgcn.s.barrier() #1 @stored_lds_ptr = addrspace(3) global i32 addrspace(3)* undef, align 4 @@ -61,7 +61,7 @@ %tmp1 = load i32, i32 addrspace(3)* %ptr1, align 4 store i32 99, i32 addrspace(1)* %gptr, align 4 - call void @llvm.amdgcn.s.barrier() #2 + call void @llvm.amdgcn.s.barrier() #1 %tmp2 = load i32, i32 addrspace(3)* %ptr2, align 4 %add = add nsw i32 %tmp1, %tmp2 @@ -212,7 +212,7 @@ ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x4 ; XCI: TBUFFER_STORE_FORMAT ; XCI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}}, 0x8 -; define void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #1 { +; define amdgpu_vs void @reorder_local_load_tbuffer_store_local_load(i32 addrspace(1)* %out, i32 %a1, i32 %vaddr) #0 { ; %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4 ; %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1 @@ -234,5 +234,4 @@ ; } attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind convergent } +attributes #1 = { nounwind convergent } Index: test/CodeGen/AMDGPU/smrd.ll =================================================================== --- test/CodeGen/AMDGPU/smrd.ll +++ test/CodeGen/AMDGPU/smrd.ll @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}smrd_load_const0: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10 -define void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 @@ -102,7 +102,7 @@ ; GCN-LABEL: {{^}}smrd_load_const1: ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc -define void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 @@ -118,7 +118,7 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400 -define void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 @@ -133,7 +133,7 @@ ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc -define void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 @@ -148,7 +148,7 @@ ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]] ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000 ; GCN: s_endpgm -define void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 { +define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) { main_body: %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0 %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20 @@ -158,9 +158,8 @@ } ; Function Attrs: nounwind readnone -declare float @llvm.SI.load.const(<16 x i8>, i32) #1 +declare float @llvm.SI.load.const(<16 x i8>, i32) #0 declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } -attributes #1 = { nounwind readnone } +attributes #0 = { nounwind readnone } Index: test/CodeGen/AMDGPU/split-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/split-smrd.ll +++ test/CodeGen/AMDGPU/split-smrd.ll @@ -6,7 +6,7 @@ ; CHECK-LABEL: {{^}}split_smrd_add_worklist: ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1 -define void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { +define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 { bb: %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96) %tmp1 = bitcast float %tmp to i32 @@ -38,7 +38,7 @@ declare i32 @llvm.SI.packf16(float, float) #1 -attributes #0 = { "ShaderType"="0" "unsafe-fp-math"="true" } +attributes #0 = { "unsafe-fp-math"="true" } attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} Index: test/CodeGen/AMDGPU/subreg-coalescer-crash.ll =================================================================== --- test/CodeGen/AMDGPU/subreg-coalescer-crash.ll +++ test/CodeGen/AMDGPU/subreg-coalescer-crash.ll @@ -44,7 +44,7 @@ ; SI-LABEL: {{^}}foo: ; SI: s_endpgm -define void @foo() #0 { +define amdgpu_ps void @foo() #0 { bb: br i1 undef, label %bb2, label %bb1 @@ -105,5 +105,5 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } +attributes #0 = { "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" } attributes #1 = { nounwind readnone } Index: test/CodeGen/AMDGPU/swizzle-export.ll =================================================================== --- test/CodeGen/AMDGPU/swizzle-export.ll +++ test/CodeGen/AMDGPU/swizzle-export.ll @@ -6,7 +6,7 @@ ;EG: EXPORT T{{[0-9]+}}.XXWX ;EG: EXPORT T{{[0-9]+}}.XXXW -define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -96,7 +96,7 @@ ; EG: T{{[0-9]+}}.XY__ ; EG: T{{[0-9]+}}.ZXY0 -define void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 { +define amdgpu_vs void @main2(<4 x float> inreg %reg0, <4 x float> inreg %reg1) { main_body: %0 = extractelement <4 x float> %reg1, i32 0 %1 = extractelement <4 x float> %reg1, i32 1 @@ -125,5 +125,4 @@ declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) -attributes #0 = { "ShaderType"="1" } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/tex-clause-antidep.ll =================================================================== --- test/CodeGen/AMDGPU/tex-clause-antidep.ll +++ test/CodeGen/AMDGPU/tex-clause-antidep.ll @@ -3,7 +3,7 @@ ;CHECK: TEX ;CHECK-NEXT: ALU -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_vs void @test(<4 x float> inreg %reg0) { %1 = extractelement <4 x float> %reg0, i32 0 %2 = extractelement <4 x float> %reg0, i32 1 %3 = extractelement <4 x float> %reg0, i32 2 @@ -21,5 +21,3 @@ declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/texture-input-merge.ll =================================================================== --- test/CodeGen/AMDGPU/texture-input-merge.ll +++ test/CodeGen/AMDGPU/texture-input-merge.ll @@ -2,7 +2,7 @@ ;CHECK-NOT: MOV -define void @test(<4 x float> inreg %reg0) #0 { +define amdgpu_vs void @test(<4 x float> inreg %reg0) { %1 = extractelement <4 x float> %reg0, i32 0 %2 = extractelement <4 x float> %reg0, i32 1 %3 = extractelement <4 x float> %reg0, i32 2 @@ -27,5 +27,3 @@ declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) readnone declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) - -attributes #0 = { "ShaderType"="1" } Index: test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll =================================================================== --- test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll +++ test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll @@ -5,7 +5,7 @@ ;CHECK-LABEL: {{^}}test1: ;CHECK: s_cbranch_execz ;CHECK: %loop_body -define void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) #0 { +define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <2 x i32> %addr.base, i32 %y, i32 %p) { main_body: %cc = icmp eq i32 %p, 0 br i1 %cc, label %out, label %loop_body @@ -58,5 +58,4 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1 -attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind readonly } Index: test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll =================================================================== --- test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll +++ test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll @@ -24,7 +24,7 @@ ; GCN: NumVgprs: 256 ; GCN: ScratchSize: 1024 -define void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 { +define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) { bb: %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i64 0, i64 0 %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0 @@ -491,7 +491,7 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1 -attributes #0 = { "ShaderType"="1" "enable-no-nans-fp-math"="true" } +attributes #0 = { "enable-no-nans-fp-math"="true" } attributes #1 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} Index: test/CodeGen/AMDGPU/wait.ll =================================================================== --- test/CodeGen/AMDGPU/wait.ll +++ test/CodeGen/AMDGPU/wait.ll @@ -11,7 +11,7 @@ ; DEFAULT: exp ; DEFAULT: s_waitcnt lgkmcnt(0) ; DEFAULT: s_endpgm -define void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 { +define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) { main_body: %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0 %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0 @@ -45,8 +45,8 @@ ; ILPMAX: s_waitcnt vmcnt(0) ; ILPMAX: s_endpgm -define void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* -byval, i32 inreg, i32 inreg, i32, i32, i32, i32) #0 { +define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)* +byval, i32 inreg, i32 inreg, i32, i32, i32, i32) { main_body: %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0 %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0 @@ -78,7 +78,6 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="1" } attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } Index: test/CodeGen/AMDGPU/wqm.ll =================================================================== --- test/CodeGen/AMDGPU/wqm.ll +++ test/CodeGen/AMDGPU/wqm.ll @@ -5,7 +5,7 @@ ; ;CHECK-LABEL: {{^}}test1: ;CHECK-NOT: s_wqm -define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +define amdgpu_ps <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) @@ -20,7 +20,7 @@ ;CHECK: image_sample ;CHECK-NOT: exec ;CHECK: _load_dword v0, -define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 { +define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { main_body: %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %c.2 = bitcast <4 x float> %c.1 to <4 x i32> @@ -40,7 +40,7 @@ ;CHECK: s_and_b64 exec, exec, [[ORIG]] ;CHECK: store ;CHECK-NOT: exec -define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 { +define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = bitcast <4 x float> %tex to <4 x i32> @@ -62,7 +62,7 @@ ;CHECK: store ;CHECK: s_wqm_b64 exec, exec ;CHECK: image_sample v[0:3], [[MUL]], s[0:7], s[8:11] dmask:0xf -define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 { +define amdgpu_ps <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) { main_body: %c.1 = mul i32 %c, %d %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 @@ -88,7 +88,7 @@ ;CHECK: s_mov_b64 exec, [[SAVED]] ;CHECK: %IF ;CHECK: image_sample -define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 { +define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %IF, label %ELSE @@ -124,7 +124,7 @@ ;CHECK-NEXT: %ELSE ;CHECK: store ;CHECK: %END -define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 { +define amdgpu_ps float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) { main_body: %cmp = icmp eq i32 %z, 0 br i1 %cmp, label %ELSE, label %IF @@ -158,7 +158,7 @@ ;CHECK: store ;CHECK: s_wqm_b64 exec, exec ;CHECK: v_cmp -define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 { +define amdgpu_ps <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %idx.1 = extractelement <3 x i32> %idx, i32 0 %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 @@ -205,7 +205,7 @@ ;CHECK: load ;CHECK: store ;CHECK: v_cmp -define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 { +define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) %tex.1 = extractelement <4 x float> %tex, i32 0 @@ -253,7 +253,7 @@ ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: %END ;CHECK: image_sample -define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 { +define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) { main_body: %cond = icmp eq i32 %y, 0 br i1 %cond, label %IF, label %END @@ -286,7 +286,7 @@ ;VI: flat_store_dword ;CHECK: s_mov_b64 exec, [[SAVE]] ;CHECK: image_sample -define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) #0 { +define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -320,7 +320,7 @@ ;VI: flat_store_dword ;CHECK-NOT: wqm ;CHECK: v_cmpx_ -define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) #0 { +define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) { main_body: %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) @@ -342,7 +342,6 @@ declare void @llvm.AMDGPU.kill(float) declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) -attributes #0 = { "ShaderType"="0" } attributes #1 = { nounwind } attributes #2 = { nounwind readonly } attributes #3 = { nounwind readnone }