Index: lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -563,6 +563,8 @@ OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); + OutStreamer->EmitIntValue(MFI->PSInputEna, 4); + OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); OutStreamer->EmitIntValue(MFI->PSInputAddr, 4); } } Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -20,28 +20,83 @@ CCIfInReg>>, CCIfInReg>>, + // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. CCIfNotInReg>>, CCIfByVal>> ]>; +def RetCC_SI : CallingConv<[ + CCIfType<[i32] , CCAssignToReg<[ + SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7, + SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15, + SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23, + SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31, + SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39 + ]>>, + + // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. + CCIfType<[f32] , CCAssignToReg<[ + VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, + VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, + VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, + VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31, + VGPR32, VGPR33, VGPR34, VGPR35, VGPR36, VGPR37, VGPR38, VGPR39, + VGPR40, VGPR41, VGPR42, VGPR43, VGPR44, VGPR45, VGPR46, VGPR47, + VGPR48, VGPR49, VGPR50, VGPR51, VGPR52, VGPR53, VGPR54, VGPR55, + VGPR56, VGPR57, VGPR58, VGPR59, VGPR60, VGPR61, VGPR62, VGPR63, + VGPR64, VGPR65, VGPR66, VGPR67, VGPR68, VGPR69, VGPR70, VGPR71, + VGPR72, VGPR73, VGPR74, VGPR75, VGPR76, VGPR77, VGPR78, VGPR79, + VGPR80, VGPR81, VGPR82, VGPR83, VGPR84, VGPR85, VGPR86, VGPR87, + VGPR88, VGPR89, VGPR90, VGPR91, VGPR92, VGPR93, VGPR94, VGPR95, + VGPR96, VGPR97, VGPR98, VGPR99, VGPR100, VGPR101, VGPR102, VGPR103, + VGPR104, VGPR105, VGPR106, VGPR107, VGPR108, VGPR109, VGPR110, VGPR111, + VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119, + VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127, + VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135 + ]>> +]>; + // Calling convention for R600 def CC_R600 : CallingConv<[ CCIfInReg &OrigIns) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl &Ins) const; + void AnalyzeReturn(CCState &State, + const SmallVectorImpl &Outs) const; public: AMDGPUTargetLowering(TargetMachine &TM, const AMDGPUSubtarget &STI); Index: lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -565,6 +565,12 @@ State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } +void AMDGPUTargetLowering::AnalyzeReturn(CCState &State, + const SmallVectorImpl &Outs) const { + + State.AnalyzeReturn(Outs, RetCC_SI); +} + SDValue AMDGPUTargetLowering::LowerReturn( SDValue Chain, CallingConv::ID CallConv, Index: lib/Target/AMDGPU/AMDGPUInstrInfo.td =================================================================== --- lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -240,4 +240,4 @@ // Call/Return DAG Nodes //===----------------------------------------------------------------------===// def IL_retflag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone, - [SDNPHasChain, SDNPOptInGlue]>; + [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; Index: lib/Target/AMDGPU/SIDefines.h =================================================================== --- lib/Target/AMDGPU/SIDefines.h +++ lib/Target/AMDGPU/SIDefines.h @@ -137,7 +137,7 @@ #define C_00B84C_EXCP_EN #define R_0286CC_SPI_PS_INPUT_ENA 0x0286CC - +#define R_0286D0_SPI_PS_INPUT_ADDR 0x0286D0 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B848_VGPRS(x) (((x) & 0x3F) << 0) Index: lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- lib/Target/AMDGPU/SIISelLowering.h +++ lib/Target/AMDGPU/SIISelLowering.h @@ -95,6 +95,13 @@ SDLoc DL, SelectionDAG &DAG, SmallVectorImpl &InVals) const override; + SDValue LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const override; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI, MachineBasicBlock * BB) const override; bool enableAggressiveFMAFusion(EVT VT) const override; Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -598,18 +598,18 @@ // First check if it's a PS input addr if (Info->getShaderType() == ShaderType::PIXEL && !Arg.Flags.isInReg() && - !Arg.Flags.isByVal()) { + !Arg.Flags.isByVal() && PSInputNum <= 15) { - assert((PSInputNum <= 15) && "Too many PS inputs!"); - - if (!Arg.Used) { + if (!Arg.Used && !(Info->PSInputAddr & (1 << PSInputNum))) { // We can safely skip PS inputs Skipped.set(i); ++PSInputNum; continue; } - Info->PSInputAddr |= 1 << PSInputNum++; + Info->PSInputEna |= 1 << PSInputNum; + Info->PSInputAddr |= 1 << PSInputNum; + ++PSInputNum; } // Second split vertices into their elements @@ -640,10 +640,14 @@ // At least one interpolation mode must be enabled or else the GPU will hang. if (Info->getShaderType() == ShaderType::PIXEL && - (Info->PSInputAddr & 0x7F) == 0) { - Info->PSInputAddr |= 1; - CCInfo.AllocateReg(AMDGPU::VGPR0); - CCInfo.AllocateReg(AMDGPU::VGPR1); + (Info->PSInputEna & 0x7F) == 0) { + Info->PSInputEna |= 1; + + if (!(Info->PSInputAddr & 0x1)) { + Info->PSInputAddr |= 1; + CCInfo.AllocateReg(AMDGPU::VGPR0); + CCInfo.AllocateReg(AMDGPU::VGPR1); + } } if (Info->getShaderType() == ShaderType::COMPUTE) { @@ -872,6 +876,92 @@ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } +SDValue SITargetLowering::LowerReturn(SDValue Chain, + CallingConv::ID CallConv, + bool isVarArg, + const SmallVectorImpl &Outs, + const SmallVectorImpl &OutVals, + SDLoc DL, SelectionDAG &DAG) const { + MachineFunction &MF = DAG.getMachineFunction(); + SIMachineFunctionInfo *Info = MF.getInfo(); + Info->ReturnsVoid = Outs.size() == 0; + + SmallVector Splits; + SmallVector SplitVals; + + // Split vectors into their elements. + for (unsigned i = 0, e = Outs.size(); i != e; ++i) { + const ISD::OutputArg &Out = Outs[i]; + + if (Out.VT.isVector()) { + MVT VT = Out.VT.getVectorElementType(); + ISD::OutputArg NewOut = Out; + NewOut.Flags.setSplit(); + NewOut.VT = VT; + + // We want the original number of vector elements here, e.g. + // three or five, not four or eight. + unsigned NumElements = Out.ArgVT.getVectorNumElements(); + + for (unsigned j = 0; j != NumElements; ++j) { + SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], + DAG.getConstant(j, DL, MVT::i32)); + SplitVals.push_back(Elem); + Splits.push_back(NewOut); + NewOut.PartOffset += NewOut.VT.getStoreSize(); + } + } else { + SplitVals.push_back(OutVals[i]); + Splits.push_back(Out); + } + } + + // CCValAssign - represent the assignment of the return value to a location. + SmallVector RVLocs; + + // CCState - Info about the registers and stack slots. + CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, + *DAG.getContext()); + + // Analyze outgoing return values. + AnalyzeReturn(CCInfo, Splits); + + SDValue Flag; + SmallVector RetOps; + RetOps.push_back(Chain); // Operand #0 = Chain (updated below) + + // Copy the result values into the output registers. + for (unsigned i = 0, realRVLocIdx = 0; + i != RVLocs.size(); + ++i, ++realRVLocIdx) { + CCValAssign &VA = RVLocs[i]; + assert(VA.isRegLoc() && "Can only return in registers!"); + + SDValue Arg = SplitVals[realRVLocIdx]; + + // Copied from other backends. + switch (VA.getLocInfo()) { + default: llvm_unreachable("Unknown loc info!"); + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg); + break; + } + + Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag); + Flag = Chain.getValue(1); + RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); + } + + // Update chain and glue. + RetOps[0] = Chain; + if (Flag.getNode()) + RetOps.push_back(Flag); + + return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, RetOps); +} + MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter( MachineInstr * MI, MachineBasicBlock * BB) const { @@ -2292,7 +2382,9 @@ SmallVector Ops; for (unsigned i = 0; i < Node->getNumOperands(); ++i) { - if (!isFrameIndexOp(Node->getOperand(i))) { + if (!isFrameIndexOp(Node->getOperand(i)) && + (Node->getOpcode() != ISD::MERGE_VALUES || + !isa(Node->getOperand(i)))) { Ops.push_back(Node->getOperand(i)); continue; } Index: lib/Target/AMDGPU/SIInsertWaits.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaits.cpp +++ lib/Target/AMDGPU/SIInsertWaits.cpp @@ -84,6 +84,9 @@ bool LastInstWritesM0; + /// \brief Whether the machine function returns void + bool ReturnsVoid; + /// \brief Get increment/decrement amount for this instruction. Counters getHwCounts(MachineInstr &MI); @@ -322,7 +325,9 @@ const Counters &Required) { // End of program? No need to wait on anything - if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM) + // A function not returning void needs to wait, because other bytecode will + // be appended after it and we don't know what it will be. + if (I != MBB.end() && I->getOpcode() == AMDGPU::S_ENDPGM && ReturnsVoid) return false; // Figure out if the async instructions execute in order @@ -465,6 +470,7 @@ LastIssued = ZeroCounts; LastOpcodeType = OTHER; LastInstWritesM0 = false; + ReturnsVoid = MF.getInfo()->ReturnsVoid; memset(&UsedRegs, 0, sizeof(UsedRegs)); memset(&DefinedRegs, 0, sizeof(DefinedRegs)); @@ -488,6 +494,15 @@ // Wait for everything at the end of the MBB Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued); + + // Functions returning something shouldn't contain S_ENDPGM, because other + // bytecode will be appended after it. + if (!ReturnsVoid) { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + assert(I != MBB.end()); + if (I->getOpcode() == AMDGPU::S_ENDPGM) + I->eraseFromParent(); + } } return Changes; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -60,11 +60,13 @@ public: // FIXME: Make private unsigned LDSWaveSpillSize; + unsigned PSInputEna; unsigned PSInputAddr; std::map LaneVGPRs; unsigned ScratchOffsetReg; unsigned NumUserSGPRs; unsigned NumSystemSGPRs; + bool ReturnsVoid; private: bool HasSpilledSGPRs; Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -47,9 +47,11 @@ WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), LDSWaveSpillSize(0), + PSInputEna(0), PSInputAddr(0), NumUserSGPRs(0), NumSystemSGPRs(0), + ReturnsVoid(true), HasSpilledSGPRs(false), HasSpilledVGPRs(false), PrivateSegmentBuffer(false), @@ -72,6 +74,8 @@ const AMDGPUSubtarget &ST = MF.getSubtarget(); const Function *F = MF.getFunction(); + PSInputAddr = AMDGPU::getInitialPSInputAddr(*F); + const MachineFrameInfo *FrameInfo = MF.getFrameInfo(); if (getShaderType() == ShaderType::COMPUTE) Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -45,6 +45,8 @@ bool isReadOnlySegment(const GlobalValue *GV); unsigned getShaderType(const Function &F); +unsigned getInitialPSInputAddr(const Function &F); + bool isSI(const MCSubtargetInfo &STI); bool isCI(const MCSubtargetInfo &STI); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -106,20 +106,27 @@ return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } -static const char ShaderTypeAttribute[] = "ShaderType"; - -unsigned getShaderType(const Function &F) { - Attribute A = F.getFnAttribute(ShaderTypeAttribute); - unsigned ShaderType = ShaderType::COMPUTE; +static unsigned getIntegerAttribute(const Function &F, const char *Name, + unsigned Default) { + Attribute A = F.getFnAttribute(Name); + unsigned Result = Default; if (A.isStringAttribute()) { StringRef Str = A.getValueAsString(); - if (Str.getAsInteger(0, ShaderType)) { + if (Str.getAsInteger(0, Result)) { LLVMContext &Ctx = F.getContext(); Ctx.emitError("can't parse shader type"); } } - return ShaderType; + return Result; +} + +unsigned getShaderType(const Function &F) { + return getIntegerAttribute(F, "ShaderType", ShaderType::COMPUTE); +} + +unsigned getInitialPSInputAddr(const Function &F) { + return getIntegerAttribute(F, "InitialPSInputAddr", 0); } bool isSI(const MCSubtargetInfo &STI) { Index: test/CodeGen/AMDGPU/ret.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/ret.ll @@ -0,0 +1,200 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +attributes #0 = { "ShaderType"="1" } + +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +; GCN-LABEL: {{^}}vgpr: +; GCN: v_mov_b32_e32 v1, v0 +; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 +; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN: s_waitcnt expcnt(0) +; GCN-NOT: s_endpgm +define {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) + %x = fadd float %3, 1.0 + %a = insertvalue {float, float} undef, float %x, 0 + %b = insertvalue {float, float} %a, float %3, 1 + ret {float, float} %b +} + +; GCN-LABEL: {{^}}vgpr_literal: +; GCN: v_mov_b32_e32 v4, v0 +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 4.0 +; GCN-DAG: v_mov_b32_e32 v3, -1.0 +; GCN: exp 15, 0, 1, 1, 1, v4, v4, v4, v4 +; GCN: s_waitcnt expcnt(0) +; GCN-NOT: s_endpgm +define {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) + ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0} +} + + +; GCN-LABEL: {{^}}vgpr_ps_addr0: +; GCN-NOT: v_mov_b32_e32 v0 +; GCN-NOT: v_mov_b32_e32 v1 +; GCN-NOT: v_mov_b32_e32 v2 +; GCN: v_mov_b32_e32 v3, v4 +; GCN: v_mov_b32_e32 v4, v6 +; GCN-NOT: s_endpgm +attributes #1 = { "ShaderType"="0" "InitialPSInputAddr"="0" } +define {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 { + %i0 = extractelement <2 x i32> %4, i32 0 + %i1 = extractelement <2 x i32> %4, i32 1 + %i2 = extractelement <2 x i32> %7, i32 0 + %i3 = extractelement <2 x i32> %8, i32 0 + %f0 = bitcast i32 %i0 to float + %f1 = bitcast i32 %i1 to float + %f2 = bitcast i32 %i2 to float + %f3 = bitcast i32 %i3 to float + %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 + %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 + %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 + %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 + %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 + ret {float, float, float, float, float} %r4 +} + + +; GCN-LABEL: {{^}}vgpr_ps_addr1: +; GCN-DAG: v_mov_b32_e32 v0, v2 +; GCN-DAG: v_mov_b32_e32 v1, v3 +; GCN: v_mov_b32_e32 v2, v4 +; GCN-DAG: v_mov_b32_e32 v3, v6 +; GCN-DAG: v_mov_b32_e32 v4, v8 +; GCN-NOT: s_endpgm +attributes #2 = { "ShaderType"="0" "InitialPSInputAddr"="1" } +define {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 { + %i0 = extractelement <2 x i32> %4, i32 0 + %i1 = extractelement <2 x i32> %4, i32 1 + %i2 = extractelement <2 x i32> %7, i32 0 + %i3 = extractelement <2 x i32> %8, i32 0 + %f0 = bitcast i32 %i0 to float + %f1 = bitcast i32 %i1 to float + %f2 = bitcast i32 %i2 to float + %f3 = bitcast i32 %i3 to float + %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 + %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 + %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 + %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 + %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 + ret {float, float, float, float, float} %r4 +} + + +; GCN-LABEL: {{^}}vgpr_ps_addr119: +; GCN-DAG: v_mov_b32_e32 v0, v2 +; GCN-DAG: v_mov_b32_e32 v1, v3 +; GCN: v_mov_b32_e32 v2, v6 +; GCN: v_mov_b32_e32 v3, v8 +; GCN: v_mov_b32_e32 v4, v12 +; GCN-NOT: s_endpgm +attributes #3 = { "ShaderType"="0" "InitialPSInputAddr"="119" } +define {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 { + %i0 = extractelement <2 x i32> %4, i32 0 + %i1 = extractelement <2 x i32> %4, i32 1 + %i2 = extractelement <2 x i32> %7, i32 0 + %i3 = extractelement <2 x i32> %8, i32 0 + %f0 = bitcast i32 %i0 to float + %f1 = bitcast i32 %i1 to float + %f2 = bitcast i32 %i2 to float + %f3 = bitcast i32 %i3 to float + %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 + %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 + %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 + %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 + %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 + ret {float, float, float, float, float} %r4 +} + + +; GCN-LABEL: {{^}}vgpr_ps_addr418: +; GCN-NOT: v_mov_b32_e32 v0 +; GCN-NOT: v_mov_b32_e32 v1 +; GCN-NOT: v_mov_b32_e32 v2 +; GCN: v_mov_b32_e32 v3, v4 +; GCN: v_mov_b32_e32 v4, v8 +; GCN-NOT: s_endpgm +attributes #4 = { "ShaderType"="0" "InitialPSInputAddr"="418" } +define {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #4 { + %i0 = extractelement <2 x i32> %4, i32 0 + %i1 = extractelement <2 x i32> %4, i32 1 + %i2 = extractelement <2 x i32> %7, i32 0 + %i3 = extractelement <2 x i32> %8, i32 0 + %f0 = bitcast i32 %i0 to float + %f1 = bitcast i32 %i1 to float + %f2 = bitcast i32 %i2 to float + %f3 = bitcast i32 %i3 to float + %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0 + %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1 + %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2 + %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3 + %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4 + ret {float, float, float, float, float} %r4 +} + + +; GCN-LABEL: {{^}}sgpr: +; GCN: s_add_i32 s0, s3, 2 +; GCN: s_mov_b32 s2, s3 +; GCN-NOT: s_endpgm +define {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + %x = add i32 %2, 2 + %a = insertvalue {i32, i32, i32} undef, i32 %x, 0 + %b = insertvalue {i32, i32, i32} %a, i32 %1, 1 + %c = insertvalue {i32, i32, i32} %a, i32 %2, 2 + ret {i32, i32, i32} %c +} + + +; GCN-LABEL: {{^}}sgpr_literal: +; GCN: s_mov_b32 s0, 5 +; XGCN-NOT: s_mov_b32 s0, s0 +; GCN-DAG: s_mov_b32 s1, 6 +; GCN-DAG: s_mov_b32 s2, 7 +; GCN-DAG: s_mov_b32 s3, 8 +; GCN-NOT: s_endpgm +define {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + %x = add i32 %2, 2 + ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8} +} + + +; GCN-LABEL: {{^}}both: +; GCN: v_mov_b32_e32 v1, v0 +; GCN-DAG: exp 15, 0, 1, 1, 1, v1, v1, v1, v1 +; GCN-DAG: v_add_f32_e32 v0, 1.0, v1 +; GCN-DAG: s_add_i32 s0, s3, 2 +; GCN-DAG: s_mov_b32 s1, s2 +; GCN: s_mov_b32 s2, s3 +; GCN: s_waitcnt expcnt(0) +; GCN-NOT: s_endpgm +define {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) + %v = fadd float %3, 1.0 + %s = add i32 %2, 2 + %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0 + %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1 + %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2 + %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3 + %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4 + ret {float, i32, float, i32, i32} %a4 +} + + +; GCN-LABEL: {{^}}structure_literal: +; GCN: v_mov_b32_e32 v3, v0 +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: s_mov_b32 s0, 2 +; GCN-DAG: s_mov_b32 s1, 3 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN-DAG: v_mov_b32_e32 v2, 4.0 +; GCN-DAG: exp 15, 0, 1, 1, 1, v3, v3, v3, v3 +define {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) #0 { + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3) + ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> }} +}