Index: docs/AMDGPUUsage.rst =================================================================== --- docs/AMDGPUUsage.rst +++ docs/AMDGPUUsage.rst @@ -285,6 +285,7 @@ 3 Local (group/LDS) Local (group/LDS) Local (group/LDS) Local (group/LDS) 4 Generic (Flat) Region (GDS) Region (GDS) Constant 5 Region (GDS) Private (Scratch) Private (Scratch) Private (Scratch) + 6 Constant 32-bit Constant 32-bit Constant 32-bit Constant 32-bit ================== ================= ================= ================= ================= Current Default Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -224,6 +224,9 @@ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. + + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory + /// Address space for direct addressible parameter memory (CONST0) PARAM_D_ADDRESS = 6, /// Address space for indirect addressible parameter memory (VTX1) Index: lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -115,7 +115,8 @@ bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); - if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) { + if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS || + Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) { return true; } Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -466,7 +466,8 @@ } bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { - if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -162,6 +162,7 @@ bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; + SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -636,7 +637,8 @@ if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; } @@ -1438,19 +1440,45 @@ return true; } +SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { + if (Addr.getValueType() != MVT::i32) + return Addr; + + // Zero-extend a 32-bit address. + SDLoc SL(Addr); + + const MachineFunction &MF = CurDAG->getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo(); + unsigned AddrHiVal = Info->get32BitAddressHighBits(); + SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); + + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), + Addr, + CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), + 0), + CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), + }; + + return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, + Ops), 0); +} + bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); if (SelectSMRDOffset(N1, Offset, Imm)) { - SBase = N0; + SBase = Expand32BitAddress(N0); return true; } } - SBase = Addr; + SBase = Expand32BitAddress(Addr); Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); Imm = true; return true; Index: lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -293,7 +293,8 @@ if (!I.hasOneMemOperand()) return false; - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS) + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS && + (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT) return false; if (!isInstrUniform(I)) Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -272,10 +272,10 @@ // flat. if (TT.getEnvironmentName() == "amdgiz" || TT.getEnvironmentName() == "amdgizcl") - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -237,6 +237,7 @@ AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS_32BIT || AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -900,7 +900,8 @@ if (AS == AMDGPUASI.GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1023,7 +1024,8 @@ // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || + AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? (Align % 4 == 0) : true; } @@ -1066,7 +1068,8 @@ static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -4008,13 +4011,15 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4391,7 +4396,8 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -4444,6 +4450,7 @@ const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && // FIXME: It isn't correct to rely on the type of the pointer. This should // be removed when address space 0 is 64-bit. @@ -5378,7 +5385,8 @@ AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they @@ -5386,7 +5394,9 @@ // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); @@ -5395,7 +5405,9 @@ // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || + AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); Index: lib/Target/AMDGPU/SIMachineFunctionInfo.h =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -186,6 +186,8 @@ // current hardware only allows a 16 bit value. unsigned GITPtrHigh; + unsigned HighBitsOf32BitAddress; + MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -411,6 +413,10 @@ return GITPtrHigh; } + unsigned get32BitAddressHighBits() const { + return HighBitsOf32BitAddress; + } + unsigned getNumUserSGPRs() const { return NumUserSGPRs; } Index: lib/Target/AMDGPU/SIMachineFunctionInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -47,7 +47,8 @@ WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff) { + GITPtrHigh(0xffffffff), + HighBitsOf32BitAddress(0) { const SISubtarget &ST = MF.getSubtarget(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -164,6 +165,11 @@ StringRef S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, GITPtrHigh); + + A = F.getFnAttribute("amdgpu-32bit-address-high-bits"); + S = A.getValueAsString(); + if (!S.empty()) + S.consumeInteger(0, HighBitsOf32BitAddress); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( Index: lib/Target/AMDGPU/SMInstructions.td =================================================================== --- lib/Target/AMDGPU/SMInstructions.td +++ lib/Target/AMDGPU/SMInstructions.td @@ -223,7 +223,8 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + (((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && static_cast(getTargetLowering())->isMemOpUniform(N)) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && !Ld->isVolatile() && Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -425,7 +425,8 @@ } bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; } bool shouldEmitConstantsToTextSection(const Triple &TT) { Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -0,0 +1,207 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s + +; GCN-LABEL: {{^}}load_i32: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr i32, i32 addrspace(6)* %p1, i64 2 + %r0 = load i32, i32 addrspace(6)* %p0 + %r1 = load i32, i32 addrspace(6)* %gep1 + %r = add i32 %r0, %r1 + %r2 = bitcast i32 %r to float + ret float %r2 +} + +; GCN-LABEL: {{^}}load_v2i32: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 + %r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1 + %r = add <2 x i32> %r0, %r1 + %r2 = bitcast <2 x i32> %r to <2 x float> + ret <2 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v4i32: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0 + %r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1 + %r = add <4 x i32> %r0, %r1 + %r2 = bitcast <4 x i32> %r to <4 x float> + ret <4 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v8i32: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0 + %r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1 + %r = add <8 x i32> %r0, %r1 + %r2 = bitcast <8 x i32> %r to <8 x float> + ret <8 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v16i32: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0 + %r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1 + %r = add <16 x i32> %r0, %r1 + %r2 = bitcast <16 x i32> %r to <16 x float> + ret <16 x float> %r2 +} + +; GCN-LABEL: {{^}}load_float: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr float, float addrspace(6)* %p1, i64 2 + %r0 = load float, float addrspace(6)* %p0 + %r1 = load float, float addrspace(6)* %gep1 + %r = fadd float %r0, %r1 + ret float %r +} + +; GCN-LABEL: {{^}}load_v2float: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p1, i64 2 + %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 + %r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1 + %r = fadd <2 x float> %r0, %r1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}load_v4float: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 +define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p1, i64 2 + %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0 + %r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1 + %r = fadd <4 x float> %r0, %r1 + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}load_v8float: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p1, i64 2 + %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0 + %r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1 + %r = fadd <8 x float> %r0, %r1 + ret <8 x float> %r +} + +; GCN-LABEL: {{^}}load_v16float: +; GCN-DAG: s_mov_b32 s3, 0 +; GCN-DAG: s_mov_b32 s2, s1 +; GCN-DAG: s_mov_b32 s1, s3 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { + %gep1 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p1, i64 2 + %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0 + %r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1 + %r = fadd <16 x float> %r0, %r1 + ret <16 x float> %r +} + +; GCN-LABEL: {{^}}load_i32_hi0: +; GCN: s_mov_b32 s1, 0 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +define amdgpu_vs i32 @load_i32_hi0(i32 addrspace(6)* inreg %p) #1 { + %r0 = load i32, i32 addrspace(6)* %p + ret i32 %r0 +} + +; GCN-LABEL: {{^}}load_i32_hi1: +; GCN: s_mov_b32 s1, 1 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +define amdgpu_vs i32 @load_i32_hi1(i32 addrspace(6)* inreg %p) #2 { + %r0 = load i32, i32 addrspace(6)* %p + ret i32 %r0 +} + +; GCN-LABEL: {{^}}load_i32_hiffff8000: +; GCN: s_movk_i32 s1, 0x8000 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +define amdgpu_vs i32 @load_i32_hiffff8000(i32 addrspace(6)* inreg %p) #3 { + %r0 = load i32, i32 addrspace(6)* %p + ret i32 %r0 +} + +; GCN-LABEL: {{^}}load_i32_hifffffff0: +; GCN: s_mov_b32 s1, -16 +; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 +define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 { + %r0 = load i32, i32 addrspace(6)* %p + ret i32 %r0 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind "amdgpu-32bit-address-high-bits"="0" } +attributes #2 = { nounwind "amdgpu-32bit-address-high-bits"="1" } +attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" } +attributes #4 = { nounwind "amdgpu-32bit-address-high-bits"="0xfffffff0" }