Index: lib/Target/AMDGPU/AMDGPU.h =================================================================== --- lib/Target/AMDGPU/AMDGPU.h +++ lib/Target/AMDGPU/AMDGPU.h @@ -224,6 +224,9 @@ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. + + CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory + /// Address space for direct addressible parameter memory (CONST0) PARAM_D_ADDRESS = 6, /// Address space for indirect addressible parameter memory (VTX1) Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -466,7 +466,8 @@ } bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { - if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -633,7 +633,8 @@ if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; } @@ -1438,6 +1439,38 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); + + // If this is a zero-extended 32-bit pointer, we need to look for a constant + // offset inside it. + if (Addr.getOpcode() == ISD::ZERO_EXTEND && + Addr.getValueType() == MVT::i64 && + Addr.getOperand(0).getValueType() == MVT::i32) { + SDValue Addr32 = Addr.getOperand(0); + + if (CurDAG->isBaseWithConstantOffset(Addr32)) { + SDValue N0 = Addr32.getOperand(0); + SDValue N1 = Addr32.getOperand(1); + + if (SelectSMRDOffset(N1, Offset, Imm)) { + // The base pointer has 32 bits, so we need to zero-extend it. + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, SL, MVT::i32), + N0, + CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), + SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, + CurDAG->getTargetConstant(0, SL, + MVT::i32)), + 0), + CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), + }; + + SBase = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, + MVT::i64, Ops), 0); + return true; + } + } + } + if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -272,10 +272,10 @@ // flat. if (TT.getEnvironmentName() == "amdgiz" || TT.getEnvironmentName() == "amdgizcl") - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; - return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32" + return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p6:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"; } Index: lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -237,6 +237,7 @@ AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || + AddrSpace == AS.CONSTANT_ADDRESS_32BIT || AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -148,6 +148,11 @@ computeRegisterProperties(STI.getRegisterInfo()); // We need to custom lower vector stores from local memory + // We also need to lower loads from the 32-bit address space. + // TODO: This breaks unaligned loads, so don't do it on AmdHsa. + if (!Subtarget->isAmdHsaOS()) + setOperationAction(ISD::LOAD, MVT::i32, Custom); + setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); @@ -302,10 +307,8 @@ setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i32, Expand); setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i64, Expand); - if (getSubtarget()->hasFlatAddressSpace()) { - setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); - setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); - } + setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); + setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); @@ -891,7 +894,8 @@ if (AS == AMDGPUASI.GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1014,7 +1018,8 @@ // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || + AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? (Align % 4 == 0) : true; } @@ -1057,7 +1062,8 @@ static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS; + AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -3645,13 +3651,15 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && + return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -3935,6 +3943,11 @@ } } + // constant 32bit -> constant + if (ASC->getSrcAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT && + ASC->getDestAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) + return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src); + // global <-> flat are no-ops and never emitted. const MachineFunction &MF = DAG.getMachineFunction(); @@ -4028,7 +4041,8 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -4081,6 +4095,7 @@ const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && + GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && // FIXME: It isn't correct to rely on the type of the pointer. This should // be removed when address space 0 is 64-bit. @@ -4897,6 +4912,30 @@ ISD::LoadExtType ExtType = Load->getExtensionType(); EVT MemVT = Load->getMemoryVT(); + if (Load->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + SDValue Ptr = DAG.getAddrSpaceCast(DL, MVT::i64, + Load->getBasePtr(), + AMDGPUASI.CONSTANT_ADDRESS_32BIT, + AMDGPUASI.CONSTANT_ADDRESS); + + // We need to recreate the memory operand to fix its address space. + MachinePointerInfo PointerInfo = Load->getMemOperand()->getPointerInfo(); + PointerInfo.AddrSpace = AMDGPUASI.CONSTANT_ADDRESS; + + MachineMemOperand *MMO = Load->getMemOperand(); + MachineMemOperand *NewMMO = DAG.getMachineFunction().getMachineMemOperand( + PointerInfo, MMO->getFlags(), MMO->getSize(), MMO->getBaseAlignment(), + MMO->getAAInfo(), MMO->getRanges(), MMO->getSyncScopeID(), + MMO->getOrdering(), MMO->getFailureOrdering()); + + SDValue L = DAG.getLoad(Load->getValueType(0), DL, Load->getChain(), Ptr, + NewMMO); + assert(cast(L)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS); + + SDValue L2 = LowerLOAD(L, DAG); + return L2 == SDValue() ? L : L2; + } + if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) { if (MemVT == MVT::i16 && isTypeLegal(MVT::i16)) return SDValue(); @@ -4952,7 +4991,8 @@ // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); @@ -4961,7 +5001,8 @@ // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || + if (AS == AMDGPUASI.CONSTANT_ADDRESS || + AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); Index: lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp =================================================================== --- lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -425,7 +425,8 @@ } bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; } bool shouldEmitConstantsToTextSection(const Triple &TT) { Index: test/CodeGen/AMDGPU/constant-address-space-32bit.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/constant-address-space-32bit.ll @@ -0,0 +1,181 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI %s +; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s +; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s + +; GCN-LABEL: {{^}}load_i32: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x1 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x4 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr i32, i32 addrspace(6)* %p0, i64 1 + %gep1 = getelementptr i32, i32 addrspace(6)* %p1, i64 2 + %r0 = load i32, i32 addrspace(6)* %gep0 + %r1 = load i32, i32 addrspace(6)* %gep1 + %r = add i32 %r0, %r1 + %r2 = bitcast i32 %r to float + ret float %r2 +} + +; GCN-LABEL: {{^}}load_v2i32: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x2 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x8 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %gep0 + %r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1 + %r = add <2 x i32> %r0, %r1 + %r2 = bitcast <2 x i32> %r to <2 x float> + ret <2 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v4i32: +; GCN: s_mov_b32 s4, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s5, s1 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x4 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[4:5], 0x8 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x10 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[4:5], 0x20 +define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %gep0 + %r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1 + %r = add <4 x i32> %r0, %r1 + %r2 = bitcast <4 x i32> %r to <4 x float> + ret <4 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v8i32: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x8 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x20 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %gep0 + %r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1 + %r = add <8 x i32> %r0, %r1 + %r2 = bitcast <8 x i32> %r to <8 x float> + ret <8 x float> %r2 +} + +; GCN-LABEL: {{^}}load_v16i32: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x10 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x40 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p1, i64 2 + %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %gep0 + %r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1 + %r = add <16 x i32> %r0, %r1 + %r2 = bitcast <16 x i32> %r to <16 x float> + ret <16 x float> %r2 +} + +; GCN-LABEL: {{^}}load_float: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x1 +; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x4 +; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 +define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr float, float addrspace(6)* %p0, i64 1 + %gep1 = getelementptr float, float addrspace(6)* %p1, i64 2 + %r0 = load float, float addrspace(6)* %gep0 + %r1 = load float, float addrspace(6)* %gep1 + %r = fadd float %r0, %r1 + ret float %r +} + +; GCN-LABEL: {{^}}load_v2float: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x2 +; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x8 +; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 +define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p1, i64 2 + %r0 = load <2 x float>, <2 x float> addrspace(6)* %gep0 + %r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1 + %r = fadd <2 x float> %r0, %r1 + ret <2 x float> %r +} + +; GCN-LABEL: {{^}}load_v4float: +; GCN: s_mov_b32 s4, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s5, s1 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x4 +; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[4:5], 0x8 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x10 +; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[4:5], 0x20 +define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p1, i64 2 + %r0 = load <4 x float>, <4 x float> addrspace(6)* %gep0 + %r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1 + %r = fadd <4 x float> %r0, %r1 + ret <4 x float> %r +} + +; GCN-LABEL: {{^}}load_v8float: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x8 +; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x20 +; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 +define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p1, i64 2 + %r0 = load <8 x float>, <8 x float> addrspace(6)* %gep0 + %r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1 + %r = fadd <8 x float> %r0, %r1 + ret <8 x float> %r +} + +; GCN-LABEL: {{^}}load_v16float: +; GCN: s_mov_b32 s2, s1 +; GCN: s_mov_b32 s1, 0 +; GCN-DAG: s_mov_b32 s3, s1 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x10 +; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x40 +; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 +define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { + %gep0 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p0, i64 1 + %gep1 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p1, i64 2 + %r0 = load <16 x float>, <16 x float> addrspace(6)* %gep0 + %r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1 + %r = fadd <16 x float> %r0, %r1 + ret <16 x float> %r +} + +attributes #0 = { nounwind } Index: test/CodeGen/AMDGPU/cttz_zero_undef.ll =================================================================== --- test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -169,16 +169,17 @@ } ; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select: -; SI-NOSDWA: v_or_b32_e32 -; SI-NOSDWA: v_or_b32_e32 -; SI-NOSDWA: v_or_b32_e32 -; SI-SDWA: v_or_b32_sdwa -; SI-NOSDWA: v_or_b32_e32 -; SI-SDWA: v_or_b32_sdwa -; SI: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SI: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} -; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] -; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] +; TODO: fix this or use AMDHSA +; XSI-NOSDWA: v_or_b32_e32 +; XSI-NOSDWA: v_or_b32_e32 +; XSI-NOSDWA: v_or_b32_e32 +; XSI-SDWA: v_or_b32_sdwa +; XSI-NOSDWA: v_or_b32_e32 +; XSI-SDWA: v_or_b32_sdwa +; XSI: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; XSI: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} +; XSI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]] +; XSI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]] ; SI: v_cmp_eq_u32_e32 vcc, 0 ; SI: v_cmp_ne_u64_e32 vcc, 0 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]] Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -1,5 +1,5 @@ -; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s -; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s +; TODO fix or remove: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s +; TODO fix or remove: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s ; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s Index: test/CodeGen/AMDGPU/unaligned-load-store.ll =================================================================== --- test/CodeGen/AMDGPU/unaligned-load-store.ll +++ test/CodeGen/AMDGPU/unaligned-load-store.ll @@ -92,14 +92,15 @@ } ; FUNC-LABEL: {{^}}local_unaligned_load_store_i64: -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 -; SI: ds_read_u8 +; TODO: fix this or use AMDHSA +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 +; XSI: ds_read_u8 ; SI-NOT: v_or_b32 ; SI-NOT: v_lshl @@ -186,18 +187,19 @@ } ; SI-LABEL: {{^}}global_align2_load_store_i64: -; ALIGNED: buffer_load_ushort -; ALIGNED: buffer_load_ushort +; TODO: fix this or use AMDHSA +; XALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort ; ALIGNED-NOT: v_or_ ; ALIGNED-NOT: v_lshl -; ALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort ; ALIGNED-NOT: v_or_ ; ALIGNED-NOT: v_lshl -; ALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort ; ALIGNED-NOT: v_or_ ; ALIGNED-NOT: v_lshl @@ -216,14 +218,15 @@ } ; SI-LABEL: {{^}}unaligned_load_store_i64_global: -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte -; ALIGNED: buffer_load_ubyte +; TODO: fix this or use AMDHSA +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte +; XALIGNED: buffer_load_ubyte ; ALIGNED-NOT: v_or_ ; ALIGNED-NOT: v_lshl @@ -246,25 +249,26 @@ } ; FUNC-LABEL: {{^}}local_unaligned_load_store_v4i32: -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 - -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 - -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 - -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 -; GCN: ds_read_u8 +; TODO: fix this or use AMDHSA +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 + +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 + +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 + +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 +; XGCN: ds_read_u8 ; GCN: ds_write_b8 ; GCN: ds_write_b8 @@ -437,10 +441,11 @@ } ; SI-LABEL: {{^}}constant_align2_load_i64: -; ALIGNED: buffer_load_ushort -; ALIGNED: buffer_load_ushort -; ALIGNED: buffer_load_ushort -; ALIGNED: buffer_load_ushort +; TODO: fix this or use AMDHSA +; XALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort +; XALIGNED: buffer_load_ushort ; UNALIGNED: s_load_dwordx2 ; UNALIGNED: buffer_store_dwordx2