Index: lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -262,6 +262,12 @@ SDValue getHi16Elt(SDValue In) const; + SDValue getRegSequence64(unsigned RCID, const SDLoc &DL, + SDValue Lo, SDValue Hi) const; + + SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const; + SDValue getMaterializedScalarImm64(int64_t Val, const SDLoc &DL) const; + void SelectADD_SUB_I64(SDNode *N); void SelectAddcSubb(SDNode *N); void SelectUADDO_USUBO(SDNode *N); @@ -966,6 +972,37 @@ return true; } +SDValue AMDGPUDAGToDAGISel::getRegSequence64(unsigned RCID, const SDLoc &DL, + SDValue Lo, SDValue Hi) const { + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(RCID, DL, MVT::i32), + Lo, Sub0, Hi, Sub1 + }; + + SDNode *RegSequence = CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs); + return SDValue(RegSequence, 0); +} + +SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val, + const SDLoc &DL) const { + SDNode *Mov = CurDAG->getMachineNode( + AMDGPU::S_MOV_B32, DL, MVT::i32, + CurDAG->getTargetConstant(Val, DL, MVT::i32)); + return SDValue(Mov, 0); +} + +SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm64(int64_t Val, + const SDLoc &DL) const { + + SDValue Lo = getMaterializedScalarImm32(Lo_32(Val), DL); + SDValue Hi = getMaterializedScalarImm32(Hi_32(Val), DL); + return getRegSequence64(AMDGPU::SReg_64RegClassID, DL, Lo, Hi); +} + // FIXME: Should only handle addcarry/subcarry void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) { SDLoc DL(N); @@ -1633,13 +1670,88 @@ CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); - int64_t COffsetVal = cast(N1)->getSExtValue(); + uint64_t COffsetVal = cast(N1)->getSExtValue(); const SIInstrInfo *TII = Subtarget->getInstrInfo(); - if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(), - IsSigned)) { + unsigned AS = findMemSDNode(N)->getAddressSpace(); + if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) { Addr = N0; OffsetVal = COffsetVal; + } else { + // If the offset doesn't fit, put the low bits into the offset field and + // add the rest. + + SDLoc DL(N); + uint64_t ImmField; + const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned); + if (IsSigned) { + ImmField = SignExtend64(COffsetVal, NumBits); + + // Don't use a negative offset field if the base offset is positive. + // Since the scheduler currently relies on the offset field, doing so + // could result in strange scheduling decisions. + + // TODO: Should we not do this in the opposite direction as well? + if (static_cast(COffsetVal) > 0) { + if (static_cast(ImmField) < 0) { + const uint64_t OffsetMask = maskTrailingOnes(NumBits - 1); + ImmField = COffsetVal & OffsetMask; + } + } + } else { + // TODO: Should we do this for a negative offset? + const uint64_t OffsetMask = maskTrailingOnes(NumBits); + ImmField = COffsetVal & OffsetMask; + } + + uint64_t RemainderOffset = COffsetVal - ImmField; + + assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned)); + assert(RemainderOffset + ImmField == COffsetVal); + + OffsetVal = ImmField; + +#if 0 + // TODO: Should this try to use a scalar add pseudo? + SDValue AddOffset + = getMaterializedScalarImm64(RemainderOffset, DL); + + SDNode *Add = CurDAG->getMachineNode( + AMDGPU::S_ADD_U64_PSEUDO, DL, MVT::i64, N0, AddOffset); + Addr = SDValue(Add, 0); +#else + SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32); + SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32); + + SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub0); + SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, + DL, MVT::i32, N0, Sub1); + + SDValue AddOffsetLo + = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL); + SDValue AddOffsetHi + = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL); + + SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1); + SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1); + + SDNode *Add = CurDAG->getMachineNode( + AMDGPU::V_ADD_I32_e64, DL, VTs, + {AddOffsetLo, SDValue(N0Lo, 0), Clamp}); + + SDNode *Addc = CurDAG->getMachineNode( + AMDGPU::V_ADDC_U32_e64, DL, VTs, + {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp}); + + SDValue RegSequenceArgs[] = { + CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32), + SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1 + }; + + Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL, + MVT::i64, RegSequenceArgs), 0); +#endif } } Index: lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.h +++ lib/Target/AMDGPU/SIInstrInfo.h @@ -996,6 +996,8 @@ return isUInt<12>(Imm); } + unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const; + /// Returns if \p Offset is legal for the subtarget as the offset to a FLAT /// encoded instruction. If \p Signed, this is for an instruction that /// interprets the offset as signed. Index: lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- lib/Target/AMDGPU/SIInstrInfo.cpp +++ lib/Target/AMDGPU/SIInstrInfo.cpp @@ -6181,6 +6181,20 @@ return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass); } +unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace, + bool Signed) const { + if (!ST.hasFlatInstOffsets()) + return 0; + + if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS) + return 0; + + if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) + return Signed ? 12 : 11; + + return Signed ? 13 : 12; +} + bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace, bool Signed) const { // TODO: Should 0 be special cased? Index: test/CodeGen/AMDGPU/cgp-addressing-modes.ll =================================================================== --- test/CodeGen/AMDGPU/cgp-addressing-modes.ll +++ test/CodeGen/AMDGPU/cgp-addressing-modes.ll @@ -44,7 +44,10 @@ ; GCN-LABEL: {{^}}test_sink_global_small_max_i32_ds_offset: ; GCN: s_and_saveexec_b64 ; SICIVI: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}} -; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off{{$}} + +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xf000, +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, +; GFX9: global_load_sbyte {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, off offset:4095{{$}} ; GCN: {{^}}BB1_2: ; GCN: s_or_b64 exec define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) { Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -1,14 +1,7 @@ -; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s -; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s -; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s -; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s - -; Disable optimizations in case there are optimizations added that -; specialize away generic pointer accesses. - - -; These testcases might become useless when there are optimizations to -; remove generic pointers. +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck -check-prefixes=CHECK,CIVI %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,CIVI,HSA %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA,GFX9 %s ; CHECK-LABEL: {{^}}store_flat_i32: ; CHECK-DAG: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:[[HI_SREG:[0-9]+]]], @@ -191,7 +184,11 @@ } ; CHECK-LABEL: {{^}}store_flat_i8_neg_offset: -; CHECK: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} +; CIVI: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}{{$}} + +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, +; GFX9: flat_store_byte v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:4094{{$}} define amdgpu_kernel void @store_flat_i8_neg_offset(i8* %fptr, i8 %x) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 store volatile i8 %x, i8* %fptr.offset @@ -216,7 +213,11 @@ } ; CHECK-LABEL: {{^}}load_flat_i8_neg_offset: -; CHECK: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} +; CIVI: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}{{$}} + +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, +; GFX9: flat_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} offset:4094{{$}} define amdgpu_kernel void @load_flat_i8_neg_offset(i8* %fptr) #0 { %fptr.offset = getelementptr inbounds i8, i8* %fptr, i64 -2 %val = load volatile i8, i8* %fptr.offset Index: test/CodeGen/AMDGPU/global-saddr.ll =================================================================== --- test/CodeGen/AMDGPU/global-saddr.ll +++ test/CodeGen/AMDGPU/global-saddr.ll @@ -46,7 +46,7 @@ ; Test various offset boundaries. ; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:4088{{$}} -; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4088{{$}} ; GFX9: global_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+}}:{{[0-9]+}}] offset:2040{{$}} %gep11 = getelementptr inbounds i64, i64 addrspace(1)* %gep, i64 511 %load11 = load i64, i64 addrspace(1)* %gep11 Index: test/CodeGen/AMDGPU/global_atomics.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics.ll +++ test/CodeGen/AMDGPU/global_atomics.ll @@ -25,7 +25,9 @@ ; SIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0 ; SIVI: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x8000, +; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc +; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3232{{$}} define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000 @@ -40,7 +42,10 @@ ; VI: flat_atomic_add -; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off{{$}} +; GFX9: v_mov_b32_e32 [[HIGH_K:v[0-9]+]], 0xabcd +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xd000, +; GFX9-NEXT: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, [[HIGH_K]], v{{[0-9]+}}, vcc +; GFX9: global_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off offset:3756{{$}} define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) { entry: %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595 Index: test/CodeGen/AMDGPU/global_atomics_i64.ll =================================================================== --- test/CodeGen/AMDGPU/global_atomics_i64.ll +++ test/CodeGen/AMDGPU/global_atomics_i64.ll @@ -991,7 +991,9 @@ ; CIVI: s_mov_b32 [[SREG:s[0-9]+]], 0x11940 ; CIVI: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}} -; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off{{$}} +; GFX9: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0x11000, +; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc +; GFX9: global_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}, off offset:2368{{$}} define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) { entry: %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000 Index: test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll =================================================================== --- test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -14,14 +14,15 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 + entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -75,15 +76,15 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off entry: @@ -184,7 +185,7 @@ } ; using 32bit address. -define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { +define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) { ; GCN-LABEL: Address32: ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] @@ -197,16 +198,16 @@ ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-3072 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -265,10 +266,10 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -306,10 +307,10 @@ ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-1024 -; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072 +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}} entry: %call = tail call i64 @_Z13get_global_idj(i32 0) %conv = and i64 %call, 255 @@ -347,13 +348,13 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] -; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 + +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} i8 addrspace(1)* %buffer2) { entry: %call = tail call i64 @_Z13get_global_idj(i32 0) @@ -403,13 +404,13 @@ ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] ; -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096 -; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 +; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}} ; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048 entry: %call = tail call i64 @_Z13get_global_idj(i32 0) Index: test/CodeGen/AMDGPU/store-hi16.ll =================================================================== --- test/CodeGen/AMDGPU/store-hi16.ll +++ test/CodeGen/AMDGPU/store-hi16.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900 %s -; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,NO-D16-HI %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX900,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX906,GFX9,NO-D16-HI %s ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX803,NO-D16-HI %s ; GCN-LABEL: {{^}}store_global_hi_v2i16: @@ -311,16 +311,16 @@ ; GCN-LABEL: {{^}}store_flat_hi_v2i16_neg_offset: ; GCN: s_waitcnt -; GCN: v_add{{(_co)?}}_{{i|u}}32_e32 - +; GFX803: v_add{{(_co)?}}_{{i|u}}32_e32 ; GFX803: v_addc_u32_e32 -; GFX900: v_addc_co_u32_e32 -; GFX906-NEXT: v_lshrrev_b32_e32 -; GFX906-NEXT: v_addc_co_u32_e32 -; GFX906: flat_store_short v[0:1], v2 +; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v -; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2{{$}} +; GFX906-DAG: v_lshrrev_b32_e32 +; GFX906: flat_store_short v[0:1], v2 offset:2050{{$}} + +; GFX900-NEXT: flat_store_short_d16_hi v[0:1], v2 offset:2050{{$}} ; GFX803: flat_store_short v[0:1], v2{{$}} ; GCN-NEXT: s_waitcnt ; GCN-NEXT: s_setpc_b64 @@ -359,17 +359,17 @@ ; GCN-LABEL: {{^}}store_flat_hi_v2i16_i8_neg_offset: ; GCN: s_waitcnt -; GCN-DAG: v_add{{(_co)?}}_{{i|u}}32_e32 +; GFX803-DAG: v_add_u32_e32 ; GFX803-DAG: v_addc_u32_e32 -; GFX900-DAG: v_addc_co_u32_e32 -; GFX906-DAG: v_add_co_u32_e32 -; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2{{$}} +; GFX9-DAG: v_add_co_u32_e32 v{{[0-9]+}}, vcc, 0xfffff000, v +; GFX9-DAG: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, -1, v{{[0-9]+}}, vcc -; GFX906-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX906-NEXT: v_addc_co_u32_e32 -; GFX906-NEXT: flat_store_byte v[0:1], v2{{$}} +; GFX900-NEXT: flat_store_byte_d16_hi v[0:1], v2 offset:1{{$}} + +; GFX906-DAG: v_lshrrev_b32_e32 v2, 16, v2 +; GFX906: flat_store_byte v[0:1], v2 offset:1{{$}} ; GFX803-DAG: v_lshrrev_b32_e32 v2, 16, v2 ; GFX803: flat_store_byte v[0:1], v2{{$}}