Index: llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -234,6 +234,8 @@ template bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr, SDValue &Offset) const; + bool SelectGlobalSAddr(SDNode *N, SDValue Addr, SDValue &SAddr, + SDValue &VOffset, SDValue &Offset) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -1750,6 +1752,69 @@ return true; } +// If this matches zero_extend i32:x, return x +static SDValue matchZExtFromI32(SDValue Op) { + if (Op.getOpcode() != ISD::ZERO_EXTEND) + return SDValue(); + + SDValue ExtSrc = Op.getOperand(0); + return (ExtSrc.getValueType() == MVT::i32) ? ExtSrc : SDValue(); +} + +// Match (64-bit SGPR base) + (zext vgpr offset) + sext(imm offset) +bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N, + SDValue Addr, + SDValue &SAddr, + SDValue &VOffset, + SDValue &Offset) const { + int64_t ImmOffset = 0; + + // Match the immediate offset first, which canonically is moved as low as + // possible. + if (CurDAG->isBaseWithConstantOffset(Addr)) { + SDValue LHS = Addr.getOperand(0); + SDValue RHS = Addr.getOperand(1); + + int64_t COffsetVal = cast(RHS)->getSExtValue(); + const SIInstrInfo *TII = Subtarget->getInstrInfo(); + + // TODO: Could split larger constant into VGPR offset. + if (TII->isLegalFLATOffset(COffsetVal, AMDGPUAS::GLOBAL_ADDRESS, true)) { + Addr = LHS; + ImmOffset = COffsetVal; + } + } + + // Match the variable offset. + if (Addr.getOpcode() != ISD::ADD) + return false; + + SDValue LHS = Addr.getOperand(0); + SDValue RHS = Addr.getOperand(1); + + if (!LHS->isDivergent()) { + // add (i64 sgpr), (zero_extend (i32 vgpr)) + if (SDValue ZextRHS = matchZExtFromI32(RHS)) { + SAddr = LHS; + VOffset = ZextRHS; + } + } + + if (!SAddr && !RHS->isDivergent()) { + // add (zero_extend (i32 vgpr)), (i64 sgpr) + if (SDValue ZextLHS = matchZExtFromI32(LHS)) { + SAddr = RHS; + VOffset = ZextLHS; + } + } + + if (!SAddr) + return false; + + Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16); + return true; +} + bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const { ConstantSDNode *C = dyn_cast(ByteOffsetNode); Index: llvm/lib/Target/AMDGPU/FLATInstructions.td =================================================================== --- llvm/lib/Target/AMDGPU/FLATInstructions.td +++ llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -9,6 +9,8 @@ def FLATOffset : ComplexPattern", [], [SDNPWantRoot], -10>; def FLATOffsetSigned : ComplexPattern", [], [SDNPWantRoot], -10>; +def GlobalSAddr : ComplexPattern; + //===----------------------------------------------------------------------===// // FLAT classes //===----------------------------------------------------------------------===// @@ -743,11 +745,45 @@ (inst $vaddr, $offset, 0, 0, 0, $in) >; +class GlobalLoadSaddrPat_D16 : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)), + (inst $saddr, $voffset, $offset, 0, 0, 0, $in) +>; + class FlatLoadSignedPat : GCNPat < (vt (node (FLATOffsetSigned (i64 VReg_64:$vaddr), i16:$offset))), (inst $vaddr, $offset) >; +class GlobalLoadSaddrPat : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))), + (inst $saddr, $voffset, $offset, 0, 0, 0) +>; + +class GlobalStoreSaddrPat : GCNPat < + (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset)), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) +>; + +class GlobalAtomicStoreSaddrPat : GCNPat < + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) +>; + +class GlobalAtomicSaddrPat : GCNPat < + (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), data_vt:$data)), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) +>; + +class GlobalAtomicNoRtnSaddrPat : GCNPat < + (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data), + (inst $voffset, getVregSrcForVT.ret:$data, $saddr, $offset) +>; + class FlatStorePat : GCNPat < (node vt:$data, (FLATOffset i64:$vaddr, i16:$offset)), (inst $vaddr, getVregSrcForVT.ret:$data, $offset) @@ -765,11 +801,12 @@ (inst $vaddr, getVregSrcForVT.ret:$data, $offset) >; -class FlatStoreSignedAtomicPat : GCNPat < +class FlatStoreSignedAtomicPat : GCNPat < // atomic store follows atomic binop convention so the address comes // first. - (node (FLATOffset i64:$vaddr, i16:$offset), vt:$data), - (inst $vaddr, getVregSrcForVT.ret:$data, $offset) + (node (FLATOffset i64:$vaddr, i16:$offset), data_vt:$data), + (inst $vaddr, getVregSrcForVT.ret:$data, $offset) >; class FlatAtomicPat ; class FlatSignedAtomicPat : GCNPat < + ValueType data_vt = vt> : GCNPat < (vt (node (FLATOffsetSigned i64:$vaddr, i16:$offset), data_vt:$data)), (inst $vaddr, $data, $offset) >; @@ -886,98 +923,166 @@ } // End OtherPredicates = [HasFlatAddressSpace] -let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in { -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; +multiclass GlobalFLATLoadPats { + def : FlatLoadSignedPat { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + +multiclass GlobalFLATLoadPats_D16 { + def : FlatSignedLoadPat_D16 { + let AddedComplexity = 10; + } + + def : GlobalLoadSaddrPat_D16(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + +multiclass GlobalFLATStorePats { + def : FlatStoreSignedPat { + let AddedComplexity = 10; + } + + def : GlobalStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + +// Deal with swapped operands for atomic_store vs. regular store +multiclass GlobalFLATAtomicStorePats { + def : FlatStoreSignedAtomicPat { + let AddedComplexity = 10; + } + + def : GlobalAtomicStoreSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + +multiclass GlobalFLATAtomicPats { + def : FlatSignedAtomicPat (nortn_inst_name#"_RTN"), node, vt, data_vt> { + let AddedComplexity = 10; + } + + def : GlobalAtomicSaddrPat(nortn_inst_name#"_SADDR_RTN"), node, vt, data_vt> { + let AddedComplexity = 11; + } +} + +multiclass GlobalFLATNoRtnAtomicPats { + def : FlatSignedAtomicPatNoRtn { + let AddedComplexity = 10; + } + + def : GlobalAtomicNoRtnSaddrPat(!cast(inst)#"_SADDR"), node, vt> { + let AddedComplexity = 11; + } +} + +let OtherPredicates = [HasFlatGlobalInsts] in { + +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; foreach vt = Reg32Types.types in { -def : FlatLoadSignedPat ; -def : FlatStoreSignedPat ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATStorePats ; } foreach vt = VReg_64.RegTypes in { -def : FlatLoadSignedPat ; -def : FlatStoreSignedPat ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATStorePats ; } -def : FlatLoadSignedPat ; +defm : GlobalFLATLoadPats ; foreach vt = VReg_128.RegTypes in { -def : FlatLoadSignedPat ; -def : FlatStoreSignedPat ; +defm : GlobalFLATLoadPats ; +defm : GlobalFLATStorePats ; } -def : FlatLoadSignedPat ; -def : FlatLoadSignedPat ; +// There is no distinction for atomic load lowering during selection; +// the memory legalizer will set the cache bits and insert the +// appropriate waits. +defm : GlobalFLATLoadPats ; +defm : GlobalFLATLoadPats ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; let OtherPredicates = [D16PreservesUnusedBits] in { -def : FlatStoreSignedPat ; -def : FlatStoreSignedPat ; - -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; - -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; -def : FlatSignedLoadPat_D16 ; +defm : GlobalFLATStorePats ; +defm : GlobalFLATStorePats ; + +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; + +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; +defm : GlobalFLATLoadPats_D16 ; } -def : FlatStoreSignedAtomicPat ; -def : FlatStoreSignedAtomicPat ; - -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; - -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; -def : FlatSignedAtomicPat ; - -def : FlatSignedAtomicPatNoRtn ; -def : FlatSignedAtomicPatNoRtn ; +defm : GlobalFLATAtomicStorePats ; +defm : GlobalFLATAtomicStorePats ; + +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", atomic_load_add_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", atomic_load_sub_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", atomic_inc_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", atomic_dec_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", atomic_load_and_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", atomic_load_max_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", atomic_load_umax_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN", atomic_load_min_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN", atomic_load_umin_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", atomic_load_or_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", atomic_swap_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", AMDGPUatomic_cmp_swap_global_32, i32, v2i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", atomic_load_xor_global_32, i32>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CSUB", int_amdgcn_global_atomic_csub, i32>; + +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", atomic_load_add_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", atomic_load_sub_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", atomic_inc_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", atomic_dec_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", atomic_load_and_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", atomic_load_max_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", atomic_load_umax_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMIN_X2", atomic_load_min_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMIN_X2", atomic_load_umin_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR_X2", atomic_load_or_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", atomic_swap_global_64, i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", AMDGPUatomic_cmp_swap_global_64, i64, v2i64>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", atomic_load_xor_global_64, i64>; + +defm : GlobalFLATNoRtnAtomicPats ; +defm : GlobalFLATNoRtnAtomicPats ; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 Index: llvm/lib/Target/AMDGPU/SIInstrInfo.h =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -883,6 +883,7 @@ MachineRegisterInfo &MRI) const; void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const; + void legalizeOperandsFLAT(MachineRegisterInfo &MRI, MachineInstr &MI) const; void legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, Index: llvm/lib/Target/AMDGPU/SIInstrInfo.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4667,6 +4667,22 @@ } } +// FIXME: Remove this when SelectionDAG is obsoleted. +void SIInstrInfo::legalizeOperandsFLAT(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + if (!isSegmentSpecificFLAT(MI)) + return; + + // Fixup SGPR operands in VGPRs. We only select these when the DAG divergence + // thinks they are uniform, so a readfirstlane should be valid. + MachineOperand *SAddr = getNamedOperand(MI, AMDGPU::OpName::saddr); + if (!SAddr || RI.isSGPRClass(MRI.getRegClass(SAddr->getReg()))) + return; + + Register ToSGPR = readlaneVGPRToSGPR(SAddr->getReg(), MI, MRI); + SAddr->setReg(ToSGPR); +} + void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, MachineBasicBlock::iterator I, const TargetRegisterClass *DstRC, @@ -4931,6 +4947,12 @@ return; } + // Legalize FLAT + if (isFLAT(MI)) { + legalizeOperandsFLAT(MRI, MI); + return; + } + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. Index: llvm/test/CodeGen/AMDGPU/clamp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/clamp.ll +++ llvm/test/CodeGen/AMDGPU/clamp.ll @@ -102,8 +102,8 @@ ; SI: buffer_store_dword [[MED]] ; SI: buffer_store_dword [[MAX]] -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MED]] -; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MAX]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MED]] +; GFX89: {{flat|global}}_store_dword v{{.+}}, [[MAX]] define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -15,7 +15,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; CI: buffer_store_dword [[RESULT]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} ; GCN: s_endpgm define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -39,7 +39,7 @@ ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; CI: buffer_store_dword [[RESULT]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -361,7 +361,7 @@ ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; CI: buffer_store_dwordx2 [[RESULT]] -; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i Index: llvm/test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_read2st64.ll +++ llvm/test/CodeGen/AMDGPU/ds_read2st64.ll @@ -13,7 +13,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; CI: buffer_store_dword [[RESULT]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i @@ -35,7 +35,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; CI: buffer_store_dword [[RESULT]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -58,7 +58,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[LO_VREG]], v[[HI_VREG]] ; CI: buffer_store_dword [[RESULT]] -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -143,7 +143,7 @@ ; GCN: s_waitcnt lgkmcnt(0) ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; CI: buffer_store_dwordx2 [[RESULT]] -; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i @@ -166,7 +166,7 @@ ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; CI: buffer_store_dwordx2 [[RESULT]] -; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 64 @@ -213,7 +213,7 @@ ; GCN: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}} ; CI: buffer_store_dwordx2 [[RESULT]] -; GFX9: global_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9: global_store_dwordx2 v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %add.x.0 = add nsw i32 %x.i, 256 Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -32,8 +32,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}} ; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]] @@ -196,8 +196,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 2, v{{[0-9]+}} ; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds@abs32@lo, [[VBASE]] @@ -383,8 +383,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[VBASE:v[0-9]+]], 3, v{{[0-9]+}} ; GCN-DAG: v_add_{{[ui]}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}lds.f64@abs32@lo, [[VBASE]] Index: llvm/test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2st64.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2st64.ll @@ -30,8 +30,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4{{$}} +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4{{$}} ; GCN-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}} @@ -59,8 +59,8 @@ ; CI-DAG: buffer_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 -; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, off offset:4 +; GFX9-DAG: global_load_dword [[VAL0:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}{{$}} +; GFX9-DAG: global_load_dword [[VAL1:v[0-9]+]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:4 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}} ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] @@ -87,8 +87,8 @@ ; CI-DAG: buffer_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; CI-DAG: buffer_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off{{$}} -; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], {{v\[[0-9]+:[0-9]+\]}}, off offset:8 +; GFX9-DAG: global_load_dwordx2 [[VAL0:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]$}} +; GFX9-DAG: global_load_dwordx2 [[VAL1:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} offset:8 ; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 3, v{{[0-9]+}} ; GCN-DAG: v_add_{{i|u}}32_e32 [[VPTR:v[0-9]+]], {{(vcc, )?}}s{{[0-9]+}}, [[SHL]] Index: llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll +++ llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll @@ -17,7 +17,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32: ; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] ; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -33,7 +33,7 @@ ; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -48,7 +48,7 @@ ; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -63,7 +63,7 @@ ; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -78,7 +78,7 @@ ; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -93,7 +93,7 @@ ; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -108,7 +108,7 @@ ; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -124,7 +124,7 @@ ; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -140,7 +140,7 @@ ; GCN: v_mac_f32_e32 [[V]], v{{[0-9]+}}, v{{[0-9]+$}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -157,7 +157,7 @@ ; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]] ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] ; GCN-NOT: 1.0 define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -176,7 +176,7 @@ ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -191,7 +191,7 @@ ; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -207,7 +207,7 @@ ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -223,7 +223,7 @@ ; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -239,7 +239,7 @@ ; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}] ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -255,7 +255,7 @@ ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_max ; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -271,7 +271,7 @@ ; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_max ; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -292,7 +292,7 @@ ; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]] ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id @@ -321,7 +321,7 @@ ; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -368,7 +368,7 @@ ; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -384,7 +384,7 @@ ; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -399,7 +399,7 @@ ; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -414,7 +414,7 @@ ; GCN: v_sin_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V0]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]] define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -429,7 +429,7 @@ ; GCN: v_cos_f16_e32 [[V0:v[0-9]+]], v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V0]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V0]] define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id @@ -444,7 +444,7 @@ ; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000 ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -462,7 +462,7 @@ ; GCN-NOT: v_max ; GCN-NOT: v_mul -; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -482,7 +482,7 @@ ; GCN-DENORM-NOT: v_max ; GCN-DENORM-NOT: v_mul -; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}] +; GFX9: {{flat|global}}_store_dword define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -497,7 +497,7 @@ ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -541,7 +541,7 @@ ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -564,7 +564,7 @@ ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -579,7 +579,7 @@ ; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}} ; GCN-NOT: v_max ; GCN-NOT: v_mul -; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id @@ -595,7 +595,7 @@ ; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0 ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id @@ -645,7 +645,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32 ; GFX9-DENORM: global_load_dword [[V:v[0-9]+]], -; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]] +; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[V]], s{{\[[0-9]+:[0-9]+\]}} ; GFX9-DENORM-NOT: 1.0 ; GFX9-DENORM-NOT: v_max ; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}} @@ -662,7 +662,7 @@ ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64 ; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]], -; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]] +; GCN: {{flat|global}}_store_dwordx2 v{{.+}}, [[V]] ; GCN-NOT: v_mul_ ; GCN-NOT: v_max_ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 { @@ -679,7 +679,7 @@ ; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]], ; GCN-NOT: v_mul ; GCN-NOT: v_max -; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]] +; GCN: {{flat|global}}_store_short v{{.+}}, [[V]] define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 { %id = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id Index: llvm/test/CodeGen/AMDGPU/fdiv.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv.f16.ll +++ llvm/test/CodeGen/AMDGPU/fdiv.f16.ll @@ -32,7 +32,7 @@ ; GFX8_9_10: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_LHS]], [[RCP_RHS]] ; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]] ; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16( half addrspace(1)* %r, half addrspace(1)* %a, @@ -55,7 +55,7 @@ ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GFX8_9_10-NOT: [[RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -73,7 +73,7 @@ ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]| ; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -90,12 +90,12 @@ ; We could not do 1/b -> rcp_f16(b) under !fpmath < 1ulp. ; GCN-LABEL: {{^}}reciprocal_f16_rounded: -; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: {{flat|global}}_load_ushort [[VAL16:v[0-9]+]], v{{.+}} ; GFX8_9_10: v_cvt_f32_f16_e32 [[CVT_TO32:v[0-9]+]], [[VAL16]] ; GFX8_9_10: v_rcp_f32_e32 [[RCP32:v[0-9]+]], [[CVT_TO32]] ; GFX8_9_10: v_cvt_f16_f32_e32 [[CVT_BACK16:v[0-9]+]], [[RCP32]] ; GFX8_9_10: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK16]], [[VAL16]], 1.0 -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @reciprocal_f16_rounded(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -113,7 +113,7 @@ ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GFX8_9_10-NOT: [[RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_afn(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -131,7 +131,7 @@ ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]] ; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -149,7 +149,7 @@ ; GFX8_9_10-NOT: [[VAL]] ; GFX8_9_10: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]] ; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -169,7 +169,7 @@ ; GFX8_9_10: v_sqrt_f16_e32 [[SQRT:v[0-9]+]], [[VAL]] ; GFX8_9_10-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]] ; GFX8_9_10-NOT: [RESULT]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -190,7 +190,7 @@ ; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_afn(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -212,7 +212,7 @@ ; GFX8_9_10: v_rcp_f16_e32 [[RCP:v[0-9]+]], [[RHS]] ; GFX8_9_10: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[LHS]], [[RCP]] -; GFX8_9_10: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX8_9_10: {{flat|global}}_store_short v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 { entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll +++ llvm/test/CodeGen/AMDGPU/fmuladd.f16.ll @@ -85,8 +85,8 @@ ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -115,8 +115,8 @@ ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -148,9 +148,9 @@ ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in1, @@ -185,9 +185,9 @@ ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in1, @@ -216,8 +216,8 @@ ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -243,10 +243,10 @@ ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], 2.0, [[R1]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -274,10 +274,10 @@ ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[MUL2]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] ; GFX10-DENORM: v_fmac_f16_e32 [[R2]], -2.0, [[R1]] -; GFX10-DENORM: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-DENORM: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -302,7 +302,7 @@ ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] ; GFX10-FLUSH: v_add_f16_e32 [[MUL2:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[MUL2]], [[R2]] -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -335,7 +335,7 @@ ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -370,9 +370,9 @@ ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -406,7 +406,7 @@ ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]| -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -442,7 +442,7 @@ ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]] -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -479,9 +479,9 @@ ; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]] ; GFX10-FLUSH: v_add_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REGC]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]] define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -518,7 +518,7 @@ ; GFX10-FLUSH: v_mul_f16_e64 [[TMP:v[0-9]+]], [[REGA]], |[[REGB]]| ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]] -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tid.ext = sext i32 %tid to i64 @@ -554,9 +554,9 @@ ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]] -; GFX10-FLUSH: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-STRICT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] -; GFX10-DENORM-CONTRACT: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]] +; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]] +; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[R2]] define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid @@ -588,7 +588,7 @@ ; GFX10-FLUSH: v_add_f16_e32 [[TMP:v[0-9]+]], [[R1]], [[R1]] ; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]] -; GFX10: global_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]] define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid Index: llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll +++ llvm/test/CodeGen/AMDGPU/fmuladd.v2f16.ll @@ -66,10 +66,10 @@ ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] -; GFX9-FLUSH: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX9-DENORM: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -90,10 +90,10 @@ ; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]] ; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] -; GFX9-FLUSH: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9-FLUSH: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} ; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GFX9-DENORM: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GFX9-DENORM: global_store_dword v{{[0-9]+}}, [[RESULT]], s{{\[[0-9]+:[0-9]+\]}} define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid @@ -118,7 +118,8 @@ ; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]] ; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] + +; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1, <2 x half> addrspace(1)* %in2) #0 { Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll @@ -8,10 +8,8 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_csub_saddr_i32_rtn: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -25,10 +23,8 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_csub_saddr_i32_rtn_neg128: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 +; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc ; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off offset:-128 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 @@ -43,9 +39,7 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_csub_saddr_i32_nortn: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off glc +; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -57,9 +51,7 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GCN-LABEL: global_csub_saddr_i32_nortn_neg128: ; GCN: ; %bb.0: -; GCN-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GCN-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GCN-NEXT: global_atomic_csub v0, v[2:3], v1, off offset:-128 glc +; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx908.ll @@ -10,10 +10,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { ; GCN-LABEL: global_fadd_saddr_f32_nortn: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: global_atomic_add_f32 v[2:3], v1, off +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -25,10 +22,7 @@ define amdgpu_ps void @global_fadd_saddr_f32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { ; GCN-LABEL: global_fadd_saddr_f32_nortn_neg128: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: global_atomic_add_f32 v[2:3], v1, off offset:-128 +; GCN-NEXT: global_atomic_add_f32 v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -41,10 +35,7 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) { ; GCN-LABEL: global_fadd_saddr_v2f16_nortn: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: global_atomic_pk_add_f16 v[2:3], v1, off +; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -56,10 +47,7 @@ define amdgpu_ps void @global_fadd_saddr_v2f16_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x half> %data) { ; GCN-LABEL: global_fadd_saddr_v2f16_nortn_neg128: ; GCN: ; %bb.0: -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GCN-NEXT: global_atomic_pk_add_f16 v[2:3], v1, off offset:-128 +; GCN-NEXT: global_atomic_pk_add_f16 v0, v1, s[2:3] offset:-128 ; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset Index: llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -1,28 +1,23 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; Test using saddr addressing mode of global_* flat atomic instructions. define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[2:3], v1, off +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xchg_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v[2:3], v1, off +; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -39,22 +34,17 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:2047 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_2047: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:2047 +; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -72,22 +62,17 @@ define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048 +; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:-2048 +; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -104,23 +89,17 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xchg_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -136,22 +115,18 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:2048 glc +; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, 0x800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v2, vcc, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc @@ -171,23 +146,17 @@ define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc +; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xchg_saddr_i32_rtn_neg2048: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:-2048 glc +; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -214,10 +183,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[2:3], v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog @@ -225,14 +195,14 @@ ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: ds_read_b64 v[2:3], v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -253,10 +223,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[2:3], v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc +; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog @@ -264,14 +235,14 @@ ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset: ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: ds_read_b64 v[2:3], v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off offset:42 glc +; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -293,10 +264,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[2:3], v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[2:3], v1, off +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm @@ -306,11 +278,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ds_read_b64 v[2:3], v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v[2:3], v1, off +; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -331,10 +304,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: ds_read_b64 v[2:3], v2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: v_readfirstlane_b32 s0, v2 +; GFX9-NEXT: v_readfirstlane_b32 s1, v3 +; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap v[2:3], v1, off offset:42 +; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm @@ -344,11 +318,12 @@ ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: ds_read_b64 v[2:3], v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v3 +; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap v[2:3], v1, off offset:42 +; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -374,23 +349,17 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xchg_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xchg_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -406,23 +375,17 @@ define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xchg_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -439,22 +402,17 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xchg_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xchg_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -470,22 +428,17 @@ define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xchg_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_swap_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -506,23 +459,17 @@ define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_add_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_add_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -538,23 +485,17 @@ define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_add_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_add_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -571,22 +512,17 @@ define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_add_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v[2:3], v1, off +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_add_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add v[2:3], v1, off +; GFX10-NEXT: global_atomic_add v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -602,22 +538,17 @@ define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_add_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_add_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -634,23 +565,17 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_add_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_add_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -666,23 +591,17 @@ define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_add_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_add_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -699,22 +618,17 @@ define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_add_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_add_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -730,22 +644,17 @@ define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_add_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_add_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_add_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -766,23 +675,17 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_sub_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_sub_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -798,23 +701,17 @@ define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_sub_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -831,22 +728,17 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_sub_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[2:3], v1, off +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_sub_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub v[2:3], v1, off +; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -862,22 +754,17 @@ define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_sub_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -894,23 +781,17 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_sub_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_sub_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -926,23 +807,17 @@ define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_sub_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -959,22 +834,17 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_sub_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_sub_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -990,22 +860,17 @@ define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_sub_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_sub_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1026,23 +891,17 @@ define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_and_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_and_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1058,23 +917,17 @@ define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_and_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_and_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1091,22 +944,17 @@ define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_and_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[2:3], v1, off +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_and_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and v[2:3], v1, off +; GFX10-NEXT: global_atomic_and v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1122,22 +970,17 @@ define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_and_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_and_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1154,23 +997,17 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_and_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_and_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1186,23 +1023,17 @@ define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_and_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_and_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1219,22 +1050,17 @@ define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_and_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_and_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1250,22 +1076,17 @@ define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_and_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_and_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_and_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1286,23 +1107,17 @@ define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_or_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_or_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1318,23 +1133,17 @@ define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_or_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_or_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1351,22 +1160,17 @@ define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_or_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[2:3], v1, off +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_or_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or v[2:3], v1, off +; GFX10-NEXT: global_atomic_or v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1382,22 +1186,17 @@ define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_or_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_or_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1414,23 +1213,17 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_or_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_or_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1446,23 +1239,17 @@ define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_or_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_or_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1479,22 +1266,17 @@ define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_or_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_or_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1510,22 +1292,17 @@ define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_or_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_or_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_or_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1546,23 +1323,17 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xor_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xor_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1578,23 +1349,17 @@ define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xor_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1611,22 +1376,17 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xor_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[2:3], v1, off +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xor_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor v[2:3], v1, off +; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1642,22 +1402,17 @@ define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xor_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1674,23 +1429,17 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xor_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xor_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1706,23 +1455,17 @@ define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_xor_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1739,22 +1482,17 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xor_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xor_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1770,22 +1508,17 @@ define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_xor_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_xor_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1806,23 +1539,17 @@ define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1838,23 +1565,17 @@ define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1871,22 +1592,17 @@ define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax v[2:3], v1, off +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax v[2:3], v1, off +; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1902,22 +1618,17 @@ define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -1934,23 +1645,17 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1966,23 +1671,17 @@ define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -1999,22 +1698,17 @@ define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2030,22 +1724,17 @@ define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smax_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2066,23 +1755,17 @@ define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2098,23 +1781,17 @@ define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2131,22 +1808,17 @@ define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin v[2:3], v1, off +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin v[2:3], v1, off +; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2162,22 +1834,17 @@ define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2194,23 +1861,17 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2226,23 +1887,17 @@ define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2259,22 +1914,17 @@ define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2290,22 +1940,17 @@ define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_smin_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2326,23 +1971,17 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2358,23 +1997,17 @@ define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2391,22 +2024,17 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax v[2:3], v1, off +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax v[2:3], v1, off +; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2422,22 +2050,17 @@ define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2454,23 +2077,17 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2486,23 +2103,17 @@ define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2519,22 +2130,17 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2550,22 +2156,17 @@ define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umax_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2586,23 +2187,17 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off glc +; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off glc +; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2618,23 +2213,17 @@ define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc +; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin v0, v[2:3], v1, off offset:-128 glc +; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2651,22 +2240,17 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin v[2:3], v1, off +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin v[2:3], v1, off +; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2682,22 +2266,17 @@ define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin v[2:3], v1, off offset:-128 +; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin v[2:3], v1, off offset:-128 +; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2714,23 +2293,17 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off glc +; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2746,23 +2319,17 @@ define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc +; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2779,22 +2346,17 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off +; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off +; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2810,22 +2372,17 @@ define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128 +; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_umin_x2 v[3:4], v[1:2], off offset:-128 +; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2847,24 +2404,18 @@ ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc +; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off glc +; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2882,24 +2433,18 @@ ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc +; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v0, v[0:1], v[2:3], off offset:-128 glc +; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2918,23 +2463,18 @@ ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off +; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off +; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2951,23 +2491,18 @@ ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v3, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128 +; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v3, v1 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap v[0:1], v[2:3], off offset:-128 +; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -2984,27 +2519,21 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -3021,27 +2550,21 @@ define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc +; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[0:1], v[3:6], off offset:-128 glc +; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -3059,26 +2582,21 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off +; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off +; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -3094,26 +2612,21 @@ define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) { ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128 +; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: v_mov_b32_e32 v6, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v[3:6], off offset:-128 +; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: buffer_gl0_inv @@ -3135,23 +2648,11 @@ declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_inc_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_inc_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_inc_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -3161,23 +2662,11 @@ } define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_inc_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_inc_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc v0, v[2:3], v1, off offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_inc_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3188,20 +2677,10 @@ } define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_inc_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_inc v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_inc_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_inc_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -3210,20 +2689,10 @@ } define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_inc_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_inc v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_inc_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_inc_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3233,23 +2702,11 @@ } define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_inc_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_inc_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_inc_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -3259,23 +2716,11 @@ } define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_inc_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_inc_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_inc_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3286,20 +2731,10 @@ } define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_inc_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_inc_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_inc_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -3308,20 +2743,10 @@ } define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_inc_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_inc_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_inc_x2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_inc_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3338,23 +2763,11 @@ declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0 define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_dec_saddr_i32_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_dec_saddr_i32_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_dec_saddr_i32_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -3364,23 +2777,11 @@ } define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_dec_saddr_i32_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_dec_saddr_i32_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec v0, v[2:3], v1, off offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_dec_saddr_i32_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3391,20 +2792,10 @@ } define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_dec_saddr_i32_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_dec v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_dec_saddr_i32_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_dec_saddr_i32_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -3413,20 +2804,10 @@ } define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_dec_saddr_i32_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_atomic_dec v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_dec_saddr_i32_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_dec_saddr_i32_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3436,23 +2817,11 @@ } define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_dec_saddr_i64_rtn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_dec_saddr_i64_rtn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_dec_saddr_i64_rtn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -3462,23 +2831,11 @@ } define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_dec_saddr_i64_rtn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_dec_saddr_i64_rtn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec_x2 v[0:1], v[3:4], v[1:2], off offset:-128 glc -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_dec_saddr_i64_rtn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -3489,20 +2846,10 @@ } define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_dec_saddr_i64_nortn: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_dec_saddr_i64_nortn: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_dec_saddr_i64_nortn: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -3511,20 +2858,10 @@ } define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_dec_saddr_i64_nortn_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_dec_saddr_i64_nortn_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_atomic_dec_x2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_dec_saddr_i64_nortn_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 Index: llvm/test/CodeGen/AMDGPU/global-saddr-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-load.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-load.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; Test using saddr addressing mode of global_*load_* flat instructions. @@ -10,23 +10,11 @@ ; Basic pattern, no immediate offset. define amdgpu_ps float @global_load_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -39,20 +27,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -80,11 +64,10 @@ ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4096: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x1000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x1000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -101,20 +84,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg4096(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-4096 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-4096 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4096: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -142,11 +121,10 @@ ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg4097: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff000, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff000, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -161,23 +139,11 @@ ; Maximum positive offset on gfx10 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_2047: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2047 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047 @@ -191,20 +157,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:2048 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:2048 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_2048: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -219,23 +181,11 @@ ; Maximum negative offset on gfx10 define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-2048 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2048: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2048 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048 @@ -249,20 +199,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_neg2049(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-2049 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-2049 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_neg2049: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0xfffff800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0xfffff800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, -1, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -279,20 +225,16 @@ define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_offset_4095_gep_order(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:4095 +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] offset:4095 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_offset_4095_gep_order: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, s[0:1], s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s[0:1], s3, 0, s[0:1] +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -307,23 +249,11 @@ ; pointer addressing done in integers define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %sbase.as.int, %zext.offset @@ -336,23 +266,11 @@ ; zext forced to LHS of addressing expression define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s3, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -365,23 +283,11 @@ ; zext forced to LHS of addressing expression, with immediate offset define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, v0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s3, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add = add i64 %zext.offset, %sbase.as.int @@ -395,23 +301,11 @@ ; zext forced to LHS of addressing expression, with immediate offset in non-canonical position define amdgpu_ps float @global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %sbase.as.int = ptrtoint i8 addrspace(1)* %sbase to i64 %add.immoffset = add i64 %sbase.as.int, 128 @@ -431,28 +325,17 @@ ; Base pointer is uniform, but also in VGPRs define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) { -; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_read_b64 v[1:2], v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_read_b64 v[1:2], v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: ds_read_b64 v[1:2], v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_nop 4 +; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -464,28 +347,17 @@ ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps float @global_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) { -; GFX9-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: ds_read_b64 v[1:2], v1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v1, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:42 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_read_b64 v[1:2], v1 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v1, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v2, vcc_lo -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:42 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_uniform_ptr_in_vgprs_immoffset: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: ds_read_b64 v[1:2], v1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v1 +; GCN-NEXT: v_readfirstlane_b32 s1, v2 +; GCN-NEXT: s_nop 4 +; GCN-NEXT: global_load_ubyte v0, v0, s[0:1] offset:42 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -500,12 +372,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s2, s4 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 @@ -520,12 +388,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_uniform_offset_immoffset(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { ; GCN-LABEL: global_load_saddr_i8_zext_uniform_offset_immoffset: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s2, s4 -; GCN-NEXT: s_addc_u32 s1, s3, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:-24 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 @@ -541,12 +405,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s4, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: global_load_ubyte v0, v[0:1], off +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 @@ -563,12 +423,8 @@ define amdgpu_ps float @global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(i8 addrspace(1)* inreg %sbase, i32 inreg %soffset) { ; GCN-LABEL: global_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0: ; GCN: ; %bb.0: -; GCN-NEXT: s_add_u32 s0, s4, s2 -; GCN-NEXT: s_addc_u32 s1, 0, s3 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: ; implicit-def: $vcc_hi -; GCN-NEXT: global_load_ubyte v0, v[0:1], off offset:128 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:128 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %soffset to i64 @@ -594,9 +450,8 @@ ; ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -620,11 +475,10 @@ ; ; GFX10-LABEL: global_load_i8_vgpr64_sgpr32_offset_4095: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, v0, s2 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, 0x800, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, v0, s2 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, 0x800, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, 0, v1, vcc ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -660,11 +514,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -677,27 +530,13 @@ ; Cannot push the shift into 32-bits, with an immediate offset. define amdgpu_ps float @global_load_saddr_f32_natural_addressing_immoffset(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { -; GFX9-LABEL: global_load_saddr_f32_natural_addressing_immoffset: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f32_natural_addressing_immoffset: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f32_natural_addressing_immoffset: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -709,29 +548,14 @@ ; Range is sufficiently restricted to push the shift into 32-bits. define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { -; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 %gep = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -741,29 +565,14 @@ ; Range is sufficiently restricted to push the shift into 32-bits, with an imm offset define amdgpu_ps float @global_load_f32_saddr_zext_vgpr_range_imm_offset(float addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr) { -; GFX9-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:400 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:400 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_f32_saddr_zext_vgpr_range_imm_offset: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:400 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %voffset = load i32, i32 addrspace(1)* %voffset.ptr, !range !0 %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds float, float addrspace(1)* %sbase, i64 %zext.offset @@ -791,11 +600,10 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] -; GFX10-NEXT: v_add_co_u32_e64 v0, vcc_lo, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX10-NEXT: v_add_co_u32_e64 v0, vcc, s2, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc, s3, v1, vcc ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: ; return to shader part epilog @@ -811,23 +619,11 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps half @global_load_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -837,23 +633,11 @@ } define amdgpu_ps half @global_load_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -864,23 +648,11 @@ } define amdgpu_ps half @global_load_saddr_f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* @@ -889,23 +661,11 @@ } define amdgpu_ps half @global_load_saddr_f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -915,23 +675,11 @@ } define amdgpu_ps float @global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -941,23 +689,11 @@ } define amdgpu_ps float @global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -968,23 +704,11 @@ } define amdgpu_ps float @global_load_saddr_f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* @@ -993,23 +717,11 @@ } define amdgpu_ps float @global_load_saddr_f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1019,23 +731,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_v2i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i16> addrspace(1)* @@ -1045,23 +745,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_v2i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1072,23 +760,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_v2f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2f16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x half> addrspace(1)* @@ -1097,23 +773,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_v2f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2f16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2f16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2f16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1123,23 +787,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_p3: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_p3: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_p3: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* @@ -1150,23 +802,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_p3_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_p3_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_p3_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1178,23 +818,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_f64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f64: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f64: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f64: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* @@ -1204,23 +832,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_f64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_f64_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_f64_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_f64_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1231,23 +847,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i64: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -1257,23 +861,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i64_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i64_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i64_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1284,23 +876,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v2f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2f32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* @@ -1309,23 +889,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v2f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2f32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2f32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2f32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1335,23 +903,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v2i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* @@ -1361,23 +917,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v2i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1388,23 +932,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v4i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4i16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* @@ -1414,23 +946,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v4i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4i16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4i16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4i16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1441,23 +961,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v4f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4f16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* @@ -1467,23 +975,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_v4f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4f16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4f16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4f16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1494,23 +990,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_p1: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_p1: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_p1: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* @@ -1521,23 +1005,11 @@ } define amdgpu_ps <2 x float> @global_load_saddr_p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_p1_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_p1_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_p1_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1549,23 +1021,11 @@ } define amdgpu_ps <3 x float> @global_load_saddr_v3f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v3f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v3f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v3f32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* @@ -1574,23 +1034,11 @@ } define amdgpu_ps <3 x float> @global_load_saddr_v3f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v3f32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v3f32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v3f32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1600,23 +1048,11 @@ } define amdgpu_ps <3 x float> @global_load_saddr_v3i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v3i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v3i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v3i32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* @@ -1626,50 +1062,26 @@ } define amdgpu_ps <3 x float> @global_load_saddr_v3i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v3i32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v3i32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v3i32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 %gep1.cast = bitcast i8 addrspace(1)* %gep1 to <3 x i32> addrspace(1)* %load = load <3 x i32>, <3 x i32> addrspace(1)* %gep1.cast %cast.load = bitcast <3 x i32> %load to <3 x float> - ret <3 x float> %cast.load -} - -define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v6f16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v6f16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog + ret <3 x float> %cast.load +} + +define amdgpu_ps <6 x half> @global_load_saddr_v6f16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { +; GCN-LABEL: global_load_saddr_v6f16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* @@ -1678,23 +1090,11 @@ } define amdgpu_ps <6 x half> @global_load_saddr_v6f16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v6f16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v6f16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx3 v[0:2], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v6f16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx3 v[0:2], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1704,23 +1104,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4f32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4f32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4f32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4f32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* @@ -1729,23 +1117,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4f32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4f32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4f32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4f32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1755,23 +1131,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4i32: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4i32: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* @@ -1781,23 +1145,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4i32_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4i32_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4i32_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1808,23 +1160,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v2i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i64: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i64: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i64: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* @@ -1834,23 +1174,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v2i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2i64_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2i64_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2i64_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1861,23 +1189,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_i128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i128 addrspace(1)* @@ -1887,23 +1203,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_i128_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i128_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i128_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i128_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1914,23 +1218,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v2p1(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2p1: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2p1: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2p1: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* @@ -1941,23 +1233,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v2p1_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v2p1_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v2p1_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v2p1_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1969,23 +1249,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4p3(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4p3: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4p3: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4p3: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* @@ -1996,23 +1264,11 @@ } define amdgpu_ps <4 x float> @global_load_saddr_v4p3_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_v4p3_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_v4p3_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_v4p3_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dwordx4 v[0:3], v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2028,23 +1284,11 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps float @global_sextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_sextload_saddr_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_sbyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_sextload_saddr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_sextload_saddr_i8: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -2054,23 +1298,11 @@ } define amdgpu_ps float @global_sextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_sextload_saddr_i8_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_sbyte v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_sextload_saddr_i8_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_sextload_saddr_i8_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2081,23 +1313,11 @@ } define amdgpu_ps float @global_sextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_sextload_saddr_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_sshort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_sextload_saddr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sshort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_sextload_saddr_i16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sshort v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2108,23 +1328,11 @@ } define amdgpu_ps float @global_sextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_sextload_saddr_i16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_sshort v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_sextload_saddr_i16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sshort v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_sextload_saddr_i16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sshort v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2136,23 +1344,11 @@ } define amdgpu_ps float @global_zextload_saddr_i8(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_zextload_saddr_i8: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_zextload_saddr_i8: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_zextload_saddr_i8: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %load = load i8, i8 addrspace(1)* %gep0 @@ -2162,23 +1358,11 @@ } define amdgpu_ps float @global_zextload_saddr_i8_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_zextload_saddr_i8_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ubyte v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_zextload_saddr_i8_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_zextload_saddr_i8_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2189,23 +1373,11 @@ } define amdgpu_ps float @global_zextload_saddr_i16(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_zextload_saddr_i16: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_zextload_saddr_i16: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_zextload_saddr_i16: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2216,23 +1388,11 @@ } define amdgpu_ps float @global_zextload_saddr_i16_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_zextload_saddr_i16_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_zextload_saddr_i16_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ushort v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_zextload_saddr_i16_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ushort v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2250,23 +1410,17 @@ define amdgpu_ps float @atomic_global_load_saddr_i32(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: atomic_global_load_saddr_i32: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_global_load_saddr_i32: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2282,23 +1436,17 @@ define amdgpu_ps float @atomic_global_load_saddr_i32_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: atomic_global_load_saddr_i32_immneg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:-128 glc +; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_global_load_saddr_i32_immneg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dword v0, v[0:1], off offset:-128 glc dlc +; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2315,23 +1463,17 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: atomic_global_load_saddr_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_global_load_saddr_i64: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2347,23 +1489,17 @@ define amdgpu_ps <2 x float> @atomic_global_load_saddr_i64_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { ; GFX9-LABEL: atomic_global_load_saddr_i64_immneg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 glc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: atomic_global_load_saddr_i64_immneg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:-128 glc dlc +; GFX10-NEXT: global_load_dwordx2 v[0:1], v0, s[2:3] offset:-128 glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_gl0_inv ; GFX10-NEXT: buffer_gl1_inv @@ -2382,23 +1518,11 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_undef_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_short_d16 v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_undef_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16 v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2409,23 +1533,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_short_d16 v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16 v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_undef_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16 v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2437,27 +1549,13 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_zero_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_zero_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2468,27 +1566,13 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_zero_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2500,25 +1584,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2529,25 +1600,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16 v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16 v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2559,25 +1617,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ubyte_d16 v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte_d16 v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2589,25 +1634,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ubyte_d16 v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte_d16 v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte_d16 v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2620,25 +1652,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_sbyte_d16 v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte_d16 v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2650,25 +1669,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_sbyte_d16 v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte_d16 v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte_d16 v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2685,23 +1691,11 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_undef_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_short_d16_hi v0, v[0:1], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_undef_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2712,23 +1706,11 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_undef_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_short_d16_hi v0, v[0:1], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16_hi v0, v[0:1], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_undef_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16_hi v0, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2740,25 +1722,13 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_zero_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_short_d16_hi v0, v[1:2], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_zero_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v1, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2769,25 +1739,13 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zero_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v0 -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_short_d16_hi v0, v[1:2], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v1, s0, s2, v0 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s0, s3, 0, s0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_zero_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2799,25 +1757,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16_hi v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16_hi v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -2828,25 +1773,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16_hi v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_short_d16_hi v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_short_d16_hi v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2858,25 +1790,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2888,25 +1807,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_ubyte_d16_hi v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_ubyte_d16_hi v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -2919,25 +1825,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* @@ -2949,25 +1842,12 @@ } define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %reg) { -; GFX9-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off offset:-128 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-NEXT: ; return to shader part epilog -; -; GFX10-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_load_sbyte_d16_hi v1, v[2:3], off offset:-128 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, v1 -; GFX10-NEXT: ; return to shader part epilog +; GCN-LABEL: global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_sbyte_d16_hi v1, v0, s[2:3] offset:-128 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, v1 +; GCN-NEXT: ; return to shader part epilog %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 Index: llvm/test/CodeGen/AMDGPU/global-saddr-store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/global-saddr-store.ll +++ llvm/test/CodeGen/AMDGPU/global-saddr-store.ll @@ -1,29 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GCN,GFX10 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; Test using saddr addressing mode of global_*store_* flat instructions. define amdgpu_ps void @global_store_saddr_i8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { -; GFX9-LABEL: global_store_saddr_i8_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_byte v[0:1], v2, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i8_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_store_byte v[0:1], v2, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i8_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_byte v0, v2, s[2:3] +; GCN-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -33,25 +20,12 @@ ; Maximum positive offset on gfx10 define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_2047(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { -; GFX9-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_byte v[0:1], v2, off offset:2047 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_store_byte v[0:1], v2, off offset:2047 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i8_zext_vgpr_offset_2047: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_byte v0, v2, s[2:3] offset:2047 +; GCN-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -62,25 +36,12 @@ ; Maximum negative offset on gfx10 define amdgpu_ps void @global_store_saddr_i8_zext_vgpr_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 addrspace(1)* %voffset.ptr, i8 %data) { -; GFX9-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: -; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_store_byte v[0:1], v2, off offset:-2048 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: -; GFX10: ; %bb.0: -; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0 -; GFX10-NEXT: global_store_byte v[0:1], v2, off offset:-2048 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i8_zext_vgpr_offset_neg2048: +; GCN: ; %bb.0: +; GCN-NEXT: global_load_dword v0, v[0:1], off +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_byte v0, v2, s[2:3] offset:-2048 +; GCN-NEXT: s_endpgm %voffset = load i32, i32 addrspace(1)* %voffset.ptr %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -97,26 +58,16 @@ ; Base pointer is uniform, but also in VGPRs define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) { -; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_read_b64 v[2:3], v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_byte v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_read_b64 v[2:3], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: global_store_byte v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: ds_read_b64 v[2:3], v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: v_readfirstlane_b32 s1, v3 +; GCN-NEXT: s_nop 4 +; GCN-NEXT: global_store_byte v0, v1, s[0:1] +; GCN-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -126,26 +77,16 @@ ; Base pointer is uniform, but also in VGPRs, with imm offset define amdgpu_ps void @global_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) { -; GFX9-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: ds_read_b64 v[2:3], v2 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_byte v[2:3], v1, off offset:-120 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: ds_read_b64 v[2:3], v2 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, v2, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX10-NEXT: global_store_byte v[2:3], v1, off offset:-120 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_uniform_ptr_in_vgprs_immoffset: +; GCN: ; %bb.0: +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: ds_read_b64 v[2:3], v2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_readfirstlane_b32 s0, v2 +; GCN-NEXT: v_readfirstlane_b32 s1, v3 +; GCN-NEXT: s_nop 4 +; GCN-NEXT: global_store_byte v0, v1, s[0:1] offset:-120 +; GCN-NEXT: s_endpgm %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -159,21 +100,10 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps void @global_store_saddr_i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) { -; GFX9-LABEL: global_store_saddr_i16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -182,21 +112,10 @@ } define amdgpu_ps void @global_store_saddr_i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i16 %data) { -; GFX9-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -206,21 +125,10 @@ } define amdgpu_ps void @global_store_saddr_f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) { -; GFX9-LABEL: global_store_saddr_f16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to half addrspace(1)* @@ -229,21 +137,10 @@ } define amdgpu_ps void @global_store_saddr_f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, half %data) { -; GFX9-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -253,21 +150,10 @@ } define amdgpu_ps void @global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_store_saddr_i32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)* @@ -276,21 +162,10 @@ } define amdgpu_ps void @global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { -; GFX9-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -300,21 +175,10 @@ } define amdgpu_ps void @global_store_saddr_f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { -; GFX9-LABEL: global_store_saddr_f32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to float addrspace(1)* @@ -323,21 +187,10 @@ } define amdgpu_ps void @global_store_saddr_f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, float %data) { -; GFX9-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -347,21 +200,10 @@ } define amdgpu_ps void @global_store_saddr_p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) { -; GFX9-LABEL: global_store_saddr_p3_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_p3_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_p3_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(3)* addrspace(1)* @@ -370,21 +212,10 @@ } define amdgpu_ps void @global_store_saddr_p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(3)* %data) { -; GFX9-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_p3_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -394,21 +225,10 @@ } define amdgpu_ps void @global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_store_saddr_i64_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i64_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i64_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)* @@ -417,21 +237,10 @@ } define amdgpu_ps void @global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { -; GFX9-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i64_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -441,21 +250,10 @@ } define amdgpu_ps void @global_store_saddr_f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) { -; GFX9-LABEL: global_store_saddr_f64_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f64_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f64_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to double addrspace(1)* @@ -464,21 +262,10 @@ } define amdgpu_ps void @global_store_saddr_f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, double %data) { -; GFX9-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_f64_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -488,21 +275,10 @@ } define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v2i32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2i32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2i32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i32> addrspace(1)* @@ -511,21 +287,10 @@ } define amdgpu_ps void @global_store_saddr_v2i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2i32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -535,21 +300,10 @@ } define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) { -; GFX9-LABEL: global_store_saddr_v2f32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2f32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2f32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x float> addrspace(1)* @@ -558,21 +312,10 @@ } define amdgpu_ps void @global_store_saddr_v2f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x float> %data) { -; GFX9-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2f32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -582,21 +325,10 @@ } define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v4i16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4i16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4i16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i16> addrspace(1)* @@ -605,21 +337,10 @@ } define amdgpu_ps void @global_store_saddr_v4i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4i16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -629,21 +350,10 @@ } define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) { -; GFX9-LABEL: global_store_saddr_v4f16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4f16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4f16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x half> addrspace(1)* @@ -652,21 +362,10 @@ } define amdgpu_ps void @global_store_saddr_v4f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x half> %data) { -; GFX9-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4f16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -676,21 +375,10 @@ } define amdgpu_ps void @global_store_saddr_p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) { -; GFX9-LABEL: global_store_saddr_p1_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_p1_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_p1_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i8 addrspace(1)* addrspace(1)* @@ -699,21 +387,10 @@ } define amdgpu_ps void @global_store_saddr_p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i8 addrspace(1)* %data) { -; GFX9-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_p1_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -723,21 +400,10 @@ } define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v3i32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v3i32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v3i32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x i32> addrspace(1)* @@ -746,21 +412,10 @@ } define amdgpu_ps void @global_store_saddr_v3i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v3i32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -770,21 +425,10 @@ } define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) { -; GFX9-LABEL: global_store_saddr_v3f32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v3f32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v3f32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <3 x float> addrspace(1)* @@ -793,21 +437,10 @@ } define amdgpu_ps void @global_store_saddr_v3f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <3 x float> %data) { -; GFX9-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v3f32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -817,21 +450,10 @@ } define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v6i16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v6i16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v6i16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x i16> addrspace(1)* @@ -840,21 +462,10 @@ } define amdgpu_ps void @global_store_saddr_v6i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v6i16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -864,21 +475,10 @@ } define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) { -; GFX9-LABEL: global_store_saddr_v6f16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v6f16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v6f16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <6 x half> addrspace(1)* @@ -887,21 +487,10 @@ } define amdgpu_ps void @global_store_saddr_v6f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <6 x half> %data) { -; GFX9-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v5, s3 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v4, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v5, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx3 v[4:5], v[1:3], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v6f16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx3 v0, v[1:3], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -911,21 +500,10 @@ } define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v4i32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4i32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4i32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i32> addrspace(1)* @@ -934,21 +512,10 @@ } define amdgpu_ps void @global_store_saddr_v4i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i32> %data) { -; GFX9-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4i32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -958,21 +525,10 @@ } define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) { -; GFX9-LABEL: global_store_saddr_v4f32_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4f32_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4f32_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x float> addrspace(1)* @@ -981,21 +537,10 @@ } define amdgpu_ps void @global_store_saddr_v4f32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x float> %data) { -; GFX9-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4f32_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1005,21 +550,10 @@ } define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) { -; GFX9-LABEL: global_store_saddr_v2i64_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2i64_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2i64_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i64> addrspace(1)* @@ -1028,21 +562,10 @@ } define amdgpu_ps void @global_store_saddr_v2i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i64> %data) { -; GFX9-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2i64_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1052,21 +575,10 @@ } define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) { -; GFX9-LABEL: global_store_saddr_v2f64_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2f64_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2f64_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x double> addrspace(1)* @@ -1075,21 +587,10 @@ } define amdgpu_ps void @global_store_saddr_v2f64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x double> %data) { -; GFX9-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2f64_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1099,21 +600,10 @@ } define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v8i16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v8i16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v8i16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x i16> addrspace(1)* @@ -1122,21 +612,10 @@ } define amdgpu_ps void @global_store_saddr_v8i16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x i16> %data) { -; GFX9-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v8i16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1146,21 +625,10 @@ } define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) { -; GFX9-LABEL: global_store_saddr_v8f16_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v8f16_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v8f16_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <8 x half> addrspace(1)* @@ -1169,21 +637,10 @@ } define amdgpu_ps void @global_store_saddr_v8f16_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <8 x half> %data) { -; GFX9-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v8f16_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1193,21 +650,10 @@ } define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) { -; GFX9-LABEL: global_store_saddr_v2p1_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2p1_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2p1_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <2 x i8 addrspace(1)*> addrspace(1)* @@ -1216,21 +662,10 @@ } define amdgpu_ps void @global_store_saddr_v2p1_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i8 addrspace(1)*> %data) { -; GFX9-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v2p1_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1240,21 +675,10 @@ } define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) { -; GFX9-LABEL: global_store_saddr_v4p3_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4p3_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4p3_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to <4 x i8 addrspace(3)*> addrspace(1)* @@ -1263,21 +687,10 @@ } define amdgpu_ps void @global_store_saddr_v4p3_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <4 x i8 addrspace(3)*> %data) { -; GFX9-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v6, s3 -; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc -; GFX9-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v5, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s0, s3, 0, s0 -; GFX10-NEXT: global_store_dwordx4 v[5:6], v[1:4], off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_v4p3_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_dwordx4 v0, v[1:4], s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1293,21 +706,15 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_global_store_saddr_i32_zext_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] ; GFX10-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1319,25 +726,15 @@ define amdgpu_ps void @atomic_global_store_saddr_i32_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) { ; GFX9-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_global_store_saddr_i32_zext_vgpr_offset_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v2, vcc_lo, 0xffffff80, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, -1, v3, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dword v[2:3], v1, off +; GFX10-NEXT: global_store_dword v0, v1, s[2:3] offset:-128 ; GFX10-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1350,21 +747,15 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v4, s3 -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_global_store_saddr_i64_zext_vgpr: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v3, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] ; GFX10-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1376,25 +767,15 @@ define amdgpu_ps void @atomic_global_store_saddr_i64_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) { ; GFX9-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0xffffff80, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v4, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX9-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: atomic_global_store_saddr_i64_zext_vgpr_offset_neg128: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v4, s0, s3, 0, s0 -; GFX10-NEXT: v_add_co_u32_e64 v3, vcc_lo, 0xffffff80, v0 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v4, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-NEXT: global_store_dwordx2 v[3:4], v[1:2], off +; GFX10-NEXT: global_store_dwordx2 v0, v[1:2], s[2:3] offset:-128 ; GFX10-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset @@ -1409,21 +790,10 @@ ; -------------------------------------------------------------------------------- define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { -; GFX9-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short_d16_hi v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_d16hi_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short_d16_hi v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep0.cast = bitcast i8 addrspace(1)* %gep0 to i16 addrspace(1)* @@ -1433,21 +803,10 @@ } define amdgpu_ps void @global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { -; GFX9-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_short_d16_hi v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_short_d16_hi v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_d16hi_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_short_d16_hi v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 @@ -1458,21 +817,10 @@ } define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { -; GFX9-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_byte_d16_hi v[2:3], v1, off -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_byte_d16_hi v[2:3], v1, off -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %data.hi = extractelement <2 x i16> %data, i32 1 @@ -1482,21 +830,10 @@ } define amdgpu_ps void @global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, <2 x i16> %data) { -; GFX9-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: -; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_store_byte_d16_hi v[2:3], v1, off offset:-128 -; GFX9-NEXT: s_endpgm -; -; GFX10-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: -; GFX10: ; %bb.0: -; GFX10-NEXT: v_add_co_u32_e64 v2, s0, s2, v0 -; GFX10-NEXT: ; implicit-def: $vcc_hi -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, s3, 0, s0 -; GFX10-NEXT: global_store_byte_d16_hi v[2:3], v1, off offset:-128 -; GFX10-NEXT: s_endpgm +; GCN-LABEL: global_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128: +; GCN: ; %bb.0: +; GCN-NEXT: global_store_byte_d16_hi v0, v1, s[2:3] offset:-128 +; GCN-NEXT: s_endpgm %zext.offset = zext i32 %voffset to i64 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128 Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -513,20 +513,14 @@ ; GFX9-LABEL: v_insertelement_v2i16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_movk_i32 s0, 0x3e7 -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x3e7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v1, s0, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_bfi_b32 v1, v2, s2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0: @@ -579,20 +573,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff0000 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff0000 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_lshrrev_b32_e64 v1, 16, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: v_lshrrev_b32_e64 v2, 16, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_or_b32 v0, v0, v4, v1 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_and_or_b32 v1, v1, v3, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_reghi: @@ -650,19 +638,13 @@ ; GFX9-LABEL: v_insertelement_v2i16_0_inlineimm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v1, 53, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_bfi_b32 v1, v2, 53, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_0_inlineimm: @@ -715,20 +697,14 @@ ; GFX9-LABEL: v_insertelement_v2i16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_movk_i32 s0, 0x3e7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x3e7 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1: @@ -780,19 +756,13 @@ ; GFX9-LABEL: v_insertelement_v2i16_1_inlineimm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, -15, 16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, -15, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_1_inlineimm: @@ -844,20 +814,14 @@ ; GFX9-LABEL: v_insertelement_v2f16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x4500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX9-NEXT: v_mov_b32_e32 v3, 0x4500 -; GFX9-NEXT: v_lshl_or_b32 v2, v2, 16, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0: @@ -909,19 +873,13 @@ ; GFX9-LABEL: v_insertelement_v2f16_0_inlineimm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 53 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v1, 16, 53 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_0_inlineimm: @@ -973,20 +931,14 @@ ; GFX9-LABEL: v_insertelement_v2f16_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_movk_i32 s0, 0x4500 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_movk_i32 s2, 0x4500 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, s0, 16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, s2, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1: @@ -1038,19 +990,13 @@ ; GFX9-LABEL: v_insertelement_v2f16_1_inlineimm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX9-NEXT: v_lshl_or_b32 v0, 35, 16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, 35, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_1_inlineimm: @@ -1165,21 +1111,15 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0x3e703e7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b32 s0, s4, 4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_lshl_b32 s0, 0xffff, s0 -; GFX9-NEXT: v_mov_b32_e32 v1, 0x3e703e7 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_lshl_b32 s2, s4, 4 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, s0, v1, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_bfi_b32 v1, s2, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2i16_dynamic_sgpr: @@ -1238,27 +1178,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: global_load_dword v3, v[0:1], off +; GFX9-NEXT: global_load_dword v1, v0, s[4:5] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_mov_b32 s0, 0x12341234 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 -; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s2 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 +; GFX9-NEXT: v_lshlrev_b32_e64 v1, v1, s2 +; GFX9-NEXT: s_mov_b32 s2, 0x12341234 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v2, v2, s0, v3 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_bfi_b32 v1, v1, s2, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v2f16_dynamic_vgpr: @@ -1331,18 +1262,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v0, v4, s4, v0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v0, v3, s4, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_0: @@ -1403,17 +1328,11 @@ ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: v_lshl_or_b32 v0, s4, 16, v0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_1: @@ -1473,18 +1392,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x30 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_2: @@ -1545,17 +1458,11 @@ ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX9-NEXT: v_lshl_or_b32 v1, s4, 16, v1 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_3: @@ -1615,18 +1522,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v4, s4, v1 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, v3, s4, v1 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_2: @@ -1684,28 +1585,22 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 { ; GFX9-LABEL: v_insertelement_v4i16_dynamic_vgpr: ; GFX9: ; %bb.0: +; GFX9-NEXT: global_load_dword v2, v[0:1], off ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s4, s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v2, v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: s_pack_ll_b32_b16 s1, s4, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], v2, s[2:3] +; GFX9-NEXT: s_pack_ll_b32_b16 s2, s4, s4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, v3, s1, v1 -; GFX9-NEXT: v_bfi_b32 v0, v2, s1, v0 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, v3, s2, v1 +; GFX9-NEXT: v_bfi_b32 v0, v2, s2, v0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4i16_dynamic_vgpr: @@ -1783,24 +1678,18 @@ ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_pack_ll_b32_b16 s4, s4, s4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: s_mov_b32 s2, 0xffff -; GFX9-NEXT: s_lshl_b32 s1, s5, 4 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], s1 +; GFX9-NEXT: s_lshl_b32 s5, s5, 4 +; GFX9-NEXT: s_lshl_b64 s[2:3], s[2:3], s5 +; GFX9-NEXT: v_mov_b32_e32 v3, s4 ; GFX9-NEXT: v_mov_b32_e32 v4, s4 -; GFX9-NEXT: v_mov_b32_e32 v5, s4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfi_b32 v1, s1, v4, v1 -; GFX9-NEXT: v_bfi_b32 v0, s0, v5, v0 -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: v_bfi_b32 v1, s3, v3, v1 +; GFX9-NEXT: v_bfi_b32 v0, s2, v4, v0 +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_insertelement_v4f16_dynamic_sgpr: Index: llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll +++ llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.subtest-saddr.ll @@ -15,7 +15,7 @@ ; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]] ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]] -; GCN: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]] +; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]] define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -137,22 +137,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -203,18 +194,12 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_reg_imm: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, 1.0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, 1.0 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -263,18 +248,12 @@ ; GFX9-LABEL: v_cvt_pkrtz_v2f16_f32_imm_reg: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, 1.0, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, 1.0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -332,22 +311,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -408,22 +378,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, -v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, v1, -v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -484,22 +445,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -v0, -v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -v1, -v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -561,22 +513,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v0, -|v0|, -v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_cvt_pkrtz_f16_f32 v1, -|v1|, -v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll @@ -51,45 +51,39 @@ ; VARIANT2: ; %bb.0: ; %entry ; VARIANT2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT2-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT2-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VARIANT2-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT2-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT2-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT2-NEXT: v_xad_u32 v3, v0, -1, s0 -; VARIANT2-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; VARIANT2-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 -; VARIANT2-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; VARIANT2-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; VARIANT2-NEXT: global_store_dword v[1:2], v0, off -; VARIANT2-NEXT: v_mov_b32_e32 v0, s3 -; VARIANT2-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT2-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT2-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT2-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT2-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VARIANT2-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT2-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT2-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT2-NEXT: s_waitcnt vmcnt(0) ; VARIANT2-NEXT: s_barrier -; VARIANT2-NEXT: global_load_dword v0, v[3:4], off +; VARIANT2-NEXT: global_load_dword v0, v[0:1], off ; VARIANT2-NEXT: s_waitcnt vmcnt(0) -; VARIANT2-NEXT: global_store_dword v[1:2], v0, off +; VARIANT2-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT2-NEXT: s_endpgm ; ; VARIANT3-LABEL: test_barrier: ; VARIANT3: ; %bb.0: ; %entry ; VARIANT3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VARIANT3-NEXT: s_load_dword s0, s[0:1], 0x2c -; VARIANT3-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; VARIANT3-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; VARIANT3-NEXT: s_waitcnt lgkmcnt(0) -; VARIANT3-NEXT: v_mov_b32_e32 v2, s3 -; VARIANT3-NEXT: v_xad_u32 v3, v0, -1, s0 -; VARIANT3-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; VARIANT3-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 -; VARIANT3-NEXT: v_lshlrev_b64 v[3:4], 2, v[3:4] -; VARIANT3-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; VARIANT3-NEXT: global_store_dword v[1:2], v0, off -; VARIANT3-NEXT: v_mov_b32_e32 v0, s3 -; VARIANT3-NEXT: v_add_co_u32_e32 v3, vcc, s2, v3 -; VARIANT3-NEXT: v_addc_co_u32_e32 v4, vcc, v0, v4, vcc +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] +; VARIANT3-NEXT: v_xad_u32 v0, v0, -1, s0 +; VARIANT3-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; VARIANT3-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; VARIANT3-NEXT: v_mov_b32_e32 v3, s3 +; VARIANT3-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 +; VARIANT3-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; VARIANT3-NEXT: s_barrier -; VARIANT3-NEXT: global_load_dword v0, v[3:4], off +; VARIANT3-NEXT: global_load_dword v0, v[0:1], off ; VARIANT3-NEXT: s_waitcnt vmcnt(0) -; VARIANT3-NEXT: global_store_dword v[1:2], v0, off +; VARIANT3-NEXT: global_store_dword v2, v0, s[2:3] ; VARIANT3-NEXT: s_endpgm entry: %tmp = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/lshr.v2i16.ll @@ -69,19 +69,13 @@ ; GFX9-LABEL: v_lshr_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v4 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v2i16: @@ -148,18 +142,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, s0, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, s0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_s_v2i16: @@ -223,18 +211,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_s_v_v2i16: @@ -297,18 +279,12 @@ ; GFX9-LABEL: lshr_imm_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_imm_v_v2i16: @@ -365,18 +341,12 @@ ; GFX9-LABEL: lshr_v_imm_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v2i16: @@ -430,18 +400,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshrrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshrrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshrrev_b16 v1, v3, v1 +; GFX9-NEXT: v_pk_lshrrev_b16 v0, v2, v0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_lshr_v4i16: @@ -520,17 +484,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: lshr_v_imm_v4i16: Index: llvm/test/CodeGen/AMDGPU/madak.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/madak.ll +++ llvm/test/CodeGen/AMDGPU/madak.ll @@ -44,9 +44,9 @@ ; GFX6-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}} ; GFX6-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4 ; GFX6-DAG: buffer_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8 -; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} -; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} -; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}} +; GFX8_9_10: {{flat|global}}_load_dword [[VA:v[0-9]+]], +; GFX8_9_10: {{flat|global}}_load_dword [[VB:v[0-9]+]], +; GFX8_9_10: {{flat|global}}_load_dword [[VC:v[0-9]+]], ; GFX6-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX8-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000 ; GFX6_8_9-DAG: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 Index: llvm/test/CodeGen/AMDGPU/max.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/max.i16.ll +++ llvm/test/CodeGen/AMDGPU/max.i16.ll @@ -30,22 +30,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -89,22 +80,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v0, v0, v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_pk_max_i16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid @@ -159,29 +141,20 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 -; GFX9-NEXT: v_mov_b32_e32 v6, 0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_short_d16 v6, v[0:1], off offset:4 -; GFX9-NEXT: global_load_dword v7, v[0:1], off -; GFX9-NEXT: global_load_short_d16 v4, v[2:3], off offset:4 -; GFX9-NEXT: global_load_dword v2, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v5 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_pk_max_i16 v3, v6, v4 +; GFX9-NEXT: global_load_short_d16 v2, v0, s[6:7] offset:4 +; GFX9-NEXT: global_load_short_d16 v1, v0, s[0:1] offset:4 +; GFX9-NEXT: global_load_dword v3, v0, s[6:7] +; GFX9-NEXT: global_load_dword v4, v0, s[0:1] +; GFX9-NEXT: s_waitcnt vmcnt(2) +; GFX9-NEXT: v_pk_max_i16 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_i16 v2, v7, v2 -; GFX9-NEXT: global_store_short v[0:1], v3, off offset:4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: v_pk_max_i16 v3, v3, v4 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] offset:4 +; GFX9-NEXT: global_store_dword v0, v3, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid @@ -230,21 +203,12 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_max_i16 v1, v1, v3 ; GFX9-NEXT: v_pk_max_i16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid @@ -286,22 +250,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_i16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: v_max_i16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -343,22 +298,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -400,22 +346,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: global_load_ushort v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[6:7] +; GFX9-NEXT: global_load_ushort v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_max_u16_e32 v0, v0, v1 -; GFX9-NEXT: global_store_short v[4:5], v0, off +; GFX9-NEXT: v_max_u16_e32 v1, v1, v2 +; GFX9-NEXT: global_store_short v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid @@ -458,22 +395,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_max_u16 v0, v0, v1 -; GFX9-NEXT: global_store_dword v[4:5], v0, off +; GFX9-NEXT: v_pk_max_u16 v1, v1, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-load.ll @@ -494,8 +494,8 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} -; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off glc slc{{$}} -; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], off slc{{$}} +; GFX9: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: global_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_global_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-store.ll @@ -363,8 +363,8 @@ ; GCN-LABEL: {{^}}nontemporal_global_1: ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}} -; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}} -; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}} +; GFX10: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}} ; GFX10: .amdhsa_kernel nontemporal_global_1 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0 ; GFX10CU: .amdhsa_workgroup_processor_mode 0 Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -6,26 +6,21 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GCN-NEXT: v_lshlrev_b32_e32 v18, 4, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 4, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s3 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s2, v18 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v0, vcc -; GCN-NEXT: global_load_dwordx4 v[0:3], v[16:17], off -; GCN-NEXT: global_load_dwordx4 v[4:7], v[16:17], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[16:17], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[16:17], off offset:48 -; GCN-NEXT: v_mov_b32_e32 v17, s5 -; GCN-NEXT: v_add_co_u32_e32 v16, vcc, s4, v18 -; GCN-NEXT: v_addc_co_u32_e32 v17, vcc, 0, v17, vcc +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[2:3] +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[0:3], off +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[4:7], off offset:16 +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[4:5] offset:16 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[8:11], off offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[4:5] offset:32 ; GCN-NEXT: s_waitcnt vmcnt(3) -; GCN-NEXT: global_store_dwordx4 v[16:17], v[12:15], off offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[4:5] offset:48 ; GCN-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -195,10 +190,7 @@ ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34 ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, s2, v0 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GCN-NEXT: global_load_dwordx2 v[8:9], v[0:1], off +; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] ; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off Index: llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll +++ llvm/test/CodeGen/AMDGPU/sdwa-op64-test.ll @@ -6,7 +6,7 @@ ; GFX9: v_addc_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} ; FIJI: v_add_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; FIJI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} -define amdgpu_kernel void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +define void @test_add_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp @@ -26,7 +26,7 @@ ; GFX9: v_subbrev_co_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} ; FIJI: v_sub_u32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; FIJI: v_subbrev_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}} -define amdgpu_kernel void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { +define void @test_sub_co_sdwa(i64 addrspace(1)* %arg, i32 addrspace(1)* %arg1) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i32 %tmp Index: llvm/test/CodeGen/AMDGPU/sext-in-reg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sext-in-reg.ll +++ llvm/test/CodeGen/AMDGPU/sext-in-reg.ll @@ -159,7 +159,7 @@ ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -186,7 +186,7 @@ ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -213,7 +213,7 @@ ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]] ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}} -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[HI]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -237,7 +237,7 @@ ; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]] -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[LO]]:[[SHR]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -471,7 +471,7 @@ ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]] ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid @@ -501,7 +501,7 @@ ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]] ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} -; GFX89: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} +; GFX89: {{flat|global}}_store_dwordx2 v{{.+}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid Index: llvm/test/CodeGen/AMDGPU/shl.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shl.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/shl.v2i16.ll @@ -68,19 +68,13 @@ ; GFX9-LABEL: v_shl_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: global_load_dword v0, v[0:1], off offset:4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] offset:4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v4 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v2, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v2i16: @@ -147,18 +141,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, s0, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, s0, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_s_v2i16: @@ -222,18 +210,12 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s5 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s4, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, s0 +; GFX9-NEXT: global_store_dword v0, v1, s[4:5] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_s_v_v2i16: @@ -296,18 +278,12 @@ ; GFX9-LABEL: shl_imm_v_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, 8 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, 8 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_imm_v_v2i16: @@ -365,18 +341,12 @@ ; GFX9-LABEL: shl_v_imm_v2i16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v2i16: @@ -431,18 +401,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:8 -; GFX9-NEXT: v_mov_b32_e32 v5, s1 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3] offset:8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_lshlrev_b16 v1, v1, v3 -; GFX9-NEXT: v_pk_lshlrev_b16 v0, v0, v2 -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: v_pk_lshlrev_b16 v1, v3, v1 +; GFX9-NEXT: v_pk_lshlrev_b16 v0, v2, v0 +; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: v_shl_v4i16: @@ -521,17 +485,11 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1] ; GFX9-NEXT: v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1] -; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] ; GFX9-NEXT: s_endpgm ; ; VI-LABEL: shl_v_imm_v4i16: Index: llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -44,34 +44,24 @@ ; GFX9-LABEL: v_test_i32_x_sub_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -128,43 +118,33 @@ ; GFX9-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v3, v[0:1], off -; GFX9-NEXT: global_load_dword v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v3 +; GFX9-NEXT: v_subrev_u32_e32 v1, 64, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u32_e32 v3, 64, v4 -; GFX9-NEXT: global_store_dword v[0:1], v2, off -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: v_subrev_u32_e32 v2, 64, v2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] +; GFX9-NEXT: global_store_dword v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_64_multi_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: global_load_dword v4, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v3 +; GFX10-NEXT: v_subrev_nc_u32_e32 v1, 64, v1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_subrev_nc_u32_e32 v3, 64, v4 -; GFX10-NEXT: global_store_dword v[0:1], v2, off -; GFX10-NEXT: global_store_dword v[0:1], v3, off +; GFX10-NEXT: v_subrev_nc_u32_e32 v2, 64, v2 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-NEXT: global_store_dword v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -216,34 +196,24 @@ ; GFX9-LABEL: v_test_i32_64_sub_x: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_sub_u32_e32 v1, 64, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_64_sub_x: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 64, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 64, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -292,34 +262,24 @@ ; GFX9-LABEL: v_test_i32_x_sub_65: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 0xffffffbf, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_add_u32_e32 v1, 0xffffffbf, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_65: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 0xffffffbf, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 0xffffffbf, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -368,34 +328,24 @@ ; GFX9-LABEL: v_test_i32_65_sub_x: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0x41, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_sub_u32_e32 v1, 0x41, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_65_sub_x: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x41, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x41, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -444,34 +394,24 @@ ; GFX9-LABEL: v_test_i32_x_sub_neg16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_add_u32_e32 v1, 16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_neg16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 16, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 16, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -520,34 +460,24 @@ ; GFX9-LABEL: v_test_i32_neg16_sub_x: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, -16, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_sub_u32_e32 v1, -16, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg16_sub_x: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, -16, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v1, -16, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -596,34 +526,24 @@ ; GFX9-LABEL: v_test_i32_x_sub_neg17: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u32_e32 v0, 17, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_add_u32_e32 v1, 17, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_x_sub_neg17: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_add_nc_u32_e32 v2, 17, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_add_nc_u32_e32 v1, 17, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -672,34 +592,24 @@ ; GFX9-LABEL: v_test_i32_neg17_sub_x: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_sub_u32_e32 v0, 0xffffffef, v0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_sub_u32_e32 v1, 0xffffffef, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i32_neg17_sub_x: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0xffffffef, v3 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0xffffffef, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -794,34 +704,24 @@ ; GFX9-LABEL: v_test_i16_x_sub_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 -; GFX9-NEXT: global_store_short v[2:3], v0, off +; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_ushort v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 -; GFX10-NEXT: global_store_short v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -876,16 +776,10 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s2, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc -; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, s0, v0 -; GFX9-NEXT: global_load_ushort v0, v[1:2], off -; GFX9-NEXT: v_mov_b32_e32 v4, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc +; GFX9-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v0, 64, v0 -; GFX9-NEXT: global_store_dword v[3:4], v0, off +; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_zext_to_i32: @@ -895,15 +789,11 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v1, s2, s2, v1 -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v2, s2, s3, 0, s2 -; GFX10-NEXT: global_load_ushort v1, v[1:2], off +; GFX10-NEXT: global_load_ushort v1, v1, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v1, 64 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 -; GFX10-NEXT: v_and_b32_e32 v2, 0xffff, v2 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 +; GFX10-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -961,43 +851,33 @@ ; GFX9-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_ushort v3, v[0:1], off -; GFX9-NEXT: global_load_ushort v4, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v3 +; GFX9-NEXT: v_subrev_u16_e32 v1, 64, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_subrev_u16_e32 v3, 64, v4 -; GFX9-NEXT: global_store_short v[0:1], v2, off -; GFX9-NEXT: global_store_short v[0:1], v3, off +; GFX9-NEXT: v_subrev_u16_e32 v2, 64, v2 +; GFX9-NEXT: global_store_short v0, v1, s[0:1] +; GFX9-NEXT: global_store_short v0, v2, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_i16_x_sub_64_multi_use: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 1, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_ushort v3, v[0:1], off -; GFX10-NEXT: global_load_ushort v4, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_ushort v1, v0, s[2:3] +; GFX10-NEXT: global_load_ushort v2, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_sub_nc_u16_e64 v2, v3, 64 +; GFX10-NEXT: v_sub_nc_u16_e64 v1, v1, 64 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_sub_nc_u16_e64 v3, v4, 64 -; GFX10-NEXT: global_store_short v[0:1], v2, off -; GFX10-NEXT: global_store_short v[0:1], v3, off +; GFX10-NEXT: v_sub_nc_u16_e64 v2, v2, 64 +; GFX10-NEXT: global_store_short v0, v1, s[0:1] +; GFX10-NEXT: global_store_short v0, v2, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1056,34 +936,24 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_64_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, 64 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1139,35 +1009,25 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_7_64: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0x400007 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x400007 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1223,35 +1083,25 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_64_123: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0x7b0040 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x7b0040 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_64_123: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 64 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 64 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1306,34 +1156,24 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_7_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, 7 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, 7 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_7_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 7 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 7 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1385,34 +1225,24 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_0_16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1463,35 +1293,25 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_brev_b32 s0, 35 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, 35 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_1_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0xc400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1542,35 +1362,25 @@ ; GFX9-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_brev_b32 s0, 34 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_brev_b32 s2, 34 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_i16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_sub_0_neg1_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_i16 v2, v3, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_i16 v1, v1, 0x4400 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1627,34 +1437,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_neg32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1705,34 +1505,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1786,34 +1576,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1870,34 +1650,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_neg16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -1948,34 +1718,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_0_neg16: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2029,34 +1789,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 16 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg16_0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 16 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 16 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2113,35 +1863,25 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0x3c003c00 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x3c003c00 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 0x3c00 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x3c00 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2198,35 +1938,25 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0xbc00bc00 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xbc00bc00 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 0xbc00 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xbc00 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2283,35 +2013,25 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0xc000c000 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0xc000c000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_fptwo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 0xc000 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0xc000 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2368,35 +2088,25 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: s_mov_b32 s0, 0x40004000 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] +; GFX9-NEXT: s_mov_b32 s2, 0x40004000 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, s0 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, s2 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg_negfptwo: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 0x4000 op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 0x4000 op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2447,34 +2157,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_undef_neg32: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 op_sel:[0,1] op_sel_hi:[1,0] -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 op_sel:[0,1] op_sel_hi:[1,0] +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -2524,34 +2224,24 @@ ; GFX9-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s3 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s2, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_u16 v0, v0, 32 -; GFX9-NEXT: global_store_dword v[2:3], v0, off +; GFX9-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: v_test_v2i16_x_add_neg32_undef: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: ; implicit-def: $vcc_hi ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_add_co_u32_e64 v0, s2, s2, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s3, 0, s2 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: v_add_co_u32_e64 v0, s0, s0, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: global_load_dword v1, v0, s[2:3] ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_sub_u16 v2, v3, 32 -; GFX10-NEXT: global_store_dword v[0:1], v2, off +; GFX10-NEXT: v_pk_sub_u16 v1, v1, 32 +; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 Index: llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -257,14 +257,14 @@ ; CI: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:36{{$}} ; CI-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:52{{$}} -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:12 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:28 -; GFX9: global_load_dword {{v[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:44 - -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off{{$}} -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:20 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:36 -; GFX9: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, off offset:52 +; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:12 +; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:28 +; GFX9: global_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:44 + +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]$}} +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:20 +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:36 +; GFX9: global_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} offset:52 define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 { %id = call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -21,7 +21,7 @@ ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid = load volatile i32, i32 addrspace(1)* undef %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8 @@ -73,7 +73,7 @@ ; GFX908: VGPRBlocks: 2 ; GFX908: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid = load volatile i32, i32 addrspace(1)* undef call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9) %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4 @@ -142,7 +142,7 @@ ; GCN: VGPRBlocks: 2 ; GCN: NumVGPRsForWavesPerEU: 10 define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 { - %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid = load volatile i32, i32 addrspace(1)* undef call void asm sideeffect "", "a"(i32 1) %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8 Index: llvm/test/CodeGen/AMDGPU/sub.v2i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -8,22 +8,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s9 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off ; GFX9-NEXT: s_mov_b32 s0, s4 +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[8:9] ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -169,10 +163,7 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0x1c8007b ; GFX9-NEXT: s_mov_b32 s1, s5 @@ -217,10 +208,7 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 0xfc21fcb3 ; GFX9-NEXT: s_mov_b32 s1, s5 @@ -264,10 +252,7 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -310,10 +295,7 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s1, s5 ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -356,10 +338,7 @@ ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off +; GFX9-NEXT: global_load_dword v0, v0, s[6:7] ; GFX9-NEXT: s_mov_b32 s0, s4 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s1, s5 @@ -402,20 +381,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -460,24 +433,18 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v2, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 -; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_pk_sub_i16 v2, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; @@ -522,20 +489,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v0, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v0, v1, v0 ; GFX9-NEXT: v_ashrrev_i32_e32 v1, 16, v0 ; GFX9-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -582,20 +543,14 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s7 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s6, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: global_load_dword v1, v[2:3], off +; GFX9-NEXT: global_load_dword v1, v0, s[6:7] +; GFX9-NEXT: global_load_dword v0, v0, s[0:1] ; GFX9-NEXT: s_mov_b32 s7, 0xf000 ; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_sub_i16 v1, v0, v1 +; GFX9-NEXT: v_pk_sub_i16 v1, v1, v0 ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v1 ; GFX9-NEXT: v_bfe_i32 v0, v1, 0, 16 ; GFX9-NEXT: v_bfe_i32 v2, v2, 0, 16 Index: llvm/test/CodeGen/AMDGPU/v_cndmask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -49,8 +49,8 @@ ; (select (cmp (sgprX, constant)), constant, sgprZ) ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_sgprZ_f32: -; GCN: s_load_dwordx2 -; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}} +; GCN: s_load_dwordx2 s{{\[}}[[X:[0-9]+]]:[[Z:[0-9]+]]{{\]}}, s[0:1], {{0x4c|0x13}} + ; SIVI-DAG: v_cmp_nlg_f32_e64 [[CC:vcc]], s[[X]], 0 ; GFX10-DAG: v_cmp_nlg_f32_e64 [[CC:s\[[0-9:]+\]]], s[[X]], 0 ; SIVI-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], s[[Z]] Index: llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -733,26 +733,17 @@ ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 -; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v5, s5 -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, s4, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[2:3], v[2:3], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[4:5], off +; GFX9-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] +; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3] +; GFX9-NEXT: global_load_dwordx2 v[4:5], v6, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_pk_fma_f16 v6, v0, v2, v6 op_sel_hi:[0,1,1] -; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v7 op_sel_hi:[0,1,1] -; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v6 op_sel:[1,0,0] +; GFX9-NEXT: v_pk_fma_f16 v4, v0, v2, v4 op_sel_hi:[0,1,1] +; GFX9-NEXT: v_pk_fma_f16 v2, v1, v2, v5 op_sel_hi:[0,1,1] +; GFX9-NEXT: v_pk_fma_f16 v0, v0, v3, v4 op_sel:[1,0,0] ; GFX9-NEXT: v_pk_fma_f16 v1, v1, v3, v2 op_sel:[1,0,0] -; GFX9-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX9-NEXT: global_store_dwordx2 v6, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm entry: %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x()