diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -33,7 +33,6 @@ LegacyDivergenceAnalysis *DA; MemorySSA *MSSA; AliasAnalysis *AA; - DenseMap noClobberClones; bool isEntryFunc; public: @@ -160,44 +159,17 @@ Value *Ptr = I.getPointerOperand(); if (!DA->isUniform(Ptr)) return; + Instruction *PtrI = dyn_cast(Ptr); + if (PtrI) + setUniformMetadata(PtrI); + // We're tracking up to the Function boundaries, and cannot go beyond because // of FunctionPass restrictions. We can ensure that is memory not clobbered // for memory operations that are live in to entry points only. - Instruction *PtrI = dyn_cast(Ptr); - - if (!isEntryFunc) { - if (PtrI) - setUniformMetadata(PtrI); + if (!isEntryFunc) return; - } - - bool NotClobbered = false; bool GlobalLoad = I.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS; - if (PtrI) - NotClobbered = GlobalLoad && !isClobberedInFunction(&I); - else if (isa(Ptr) || isa(Ptr)) { - if (GlobalLoad && !isClobberedInFunction(&I)) { - NotClobbered = true; - // Lookup for the existing GEP - if (noClobberClones.count(Ptr)) { - PtrI = noClobberClones[Ptr]; - } else { - // Create GEP of the Value - Function *F = I.getParent()->getParent(); - Value *Idx = Constant::getIntegerValue( - Type::getInt32Ty(Ptr->getContext()), APInt(64, 0)); - // Insert GEP at the entry to make it dominate all uses - PtrI = GetElementPtrInst::Create(I.getType(), Ptr, - ArrayRef(Idx), Twine(""), - F->getEntryBlock().getFirstNonPHI()); - } - I.replaceUsesOfWith(Ptr, PtrI); - } - } - - if (PtrI) - setUniformMetadata(PtrI); - + bool NotClobbered = GlobalLoad && !isClobberedInFunction(&I); if (NotClobbered) setNoClobberMetadata(&I); } @@ -216,7 +188,6 @@ isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv()); visit(F); - noClobberClones.clear(); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll @@ -1677,8 +1677,7 @@ ; HSA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 ; HSA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) ; HSA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) - ; HSA-VI-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) - ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.1, addrspace 1) + ; HSA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1) ; HSA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1) ; HSA-VI-NEXT: S_ENDPGM 0 ; LEGACY-MESA-VI-LABEL: name: byref_global_i32_arg @@ -1692,8 +1691,7 @@ ; LEGACY-MESA-VI-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 44 ; LEGACY-MESA-VI-NEXT: [[PTR_ADD1:%[0-9]+]]:_(p4) = G_PTR_ADD [[COPY]], [[C1]](s64) ; LEGACY-MESA-VI-NEXT: [[ADDRSPACE_CAST:%[0-9]+]]:_(p1) = G_ADDRSPACE_CAST [[PTR_ADD1]](p4) - ; LEGACY-MESA-VI-NEXT: [[COPY1:%[0-9]+]]:_(p1) = COPY [[ADDRSPACE_CAST]](p1) - ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[COPY1]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.1, addrspace 1) + ; LEGACY-MESA-VI-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[ADDRSPACE_CAST]](p1) :: (dereferenceable "amdgpu-noclobber" load (s32) from %ir.in.byref, addrspace 1) ; LEGACY-MESA-VI-NEXT: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store (s32) into %ir.out, addrspace 1) ; LEGACY-MESA-VI-NEXT: S_ENDPGM 0 %in = load i32, i32 addrspace(1)* %in.byref diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -386,7 +386,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile load (s1) from `i1 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `i1 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_signext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -444,7 +444,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile load (s1) from `i1 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s1) from `i1 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i1_zeroext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -559,7 +559,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_signext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -618,7 +618,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile load (s8) from `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i8_zeroext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -732,7 +732,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile load (s16) from `i16 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_signext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -790,7 +790,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile load (s16) from `i16 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i16_zeroext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1000,7 +1000,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i64 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1121,7 +1121,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1181,7 +1181,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_signext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1241,7 +1241,7 @@ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s48) = G_LOAD [[DEF]](p1) :: (volatile "amdgpu-noclobber" load (s48) from `i48 addrspace(1)* undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_i48_zeroext ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1357,7 +1357,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[C:%[0-9]+]]:_(p1) = G_CONSTANT i64 0 - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: (load (<2 x p0>) from `<2 x i8*> addrspace(1)* null`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x p0>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x p0>) from `<2 x i8*> addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2p0 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -1420,7 +1420,7 @@ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[DEF]](s64) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<3 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i64 @@ -1488,7 +1488,7 @@ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8589934593 ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 17179869187 ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[C1]](s64), [[C2]](s64) - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: (load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s64>) = G_LOAD [[C]](p1) :: ("amdgpu-noclobber" load (<2 x s64>) from `<2 x i64> addrspace(1)* null`, addrspace 1) ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s64>) = G_SHUFFLE_VECTOR [[LOAD]](<2 x s64>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 3) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i64 @@ -2024,7 +2024,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `<2 x i16> addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2080,7 +2080,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `<3 x i16> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2141,7 +2141,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: (load (<3 x s16>) from `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<3 x s16>) from `<3 x half> addrspace(1)* undef`, align 8, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v3f16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2202,7 +2202,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: (load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s16>) from `<4 x i16> addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2320,7 +2320,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: (load (<5 x s16>) from `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<5 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<5 x s16>) from `<5 x i16> addrspace(1)* undef`, align 16, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v5i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2382,7 +2382,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: (load (<7 x s16>) from `<7 x i16> addrspace(1)* undef`, align 16, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<7 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<7 x s16>) from `<7 x i16> addrspace(1)* undef`, align 16, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v7i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2445,7 +2445,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: (load (<63 x s16>) from `<63 x i16> addrspace(1)* undef`, align 128, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<63 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<63 x s16>) from `<63 x i16> addrspace(1)* undef`, align 128, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v63i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2539,7 +2539,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: (load (<65 x s16>) from `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<65 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<65 x s16>) from `<65 x i16> addrspace(1)* undef`, align 256, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v65i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2636,7 +2636,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: (load (<66 x s16>) from `<66 x i16> addrspace(1)* undef`, align 256, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<66 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<66 x s16>) from `<66 x i16> addrspace(1)* undef`, align 256, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v66i16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2730,7 +2730,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: (load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s16>) from `<2 x half> addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2f16 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -2786,7 +2786,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: (load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<2 x s32>) from `<2 x i32> addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v2i32 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -3026,7 +3026,7 @@ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(p4) = COPY $sgpr8_sgpr9 ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: (load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[DEF]](p1) :: ("amdgpu-noclobber" load (<4 x s32>) from `<4 x i32> addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v4i32 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -3519,7 +3519,7 @@ ; CHECK-NEXT: [[INT:%[0-9]+]]:_(p4) = G_INTRINSIC intrinsic(@llvm.amdgcn.kernarg.segment.ptr) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: (load (s32) from `i32 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s32) from `i32 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i32 ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -3619,8 +3619,8 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[DEF1]](p1) :: (load (s8) from `i8 addrspace(1)* undef`, addrspace 1) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY10]](p1) :: (load (s16) from `i16 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s8) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (s8) from `i8 addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(s16) = G_LOAD [[COPY10]](p1) :: ("amdgpu-noclobber" load (s16) from `i16 addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_i8_i8_i16 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] @@ -3730,8 +3730,8 @@ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p1) = COPY [[DEF1]](p1) ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[DEF]](p4) :: (load (p1) from `<32 x i32> addrspace(1)* addrspace(4)* undef`, addrspace 4) ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(<32 x s32>) = G_LOAD [[LOAD]](p1) :: ("amdgpu-noclobber" load (<32 x s32>) from %ir.ptr0, addrspace 1) - ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: (load (p3) from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) - ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: (load (p5) from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(p3) = G_LOAD [[DEF1]](p1) :: ("amdgpu-noclobber" load (p3) from `i8 addrspace(3)* addrspace(1)* undef`, addrspace 1) + ; CHECK-NEXT: [[LOAD3:%[0-9]+]]:_(p5) = G_LOAD [[COPY10]](p1) :: ("amdgpu-noclobber" load (p5) from `i8 addrspace(5)* addrspace(1)* undef`, addrspace 1) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_v32i32_p3_p5 ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY8]] diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2345,142 +2345,82 @@ define amdgpu_kernel void @cvt_f32_ubyte0_vector() local_unnamed_addr { ; SI-LABEL: cvt_f32_ubyte0_vector: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_mov_b32 s6, 0 -; SI-NEXT: s_mov_b32 s7, s3 -; SI-NEXT: buffer_load_dword v2, off, s[0:3], 0 -; SI-NEXT: s_waitcnt vmcnt(1) -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 offset:3 -; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[4:7], 0 addr64 offset:2 -; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[4:7], 0 addr64 offset:1 -; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 +; SI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 +; SI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 +; SI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; SI-NEXT: s_load_dword s0, s[0:1], 0x0 ; SI-NEXT: s_waitcnt vmcnt(3) -; SI-NEXT: v_cvt_f32_ubyte0_e32 v1, v3 -; SI-NEXT: v_fma_f32 v1, v2, v1, 0.5 -; SI-NEXT: v_cvt_u32_f32_e32 v1, v1 +; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_fma_f32 v0, s0, v0, 0.5 +; SI-NEXT: v_cvt_u32_f32_e32 v0, v0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_byte v4, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(2) -; SI-NEXT: buffer_store_byte v5, off, s[0:3], 0 +; SI-NEXT: buffer_store_byte v2, off, s[0:3], 0 ; SI-NEXT: s_waitcnt vmcnt(2) +; SI-NEXT: buffer_store_byte v3, off, s[0:3], 0 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; SI-NEXT: buffer_store_byte v1, off, s[0:3], 0 ; SI-NEXT: .LBB40_1: ; %for.body.i ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 ; SI-NEXT: s_branch .LBB40_1 ; ; VI-LABEL: cvt_f32_ubyte0_vector: ; VI: ; %bb.0: ; %entry +; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; VI-NEXT: s_mov_b32 s3, 0xf000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; VI-NEXT: v_mov_b32_e32 v2, -1 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: .LBB40_1: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: v_readfirstlane_b32 s9, v1 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s11, v3 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; VI-NEXT: s_nop 0 -; VI-NEXT: buffer_load_ubyte v4, off, s[8:11], 0 -; VI-NEXT: s_xor_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB40_1 -; VI-NEXT: ; %bb.2: -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: .LBB40_3: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: v_readfirstlane_b32 s9, v1 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s11, v3 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; VI-NEXT: s_nop 0 -; VI-NEXT: buffer_load_ubyte v5, off, s[8:11], 0 offset:1 -; VI-NEXT: s_xor_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB40_3 -; VI-NEXT: ; %bb.4: -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: .LBB40_5: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: v_readfirstlane_b32 s9, v1 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s11, v3 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; VI-NEXT: s_nop 0 -; VI-NEXT: buffer_load_ubyte v6, off, s[8:11], 0 offset:2 -; VI-NEXT: s_xor_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB40_5 -; VI-NEXT: ; %bb.6: -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: .LBB40_7: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: v_readfirstlane_b32 s8, v0 -; VI-NEXT: v_readfirstlane_b32 s9, v1 -; VI-NEXT: v_readfirstlane_b32 s10, v2 -; VI-NEXT: v_readfirstlane_b32 s11, v3 -; VI-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] -; VI-NEXT: v_cmp_eq_u64_e64 s[0:1], s[10:11], v[2:3] -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: s_and_saveexec_b64 s[0:1], s[0:1] -; VI-NEXT: s_nop 0 -; VI-NEXT: buffer_load_ubyte v7, off, s[8:11], 0 offset:3 -; VI-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3 -; VI-NEXT: s_xor_b64 exec, exec, s[0:1] -; VI-NEXT: s_cbranch_execnz .LBB40_7 -; VI-NEXT: ; %bb.8: -; VI-NEXT: s_mov_b64 exec, s[4:5] -; VI-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cvt_f32_ubyte0_e32 v1, v7 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mul_f32_e32 v0, v0, v1 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:3 +; VI-NEXT: buffer_load_ubyte v1, off, s[0:3], 0 offset:2 +; VI-NEXT: buffer_load_ubyte v2, off, s[0:3], 0 offset:1 +; VI-NEXT: buffer_load_ubyte v3, off, s[0:3], 0 +; VI-NEXT: s_load_dword s0, s[0:1], 0x0 +; VI-NEXT: s_waitcnt vmcnt(3) +; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0 +; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mul_f32_e32 v0, s0, v0 ; VI-NEXT: v_add_f32_e32 v0, 0.5, v0 ; VI-NEXT: v_cvt_i32_f32_e32 v0, v0 -; VI-NEXT: buffer_store_byte v6, off, s[0:3], 0 -; VI-NEXT: buffer_store_byte v5, off, s[0:3], 0 -; VI-NEXT: buffer_store_byte v4, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_byte v1, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_byte v2, off, s[0:3], 0 +; VI-NEXT: s_waitcnt vmcnt(2) +; VI-NEXT: buffer_store_byte v3, off, s[0:3], 0 ; VI-NEXT: buffer_store_byte v0, off, s[0:3], 0 -; VI-NEXT: .LBB40_9: ; %for.body.i +; VI-NEXT: .LBB40_1: ; %for.body.i ; VI-NEXT: ; =>This Inner Loop Header: Depth=1 -; VI-NEXT: s_branch .LBB40_9 +; VI-NEXT: s_branch .LBB40_1 ; ; GFX10-LABEL: cvt_f32_ubyte0_vector: ; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: s_clause 0x4 -; GFX10-NEXT: global_load_ubyte v2, v[0:1], off offset:3 -; GFX10-NEXT: global_load_dword v3, v[0:1], off -; GFX10-NEXT: global_load_ubyte v4, v[0:1], off offset:2 -; GFX10-NEXT: global_load_ubyte v5, v[0:1], off offset:1 -; GFX10-NEXT: global_load_ubyte v6, v[0:1], off -; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v2 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: s_clause 0x3 +; GFX10-NEXT: global_load_ubyte v1, v0, s[0:1] offset:3 +; GFX10-NEXT: global_load_ubyte v2, v0, s[0:1] offset:2 +; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] offset:1 +; GFX10-NEXT: global_load_ubyte v4, v0, s[0:1] +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-NEXT: s_load_dword s0, s[0:1], 0x0 ; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_fma_f32 v0, v3, v0, 0.5 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v1 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_fma_f32 v0, s0, v0, 0.5 ; GFX10-NEXT: v_cvt_i32_f32_e32 v0, v0 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: global_store_byte v[0:1], v4, off +; GFX10-NEXT: global_store_byte v[0:1], v2, off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: global_store_byte v[0:1], v5, off +; GFX10-NEXT: global_store_byte v[0:1], v3, off ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_store_byte v[0:1], v6, off +; GFX10-NEXT: global_store_byte v[0:1], v4, off ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: .LBB40_1: ; %for.body.i ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll --- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -125,7 +125,7 @@ } ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64: -; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5 +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5 ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef @@ -135,9 +135,9 @@ } ; GCN-LABEL: {{^}}div_fast_k_x_pat_f64: -; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a -; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999 -; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a +; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999 +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef @@ -147,9 +147,9 @@ } ; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64: -; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a -; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999 -; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}} +; GCN-DAG: v_mov_b32_e32 v[[K_LO:[0-9]+]], 0x9999999a +; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999 +; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[K_LO]]:[[K_HI]]{{\]}} ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 { %x = load double, double addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll --- a/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll +++ b/llvm/test/CodeGen/AMDGPU/global-variable-relocs.ll @@ -16,9 +16,7 @@ ; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], private@rel32@lo+8 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], private@rel32@hi+16 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]] define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -30,9 +28,7 @@ ; CHECK: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}} ; CHECK: s_add_u32 s[[ADDR_LO:[0-9]+]], s[[PC_LO]], internal@rel32@lo+8 ; CHECK: s_addc_u32 s[[ADDR_HI:[0-9]+]], s[[PC_HI]], internal@rel32@hi+16 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]] define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -45,11 +41,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], available_externally@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], available_externally@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -62,11 +54,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -79,11 +67,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -96,11 +80,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], common@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], common@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -113,11 +93,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], extern_weak@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], extern_weak@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -130,11 +106,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], linkonce_odr@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], linkonce_odr@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -147,11 +119,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], weak_odr@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], weak_odr@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -164,11 +132,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr @@ -181,11 +145,7 @@ ; CHECK: s_add_u32 s[[GOTADDR_LO:[0-9]+]], s[[PC_LO]], external_w_init@gotpcrel32@lo+4 ; CHECK: s_addc_u32 s[[GOTADDR_HI:[0-9]+]], s[[PC_HI]], external_w_init@gotpcrel32@hi+12 ; CHECK: s_load_dwordx2 s{{\[}}[[ADDR_LO:[0-9]+]]:[[ADDR_HI:[0-9]+]]{{\]}}, s{{\[}}[[GOTADDR_LO]]:[[GOTADDR_HI]]{{\]}}, 0x0 -; CHECK: s_add_u32 s[[GEP_LO:[0-9]+]], s[[ADDR_LO]], 4 -; CHECK: s_addc_u32 s[[GEP_HI:[0-9]+]], s[[ADDR_HI]], 0 -; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]] -; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]] -; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}} +; CHECK: s_load_dword s{{[0-9]+}}, s[[[ADDR_LO]]:[[ADDR_HI]]], 0x4 define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) { %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1 %val = load i32, i32 addrspace(1)* %ptr diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -148,41 +148,43 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB1_6 ; GCN-NEXT: ; %bb.1: ; %bb14.lr.ph -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 ; GCN-NEXT: s_branch .LBB1_3 -; GCN-NEXT: .LBB1_2: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: s_mov_b64 s[0:1], -1 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: s_cbranch_execnz .LBB1_6 +; GCN-NEXT: .LBB1_2: ; %Flow +; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_cbranch_vccnz .LBB1_6 ; GCN-NEXT: .LBB1_3: ; %bb14 ; GCN-NEXT: ; =>This Loop Header: Depth=1 ; GCN-NEXT: ; Child Loop BB1_4 Depth 2 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 -; GCN-NEXT: s_and_b64 vcc, exec, vcc -; GCN-NEXT: s_cbranch_vccnz .LBB1_2 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s4, 1 +; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: ; implicit-def: $sgpr4 +; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: .LBB1_4: ; %bb18 ; GCN-NEXT: ; Parent Loop BB1_3 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 8, v0 ; GCN-NEXT: s_and_b64 vcc, exec, vcc ; GCN-NEXT: s_cbranch_vccnz .LBB1_4 ; GCN-NEXT: ; %bb.5: ; %bb21 ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 +; GCN-NEXT: s_load_dword s4, s[0:1], 0x0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_cmp_lt_i32_e64 s[0:1], 8, v1 -; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] -; GCN-NEXT: s_cbranch_vccz .LBB1_3 +; GCN-NEXT: v_cmp_lt_i32_e64 s[0:1], 8, v0 +; GCN-NEXT: s_branch .LBB1_2 ; GCN-NEXT: .LBB1_6: ; %bb31 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll --- a/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll +++ b/llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll @@ -20,8 +20,7 @@ define amdgpu_kernel void @simple_barrier(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @simple_barrier( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire @@ -59,8 +58,7 @@ define amdgpu_kernel void @memory_phi_no_clobber(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_no_clobber( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() @@ -106,8 +104,7 @@ define amdgpu_kernel void @memory_phi_clobber1(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_clobber1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3 @@ -155,8 +152,7 @@ define amdgpu_kernel void @memory_phi_clobber2(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @memory_phi_clobber2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0 ; CHECK: if.then: ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() @@ -203,8 +199,7 @@ define amdgpu_kernel void @no_clobbering_loop1(i32 addrspace(1)* %arg, i1 %cc) { ; CHECK-LABEL: @no_clobbering_loop1( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 @@ -242,8 +237,7 @@ define amdgpu_kernel void @no_clobbering_loop2(i32 addrspace(1)* noalias %arg, i32 addrspace(1)* noalias %out, i32 %n) { ; CHECK-LABEL: @no_clobbering_loop2( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ] @@ -286,8 +280,7 @@ define amdgpu_kernel void @clobbering_loop(i32 addrspace(1)* %arg, i32 addrspace(1)* %out, i1 %cc) { ; CHECK-LABEL: @clobbering_loop( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0 ; CHECK: while.cond: ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 1, !amdgpu.uniform !0 @@ -325,8 +318,7 @@ define amdgpu_kernel void @clobber_by_atomic_load(i32 addrspace(1)* %arg) { ; CHECK-LABEL: @clobber_by_atomic_load( ; CHECK-NEXT: bb: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[ARG:%.*]], i64 0, !amdgpu.uniform !0 -; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[I:%.*]] = load i32, i32 addrspace(1)* [[ARG:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 2, !amdgpu.uniform !0 ; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, i32 addrspace(1)* [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[ARG]], i64 3, !amdgpu.uniform !0 @@ -357,12 +349,11 @@ define protected amdgpu_kernel void @no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: store i32 0, i32 addrspace(3)* @LDS, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -411,12 +402,11 @@ define protected amdgpu_kernel void @no_alias_volatile_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_volatile_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: store volatile i32 0, i32 addrspace(3)* @LDS, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -438,9 +428,8 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4 -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -459,12 +448,11 @@ define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) { ; CHECK-LABEL: @no_alias_atomic_cmpxchg( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -486,12 +474,11 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) { ; CHECK-LABEL: @no_alias_atomic_rmw( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; @@ -595,13 +582,12 @@ define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) { ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0 ; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4 ; CHECK-NEXT: fence syncscope("workgroup") release ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: fence syncscope("workgroup") acquire -; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[TMP0]], align 4, !amdgpu.noclobber !0 +; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[IN:%.*]], align 4, !amdgpu.noclobber !0 ; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll --- a/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll +++ b/llvm/test/CodeGen/AMDGPU/rcp-pattern.ll @@ -118,7 +118,7 @@ } ; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32: -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}} +; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], s{{[0-9]+}}, 0.5 ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef @@ -128,7 +128,8 @@ } ; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32: -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}} +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x3dcccccd +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef @@ -138,7 +139,8 @@ } ; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32: -; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}} +; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0xbdcccccd +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], s{{[0-9]+}}, [[V]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 { %x = load float, float addrspace(1)* undef diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -164,25 +164,24 @@ define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind { ; SI-LABEL: loop_land_info_assert: ; SI: ; %bb.0: ; %entry -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; SI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9 +; SI-NEXT: s_load_dword s6, s[0:1], 0x0 ; SI-NEXT: s_load_dword s14, s[0:1], 0xc -; SI-NEXT: s_brev_b32 s8, 44 +; SI-NEXT: v_bfrev_b32_e32 v0, 44 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lt_i32 s2, 1 -; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s3, 4 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 +; SI-NEXT: s_cmp_lt_i32 s3, 4 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; SI-NEXT: s_cmp_gt_i32 s3, 3 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; SI-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; SI-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: v_cmp_lt_f32_e64 s[6:7], |s6|, v0 +; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; SI-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8 -; SI-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_and_b64 s[4:5], exec, s[6:7] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: v_mov_b32_e32 v0, 3 ; SI-NEXT: s_branch .LBB3_4 ; SI-NEXT: .LBB3_1: ; %Flow6 @@ -240,25 +239,24 @@ ; ; FLAT-LABEL: loop_land_info_assert: ; FLAT: ; %bb.0: ; %entry -; FLAT-NEXT: s_mov_b32 s7, 0xf000 -; FLAT-NEXT: s_mov_b32 s6, -1 -; FLAT-NEXT: buffer_load_dword v0, off, s[4:7], 0 ; FLAT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; FLAT-NEXT: s_load_dword s6, s[0:1], 0x0 ; FLAT-NEXT: s_load_dword s14, s[0:1], 0x30 -; FLAT-NEXT: s_brev_b32 s8, 44 +; FLAT-NEXT: v_bfrev_b32_e32 v0, 44 ; FLAT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT-NEXT: s_cmp_lt_i32 s2, 1 -; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0 -; FLAT-NEXT: s_cmp_lt_i32 s3, 4 ; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 +; FLAT-NEXT: s_cmp_lt_i32 s3, 4 +; FLAT-NEXT: s_cselect_b64 s[4:5], -1, 0 ; FLAT-NEXT: s_cmp_gt_i32 s3, 3 ; FLAT-NEXT: s_cselect_b64 s[2:3], -1, 0 -; FLAT-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] -; FLAT-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; FLAT-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; FLAT-NEXT: v_cmp_lt_f32_e64 s[6:7], |s6|, v0 +; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] ; FLAT-NEXT: s_and_b64 s[2:3], exec, s[2:3] -; FLAT-NEXT: s_waitcnt vmcnt(0) -; FLAT-NEXT: v_cmp_lt_f32_e64 s[4:5], |v0|, s8 -; FLAT-NEXT: s_and_b64 s[4:5], exec, s[4:5] +; FLAT-NEXT: s_and_b64 s[4:5], exec, s[6:7] +; FLAT-NEXT: s_mov_b32 s7, 0xf000 +; FLAT-NEXT: s_mov_b32 s6, -1 ; FLAT-NEXT: v_mov_b32_e32 v0, 3 ; FLAT-NEXT: s_branch .LBB3_4 ; FLAT-NEXT: .LBB3_1: ; %Flow6 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -481,7 +481,7 @@ ; SI-NEXT: successors: %bb.6(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[V_MOV_B1:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec - ; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: (load (s8) from `i8 addrspace(1)* null`, addrspace 1) + ; SI-NEXT: [[GLOBAL_LOAD_UBYTE1:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[V_MOV_B1]], 0, 0, implicit $exec :: ("amdgpu-noclobber" load (s8) from `i8 addrspace(1)* null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.6 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.5.Flow: