Index: lib/Target/AMDGPU/AMDGPUIntrinsics.td =================================================================== --- lib/Target/AMDGPU/AMDGPUIntrinsics.td +++ lib/Target/AMDGPU/AMDGPUIntrinsics.td @@ -69,8 +69,8 @@ def int_AMDGPU_bfm : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_brev : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; - def int_AMDGPU_barrier_local : Intrinsic<[], [], []>; - def int_AMDGPU_barrier_global : Intrinsic<[], [], []>; + def int_AMDGPU_barrier_local : Intrinsic<[], [], [IntrConvergent]>; + def int_AMDGPU_barrier_global : Intrinsic<[], [], [IntrConvergent]>; } // Legacy names for compatibility. Index: test/CodeGen/AMDGPU/addrspacecast.ll =================================================================== --- test/CodeGen/AMDGPU/addrspacecast.ll +++ test/CodeGen/AMDGPU/addrspacecast.ll @@ -62,5 +62,5 @@ declare i32 @llvm.r600.read.tidig.x() #3 attributes #0 = { nounwind } -attributes #1 = { nounwind noduplicate } +attributes #1 = { nounwind convergent } attributes #3 = { nounwind readnone } Index: test/CodeGen/AMDGPU/array-ptr-calc-i32.ll =================================================================== --- test/CodeGen/AMDGPU/array-ptr-calc-i32.ll +++ test/CodeGen/AMDGPU/array-ptr-calc-i32.ll @@ -2,7 +2,7 @@ ; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=SI -mattr=+promote-alloca < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s declare i32 @llvm.SI.tid() nounwind readnone -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate +declare void @llvm.AMDGPU.barrier.local() nounwind convergent ; The required pointer calculations for the alloca'd actually requires ; an add and won't be folded into the addressing, which fails with a @@ -35,7 +35,7 @@ %alloca_ptr = getelementptr [4 x i32], [4 x i32]* %alloca, i32 1, i32 %b store i32 %result, i32* %alloca_ptr, align 4 ; Dummy call - call void @llvm.AMDGPU.barrier.local() nounwind noduplicate + call void @llvm.AMDGPU.barrier.local() nounwind convergent %reload = load i32, i32* %alloca_ptr, align 4 %out_ptr = getelementptr i32, i32 addrspace(1)* %out, i32 %tid store i32 %reload, i32 addrspace(1)* %out_ptr, align 4 Index: test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll =================================================================== --- test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll +++ test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll @@ -49,4 +49,4 @@ attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll =================================================================== --- test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll +++ test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll @@ -66,5 +66,5 @@ } attributes #0 = { nounwind readnone } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } attributes #2 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } Index: test/CodeGen/AMDGPU/ds-sub-offset.ll =================================================================== --- test/CodeGen/AMDGPU/ds-sub-offset.ll +++ test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -122,4 +122,4 @@ attributes #0 = { nounwind readnone } attributes #1 = { nounwind } -attributes #2 = { nounwind noduplicate convergent } +attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/ds_read2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2.ll +++ test/CodeGen/AMDGPU/ds_read2.ll @@ -505,9 +505,9 @@ ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds_read2_superreg.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2_superreg.ll +++ test/CodeGen/AMDGPU/ds_read2_superreg.ll @@ -229,9 +229,9 @@ ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds_read2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_read2st64.ll +++ test/CodeGen/AMDGPU/ds_read2st64.ll @@ -264,9 +264,5 @@ ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind -declare void @llvm.AMDGPU.barrier.local() #2 - attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } Index: test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2.ll +++ test/CodeGen/AMDGPU/ds_write2.ll @@ -431,9 +431,9 @@ ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/ds_write2st64.ll =================================================================== --- test/CodeGen/AMDGPU/ds_write2st64.ll +++ test/CodeGen/AMDGPU/ds_write2st64.ll @@ -109,9 +109,9 @@ ; Function Attrs: nounwind readnone declare i32 @llvm.r600.read.tidig.y() #1 -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } attributes #1 = { nounwind readnone } -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/flat-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/flat-address-space.ll +++ test/CodeGen/AMDGPU/flat-address-space.ll @@ -128,5 +128,5 @@ declare i32 @llvm.r600.read.tidig.x() #3 attributes #0 = { nounwind } -attributes #1 = { nounwind noduplicate } +attributes #1 = { nounwind convergent } attributes #3 = { nounwind readnone } Index: test/CodeGen/AMDGPU/indirect-private-64.ll =================================================================== --- test/CodeGen/AMDGPU/indirect-private-64.ll +++ test/CodeGen/AMDGPU/indirect-private-64.ll @@ -4,7 +4,7 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s -declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind +declare void @llvm.AMDGPU.barrier.local() convergent nounwind ; SI-LABEL: {{^}}private_access_f64_alloca: @@ -18,7 +18,7 @@ %array = alloca double, i32 16, align 8 %ptr = getelementptr double, double* %array, i32 %b store double %val, double* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load double, double* %ptr, align 8 store double %result, double addrspace(1)* %out, align 8 ret void @@ -38,7 +38,7 @@ %array = alloca <2 x double>, i32 16, align 16 %ptr = getelementptr <2 x double>, <2 x double>* %array, i32 %b store <2 x double> %val, <2 x double>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load <2 x double>, <2 x double>* %ptr, align 16 store <2 x double> %result, <2 x double> addrspace(1)* %out, align 16 ret void @@ -56,7 +56,7 @@ %array = alloca i64, i32 16, align 8 %ptr = getelementptr i64, i64* %array, i32 %b store i64 %val, i64* %ptr, align 8 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load i64, i64* %ptr, align 8 store i64 %result, i64 addrspace(1)* %out, align 8 ret void @@ -76,7 +76,7 @@ %array = alloca <2 x i64>, i32 16, align 16 %ptr = getelementptr <2 x i64>, <2 x i64>* %array, i32 %b store <2 x i64> %val, <2 x i64>* %ptr, align 16 - call void @llvm.AMDGPU.barrier.local() noduplicate nounwind + call void @llvm.AMDGPU.barrier.local() convergent nounwind %result = load <2 x i64>, <2 x i64>* %ptr, align 16 store <2 x i64> %result, <2 x i64> addrspace(1)* %out, align 16 ret void Index: test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll =================================================================== --- test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll +++ test/CodeGen/AMDGPU/llvm.AMDGPU.div_fmas.ll @@ -4,7 +4,6 @@ ; FIXME: Enable for VI. declare i32 @llvm.r600.read.tidig.x() nounwind readnone -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate declare float @llvm.AMDGPU.div.fmas.f32(float, float, float, i1) nounwind readnone declare double @llvm.AMDGPU.div.fmas.f64(double, double, double, i1) nounwind readnone Index: test/CodeGen/AMDGPU/merge-stores.ll =================================================================== --- test/CodeGen/AMDGPU/merge-stores.ll +++ test/CodeGen/AMDGPU/merge-stores.ll @@ -708,4 +708,4 @@ declare void @llvm.AMDGPU.barrier.local() #1 attributes #0 = { nounwind } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } Index: test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll =================================================================== --- test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll +++ test/CodeGen/AMDGPU/schedule-vs-if-nested-loop-failure.ll @@ -3,7 +3,7 @@ ; RUN: llc -O0 -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI ; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck %s -check-prefix=SI -declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate +declare void @llvm.AMDGPU.barrier.local() nounwind convergent ; SI-LABEL: {{^}}main( Index: test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll =================================================================== --- test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -234,4 +234,4 @@ attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } attributes #1 = { "ShaderType"="1" nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } -attributes #2 = { nounwind noduplicate } +attributes #2 = { nounwind convergent } Index: test/CodeGen/AMDGPU/store-barrier.ll =================================================================== --- test/CodeGen/AMDGPU/store-barrier.ll +++ test/CodeGen/AMDGPU/store-barrier.ll @@ -36,7 +36,7 @@ ret void } -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.local() #2 -attributes #2 = { noduplicate nounwind } +attributes #2 = { convergent nounwind } Index: test/CodeGen/AMDGPU/wait.ll =================================================================== --- test/CodeGen/AMDGPU/wait.ll +++ test/CodeGen/AMDGPU/wait.ll @@ -70,7 +70,7 @@ } -; Function Attrs: noduplicate nounwind +; Function Attrs: convergent nounwind declare void @llvm.AMDGPU.barrier.global() #1 ; Function Attrs: nounwind readnone @@ -79,7 +79,7 @@ declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) attributes #0 = { "ShaderType"="1" } -attributes #1 = { noduplicate nounwind } +attributes #1 = { convergent nounwind } attributes #2 = { nounwind readnone } !0 = !{!1, !1, i64 0, i32 1} Index: test/Transforms/LoopUnroll/AMDGPU/lit.local.cfg =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/AMDGPU/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True + Index: test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll @@ -0,0 +1,33 @@ +; RUN: opt -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -S < %s | FileCheck %s + +; CHECK-LABEL: @test_unroll_convergent_barrier( +; CHECK: call void @llvm.AMDGPU.barrier.global() +; CHECK: call void @llvm.AMDGPU.barrier.global() +; CHECK: call void @llvm.AMDGPU.barrier.global() +; CHECK: call void @llvm.AMDGPU.barrier.global() +; CHECK-NOT: br +define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ] + %sum.02 = phi i32 [ %add, %for.body ], [ 0, %entry ] + %arrayidx.in = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %indvars.iv + %arrayidx.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %indvars.iv + %load = load i32, i32 addrspace(1)* %arrayidx.in + call void @llvm.AMDGPU.barrier.global() #1 + %add = add i32 %load, %sum.02 + store i32 %add, i32 addrspace(1)* %arrayidx.out + %indvars.iv.next = add i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next, 4 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +declare void @llvm.AMDGPU.barrier.global() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind convergent }