diff --git a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll --- a/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll +++ b/llvm/test/CodeGen/AMDGPU/simplify-libcalls.ll @@ -1,22 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-POSTLINK %s -; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s +; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s ; RUN: opt -S -O1 -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s ; RUN: opt -S -passes='default' -mtriple=amdgcn-- -amdgpu-simplify-libcall < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-POSTLINK %s -; RUN: opt -S -passes='default' -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 <%s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s +; RUN: opt -S -passes='default' -mtriple=amdgcn-- -amdgpu-simplify-libcall -amdgpu-prelink -amdgpu-enable-ocl-mangling-mismatch-workaround=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-PRELINK %s ; RUN: opt -S -passes='default' -mtriple=amdgcn-- -amdgpu-use-native -amdgpu-prelink < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NATIVE %s -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos -; GCN-POSTLINK: call fast float @_Z3sinf( -; GCN-POSTLINK: call fast float @_Z3cosf( -; GCN-PRELINK: call fast float @_Z6sincosfPf( -; GCN-NATIVE: call fast float @_Z10native_sinf( -; GCN-NATIVE: call fast float @_Z10native_cosf( define amdgpu_kernel void @test_sincos(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3sinf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL2:%.*]] = tail call fast float @_Z3cosf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: store float [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca float, align 4 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast float @_Z6sincosfPf(float [[TMPVARVAR]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: store float [[TMP0]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load float, ptr [[__SINCOS_CALL]], align 4 +; GCN-PRELINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: store float [[TMP1]], ptr addrspace(1) [[ARRAYIDX3]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z10native_sinf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL2:%.*]] = tail call fast float @_Z10native_cosf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: store float [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3sinf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3sinf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 - %call2 = call fast float @_Z3cosf(float %tmp) + %call2 = call fast float @_Z3cosf(float %tmpvarvar) %arrayidx3 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 store float %call2, ptr addrspace(1) %arrayidx3, align 4 ret void @@ -26,18 +55,46 @@ declare float @_Z3cosf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v2 -; GCN-POSTLINK: call fast <2 x float> @_Z3sinDv2_f( -; GCN-POSTLINK: call fast <2 x float> @_Z3cosDv2_f( -; GCN-PRELINK: call fast <2 x float> @_Z6sincosDv2_fPS_( -; GCN-NATIVE: call fast <2 x float> @_Z10native_sinDv2_f( -; GCN-NATIVE: call fast <2 x float> @_Z10native_cosDv2_f( define amdgpu_kernel void @test_sincos_v2(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos_v2 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 8 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast <2 x float> @_Z3sinDv2_f(<2 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store <2 x float> [[CALL]], ptr addrspace(1) [[A]], align 8 +; GCN-POSTLINK-NEXT: [[CALL2:%.*]] = tail call fast <2 x float> @_Z3cosDv2_f(<2 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <2 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: store <2 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 8 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos_v2 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca <2 x float>, align 8 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 8 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast <2 x float> @_Z6sincosDv2_fPS_(<2 x float> [[TMPVARVAR]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: store <2 x float> [[TMP0]], ptr addrspace(1) [[A]], align 8 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[__SINCOS_CALL]], align 8 +; GCN-PRELINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <2 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: store <2 x float> [[TMP1]], ptr addrspace(1) [[ARRAYIDX3]], align 8 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos_v2 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load <2 x float>, ptr addrspace(1) [[A]], align 8 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast <2 x float> @_Z10native_sinDv2_f(<2 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store <2 x float> [[CALL]], ptr addrspace(1) [[A]], align 8 +; GCN-NATIVE-NEXT: [[CALL2:%.*]] = tail call fast <2 x float> @_Z10native_cosDv2_f(<2 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <2 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: store <2 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 8 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load <2 x float>, ptr addrspace(1) %a, align 8 - %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmp) + %tmpvarvar = load <2 x float>, ptr addrspace(1) %a, align 8 + %call = call fast <2 x float> @_Z3sinDv2_f(<2 x float> %tmpvarvar) store <2 x float> %call, ptr addrspace(1) %a, align 8 - %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmp) + %call2 = call fast <2 x float> @_Z3cosDv2_f(<2 x float> %tmpvarvar) %arrayidx3 = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i64 1 store <2 x float> %call2, ptr addrspace(1) %arrayidx3, align 8 ret void @@ -47,13 +104,50 @@ declare <2 x float> @_Z3cosDv2_f(<2 x float>) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v3 -; GCN-POSTLINK: call fast <3 x float> @_Z3sinDv3_f( -; GCN-POSTLINK: call fast <3 x float> @_Z3cosDv3_f( -; GCN-PRELINK: call fast <3 x float> @_Z6sincosDv3_fPS_( -; GCN-NATIVE: call fast <3 x float> @_Z10native_sinDv3_f( -; GCN-NATIVE: call fast <3 x float> @_Z10native_cosDv3_f( define amdgpu_kernel void @test_sincos_v3(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos_v3 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[LOADVEC4:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-POSTLINK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x float> [[LOADVEC4]], <4 x float> undef, <3 x i32> +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast <3 x float> @_Z3sinDv3_f(<3 x float> [[EXTRACTVEC4]]) +; GCN-POSTLINK-NEXT: [[EXTRACTVEC6:%.*]] = shufflevector <3 x float> [[CALL]], <3 x float> undef, <4 x i32> +; GCN-POSTLINK-NEXT: store <4 x float> [[EXTRACTVEC6]], ptr addrspace(1) [[A]], align 16 +; GCN-POSTLINK-NEXT: [[CALL11:%.*]] = tail call fast <3 x float> @_Z3cosDv3_f(<3 x float> [[EXTRACTVEC4]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds <3 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[EXTRACTVEC13:%.*]] = shufflevector <3 x float> [[CALL11]], <3 x float> undef, <4 x i32> +; GCN-POSTLINK-NEXT: store <4 x float> [[EXTRACTVEC13]], ptr addrspace(1) [[ARRAYIDX12]], align 16 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos_v3 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca <3 x float>, align 16 +; GCN-PRELINK-NEXT: [[LOADVEC4:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-PRELINK-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x float> [[LOADVEC4]], <4 x float> undef, <3 x i32> +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast <3 x float> @_Z6sincosDv3_fPS_(<3 x float> [[EXTRACTVEC4]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: [[EXTRACTVEC6:%.*]] = shufflevector <3 x float> [[TMP0]], <3 x float> undef, <4 x i32> +; GCN-PRELINK-NEXT: store <4 x float> [[EXTRACTVEC6]], ptr addrspace(1) [[A]], align 16 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[__SINCOS_CALL]], align 16 +; GCN-PRELINK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds <3 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[EXTRACTVEC13:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> undef, <4 x i32> +; GCN-PRELINK-NEXT: store <4 x float> [[EXTRACTVEC13]], ptr addrspace(1) [[ARRAYIDX12]], align 16 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos_v3 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[LOADVEC4:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-NATIVE-NEXT: [[EXTRACTVEC4:%.*]] = shufflevector <4 x float> [[LOADVEC4]], <4 x float> undef, <3 x i32> +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast <3 x float> @_Z10native_sinDv3_f(<3 x float> [[EXTRACTVEC4]]) +; GCN-NATIVE-NEXT: [[EXTRACTVEC6:%.*]] = shufflevector <3 x float> [[CALL]], <3 x float> undef, <4 x i32> +; GCN-NATIVE-NEXT: store <4 x float> [[EXTRACTVEC6]], ptr addrspace(1) [[A]], align 16 +; GCN-NATIVE-NEXT: [[CALL11:%.*]] = tail call fast <3 x float> @_Z10native_cosDv3_f(<3 x float> [[EXTRACTVEC4]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds <3 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[EXTRACTVEC13:%.*]] = shufflevector <3 x float> [[CALL11]], <3 x float> undef, <4 x i32> +; GCN-NATIVE-NEXT: store <4 x float> [[EXTRACTVEC13]], ptr addrspace(1) [[ARRAYIDX12]], align 16 +; GCN-NATIVE-NEXT: ret void +; entry: %loadVec4 = load <4 x float>, ptr addrspace(1) %a, align 16 %extractVec4 = shufflevector <4 x float> %loadVec4, <4 x float> undef, <3 x i32> @@ -71,18 +165,46 @@ declare <3 x float> @_Z3cosDv3_f(<3 x float>) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v4 -; GCN-POSTLINK: call fast <4 x float> @_Z3sinDv4_f( -; GCN-POSTLINK: call fast <4 x float> @_Z3cosDv4_f( -; GCN-PRELINK: call fast <4 x float> @_Z6sincosDv4_fPS_( -; GCN-NATIVE: call fast <4 x float> @_Z10native_sinDv4_f( -; GCN-NATIVE: call fast <4 x float> @_Z10native_cosDv4_f( define amdgpu_kernel void @test_sincos_v4(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos_v4 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast <4 x float> @_Z3sinDv4_f(<4 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store <4 x float> [[CALL]], ptr addrspace(1) [[A]], align 16 +; GCN-POSTLINK-NEXT: [[CALL2:%.*]] = tail call fast <4 x float> @_Z3cosDv4_f(<4 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: store <4 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos_v4 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca <4 x float>, align 16 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast <4 x float> @_Z6sincosDv4_fPS_(<4 x float> [[TMPVARVAR]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: store <4 x float> [[TMP0]], ptr addrspace(1) [[A]], align 16 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[__SINCOS_CALL]], align 16 +; GCN-PRELINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: store <4 x float> [[TMP1]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos_v4 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load <4 x float>, ptr addrspace(1) [[A]], align 16 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast <4 x float> @_Z10native_sinDv4_f(<4 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store <4 x float> [[CALL]], ptr addrspace(1) [[A]], align 16 +; GCN-NATIVE-NEXT: [[CALL2:%.*]] = tail call fast <4 x float> @_Z10native_cosDv4_f(<4 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <4 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: store <4 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 16 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load <4 x float>, ptr addrspace(1) %a, align 16 - %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmp) + %tmpvarvar = load <4 x float>, ptr addrspace(1) %a, align 16 + %call = call fast <4 x float> @_Z3sinDv4_f(<4 x float> %tmpvarvar) store <4 x float> %call, ptr addrspace(1) %a, align 16 - %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmp) + %call2 = call fast <4 x float> @_Z3cosDv4_f(<4 x float> %tmpvarvar) %arrayidx3 = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i64 1 store <4 x float> %call2, ptr addrspace(1) %arrayidx3, align 16 ret void @@ -92,18 +214,46 @@ declare <4 x float> @_Z3cosDv4_f(<4 x float>) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v8 -; GCN-POSTLINK: call fast <8 x float> @_Z3sinDv8_f( -; GCN-POSTLINK: call fast <8 x float> @_Z3cosDv8_f( -; GCN-PRELINK: call fast <8 x float> @_Z6sincosDv8_fPS_( -; GCN-NATIVE: call fast <8 x float> @_Z10native_sinDv8_f( -; GCN-NATIVE: call fast <8 x float> @_Z10native_cosDv8_f( define amdgpu_kernel void @test_sincos_v8(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos_v8 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load <8 x float>, ptr addrspace(1) [[A]], align 32 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast <8 x float> @_Z3sinDv8_f(<8 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store <8 x float> [[CALL]], ptr addrspace(1) [[A]], align 32 +; GCN-POSTLINK-NEXT: [[CALL2:%.*]] = tail call fast <8 x float> @_Z3cosDv8_f(<8 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <8 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: store <8 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 32 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos_v8 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca <8 x float>, align 32 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load <8 x float>, ptr addrspace(1) [[A]], align 32 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast <8 x float> @_Z6sincosDv8_fPS_(<8 x float> [[TMPVARVAR]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[A]], align 32 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[__SINCOS_CALL]], align 32 +; GCN-PRELINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <8 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: store <8 x float> [[TMP1]], ptr addrspace(1) [[ARRAYIDX3]], align 32 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos_v8 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load <8 x float>, ptr addrspace(1) [[A]], align 32 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast <8 x float> @_Z10native_sinDv8_f(<8 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store <8 x float> [[CALL]], ptr addrspace(1) [[A]], align 32 +; GCN-NATIVE-NEXT: [[CALL2:%.*]] = tail call fast <8 x float> @_Z10native_cosDv8_f(<8 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <8 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: store <8 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 32 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load <8 x float>, ptr addrspace(1) %a, align 32 - %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmp) + %tmpvarvar = load <8 x float>, ptr addrspace(1) %a, align 32 + %call = call fast <8 x float> @_Z3sinDv8_f(<8 x float> %tmpvarvar) store <8 x float> %call, ptr addrspace(1) %a, align 32 - %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmp) + %call2 = call fast <8 x float> @_Z3cosDv8_f(<8 x float> %tmpvarvar) %arrayidx3 = getelementptr inbounds <8 x float>, ptr addrspace(1) %a, i64 1 store <8 x float> %call2, ptr addrspace(1) %arrayidx3, align 32 ret void @@ -113,18 +263,46 @@ declare <8 x float> @_Z3cosDv8_f(<8 x float>) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_sincos_v16 -; GCN-POSTLINK: call fast <16 x float> @_Z3sinDv16_f( -; GCN-POSTLINK: call fast <16 x float> @_Z3cosDv16_f( -; GCN-PRELINK: call fast <16 x float> @_Z6sincosDv16_fPS_( -; GCN-NATIVE: call fast <16 x float> @_Z10native_sinDv16_f( -; GCN-NATIVE: call fast <16 x float> @_Z10native_cosDv16_f( define amdgpu_kernel void @test_sincos_v16(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_sincos_v16 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load <16 x float>, ptr addrspace(1) [[A]], align 64 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast <16 x float> @_Z3sinDv16_f(<16 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store <16 x float> [[CALL]], ptr addrspace(1) [[A]], align 64 +; GCN-POSTLINK-NEXT: [[CALL2:%.*]] = tail call fast <16 x float> @_Z3cosDv16_f(<16 x float> [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <16 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: store <16 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 64 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_sincos_v16 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[__SINCOS_CALL:%.*]] = alloca <16 x float>, align 64 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load <16 x float>, ptr addrspace(1) [[A]], align 64 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = call fast <16 x float> @_Z6sincosDv16_fPS_(<16 x float> [[TMPVARVAR]], ptr nonnull [[__SINCOS_CALL]]) +; GCN-PRELINK-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[A]], align 64 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[__SINCOS_CALL]], align 64 +; GCN-PRELINK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <16 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: store <16 x float> [[TMP1]], ptr addrspace(1) [[ARRAYIDX3]], align 64 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_sincos_v16 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load <16 x float>, ptr addrspace(1) [[A]], align 64 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast <16 x float> @_Z10native_sinDv16_f(<16 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store <16 x float> [[CALL]], ptr addrspace(1) [[A]], align 64 +; GCN-NATIVE-NEXT: [[CALL2:%.*]] = tail call fast <16 x float> @_Z10native_cosDv16_f(<16 x float> [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds <16 x float>, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: store <16 x float> [[CALL2]], ptr addrspace(1) [[ARRAYIDX3]], align 64 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load <16 x float>, ptr addrspace(1) %a, align 64 - %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmp) + %tmpvarvar = load <16 x float>, ptr addrspace(1) %a, align 64 + %call = call fast <16 x float> @_Z3sinDv16_f(<16 x float> %tmpvarvar) store <16 x float> %call, ptr addrspace(1) %a, align 64 - %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmp) + %call2 = call fast <16 x float> @_Z3cosDv16_f(<16 x float> %tmpvarvar) %arrayidx3 = getelementptr inbounds <16 x float>, ptr addrspace(1) %a, i64 1 store <16 x float> %call2, ptr addrspace(1) %arrayidx3, align 64 ret void @@ -134,9 +312,26 @@ declare <16 x float> @_Z3cosDv16_f(<16 x float>) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_recip ; GCN: store float 0x3FD5555560000000, ptr addrspace(1) %a define amdgpu_kernel void @test_native_recip(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_native_recip +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_native_recip +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_native_recip +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %call = call fast float @_Z12native_recipf(float 3.000000e+00) store float %call, ptr addrspace(1) %a, align 4 @@ -145,9 +340,26 @@ declare float @_Z12native_recipf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_recip ; GCN: store float 0x3FD5555560000000, ptr addrspace(1) %a define amdgpu_kernel void @test_half_recip(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_half_recip +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_half_recip +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_half_recip +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float 0x3FD5555560000000, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %call = call fast float @_Z10half_recipf(float 3.000000e+00) store float %call, ptr addrspace(1) %a, align 4 @@ -156,496 +368,1405 @@ declare float @_Z10half_recipf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_native_divide -; GCN: fmul fast float %tmp, 0x3FD5555560000000 +; GCN: fmul fast float %tmpvarvar, 0x3FD5555560000000 define amdgpu_kernel void @test_native_divide(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_native_divide +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-POSTLINK-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_native_divide +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-PRELINK-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_native_divide +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-NATIVE-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z13native_divideff(float %tmp, float 3.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z13native_divideff(float %tmpvarvar, float 3.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z13native_divideff(float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_half_divide -; GCN: fmul fast float %tmp, 0x3FD5555560000000 +; GCN: fmul fast float %tmpvarvar, 0x3FD5555560000000 define amdgpu_kernel void @test_half_divide(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_half_divide +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-POSTLINK-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_half_divide +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-PRELINK-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_half_divide +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__DIV2MUL:%.*]] = fmul fast float [[TMPVARVAR]], 0x3FD5555560000000 +; GCN-NATIVE-NEXT: store float [[__DIV2MUL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z11half_divideff(float %tmp, float 3.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z11half_divideff(float %tmpvarvar, float 3.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z11half_divideff(float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0f ; GCN: store float 1.000000e+00, ptr addrspace(1) %a define amdgpu_kernel void @test_pow_0f(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_0f +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_0f +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_0f +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 0.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3powff(float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_0i ; GCN: store float 1.000000e+00, ptr addrspace(1) %a define amdgpu_kernel void @test_pow_0i(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_0i +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_0i +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_0i +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float 1.000000e+00, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3powff(float %tmp, float 0.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 0.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1f -; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4 -; GCN: store float %tmp, ptr addrspace(1) %a, align 4 +; GCN: %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 +; GCN: store float %tmpvarvar, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pow_1f(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_1f +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_1f +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_1f +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 1.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_1i -; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4 -; GCN: store float %tmp, ptr addrspace(1) %a, align 4 +; GCN: %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 +; GCN: store float %tmpvarvar, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pow_1i(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_1i +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_1i +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_1i +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float 1.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 1.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2f -; GCN: %tmp = load float, ptr addrspace(1) %a, align 4 -; GCN: %__pow2 = fmul fast float %tmp, %tmp +; GCN: %tmpvarvar = load float, ptr addrspace(1) %a, align 4 +; GCN: %__pow2 = fmul fast float %tmpvarvar, %tmpvarvar define amdgpu_kernel void @test_pow_2f(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_2f +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_2f +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_2f +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 2.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_2i -; GCN: %tmp = load float, ptr addrspace(1) %a, align 4 -; GCN: %__pow2 = fmul fast float %tmp, %tmp +; GCN: %tmpvarvar = load float, ptr addrspace(1) %a, align 4 +; GCN: %__pow2 = fmul fast float %tmpvarvar, %tmpvarvar define amdgpu_kernel void @test_pow_2i(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_2i +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_2i +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_2i +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__POW2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[__POW2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3powff(float %tmp, float 2.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 2.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1f -; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4 -; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp +; GCN: %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 +; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmpvarvar define amdgpu_kernel void @test_pow_m1f(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_m1f +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_m1f +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_m1f +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float -1.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_m1i -; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4 -; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmp +; GCN: %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 +; GCN: %__powrecip = fdiv fast float 1.000000e+00, %tmpvarvar define amdgpu_kernel void @test_pow_m1i(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_m1i +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_m1i +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_m1i +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POWRECIP:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[__POWRECIP]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float -1.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float -1.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_half -; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 5.000000e-01) -; GCN-PRELINK: %__pow2sqrt = tail call fast float @_Z4sqrtf(float %tmp) define amdgpu_kernel void @test_pow_half(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_half +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3powff(float [[TMPVARVAR]], float 5.000000e-01) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_half +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POW2SQRT:%.*]] = tail call fast float @_Z4sqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__POW2SQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_half +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POW2SQRT:%.*]] = tail call fast float @_Z4sqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[__POW2SQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float 5.000000e-01) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 5.000000e-01) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_mhalf -; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float -5.000000e-01) -; GCN-PRELINK: %__pow2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp) define amdgpu_kernel void @test_pow_mhalf(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_mhalf +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3powff(float [[TMPVARVAR]], float -5.000000e-01) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_mhalf +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POW2RSQRT:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__POW2RSQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_mhalf +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POW2RSQRT:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[__POW2RSQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float -5.000000e-01) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float -5.000000e-01) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow_c -; GCN: %__powx2 = fmul fast float %tmp, %tmp +; GCN: %__powx2 = fmul fast float %tmpvarvar, %tmpvarvar ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2 -; GCN: %__powx22 = fmul fast float %__powx2, %tmp -; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21 -; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22 +; GCN: %__powx22 = fmul fast float %__powx2, %tmpvarvar define amdgpu_kernel void @test_pow_c(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow_c +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-POSTLINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-POSTLINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-POSTLINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow_c +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-PRELINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-PRELINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-PRELINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow_c +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-NATIVE-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-NATIVE-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-NATIVE-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z3powff(float %tmp, float 1.100000e+01) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 1.100000e+01) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr_c -; GCN: %__powx2 = fmul fast float %tmp, %tmp +; GCN: %__powx2 = fmul fast float %tmpvarvar, %tmpvarvar ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2 -; GCN: %__powx22 = fmul fast float %__powx2, %tmp -; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21 -; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22 +; GCN: %__powx22 = fmul fast float %__powx2, %tmpvarvar define amdgpu_kernel void @test_powr_c(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_powr_c +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-POSTLINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-POSTLINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-POSTLINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_powr_c +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-PRELINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-PRELINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-PRELINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_powr_c +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-NATIVE-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-NATIVE-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-NATIVE-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z4powrff(float %tmp, float 1.100000e+01) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z4powrff(float %tmpvarvar, float 1.100000e+01) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z4powrff(float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown_c -; GCN: %__powx2 = fmul fast float %tmp, %tmp +; GCN: %__powx2 = fmul fast float %tmpvarvar, %tmpvarvar ; GCN: %__powx21 = fmul fast float %__powx2, %__powx2 -; GCN: %__powx22 = fmul fast float %__powx2, %tmp -; GCN: %[[r0:.*]] = fmul fast float %__powx21, %__powx21 -; GCN: %__powprod3 = fmul fast float %[[r0]], %__powx22 +; GCN: %__powx22 = fmul fast float %__powx2, %tmpvarvar define amdgpu_kernel void @test_pown_c(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pown_c +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-POSTLINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-POSTLINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-POSTLINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pown_c +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-PRELINK-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-PRELINK-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-PRELINK-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pown_c +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[__POWX2:%.*]] = fmul fast float [[TMPVARVAR]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[__POWX21:%.*]] = fmul fast float [[__POWX2]], [[__POWX2]] +; GCN-NATIVE-NEXT: [[__POWX22:%.*]] = fmul fast float [[__POWX2]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = fmul fast float [[__POWX21]], [[__POWX21]] +; GCN-NATIVE-NEXT: [[__POWPROD3:%.*]] = fmul fast float [[TMP0]], [[__POWX22]] +; GCN-NATIVE-NEXT: store float [[__POWPROD3]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z4pownfi(float %tmp, i32 11) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z4pownfi(float %tmpvarvar, i32 11) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z4pownfi(float, i32) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pow -; GCN-POSTLINK: call fast float @_Z3powff(float %tmp, float 1.013000e+03) -; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, 1.013000e+03 -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 -; GCN-PRELINK: %__pow_sign = and i32 %[[r0]], -2147483648 -; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32 -; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] -; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_pow(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pow +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3powff(float [[TMPVARVAR]], float 1.013000e+03) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pow +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__FABS:%.*]] = tail call fast float @_Z4fabsf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[__FABS]]) +; GCN-PRELINK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], 1.013000e+03 +; GCN-PRELINK-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = bitcast float [[TMPVARVAR]] to i32 +; GCN-PRELINK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[TMP0]], -2147483648 +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32 +; GCN-PRELINK-NEXT: [[TMP2:%.*]] = or i32 [[__POW_SIGN]], [[TMP1]] +; GCN-PRELINK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pow +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__FABS:%.*]] = tail call fast float @_Z4fabsf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[__FABS]]) +; GCN-NATIVE-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], 1.013000e+03 +; GCN-NATIVE-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = bitcast float [[TMPVARVAR]] to i32 +; GCN-NATIVE-NEXT: [[__POW_SIGN:%.*]] = and i32 [[TMP0]], -2147483648 +; GCN-NATIVE-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32 +; GCN-NATIVE-NEXT: [[TMP2:%.*]] = or i32 [[__POW_SIGN]], [[TMP1]] +; GCN-NATIVE-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3powff(float %tmp, float 1.013000e+03) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3powff(float %tmpvarvar, float 1.013000e+03) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_powr -; GCN-POSTLINK: call fast float @_Z4powrff(float %tmp, float %tmp1) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %tmp) -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: store float %__exp2, ptr addrspace(1) %a, align 4 -; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) -; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) -; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_powr(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_powr +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4powrff(float [[TMPVARVAR]], float [[TMPVARVAR1]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_powr +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-PRELINK-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[TMPVARVAR1]] +; GCN-PRELINK-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-PRELINK-NEXT: store float [[__EXP2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_powr +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-NATIVE-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z11native_log2f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[TMPVARVAR1]] +; GCN-NATIVE-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z11native_exp2f(float [[__YLOGX]]) +; GCN-NATIVE-NEXT: store float [[__EXP2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 - %call = call fast float @_Z4powrff(float %tmp, float %tmp1) - store float %call, ptr addrspace(1) %a, align 4 - ret void -} - -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pown -; GCN-POSTLINK: call fast float @_Z4pownfi(float %tmp, i32 %conv) -; GCN-PRELINK: %conv = fptosi float %tmp1 to i32 -; GCN-PRELINK: %__fabs = tail call fast float @_Z4fabsf(float %tmp) -; GCN-PRELINK: %__log2 = tail call fast float @_Z4log2f(float %__fabs) -; GCN-PRELINK: %pownI2F = sitofp i32 %conv to float -; GCN-PRELINK: %__ylogx = fmul fast float %__log2, %pownI2F -; GCN-PRELINK: %__exp2 = tail call fast float @_Z4exp2f(float %__ylogx) -; GCN-PRELINK: %__yeven = shl i32 %conv, 31 -; GCN-PRELINK: %[[r0:.*]] = bitcast float %tmp to i32 -; GCN-PRELINK: %__pow_sign = and i32 %__yeven, %[[r0]] -; GCN-PRELINK: %[[r1:.*]] = bitcast float %__exp2 to i32 -; GCN-PRELINK: %[[r2:.*]] = or i32 %__pow_sign, %[[r1]] -; GCN-PRELINK: store i32 %[[r2]], ptr addrspace(1) %a, align 4 + %tmpvarvar1 = load float, ptr addrspace(1) %arrayidx1, align 4 + %call = call fast float @_Z4powrff(float %tmpvarvar, float %tmpvarvar1) + store float %call, ptr addrspace(1) %a, align 4 + ret void +} + define amdgpu_kernel void @test_pown(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pown +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-POSTLINK-NEXT: [[CONV:%.*]] = fptosi float [[TMPVARVAR1]] to i32 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[TMPVARVAR]], i32 [[CONV]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pown +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-PRELINK-NEXT: [[CONV:%.*]] = fptosi float [[TMPVARVAR1]] to i32 +; GCN-PRELINK-NEXT: [[__FABS:%.*]] = tail call fast float @_Z4fabsf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[__FABS]]) +; GCN-PRELINK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[CONV]] to float +; GCN-PRELINK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] +; GCN-PRELINK-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-PRELINK-NEXT: [[__YEVEN:%.*]] = shl i32 [[CONV]], 31 +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = bitcast float [[TMPVARVAR]] to i32 +; GCN-PRELINK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32 +; GCN-PRELINK-NEXT: [[TMP2:%.*]] = or i32 [[__POW_SIGN]], [[TMP1]] +; GCN-PRELINK-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pown +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-NATIVE-NEXT: [[CONV:%.*]] = fptosi float [[TMPVARVAR1]] to i32 +; GCN-NATIVE-NEXT: [[__FABS:%.*]] = tail call fast float @_Z4fabsf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[__FABS]]) +; GCN-NATIVE-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[CONV]] to float +; GCN-NATIVE-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]] +; GCN-NATIVE-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-NATIVE-NEXT: [[__YEVEN:%.*]] = shl i32 [[CONV]], 31 +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = bitcast float [[TMPVARVAR]] to i32 +; GCN-NATIVE-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]] +; GCN-NATIVE-NEXT: [[TMP1:%.*]] = bitcast float [[__EXP2]] to i32 +; GCN-NATIVE-NEXT: [[TMP2:%.*]] = or i32 [[__POW_SIGN]], [[TMP1]] +; GCN-NATIVE-NEXT: store i32 [[TMP2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 - %conv = fptosi float %tmp1 to i32 - %call = call fast float @_Z4pownfi(float %tmp, i32 %conv) + %tmpvarvar1 = load float, ptr addrspace(1) %arrayidx1, align 4 + %conv = fptosi float %tmpvarvar1 to i32 + %call = call fast float @_Z4pownfi(float %tmpvarvar, i32 %conv) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_1 -; GCN: %tmp = load float, ptr addrspace(1) %arrayidx, align 4 -; GCN: store float %tmp, ptr addrspace(1) %a, align 4 +; GCN: %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 +; GCN: store float %tmpvarvar, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_rootn_1(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_rootn_1 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_rootn_1 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_rootn_1 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: store float [[TMPVARVAR]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %call = call fast float @_Z5rootnfi(float %tmp, i32 1) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %call = call fast float @_Z5rootnfi(float %tmpvarvar, i32 1) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z5rootnfi(float, i32) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_2 -; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 2) -; GCN-PRELINK: %__rootn2sqrt = tail call fast float @_Z4sqrtf(float %tmp) define amdgpu_kernel void @test_rootn_2(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_rootn_2 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[TMPVARVAR]], i32 2) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_rootn_2 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__ROOTN2SQRT:%.*]] = tail call fast float @_Z4sqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__ROOTN2SQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_rootn_2 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__ROOTN2SQRT:%.*]] = tail call fast float @_Z4sqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[__ROOTN2SQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5rootnfi(float %tmp, i32 2) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5rootnfi(float %tmpvarvar, i32 2) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_3 -; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 3) -; GCN-PRELINK: %__rootn2cbrt = tail call fast float @_Z4cbrtf(float %tmp) define amdgpu_kernel void @test_rootn_3(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_rootn_3 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[TMPVARVAR]], i32 3) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_rootn_3 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__ROOTN2CBRT:%.*]] = tail call fast float @_Z4cbrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__ROOTN2CBRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_rootn_3 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__ROOTN2CBRT:%.*]] = tail call fast float @_Z4cbrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[__ROOTN2CBRT]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5rootnfi(float %tmp, i32 3) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5rootnfi(float %tmpvarvar, i32 3) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m1 -; GCN: fdiv fast float 1.000000e+00, %tmp +; GCN: fdiv fast float 1.000000e+00, %tmpvarvar define amdgpu_kernel void @test_rootn_m1(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_rootn_m1 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[__ROOTN2DIV]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_rootn_m1 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__ROOTN2DIV:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[__ROOTN2DIV]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_rootn_m1 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__ROOTN2DIV:%.*]] = fdiv fast float 1.000000e+00, [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[__ROOTN2DIV]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5rootnfi(float %tmp, i32 -1) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5rootnfi(float %tmpvarvar, i32 -1) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_rootn_m2 -; GCN-POSTLINK: call fast float @_Z5rootnfi(float %tmp, i32 -2) -; GCN-PRELINK: %__rootn2rsqrt = tail call fast float @_Z5rsqrtf(float %tmp) define amdgpu_kernel void @test_rootn_m2(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_rootn_m2 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rootnfi(float [[TMPVARVAR]], i32 -2) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_rootn_m2 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__ROOTN2RSQRT:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__ROOTN2RSQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_rootn_m2 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[__ROOTN2RSQRT:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[__ROOTN2RSQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5rootnfi(float %tmp, i32 -2) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5rootnfi(float %tmpvarvar, i32 -2) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_0x ; GCN: store float %y, ptr addrspace(1) %a define amdgpu_kernel void @test_fma_0x(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_fma_0x +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_fma_0x +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_fma_0x +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmp, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3fmafff(float 0.000000e+00, float %tmpvarvar, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3fmafff(float, float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x0 ; GCN: store float %y, ptr addrspace(1) %a define amdgpu_kernel void @test_fma_x0(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_fma_x0 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_fma_x0 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_fma_x0 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3fmafff(float %tmp, float 0.000000e+00, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3fmafff(float %tmpvarvar, float 0.000000e+00, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_0x ; GCN: store float %y, ptr addrspace(1) %a define amdgpu_kernel void @test_mad_0x(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_mad_0x +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_mad_0x +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_mad_0x +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmp, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3madfff(float 0.000000e+00, float %tmpvarvar, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3madfff(float, float, float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_mad_x0 ; GCN: store float %y, ptr addrspace(1) %a define amdgpu_kernel void @test_mad_x0(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_mad_x0 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_mad_x0 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_mad_x0 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture writeonly [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: store float [[Y]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3madfff(float %tmp, float 0.000000e+00, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3madfff(float %tmpvarvar, float 0.000000e+00, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_x1y -; GCN: %fmaadd = fadd fast float %tmp, %y +; GCN: %fmaadd = fadd fast float %tmpvarvar, %y define amdgpu_kernel void @test_fma_x1y(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_fma_x1y +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-POSTLINK-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_fma_x1y +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-PRELINK-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_fma_x1y +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-NATIVE-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3fmafff(float %tmp, float 1.000000e+00, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3fmafff(float %tmpvarvar, float 1.000000e+00, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_1xy -; GCN: %fmaadd = fadd fast float %tmp, %y +; GCN: %fmaadd = fadd fast float %tmpvarvar, %y define amdgpu_kernel void @test_fma_1xy(ptr addrspace(1) nocapture %a, float %y) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_fma_1xy +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-POSTLINK-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_fma_1xy +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-PRELINK-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_fma_1xy +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]], float [[Y:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[FMAADD:%.*]] = fadd fast float [[TMPVARVAR]], [[Y]] +; GCN-NATIVE-NEXT: store float [[FMAADD]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmp, float %y) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3fmafff(float 1.000000e+00, float %tmpvarvar, float %y) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_fma_xy0 -; GCN: %fmamul = fmul fast float %tmp1, %tmp +; GCN: %fmamul = fmul fast float %tmpvarvar1, %tmpvarvar define amdgpu_kernel void @test_fma_xy0(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_fma_xy0 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[FMAMUL:%.*]] = fmul fast float [[TMPVARVAR1]], [[TMPVARVAR]] +; GCN-POSTLINK-NEXT: store float [[FMAMUL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_fma_xy0 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR1]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[FMAMUL:%.*]] = fmul fast float [[TMPVARVAR1]], [[TMPVARVAR]] +; GCN-PRELINK-NEXT: store float [[FMAMUL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_fma_xy0 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX]], align 4 +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[FMAMUL:%.*]] = fmul fast float [[TMPVARVAR1]], [[TMPVARVAR]] +; GCN-NATIVE-NEXT: store float [[FMAMUL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: %arrayidx = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp = load float, ptr addrspace(1) %arrayidx, align 4 - %tmp1 = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3fmafff(float %tmp, float %tmp1, float 0.000000e+00) + %tmpvarvar = load float, ptr addrspace(1) %arrayidx, align 4 + %tmpvarvar1 = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3fmafff(float %tmpvarvar, float %tmpvarvar1, float 0.000000e+00) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp -; GCN-NATIVE: call fast float @_Z10native_expf(float %tmp) define amdgpu_kernel void @test_use_native_exp(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_exp +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3expf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_exp +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3expf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_exp +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z10native_expf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3expf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3expf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3expf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp2 -; GCN-NATIVE: call fast float @_Z11native_exp2f(float %tmp) define amdgpu_kernel void @test_use_native_exp2(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_exp2 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4exp2f(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_exp2 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4exp2f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_exp2 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z11native_exp2f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z4exp2f(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z4exp2f(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z4exp2f(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_exp10 -; GCN-NATIVE: call fast float @_Z12native_exp10f(float %tmp) define amdgpu_kernel void @test_use_native_exp10(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_exp10 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5exp10f(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_exp10 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5exp10f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_exp10 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z12native_exp10f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5exp10f(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5exp10f(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z5exp10f(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log -; GCN-NATIVE: call fast float @_Z10native_logf(float %tmp) define amdgpu_kernel void @test_use_native_log(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_log +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3logf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_log +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3logf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_log +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z10native_logf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3logf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3logf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3logf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log2 -; GCN-NATIVE: call fast float @_Z11native_log2f(float %tmp) define amdgpu_kernel void @test_use_native_log2(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_log2 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4log2f(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_log2 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4log2f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_log2 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z11native_log2f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z4log2f(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z4log2f(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z4log2f(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_log10 -; GCN-NATIVE: call fast float @_Z12native_log10f(float %tmp) define amdgpu_kernel void @test_use_native_log10(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_log10 +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5log10f(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_log10 +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5log10f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_log10 +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z12native_log10f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5log10f(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5log10f(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z5log10f(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_powr -; GCN-NATIVE: %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 -; GCN-NATIVE: %__log2 = tail call fast float @_Z11native_log2f(float %tmp) -; GCN-NATIVE: %__ylogx = fmul fast float %__log2, %tmp1 -; GCN-NATIVE: %__exp2 = tail call fast float @_Z11native_exp2f(float %__ylogx) -; GCN-NATIVE: store float %__exp2, ptr addrspace(1) %a, align 4 define amdgpu_kernel void @test_use_native_powr(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_powr +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4powrff(float [[TMPVARVAR]], float [[TMPVARVAR1]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_powr +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-PRELINK-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z4log2f(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[TMPVARVAR1]] +; GCN-PRELINK-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z4exp2f(float [[__YLOGX]]) +; GCN-PRELINK-NEXT: store float [[__EXP2]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_powr +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = load float, ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-NATIVE-NEXT: [[__LOG2:%.*]] = tail call fast float @_Z11native_log2f(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[TMPVARVAR1]] +; GCN-NATIVE-NEXT: [[__EXP2:%.*]] = tail call fast float @_Z11native_exp2f(float [[__YLOGX]]) +; GCN-NATIVE-NEXT: store float [[__EXP2]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp1 = load float, ptr addrspace(1) %arrayidx1, align 4 - %call = call fast float @_Z4powrff(float %tmp, float %tmp1) + %tmpvarvar1 = load float, ptr addrspace(1) %arrayidx1, align 4 + %call = call fast float @_Z4powrff(float %tmpvarvar, float %tmpvarvar1) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sqrt -; GCN-NATIVE: call fast float @_Z11native_sqrtf(float %tmp) define amdgpu_kernel void @test_use_native_sqrt(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_sqrt +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4sqrtf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_sqrt +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[__SQRT:%.*]] = tail call fast float @_Z11native_sqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[__SQRT]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_sqrt +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z11native_sqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z4sqrtf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z4sqrtf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64 -; GCN: call fast double @_Z4sqrtd(double %tmp) +; GCN: call fast double @_Z4sqrtd(double %tmpvarvar) define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64(ptr addrspace(1) nocapture %a) { +; GCN-LABEL: define amdgpu_kernel void @test_dont_use_native_sqrt_fast_f64 +; GCN-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-NEXT: entry: +; GCN-NEXT: [[TMPVARVAR:%.*]] = load double, ptr addrspace(1) [[A]], align 8 +; GCN-NEXT: [[CALL:%.*]] = tail call fast double @_Z4sqrtd(double [[TMPVARVAR]]) +; GCN-NEXT: store double [[CALL]], ptr addrspace(1) [[A]], align 8 +; GCN-NEXT: ret void +; entry: - %tmp = load double, ptr addrspace(1) %a, align 8 - %call = call fast double @_Z4sqrtd(double %tmp) + %tmpvarvar = load double, ptr addrspace(1) %a, align 8 + %call = call fast double @_Z4sqrtd(double %tmpvarvar) store double %call, ptr addrspace(1) %a, align 8 ret void } @@ -653,39 +1774,111 @@ declare float @_Z4sqrtf(float) declare double @_Z4sqrtd(double) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_rsqrt -; GCN-NATIVE: call fast float @_Z12native_rsqrtf(float %tmp) define amdgpu_kernel void @test_use_native_rsqrt(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_rsqrt +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_rsqrt +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z5rsqrtf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_rsqrt +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z12native_rsqrtf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z5rsqrtf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z5rsqrtf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z5rsqrtf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_tan -; GCN-NATIVE: call fast float @_Z10native_tanf(float %tmp) define amdgpu_kernel void @test_use_native_tan(ptr addrspace(1) nocapture %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_tan +; GCN-POSTLINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3tanf(float [[TMPVARVAR]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_tan +; GCN-PRELINK-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z3tanf(float [[TMPVARVAR]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_tan +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[CALL:%.*]] = tail call fast float @_Z10native_tanf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 - %call = call fast float @_Z3tanf(float %tmp) + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 + %call = call fast float @_Z3tanf(float %tmpvarvar) store float %call, ptr addrspace(1) %a, align 4 ret void } declare float @_Z3tanf(float) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_use_native_sincos -; GCN-NATIVE: call float @_Z10native_sinf(float %tmp) -; GCN-NATIVE: call float @_Z10native_cosf(float %tmp) define amdgpu_kernel void @test_use_native_sincos(ptr addrspace(1) %a) { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_use_native_sincos +; GCN-POSTLINK-SAME: (ptr addrspace(1) [[A:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[ARRAYIDX1]] to ptr +; GCN-POSTLINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z6sincosfPf(float [[TMPVARVAR]], ptr [[TMPVARVAR1]]) +; GCN-POSTLINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_use_native_sincos +; GCN-PRELINK-SAME: (ptr addrspace(1) [[A:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[ARRAYIDX1]] to ptr +; GCN-PRELINK-NEXT: [[CALL:%.*]] = tail call fast float @_Z6sincosfPf(float [[TMPVARVAR]], ptr [[TMPVARVAR1]]) +; GCN-PRELINK-NEXT: store float [[CALL]], ptr addrspace(1) [[A]], align 4 +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_use_native_sincos +; GCN-NATIVE-SAME: (ptr addrspace(1) nocapture [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = load float, ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[A]], i64 1 +; GCN-NATIVE-NEXT: [[SPLITSIN:%.*]] = tail call float @_Z10native_sinf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: [[SPLITCOS:%.*]] = tail call float @_Z10native_cosf(float [[TMPVARVAR]]) +; GCN-NATIVE-NEXT: store float [[SPLITCOS]], ptr addrspace(1) [[ARRAYIDX1]], align 4 +; GCN-NATIVE-NEXT: store float [[SPLITSIN]], ptr addrspace(1) [[A]], align 4 +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = load float, ptr addrspace(1) %a, align 4 + %tmpvarvar = load float, ptr addrspace(1) %a, align 4 %arrayidx1 = getelementptr inbounds float, ptr addrspace(1) %a, i64 1 - %tmp1 = addrspacecast ptr addrspace(1) %arrayidx1 to ptr - %call = call fast float @_Z6sincosfPf(float %tmp, ptr %tmp1) + %tmpvarvar1 = addrspacecast ptr addrspace(1) %arrayidx1 to ptr + %call = call fast float @_Z6sincosfPf(float %tmpvarvar, ptr %tmpvarvar1) store float %call, ptr addrspace(1) %a, align 4 ret void } @@ -695,16 +1888,43 @@ %opencl.pipe_t = type opaque %opencl.reserve_id_t = type opaque -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) -; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND:[0-9]+]] -; GCN-PRELINK: call i32 @__read_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]] define amdgpu_kernel void @test_read_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_read_pipe +; GCN-POSTLINK-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR2:[0-9]+]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) +; GCN-POSTLINK-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: tail call void @__commit_read_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_read_pipe +; GCN-PRELINK-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR3:[0-9]+]] +; GCN-PRELINK-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: tail call void @__commit_read_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_read_pipe +; GCN-NATIVE-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR3:[0-9]+]] +; GCN-NATIVE-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) +; GCN-NATIVE-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: tail call void @__commit_read_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr - %tmp2 = call i32 @__read_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0 - %tmp3 = call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) - %tmp4 = call i32 @__read_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0 - call void @__commit_read_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4) + %tmpvarvar1 = addrspacecast ptr addrspace(1) %ptr to ptr + %tmpvarvar2 = call i32 @__read_pipe_2(ptr addrspace(1) %p, ptr %tmpvarvar1, i32 4, i32 4) #0 + %tmpvarvar3 = call ptr addrspace(5) @__reserve_read_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) + %tmpvarvar4 = call i32 @__read_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmpvarvar3, i32 2, ptr %tmpvarvar1, i32 4, i32 4) #0 + call void @__commit_read_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmpvarvar3, i32 4, i32 4) ret void } @@ -716,16 +1936,43 @@ declare void @__commit_read_pipe(ptr addrspace(1), ptr addrspace(5), i32, i32) -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) -; GCN-PRELINK: call i32 @__write_pipe_2_4(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__write_pipe_4_4(ptr addrspace(1) %{{.*}}, ptr addrspace(5) %{{.*}}, i32 2, ptr %{{.*}}) #[[$NOUNWIND]] define amdgpu_kernel void @test_write_pipe(ptr addrspace(1) %p, ptr addrspace(1) %ptr) local_unnamed_addr { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_write_pipe +; GCN-POSTLINK-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = tail call i32 @__write_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMP1:%.*]] = tail call i32 @__write_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: tail call void @__commit_write_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) #[[ATTR2]] +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_write_pipe +; GCN-PRELINK-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr #[[ATTR3]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = tail call i32 @__write_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = tail call i32 @__write_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: tail call void @__commit_write_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) #[[ATTR3]] +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_write_pipe +; GCN-NATIVE-SAME: (ptr addrspace(1) [[P:%.*]], ptr addrspace(1) [[PTR:%.*]]) local_unnamed_addr #[[ATTR3]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR1:%.*]] = addrspacecast ptr addrspace(1) [[PTR]] to ptr +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = tail call i32 @__write_pipe_2_4(ptr addrspace(1) [[P]], ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR3:%.*]] = tail call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) [[P]], i32 2, i32 4, i32 4) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMP1:%.*]] = tail call i32 @__write_pipe_4_4(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 2, ptr [[TMPVARVAR1]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: tail call void @__commit_write_pipe(ptr addrspace(1) [[P]], ptr addrspace(5) [[TMPVARVAR3]], i32 4, i32 4) #[[ATTR3]] +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp1 = addrspacecast ptr addrspace(1) %ptr to ptr - %tmp2 = call i32 @__write_pipe_2(ptr addrspace(1) %p, ptr %tmp1, i32 4, i32 4) #0 - %tmp3 = call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) #0 - %tmp4 = call i32 @__write_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 2, ptr %tmp1, i32 4, i32 4) #0 - call void @__commit_write_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmp3, i32 4, i32 4) #0 + %tmpvarvar1 = addrspacecast ptr addrspace(1) %ptr to ptr + %tmpvarvar2 = call i32 @__write_pipe_2(ptr addrspace(1) %p, ptr %tmpvarvar1, i32 4, i32 4) #0 + %tmpvarvar3 = call ptr addrspace(5) @__reserve_write_pipe(ptr addrspace(1) %p, i32 2, i32 4, i32 4) #0 + %tmpvarvar4 = call i32 @__write_pipe_4(ptr addrspace(1) %p, ptr addrspace(5) %tmpvarvar3, i32 2, ptr %tmpvarvar1, i32 4, i32 4) #0 + call void @__commit_write_pipe(ptr addrspace(1) %p, ptr addrspace(5) %tmpvarvar3, i32 4, i32 4) #0 ret void } @@ -739,43 +1986,97 @@ %struct.S = type { [100 x i32] } -; GCN-LABEL: {{^}}define amdgpu_kernel void @test_pipe_size -; GCN-PRELINK: call i32 @__read_pipe_2_1(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_2(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_4(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_8(ptr addrspace(1) %{{.*}} ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_16(ptr addrspace(1) %{{.*}}, ptr %{{.*}}) #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_32(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_64(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2_128(ptr addrspace(1) %{{.*}}, ptr %{{.*}} #[[$NOUNWIND]] -; GCN-PRELINK: call i32 @__read_pipe_2(ptr addrspace(1) %{{.*}}, ptr %{{.*}} i32 400, i32 4) #[[$NOUNWIND]] define amdgpu_kernel void @test_pipe_size(ptr addrspace(1) %p1, ptr addrspace(1) %ptr1, ptr addrspace(1) %p2, ptr addrspace(1) %ptr2, ptr addrspace(1) %p4, ptr addrspace(1) %ptr4, ptr addrspace(1) %p8, ptr addrspace(1) %ptr8, ptr addrspace(1) %p16, ptr addrspace(1) %ptr16, ptr addrspace(1) %p32, ptr addrspace(1) %ptr32, ptr addrspace(1) %p64, ptr addrspace(1) %ptr64, ptr addrspace(1) %p128, ptr addrspace(1) %ptr128, ptr addrspace(1) %pu, ptr addrspace(1) %ptru) local_unnamed_addr #0 { +; GCN-POSTLINK-LABEL: define amdgpu_kernel void @test_pipe_size +; GCN-POSTLINK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) [[P2:%.*]], ptr addrspace(1) [[PTR2:%.*]], ptr addrspace(1) [[P4:%.*]], ptr addrspace(1) [[PTR4:%.*]], ptr addrspace(1) [[P8:%.*]], ptr addrspace(1) [[PTR8:%.*]], ptr addrspace(1) [[P16:%.*]], ptr addrspace(1) [[PTR16:%.*]], ptr addrspace(1) [[P32:%.*]], ptr addrspace(1) [[PTR32:%.*]], ptr addrspace(1) [[P64:%.*]], ptr addrspace(1) [[PTR64:%.*]], ptr addrspace(1) [[P128:%.*]], ptr addrspace(1) [[PTR128:%.*]], ptr addrspace(1) [[PU:%.*]], ptr addrspace(1) [[PTRU:%.*]]) local_unnamed_addr #[[ATTR2]] { +; GCN-POSTLINK-NEXT: entry: +; GCN-POSTLINK-NEXT: [[TMPVARVAR:%.*]] = addrspacecast ptr addrspace(1) [[PTR1]] to ptr +; GCN-POSTLINK-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_1(ptr addrspace(1) [[P1]], ptr [[TMPVARVAR]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR3:%.*]] = addrspacecast ptr addrspace(1) [[PTR2]] to ptr +; GCN-POSTLINK-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_2_2(ptr addrspace(1) [[P2]], ptr [[TMPVARVAR3]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR6:%.*]] = addrspacecast ptr addrspace(1) [[PTR4]] to ptr +; GCN-POSTLINK-NEXT: [[TMP2:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P4]], ptr [[TMPVARVAR6]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR9:%.*]] = addrspacecast ptr addrspace(1) [[PTR8]] to ptr +; GCN-POSTLINK-NEXT: [[TMP3:%.*]] = tail call i32 @__read_pipe_2_8(ptr addrspace(1) [[P8]], ptr [[TMPVARVAR9]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR12:%.*]] = addrspacecast ptr addrspace(1) [[PTR16]] to ptr +; GCN-POSTLINK-NEXT: [[TMP4:%.*]] = tail call i32 @__read_pipe_2_16(ptr addrspace(1) [[P16]], ptr [[TMPVARVAR12]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR15:%.*]] = addrspacecast ptr addrspace(1) [[PTR32]] to ptr +; GCN-POSTLINK-NEXT: [[TMP5:%.*]] = tail call i32 @__read_pipe_2_32(ptr addrspace(1) [[P32]], ptr [[TMPVARVAR15]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR18:%.*]] = addrspacecast ptr addrspace(1) [[PTR64]] to ptr +; GCN-POSTLINK-NEXT: [[TMP6:%.*]] = tail call i32 @__read_pipe_2_64(ptr addrspace(1) [[P64]], ptr [[TMPVARVAR18]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR21:%.*]] = addrspacecast ptr addrspace(1) [[PTR128]] to ptr +; GCN-POSTLINK-NEXT: [[TMP7:%.*]] = tail call i32 @__read_pipe_2_128(ptr addrspace(1) [[P128]], ptr [[TMPVARVAR21]]) #[[ATTR2]] +; GCN-POSTLINK-NEXT: [[TMPVARVAR24:%.*]] = addrspacecast ptr addrspace(1) [[PTRU]] to ptr +; GCN-POSTLINK-NEXT: [[TMPVARVAR25:%.*]] = tail call i32 @__read_pipe_2(ptr addrspace(1) [[PU]], ptr [[TMPVARVAR24]], i32 400, i32 4) #[[ATTR2]] +; GCN-POSTLINK-NEXT: ret void +; +; GCN-PRELINK-LABEL: define amdgpu_kernel void @test_pipe_size +; GCN-PRELINK-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) [[P2:%.*]], ptr addrspace(1) [[PTR2:%.*]], ptr addrspace(1) [[P4:%.*]], ptr addrspace(1) [[PTR4:%.*]], ptr addrspace(1) [[P8:%.*]], ptr addrspace(1) [[PTR8:%.*]], ptr addrspace(1) [[P16:%.*]], ptr addrspace(1) [[PTR16:%.*]], ptr addrspace(1) [[P32:%.*]], ptr addrspace(1) [[PTR32:%.*]], ptr addrspace(1) [[P64:%.*]], ptr addrspace(1) [[PTR64:%.*]], ptr addrspace(1) [[P128:%.*]], ptr addrspace(1) [[PTR128:%.*]], ptr addrspace(1) [[PU:%.*]], ptr addrspace(1) [[PTRU:%.*]]) local_unnamed_addr #[[ATTR3]] { +; GCN-PRELINK-NEXT: entry: +; GCN-PRELINK-NEXT: [[TMPVARVAR:%.*]] = addrspacecast ptr addrspace(1) [[PTR1]] to ptr +; GCN-PRELINK-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_1(ptr addrspace(1) [[P1]], ptr [[TMPVARVAR]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR3:%.*]] = addrspacecast ptr addrspace(1) [[PTR2]] to ptr +; GCN-PRELINK-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_2_2(ptr addrspace(1) [[P2]], ptr [[TMPVARVAR3]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR6:%.*]] = addrspacecast ptr addrspace(1) [[PTR4]] to ptr +; GCN-PRELINK-NEXT: [[TMP2:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P4]], ptr [[TMPVARVAR6]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR9:%.*]] = addrspacecast ptr addrspace(1) [[PTR8]] to ptr +; GCN-PRELINK-NEXT: [[TMP3:%.*]] = tail call i32 @__read_pipe_2_8(ptr addrspace(1) [[P8]], ptr [[TMPVARVAR9]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR12:%.*]] = addrspacecast ptr addrspace(1) [[PTR16]] to ptr +; GCN-PRELINK-NEXT: [[TMP4:%.*]] = tail call i32 @__read_pipe_2_16(ptr addrspace(1) [[P16]], ptr [[TMPVARVAR12]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR15:%.*]] = addrspacecast ptr addrspace(1) [[PTR32]] to ptr +; GCN-PRELINK-NEXT: [[TMP5:%.*]] = tail call i32 @__read_pipe_2_32(ptr addrspace(1) [[P32]], ptr [[TMPVARVAR15]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR18:%.*]] = addrspacecast ptr addrspace(1) [[PTR64]] to ptr +; GCN-PRELINK-NEXT: [[TMP6:%.*]] = tail call i32 @__read_pipe_2_64(ptr addrspace(1) [[P64]], ptr [[TMPVARVAR18]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR21:%.*]] = addrspacecast ptr addrspace(1) [[PTR128]] to ptr +; GCN-PRELINK-NEXT: [[TMP7:%.*]] = tail call i32 @__read_pipe_2_128(ptr addrspace(1) [[P128]], ptr [[TMPVARVAR21]]) #[[ATTR3]] +; GCN-PRELINK-NEXT: [[TMPVARVAR24:%.*]] = addrspacecast ptr addrspace(1) [[PTRU]] to ptr +; GCN-PRELINK-NEXT: [[TMPVARVAR25:%.*]] = tail call i32 @__read_pipe_2(ptr addrspace(1) [[PU]], ptr [[TMPVARVAR24]], i32 400, i32 4) #[[ATTR3]] +; GCN-PRELINK-NEXT: ret void +; +; GCN-NATIVE-LABEL: define amdgpu_kernel void @test_pipe_size +; GCN-NATIVE-SAME: (ptr addrspace(1) [[P1:%.*]], ptr addrspace(1) [[PTR1:%.*]], ptr addrspace(1) [[P2:%.*]], ptr addrspace(1) [[PTR2:%.*]], ptr addrspace(1) [[P4:%.*]], ptr addrspace(1) [[PTR4:%.*]], ptr addrspace(1) [[P8:%.*]], ptr addrspace(1) [[PTR8:%.*]], ptr addrspace(1) [[P16:%.*]], ptr addrspace(1) [[PTR16:%.*]], ptr addrspace(1) [[P32:%.*]], ptr addrspace(1) [[PTR32:%.*]], ptr addrspace(1) [[P64:%.*]], ptr addrspace(1) [[PTR64:%.*]], ptr addrspace(1) [[P128:%.*]], ptr addrspace(1) [[PTR128:%.*]], ptr addrspace(1) [[PU:%.*]], ptr addrspace(1) [[PTRU:%.*]]) local_unnamed_addr #[[ATTR3]] { +; GCN-NATIVE-NEXT: entry: +; GCN-NATIVE-NEXT: [[TMPVARVAR:%.*]] = addrspacecast ptr addrspace(1) [[PTR1]] to ptr +; GCN-NATIVE-NEXT: [[TMP0:%.*]] = tail call i32 @__read_pipe_2_1(ptr addrspace(1) [[P1]], ptr [[TMPVARVAR]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR3:%.*]] = addrspacecast ptr addrspace(1) [[PTR2]] to ptr +; GCN-NATIVE-NEXT: [[TMP1:%.*]] = tail call i32 @__read_pipe_2_2(ptr addrspace(1) [[P2]], ptr [[TMPVARVAR3]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR6:%.*]] = addrspacecast ptr addrspace(1) [[PTR4]] to ptr +; GCN-NATIVE-NEXT: [[TMP2:%.*]] = tail call i32 @__read_pipe_2_4(ptr addrspace(1) [[P4]], ptr [[TMPVARVAR6]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR9:%.*]] = addrspacecast ptr addrspace(1) [[PTR8]] to ptr +; GCN-NATIVE-NEXT: [[TMP3:%.*]] = tail call i32 @__read_pipe_2_8(ptr addrspace(1) [[P8]], ptr [[TMPVARVAR9]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR12:%.*]] = addrspacecast ptr addrspace(1) [[PTR16]] to ptr +; GCN-NATIVE-NEXT: [[TMP4:%.*]] = tail call i32 @__read_pipe_2_16(ptr addrspace(1) [[P16]], ptr [[TMPVARVAR12]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR15:%.*]] = addrspacecast ptr addrspace(1) [[PTR32]] to ptr +; GCN-NATIVE-NEXT: [[TMP5:%.*]] = tail call i32 @__read_pipe_2_32(ptr addrspace(1) [[P32]], ptr [[TMPVARVAR15]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR18:%.*]] = addrspacecast ptr addrspace(1) [[PTR64]] to ptr +; GCN-NATIVE-NEXT: [[TMP6:%.*]] = tail call i32 @__read_pipe_2_64(ptr addrspace(1) [[P64]], ptr [[TMPVARVAR18]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR21:%.*]] = addrspacecast ptr addrspace(1) [[PTR128]] to ptr +; GCN-NATIVE-NEXT: [[TMP7:%.*]] = tail call i32 @__read_pipe_2_128(ptr addrspace(1) [[P128]], ptr [[TMPVARVAR21]]) #[[ATTR3]] +; GCN-NATIVE-NEXT: [[TMPVARVAR24:%.*]] = addrspacecast ptr addrspace(1) [[PTRU]] to ptr +; GCN-NATIVE-NEXT: [[TMPVARVAR25:%.*]] = tail call i32 @__read_pipe_2(ptr addrspace(1) [[PU]], ptr [[TMPVARVAR24]], i32 400, i32 4) #[[ATTR3]] +; GCN-NATIVE-NEXT: ret void +; entry: - %tmp = addrspacecast ptr addrspace(1) %ptr1 to ptr - %tmp1 = call i32 @__read_pipe_2(ptr addrspace(1) %p1, ptr %tmp, i32 1, i32 1) #0 - %tmp3 = addrspacecast ptr addrspace(1) %ptr2 to ptr - %tmp4 = call i32 @__read_pipe_2(ptr addrspace(1) %p2, ptr %tmp3, i32 2, i32 2) #0 - %tmp6 = addrspacecast ptr addrspace(1) %ptr4 to ptr - %tmp7 = call i32 @__read_pipe_2(ptr addrspace(1) %p4, ptr %tmp6, i32 4, i32 4) #0 - %tmp9 = addrspacecast ptr addrspace(1) %ptr8 to ptr - %tmp10 = call i32 @__read_pipe_2(ptr addrspace(1) %p8, ptr %tmp9, i32 8, i32 8) #0 - %tmp12 = addrspacecast ptr addrspace(1) %ptr16 to ptr - %tmp13 = call i32 @__read_pipe_2(ptr addrspace(1) %p16, ptr %tmp12, i32 16, i32 16) #0 - %tmp15 = addrspacecast ptr addrspace(1) %ptr32 to ptr - %tmp16 = call i32 @__read_pipe_2(ptr addrspace(1) %p32, ptr %tmp15, i32 32, i32 32) #0 - %tmp18 = addrspacecast ptr addrspace(1) %ptr64 to ptr - %tmp19 = call i32 @__read_pipe_2(ptr addrspace(1) %p64, ptr %tmp18, i32 64, i32 64) #0 - %tmp21 = addrspacecast ptr addrspace(1) %ptr128 to ptr - %tmp22 = call i32 @__read_pipe_2(ptr addrspace(1) %p128, ptr %tmp21, i32 128, i32 128) #0 - %tmp24 = addrspacecast ptr addrspace(1) %ptru to ptr - %tmp25 = call i32 @__read_pipe_2(ptr addrspace(1) %pu, ptr %tmp24, i32 400, i32 4) #0 - ret void -} - -; GCN-PRELINK: declare float @_Z4fabsf(float) local_unnamed_addr #[[$NOUNWIND_READONLY:[0-9]+]] -; GCN-PRELINK: declare float @_Z4cbrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]] -; GCN-PRELINK: declare float @_Z11native_sqrtf(float) local_unnamed_addr #[[$NOUNWIND_READONLY]] - -; GCN-PRELINK: attributes #[[$NOUNWIND]] = { nounwind } -; GCN-PRELINK: attributes #[[$NOUNWIND_READONLY]] = { nofree nounwind memory(read) } + %tmpvarvar = addrspacecast ptr addrspace(1) %ptr1 to ptr + %tmpvarvar1 = call i32 @__read_pipe_2(ptr addrspace(1) %p1, ptr %tmpvarvar, i32 1, i32 1) #0 + %tmpvarvar3 = addrspacecast ptr addrspace(1) %ptr2 to ptr + %tmpvarvar4 = call i32 @__read_pipe_2(ptr addrspace(1) %p2, ptr %tmpvarvar3, i32 2, i32 2) #0 + %tmpvarvar6 = addrspacecast ptr addrspace(1) %ptr4 to ptr + %tmpvarvar7 = call i32 @__read_pipe_2(ptr addrspace(1) %p4, ptr %tmpvarvar6, i32 4, i32 4) #0 + %tmpvarvar9 = addrspacecast ptr addrspace(1) %ptr8 to ptr + %tmpvarvar10 = call i32 @__read_pipe_2(ptr addrspace(1) %p8, ptr %tmpvarvar9, i32 8, i32 8) #0 + %tmpvarvar12 = addrspacecast ptr addrspace(1) %ptr16 to ptr + %tmpvarvar13 = call i32 @__read_pipe_2(ptr addrspace(1) %p16, ptr %tmpvarvar12, i32 16, i32 16) #0 + %tmpvarvar15 = addrspacecast ptr addrspace(1) %ptr32 to ptr + %tmpvarvar16 = call i32 @__read_pipe_2(ptr addrspace(1) %p32, ptr %tmpvarvar15, i32 32, i32 32) #0 + %tmpvarvar18 = addrspacecast ptr addrspace(1) %ptr64 to ptr + %tmpvarvar19 = call i32 @__read_pipe_2(ptr addrspace(1) %p64, ptr %tmpvarvar18, i32 64, i32 64) #0 + %tmpvarvar21 = addrspacecast ptr addrspace(1) %ptr128 to ptr + %tmpvarvar22 = call i32 @__read_pipe_2(ptr addrspace(1) %p128, ptr %tmpvarvar21, i32 128, i32 128) #0 + %tmpvarvar24 = addrspacecast ptr addrspace(1) %ptru to ptr + %tmpvarvar25 = call i32 @__read_pipe_2(ptr addrspace(1) %pu, ptr %tmpvarvar24, i32 400, i32 4) #0 + ret void +} + + attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll --- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll @@ -1,352 +1,871 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-ALLOCA %s ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GCN-PROMOTE %s -; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s target datalayout = "A5" -; OPT-LABEL: @vector_read_alloca_bitcast( -; OPT-NOT: alloca -; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT-NEXT: store i32 %0, ptr addrspace(1) %out, align 4 - -; GCN-LABEL: {{^}}vector_read_alloca_bitcast: -; GCN-ALLOCA-COUNT-4: buffer_store_dword -; GCN-ALLOCA: buffer_load_dword - -; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 -; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 -; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc -; GCN-PROMOTE: ScratchSize: 0 define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) { +; GCN-ALLOCA-LABEL: vector_read_alloca_bitcast: +; GCN-ALLOCA: ; %bb.0: ; %entry +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 0 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 1 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:8 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 2 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:12 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 3 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:16 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 4 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dword v[0:1], v2 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_read_alloca_bitcast: +; GCN-PROMOTE: ; %bb.0: ; %entry +; GCN-PROMOTE-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-PROMOTE-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v2, 3, v0, vcc +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, s0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, s1 +; GCN-PROMOTE-NEXT: flat_store_dword v[0:1], v2 +; GCN-PROMOTE-NEXT: s_endpgm entry: - %tmp = alloca [4 x i32], addrspace(5) - %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 - store i32 0, ptr addrspace(5) %tmp + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar store i32 1, ptr addrspace(5) %y store i32 2, ptr addrspace(5) %z store i32 3, ptr addrspace(5) %w - %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index - %tmp2 = load i32, ptr addrspace(5) %tmp1 - store i32 %tmp2, ptr addrspace(1) %out + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + store i32 %tmpvar2, ptr addrspace(1) %out ret void } -; OPT-LABEL: @vector_write_alloca_bitcast( -; OPT-NOT: alloca -; OPT: %0 = insertelement <4 x i32> zeroinitializer, i32 1, i32 %w_index -; OPT-NEXT: %1 = extractelement <4 x i32> %0, i32 %r_index -; OPT-NEXT: store i32 %1, ptr addrspace(1) %out, align - -; GCN-LABEL: {{^}}vector_write_alloca_bitcast: -; GCN-ALLOCA-COUNT-5: buffer_store_dword -; GCN-ALLOCA: buffer_load_dword - -; GCN-PROMOTE-COUNT-7: v_cndmask - -; GCN-PROMOTE: ScratchSize: 0 - define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { +; GCN-ALLOCA-LABEL: vector_write_alloca_bitcast: +; GCN-ALLOCA: ; %bb.0: ; %entry +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 0 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:8 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:12 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:16 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 4 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v1, vcc, s2, v0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v2, 1 +; GCN-ALLOCA-NEXT: s_lshl_b32 s2, s3, 2 +; GCN-ALLOCA-NEXT: buffer_store_dword v2, v1, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dword v[0:1], v2 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_write_alloca_bitcast: +; GCN-PROMOTE: ; %bb.0: ; %entry +; GCN-PROMOTE-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s2, 3 +; GCN-PROMOTE-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s2, 2 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-PROMOTE-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s2, 1 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-PROMOTE-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GCN-PROMOTE-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s3, 1 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s3, 2 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s3, 3 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, s0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, s1 +; GCN-PROMOTE-NEXT: flat_store_dword v[0:1], v2 +; GCN-PROMOTE-NEXT: s_endpgm entry: - %tmp = alloca [4 x i32], addrspace(5) - %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 - store i32 0, ptr addrspace(5) %tmp + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar store i32 0, ptr addrspace(5) %y store i32 0, ptr addrspace(5) %z store i32 0, ptr addrspace(5) %w - %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %w_index - store i32 1, ptr addrspace(5) %tmp1 - %tmp2 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %r_index - %tmp3 = load i32, ptr addrspace(5) %tmp2 - store i32 %tmp3, ptr addrspace(1) %out + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %w_index + store i32 1, ptr addrspace(5) %tmpvar1 + %tmpvar2 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %r_index + %tmpvar3 = load i32, ptr addrspace(5) %tmpvar2 + store i32 %tmpvar3, ptr addrspace(1) %out ret void } -; OPT-LABEL: @vector_write_read_bitcast_to_float( -; OPT-NOT: alloca -; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x float> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x float> %tmp.sroa.0.0, float %tmp72, i32 %tmp10 -; OPT: .preheader: -; OPT: %bc = bitcast <6 x float> %0 to <6 x i32> -; OPT: %1 = extractelement <6 x i32> %bc, i32 %tmp20 - -; GCN-LABEL: {{^}}vector_write_read_bitcast_to_float: -; GCN-ALLOCA: buffer_store_dword - -; GCN-PROMOTE-COUNT-6: v_cmp_eq_u16 -; GCN-PROMOTE-COUNT-6: v_cndmask - -; GCN: s_cbranch - -; GCN-ALLOCA: buffer_load_dword - -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask -; GCN-PROMOTE: v_cmp_eq_u16 -; GCN-PROMOTE: v_cndmask - -; GCN-PROMOTE: ScratchSize: 0 - define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) { +; GCN-ALLOCA-LABEL: vector_write_read_bitcast_to_float: +; GCN-ALLOCA: ; %bb.0: ; %bb +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: s_mov_b32 s4, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-ALLOCA-NEXT: .LBB2_1: ; %bb2 +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s6, s4, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s6, s6, 0xaaab +; GCN-ALLOCA-NEXT: s_load_dword s5, s[2:3], 0x0 +; GCN-ALLOCA-NEXT: s_lshr_b32 s6, s6, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s6, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s4, v0 +; GCN-ALLOCA-NEXT: s_add_i32 s4, s4, 1 +; GCN-ALLOCA-NEXT: s_add_u32 s2, s2, 4 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-ALLOCA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GCN-ALLOCA-NEXT: s_cmpk_lg_i32 s4, 0x3e8 +; GCN-ALLOCA-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_cbranch_scc1 .LBB2_1 +; GCN-ALLOCA-NEXT: ; %bb.2: ; %.preheader.preheader +; GCN-ALLOCA-NEXT: s_mov_b32 s2, 0 +; GCN-ALLOCA-NEXT: .LBB2_3: ; %.preheader +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s3, s3, 0xaaab +; GCN-ALLOCA-NEXT: s_lshr_b32 s3, s3, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s3, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s2, v0 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-ALLOCA-NEXT: v_sub_u32_e32 v0, vcc, 4, v0 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 20, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-ALLOCA-NEXT: s_add_i32 s2, s2, 1 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-ALLOCA-NEXT: s_add_u32 s0, s0, 4 +; GCN-ALLOCA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-ALLOCA-NEXT: s_cmpk_eq_i32 s2, 0x3e8 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dword v[0:1], v2 +; GCN-ALLOCA-NEXT: s_cbranch_scc0 .LBB2_3 +; GCN-ALLOCA-NEXT: ; %bb.4: ; %bb15 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_write_read_bitcast_to_float: +; GCN-PROMOTE: ; %bb.0: ; %bb +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_mov_b32 s14, 0 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr0 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr1 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr2 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr3 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr4 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr5 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_mov_b64 s[12:13], s[10:11] +; GCN-PROMOTE-NEXT: .LBB2_1: ; %bb2 +; GCN-PROMOTE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: s_and_b32 s1, s14, 0xffff +; GCN-PROMOTE-NEXT: s_load_dword s0, s[12:13], 0x0 +; GCN-PROMOTE-NEXT: s_mul_i32 s1, s1, 0xaaab +; GCN-PROMOTE-NEXT: s_lshr_b32 s1, s1, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v6, s1, 6 +; GCN-PROMOTE-NEXT: v_sub_u16_e32 v6, s14, v6 +; GCN-PROMOTE-NEXT: s_add_i32 s14, s14, 1 +; GCN-PROMOTE-NEXT: s_add_u32 s12, s12, 4 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v7, s0 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 4, v6 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e64 s[0:1], 3, v6 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e64 s[2:3], 2, v6 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e64 s[4:5], 1, v6 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e64 s[6:7], 0, v6 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e64 s[8:9], 5, v6 +; GCN-PROMOTE-NEXT: s_addc_u32 s13, s13, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[8:9] +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[0:1] +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v2, v2, v7, s[2:3] +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; GCN-PROMOTE-NEXT: s_cmpk_lg_i32 s14, 0x3e8 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[6:7] +; GCN-PROMOTE-NEXT: s_cbranch_scc1 .LBB2_1 +; GCN-PROMOTE-NEXT: ; %bb.2: ; %.preheader.preheader +; GCN-PROMOTE-NEXT: s_mov_b32 s0, 0 +; GCN-PROMOTE-NEXT: .LBB2_3: ; %.preheader +; GCN-PROMOTE-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: s_and_b32 s1, s0, 0xffff +; GCN-PROMOTE-NEXT: s_mul_i32 s1, s1, 0xaaab +; GCN-PROMOTE-NEXT: s_lshr_b32 s1, s1, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v8, s1, 6 +; GCN-PROMOTE-NEXT: v_subrev_u16_e32 v8, s0, v8 +; GCN-PROMOTE-NEXT: v_add_u16_e32 v8, 5, v8 +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 1, v8 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v9, v0, v1, vcc +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 2, v8 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v6, s10 +; GCN-PROMOTE-NEXT: s_add_i32 s0, s0, 1 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v9, v9, v2, vcc +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 3, v8 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v7, s11 +; GCN-PROMOTE-NEXT: s_add_u32 s10, s10, 4 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 4, v8 +; GCN-PROMOTE-NEXT: s_addc_u32 s11, s11, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v9, v9, v4, vcc +; GCN-PROMOTE-NEXT: v_cmp_eq_u16_e32 vcc, 5, v8 +; GCN-PROMOTE-NEXT: s_cmpk_eq_i32 s0, 0x3e8 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v8, v9, v5, vcc +; GCN-PROMOTE-NEXT: flat_store_dword v[6:7], v8 +; GCN-PROMOTE-NEXT: s_cbranch_scc0 .LBB2_3 +; GCN-PROMOTE-NEXT: ; %bb.4: ; %bb15 +; GCN-PROMOTE-NEXT: s_endpgm bb: - %tmp = alloca [6 x float], align 4, addrspace(5) - call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmp) #2 + %tmpvar = alloca [6 x float], align 4, addrspace(5) + call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmpvar) #2 br label %bb2 bb2: ; preds = %bb2, %bb - %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp4 - %tmp7 = load i32, ptr addrspace(1) %tmp5, align 4 - %tmp8 = trunc i32 %tmp3 to i16 - %tmp9 = urem i16 %tmp8, 6 - %tmp10 = zext i16 %tmp9 to i32 - %tmp11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp10 - store i32 %tmp7, ptr addrspace(5) %tmp11, align 4 - %tmp13 = add nuw nsw i32 %tmp3, 1 - %tmp14 = icmp eq i32 %tmp13, 1000 - br i1 %tmp14, label %.preheader, label %bb2 + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar13, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar7 = load i32, ptr addrspace(1) %tmpvar5, align 4 + %tmpvar8 = trunc i32 %tmpvar3 to i16 + %tmpvar9 = urem i16 %tmpvar8, 6 + %tmpvar10 = zext i16 %tmpvar9 to i32 + %tmpvar11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar10 + store i32 %tmpvar7, ptr addrspace(5) %tmpvar11, align 4 + %tmpvar13 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar14 = icmp eq i32 %tmpvar13, 1000 + br i1 %tmpvar14, label %.preheader, label %bb2 bb15: ; preds = %.preheader - call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmp) #2 + call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmpvar) #2 ret void .preheader: ; preds = %.preheader, %bb2 - %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ] - %tmp17 = trunc i32 %tmp16 to i16 - %tmp18 = urem i16 %tmp17, 6 - %tmp19 = sub nuw nsw i16 5, %tmp18 - %tmp20 = zext i16 %tmp19 to i32 - %tmp21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmp, i32 0, i32 %tmp20 - %tmp23 = load i32, ptr addrspace(5) %tmp21, align 4 - %tmp24 = zext i32 %tmp16 to i64 - %tmp25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmp24 - store i32 %tmp23, ptr addrspace(1) %tmp25, align 4 - %tmp27 = add nuw nsw i32 %tmp16, 1 - %tmp28 = icmp eq i32 %tmp27, 1000 - br i1 %tmp28, label %bb15, label %.preheader + %tmpvar16 = phi i32 [ %tmpvar27, %.preheader ], [ 0, %bb2 ] + %tmpvar17 = trunc i32 %tmpvar16 to i16 + %tmpvar18 = urem i16 %tmpvar17, 6 + %tmpvar19 = sub nuw nsw i16 5, %tmpvar18 + %tmpvar20 = zext i16 %tmpvar19 to i32 + %tmpvar21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar20 + %tmpvar23 = load i32, ptr addrspace(5) %tmpvar21, align 4 + %tmpvar24 = zext i32 %tmpvar16 to i64 + %tmpvar25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmpvar24 + store i32 %tmpvar23, ptr addrspace(1) %tmpvar25, align 4 + %tmpvar27 = add nuw nsw i32 %tmpvar16, 1 + %tmpvar28 = icmp eq i32 %tmpvar27, 1000 + br i1 %tmpvar28, label %bb15, label %.preheader } -; OPT-LABEL: @vector_write_read_bitcast_to_double( -; OPT-NOT: alloca -; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x double> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x double> %tmp.sroa.0.0, double %tmp72, i32 %tmp10 -; OPT: .preheader: -; OPT: %bc = bitcast <6 x double> %0 to <6 x i64> -; OPT: %1 = extractelement <6 x i64> %bc, i32 %tmp20 - -; GCN-LABEL: {{^}}vector_write_read_bitcast_to_double: - -; GCN-ALLOCA-COUNT-2: buffer_store_dword -; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32 - -; GCN: s_cbranch - -; GCN-ALLOCA-COUNT-2: buffer_load_dword -; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32 - -; GCN-PROMOTE: ScratchSize: 0 - define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) { +; GCN-ALLOCA-LABEL: vector_write_read_bitcast_to_double: +; GCN-ALLOCA: ; %bb.0: ; %bb +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: s_mov_b32 s4, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-ALLOCA-NEXT: .LBB3_1: ; %bb2 +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s5, s4, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s5, s5, 0xaaab +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GCN-ALLOCA-NEXT: s_lshr_b32 s5, s5, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s5, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s4, v0 +; GCN-ALLOCA-NEXT: s_add_i32 s4, s4, 1 +; GCN-ALLOCA-NEXT: s_add_u32 s2, s2, 8 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-ALLOCA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GCN-ALLOCA-NEXT: s_cmpk_lg_i32 s4, 0x3e8 +; GCN-ALLOCA-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen offset:4 +; GCN-ALLOCA-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_cbranch_scc1 .LBB3_1 +; GCN-ALLOCA-NEXT: ; %bb.2: ; %.preheader.preheader +; GCN-ALLOCA-NEXT: s_mov_b32 s2, 0 +; GCN-ALLOCA-NEXT: .LBB3_3: ; %.preheader +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s3, s3, 0xaaab +; GCN-ALLOCA-NEXT: s_lshr_b32 s3, s3, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s3, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s2, v0 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-ALLOCA-NEXT: v_sub_u32_e32 v0, vcc, 8, v0 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 44, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-ALLOCA-NEXT: s_add_i32 s2, s2, 1 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-ALLOCA-NEXT: s_add_u32 s0, s0, 8 +; GCN-ALLOCA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-ALLOCA-NEXT: s_cmpk_eq_i32 s2, 0x3e8 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-ALLOCA-NEXT: s_cbranch_scc0 .LBB3_3 +; GCN-ALLOCA-NEXT: ; %bb.4: ; %bb15 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_write_read_bitcast_to_double: +; GCN-PROMOTE: ; %bb.0: ; %bb +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_mov_b32 s8, 0 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-PROMOTE-NEXT: .LBB3_1: ; %bb2 +; GCN-PROMOTE-NEXT: ; =>This Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: ; Child Loop BB3_2 Depth 2 +; GCN-PROMOTE-NEXT: ; Child Loop BB3_4 Depth 2 +; GCN-PROMOTE-NEXT: s_and_b32 s4, s8, 0xffff +; GCN-PROMOTE-NEXT: s_mul_i32 s6, s4, 0xaaab +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-PROMOTE-NEXT: s_lshr_b32 s6, s6, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v12, s6, 6 +; GCN-PROMOTE-NEXT: v_sub_u16_e32 v12, s8, v12 +; GCN-PROMOTE-NEXT: v_lshlrev_b32_e32 v16, 1, v12 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v17, s4 +; GCN-PROMOTE-NEXT: s_mov_b64 s[6:7], exec +; GCN-PROMOTE-NEXT: .LBB3_2: ; Parent Loop BB3_1 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s4, v16 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s4 +; GCN-PROMOTE-NEXT: v_movreld_b32_e32 v0, v17 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB3_2 +; GCN-PROMOTE-NEXT: ; %bb.3: ; in Loop: Header=BB3_1 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[6:7] +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v17, s5 +; GCN-PROMOTE-NEXT: s_mov_b64 s[4:5], exec +; GCN-PROMOTE-NEXT: .LBB3_4: ; Parent Loop BB3_1 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s6, v16 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s6 +; GCN-PROMOTE-NEXT: v_movreld_b32_e32 v0, v17 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB3_4 +; GCN-PROMOTE-NEXT: ; %bb.5: ; in Loop: Header=BB3_1 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[4:5] +; GCN-PROMOTE-NEXT: s_add_i32 s8, s8, 1 +; GCN-PROMOTE-NEXT: s_add_u32 s2, s2, 8 +; GCN-PROMOTE-NEXT: s_addc_u32 s3, s3, 0 +; GCN-PROMOTE-NEXT: s_cmpk_lg_i32 s8, 0x3e8 +; GCN-PROMOTE-NEXT: s_cbranch_scc1 .LBB3_1 +; GCN-PROMOTE-NEXT: ; %bb.6: ; %.preheader.preheader +; GCN-PROMOTE-NEXT: s_mov_b32 s4, 0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v18, 1 +; GCN-PROMOTE-NEXT: .LBB3_7: ; %.preheader +; GCN-PROMOTE-NEXT: ; =>This Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: ; Child Loop BB3_8 Depth 2 +; GCN-PROMOTE-NEXT: ; Child Loop BB3_10 Depth 2 +; GCN-PROMOTE-NEXT: s_and_b32 s2, s4, 0xffff +; GCN-PROMOTE-NEXT: s_mul_i32 s2, s2, 0xaaab +; GCN-PROMOTE-NEXT: s_lshr_b32 s2, s2, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v16, s2, 6 +; GCN-PROMOTE-NEXT: v_sub_u16_e32 v16, s4, v16 +; GCN-PROMOTE-NEXT: v_sub_u32_e32 v16, vcc, 5, v16 +; GCN-PROMOTE-NEXT: v_lshlrev_b32_sdwa v19, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], exec +; GCN-PROMOTE-NEXT: .LBB3_8: ; Parent Loop BB3_7 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s5, v19 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s5, v19 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s5 +; GCN-PROMOTE-NEXT: v_movrels_b32_e32 v17, v1 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB3_8 +; GCN-PROMOTE-NEXT: ; %bb.9: ; in Loop: Header=BB3_7 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[2:3] +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], exec +; GCN-PROMOTE-NEXT: .LBB3_10: ; Parent Loop BB3_7 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s5, v19 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s5, v19 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s5 +; GCN-PROMOTE-NEXT: v_movrels_b32_e32 v16, v0 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB3_10 +; GCN-PROMOTE-NEXT: ; %bb.11: ; in Loop: Header=BB3_7 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[2:3] +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v20, s1 +; GCN-PROMOTE-NEXT: s_add_i32 s4, s4, 1 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v19, s0 +; GCN-PROMOTE-NEXT: s_add_u32 s0, s0, 8 +; GCN-PROMOTE-NEXT: s_addc_u32 s1, s1, 0 +; GCN-PROMOTE-NEXT: s_cmpk_eq_i32 s4, 0x3e8 +; GCN-PROMOTE-NEXT: flat_store_dwordx2 v[19:20], v[16:17] +; GCN-PROMOTE-NEXT: s_cbranch_scc0 .LBB3_7 +; GCN-PROMOTE-NEXT: ; %bb.12: ; %bb15 +; GCN-PROMOTE-NEXT: s_endpgm bb: - %tmp = alloca [6 x double], align 8, addrspace(5) - call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2 + %tmpvar = alloca [6 x double], align 8, addrspace(5) + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmpvar) #2 br label %bb2 bb2: ; preds = %bb2, %bb - %tmp3 = phi i32 [ 0, %bb ], [ %tmp13, %bb2 ] - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp4 - %tmp7 = load i64, ptr addrspace(1) %tmp5, align 8 - %tmp8 = trunc i32 %tmp3 to i16 - %tmp9 = urem i16 %tmp8, 6 - %tmp10 = zext i16 %tmp9 to i32 - %tmp11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp10 - store i64 %tmp7, ptr addrspace(5) %tmp11, align 8 - %tmp13 = add nuw nsw i32 %tmp3, 1 - %tmp14 = icmp eq i32 %tmp13, 1000 - br i1 %tmp14, label %.preheader, label %bb2 + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar13, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar7 = load i64, ptr addrspace(1) %tmpvar5, align 8 + %tmpvar8 = trunc i32 %tmpvar3 to i16 + %tmpvar9 = urem i16 %tmpvar8, 6 + %tmpvar10 = zext i16 %tmpvar9 to i32 + %tmpvar11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar10 + store i64 %tmpvar7, ptr addrspace(5) %tmpvar11, align 8 + %tmpvar13 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar14 = icmp eq i32 %tmpvar13, 1000 + br i1 %tmpvar14, label %.preheader, label %bb2 bb15: ; preds = %.preheader - call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2 + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmpvar) #2 ret void .preheader: ; preds = %.preheader, %bb2 - %tmp16 = phi i32 [ %tmp27, %.preheader ], [ 0, %bb2 ] - %tmp17 = trunc i32 %tmp16 to i16 - %tmp18 = urem i16 %tmp17, 6 - %tmp19 = sub nuw nsw i16 5, %tmp18 - %tmp20 = zext i16 %tmp19 to i32 - %tmp21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmp, i32 0, i32 %tmp20 - %tmp23 = load i64, ptr addrspace(5) %tmp21, align 8 - %tmp24 = zext i32 %tmp16 to i64 - %tmp25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmp24 - store i64 %tmp23, ptr addrspace(1) %tmp25, align 8 - %tmp27 = add nuw nsw i32 %tmp16, 1 - %tmp28 = icmp eq i32 %tmp27, 1000 - br i1 %tmp28, label %bb15, label %.preheader + %tmpvar16 = phi i32 [ %tmpvar27, %.preheader ], [ 0, %bb2 ] + %tmpvar17 = trunc i32 %tmpvar16 to i16 + %tmpvar18 = urem i16 %tmpvar17, 6 + %tmpvar19 = sub nuw nsw i16 5, %tmpvar18 + %tmpvar20 = zext i16 %tmpvar19 to i32 + %tmpvar21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar20 + %tmpvar23 = load i64, ptr addrspace(5) %tmpvar21, align 8 + %tmpvar24 = zext i32 %tmpvar16 to i64 + %tmpvar25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmpvar24 + store i64 %tmpvar23, ptr addrspace(1) %tmpvar25, align 8 + %tmpvar27 = add nuw nsw i32 %tmpvar16, 1 + %tmpvar28 = icmp eq i32 %tmpvar27, 1000 + br i1 %tmpvar28, label %bb15, label %.preheader } -; OPT-LABEL: @vector_write_read_bitcast_to_i64( -; OPT-NOT: alloca -; OPT: bb2: -; OPT: %tmp.sroa.0.0 = phi <6 x i64> [ undef, %bb ], [ %0, %bb2 ] -; OPT: %0 = insertelement <6 x i64> %tmp.sroa.0.0, i64 %tmp6, i32 %tmp9 -; OPT: .preheader: -; OPT: %1 = extractelement <6 x i64> %0, i32 %tmp18 - -; GCN-LABEL: {{^}}vector_write_read_bitcast_to_i64: - -; GCN-ALLOCA-COUNT-2: buffer_store_dword -; GCN-PROMOTE-COUNT-2: v_movreld_b32_e32 - -; GCN: s_cbranch - -; GCN-ALLOCA-COUNT-2: buffer_load_dword -; GCN-PROMOTE-COUNT-2: v_movrels_b32_e32 - -; GCN-PROMOTE: ScratchSize: 0 - define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) { +; GCN-ALLOCA-LABEL: vector_write_read_bitcast_to_i64: +; GCN-ALLOCA: ; %bb.0: ; %bb +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: s_mov_b32 s4, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-ALLOCA-NEXT: .LBB4_1: ; %bb2 +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s5, s4, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s5, s5, 0xaaab +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; GCN-ALLOCA-NEXT: s_lshr_b32 s5, s5, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s5, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s4, v0 +; GCN-ALLOCA-NEXT: s_add_i32 s4, s4, 1 +; GCN-ALLOCA-NEXT: s_add_u32 s2, s2, 8 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-ALLOCA-NEXT: s_addc_u32 s3, s3, 0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s7 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v2, s6 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 8, v0 +; GCN-ALLOCA-NEXT: s_cmpk_lg_i32 s4, 0x3e8 +; GCN-ALLOCA-NEXT: buffer_store_dword v1, v0, s[88:91], 0 offen offset:4 +; GCN-ALLOCA-NEXT: buffer_store_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_cbranch_scc1 .LBB4_1 +; GCN-ALLOCA-NEXT: ; %bb.2: ; %.preheader.preheader +; GCN-ALLOCA-NEXT: s_mov_b32 s2, 0 +; GCN-ALLOCA-NEXT: .LBB4_3: ; %.preheader +; GCN-ALLOCA-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-ALLOCA-NEXT: s_and_b32 s3, s2, 0xffff +; GCN-ALLOCA-NEXT: s_mul_i32 s3, s3, 0xaaab +; GCN-ALLOCA-NEXT: s_lshr_b32 s3, s3, 18 +; GCN-ALLOCA-NEXT: v_mul_lo_u16_e64 v0, s3, 6 +; GCN-ALLOCA-NEXT: v_sub_u16_e32 v0, s2, v0 +; GCN-ALLOCA-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GCN-ALLOCA-NEXT: v_sub_u32_e32 v0, vcc, 8, v0 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v2, vcc, 40, v0 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, 44, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v1, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v3, s1 +; GCN-ALLOCA-NEXT: s_add_i32 s2, s2, 1 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v2, s0 +; GCN-ALLOCA-NEXT: s_add_u32 s0, s0, 8 +; GCN-ALLOCA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-ALLOCA-NEXT: s_cmpk_eq_i32 s2, 0x3e8 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-ALLOCA-NEXT: s_cbranch_scc0 .LBB4_3 +; GCN-ALLOCA-NEXT: ; %bb.4: ; %bb13 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_write_read_bitcast_to_i64: +; GCN-PROMOTE: ; %bb.0: ; %bb +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_mov_b32 s8, 0 +; GCN-PROMOTE-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], s[0:1] +; GCN-PROMOTE-NEXT: .LBB4_1: ; %bb2 +; GCN-PROMOTE-NEXT: ; =>This Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: ; Child Loop BB4_2 Depth 2 +; GCN-PROMOTE-NEXT: ; Child Loop BB4_4 Depth 2 +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GCN-PROMOTE-NEXT: s_and_b32 s6, s8, 0xffff +; GCN-PROMOTE-NEXT: s_mul_i32 s6, s6, 0xaaab +; GCN-PROMOTE-NEXT: s_lshr_b32 s6, s6, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v12, s6, 6 +; GCN-PROMOTE-NEXT: v_sub_u16_e32 v12, s8, v12 +; GCN-PROMOTE-NEXT: v_lshlrev_b32_e32 v16, 1, v12 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v17, s4 +; GCN-PROMOTE-NEXT: s_mov_b64 s[6:7], exec +; GCN-PROMOTE-NEXT: .LBB4_2: ; Parent Loop BB4_1 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s4, v16 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s4, v16 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s4 +; GCN-PROMOTE-NEXT: v_movreld_b32_e32 v0, v17 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB4_2 +; GCN-PROMOTE-NEXT: ; %bb.3: ; in Loop: Header=BB4_1 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[6:7] +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v17, s5 +; GCN-PROMOTE-NEXT: s_mov_b64 s[4:5], exec +; GCN-PROMOTE-NEXT: .LBB4_4: ; Parent Loop BB4_1 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s6, v16 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s6, v16 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s6 +; GCN-PROMOTE-NEXT: v_movreld_b32_e32 v0, v17 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB4_4 +; GCN-PROMOTE-NEXT: ; %bb.5: ; in Loop: Header=BB4_1 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[4:5] +; GCN-PROMOTE-NEXT: s_add_i32 s8, s8, 1 +; GCN-PROMOTE-NEXT: s_add_u32 s2, s2, 8 +; GCN-PROMOTE-NEXT: s_addc_u32 s3, s3, 0 +; GCN-PROMOTE-NEXT: s_cmpk_lg_i32 s8, 0x3e8 +; GCN-PROMOTE-NEXT: s_cbranch_scc1 .LBB4_1 +; GCN-PROMOTE-NEXT: ; %bb.6: ; %.preheader.preheader +; GCN-PROMOTE-NEXT: s_mov_b32 s4, 0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v18, 1 +; GCN-PROMOTE-NEXT: .LBB4_7: ; %.preheader +; GCN-PROMOTE-NEXT: ; =>This Loop Header: Depth=1 +; GCN-PROMOTE-NEXT: ; Child Loop BB4_8 Depth 2 +; GCN-PROMOTE-NEXT: ; Child Loop BB4_10 Depth 2 +; GCN-PROMOTE-NEXT: s_and_b32 s2, s4, 0xffff +; GCN-PROMOTE-NEXT: s_mul_i32 s2, s2, 0xaaab +; GCN-PROMOTE-NEXT: s_lshr_b32 s2, s2, 18 +; GCN-PROMOTE-NEXT: v_mul_lo_u16_e64 v16, s2, 6 +; GCN-PROMOTE-NEXT: v_sub_u16_e32 v16, s4, v16 +; GCN-PROMOTE-NEXT: v_sub_u32_e32 v16, vcc, 5, v16 +; GCN-PROMOTE-NEXT: v_lshlrev_b32_sdwa v19, v18, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], exec +; GCN-PROMOTE-NEXT: .LBB4_8: ; Parent Loop BB4_7 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s5, v19 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s5, v19 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s5 +; GCN-PROMOTE-NEXT: v_movrels_b32_e32 v17, v1 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB4_8 +; GCN-PROMOTE-NEXT: ; %bb.9: ; in Loop: Header=BB4_7 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[2:3] +; GCN-PROMOTE-NEXT: s_mov_b64 s[2:3], exec +; GCN-PROMOTE-NEXT: .LBB4_10: ; Parent Loop BB4_7 Depth=1 +; GCN-PROMOTE-NEXT: ; => This Inner Loop Header: Depth=2 +; GCN-PROMOTE-NEXT: v_readfirstlane_b32 s5, v19 +; GCN-PROMOTE-NEXT: v_cmp_eq_u32_e32 vcc, s5, v19 +; GCN-PROMOTE-NEXT: s_and_saveexec_b64 vcc, vcc +; GCN-PROMOTE-NEXT: s_mov_b32 m0, s5 +; GCN-PROMOTE-NEXT: v_movrels_b32_e32 v16, v0 +; GCN-PROMOTE-NEXT: s_xor_b64 exec, exec, vcc +; GCN-PROMOTE-NEXT: s_cbranch_execnz .LBB4_10 +; GCN-PROMOTE-NEXT: ; %bb.11: ; in Loop: Header=BB4_7 Depth=1 +; GCN-PROMOTE-NEXT: s_mov_b64 exec, s[2:3] +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v20, s1 +; GCN-PROMOTE-NEXT: s_add_i32 s4, s4, 1 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v19, s0 +; GCN-PROMOTE-NEXT: s_add_u32 s0, s0, 8 +; GCN-PROMOTE-NEXT: s_addc_u32 s1, s1, 0 +; GCN-PROMOTE-NEXT: s_cmpk_eq_i32 s4, 0x3e8 +; GCN-PROMOTE-NEXT: flat_store_dwordx2 v[19:20], v[16:17] +; GCN-PROMOTE-NEXT: s_cbranch_scc0 .LBB4_7 +; GCN-PROMOTE-NEXT: ; %bb.12: ; %bb13 +; GCN-PROMOTE-NEXT: s_endpgm bb: - %tmp = alloca [6 x i64], align 8, addrspace(5) - call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmp) #2 + %tmpvar = alloca [6 x i64], align 8, addrspace(5) + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmpvar) #2 br label %bb2 bb2: ; preds = %bb2, %bb - %tmp3 = phi i32 [ 0, %bb ], [ %tmp11, %bb2 ] - %tmp4 = zext i32 %tmp3 to i64 - %tmp5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp4 - %tmp6 = load i64, ptr addrspace(1) %tmp5, align 8 - %tmp7 = trunc i32 %tmp3 to i16 - %tmp8 = urem i16 %tmp7, 6 - %tmp9 = zext i16 %tmp8 to i32 - %tmp10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp9 - store i64 %tmp6, ptr addrspace(5) %tmp10, align 8 - %tmp11 = add nuw nsw i32 %tmp3, 1 - %tmp12 = icmp eq i32 %tmp11, 1000 - br i1 %tmp12, label %.preheader, label %bb2 + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar11, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar6 = load i64, ptr addrspace(1) %tmpvar5, align 8 + %tmpvar7 = trunc i32 %tmpvar3 to i16 + %tmpvar8 = urem i16 %tmpvar7, 6 + %tmpvar9 = zext i16 %tmpvar8 to i32 + %tmpvar10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar9 + store i64 %tmpvar6, ptr addrspace(5) %tmpvar10, align 8 + %tmpvar11 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar12 = icmp eq i32 %tmpvar11, 1000 + br i1 %tmpvar12, label %.preheader, label %bb2 bb13: ; preds = %.preheader - call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmp) #2 + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmpvar) #2 ret void .preheader: ; preds = %.preheader, %bb2 - %tmp14 = phi i32 [ %tmp23, %.preheader ], [ 0, %bb2 ] - %tmp15 = trunc i32 %tmp14 to i16 - %tmp16 = urem i16 %tmp15, 6 - %tmp17 = sub nuw nsw i16 5, %tmp16 - %tmp18 = zext i16 %tmp17 to i32 - %tmp19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmp, i32 0, i32 %tmp18 - %tmp20 = load i64, ptr addrspace(5) %tmp19, align 8 - %tmp21 = zext i32 %tmp14 to i64 - %tmp22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmp21 - store i64 %tmp20, ptr addrspace(1) %tmp22, align 8 - %tmp23 = add nuw nsw i32 %tmp14, 1 - %tmp24 = icmp eq i32 %tmp23, 1000 - br i1 %tmp24, label %bb13, label %.preheader + %tmpvar14 = phi i32 [ %tmpvar23, %.preheader ], [ 0, %bb2 ] + %tmpvar15 = trunc i32 %tmpvar14 to i16 + %tmpvar16 = urem i16 %tmpvar15, 6 + %tmpvar17 = sub nuw nsw i16 5, %tmpvar16 + %tmpvar18 = zext i16 %tmpvar17 to i32 + %tmpvar19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar18 + %tmpvar20 = load i64, ptr addrspace(5) %tmpvar19, align 8 + %tmpvar21 = zext i32 %tmpvar14 to i64 + %tmpvar22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmpvar21 + store i64 %tmpvar20, ptr addrspace(1) %tmpvar22, align 8 + %tmpvar23 = add nuw nsw i32 %tmpvar14, 1 + %tmpvar24 = icmp eq i32 %tmpvar23, 1000 + br i1 %tmpvar24, label %bb13, label %.preheader } -; TODO: llvm.assume can be ingored - -; OPT-LABEL: @vector_read_alloca_bitcast_assume( -; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT: store i32 %0, ptr addrspace(1) %out, align 4 - -; GCN-LABEL: {{^}}vector_read_alloca_bitcast_assume: -; GCN-COUNT-4: buffer_store_dword - define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) { +; GCN-ALLOCA-LABEL: vector_read_alloca_bitcast_assume: +; GCN-ALLOCA: ; %bb.0: ; %entry +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 0 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 1 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:8 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 2 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:12 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 3 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:16 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 4 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v2, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: flat_store_dword v[0:1], v2 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_read_alloca_bitcast_assume: +; GCN-PROMOTE: ; %bb.0: ; %entry +; GCN-PROMOTE-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-PROMOTE-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-PROMOTE-NEXT: s_mov_b32 s90, -1 +; GCN-PROMOTE-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-PROMOTE-NEXT: s_add_u32 s88, s88, s3 +; GCN-PROMOTE-NEXT: s_addc_u32 s89, s89, 0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, 3 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, 1 +; GCN-PROMOTE-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:16 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, 2 +; GCN-PROMOTE-NEXT: buffer_store_dword v1, off, s[88:91], 0 offset:12 +; GCN-PROMOTE-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:8 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, 0 +; GCN-PROMOTE-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-PROMOTE-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v2, 3, v0, vcc +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, s0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, s1 +; GCN-PROMOTE-NEXT: flat_store_dword v[0:1], v2 +; GCN-PROMOTE-NEXT: s_endpgm entry: - %tmp = alloca [4 x i32], addrspace(5) - %cmp = icmp ne ptr addrspace(5) %tmp, null + %tmpvar = alloca [4 x i32], addrspace(5) + %cmp = icmp ne ptr addrspace(5) %tmpvar, null call void @llvm.assume(i1 %cmp) - %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 - store i32 0, ptr addrspace(5) %tmp + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar store i32 1, ptr addrspace(5) %y store i32 2, ptr addrspace(5) %z store i32 3, ptr addrspace(5) %w - %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index - %tmp2 = load i32, ptr addrspace(5) %tmp1 - store i32 %tmp2, ptr addrspace(1) %out + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + store i32 %tmpvar2, ptr addrspace(1) %out ret void } -; OPT-LABEL: @vector_read_alloca_multiuse( -; OPT-NOT: alloca -; OPT: %0 = extractelement <4 x i32> , i32 %index -; OPT-NEXT: %add2 = add nuw nsw i32 %0, 1 -; OPT-NEXT: store i32 %add2, ptr addrspace(1) %out, align 4 - -; GCN-LABEL: {{^}}vector_read_alloca_multiuse: -; GCN-ALLOCA-COUNT-4: buffer_store_dword -; GCN-ALLOCA: buffer_load_dword - -; GCN-PROMOTE: s_cmp_eq_u32 s{{[0-9]+}}, 1 -; GCN-PROMOTE: s_cselect_b64 [[CC1:[^,]+]], -1, 0 -; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 2 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND1:v[0-9]+]], 0, 1, [[CC1]] -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN_PROMOTE: s_cmp_lg_u32 s{{[0-9]+}}, 3 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND2:v[0-9]+]], 2, [[IND1]], vcc -; GCN-PROMOTE: s_cselect_b64 vcc, -1, 0 -; GCN-PROMOTE: v_cndmask_b32_e{{32|64}} [[IND3:v[0-9]+]], 3, [[IND2]], vcc - -; GCN-PROMOTE: ScratchSize: 0 - define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) { +; GCN-ALLOCA-LABEL: vector_read_alloca_multiuse: +; GCN-ALLOCA: ; %bb.0: ; %entry +; GCN-ALLOCA-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 +; GCN-ALLOCA-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 +; GCN-ALLOCA-NEXT: s_mov_b32 s90, -1 +; GCN-ALLOCA-NEXT: s_mov_b32 s91, 0xe80000 +; GCN-ALLOCA-NEXT: s_add_u32 s88, s88, s3 +; GCN-ALLOCA-NEXT: s_load_dword s2, s[0:1], 0x2c +; GCN-ALLOCA-NEXT: s_addc_u32 s89, s89, 0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 0 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:4 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 1 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:8 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 2 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:12 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 3 +; GCN-ALLOCA-NEXT: buffer_store_dword v0, off, s[88:91], 0 offset:16 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: s_lshl_b32 s2, s2, 2 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, 4 +; GCN-ALLOCA-NEXT: v_add_u32_e32 v0, vcc, s2, v0 +; GCN-ALLOCA-NEXT: buffer_load_dword v0, v0, s[88:91], 0 offen +; GCN-ALLOCA-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-ALLOCA-NEXT: s_waitcnt vmcnt(0) +; GCN-ALLOCA-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GCN-ALLOCA-NEXT: s_waitcnt lgkmcnt(0) +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-ALLOCA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-ALLOCA-NEXT: flat_store_dword v[0:1], v2 +; GCN-ALLOCA-NEXT: s_endpgm +; +; GCN-PROMOTE-LABEL: vector_read_alloca_multiuse: +; GCN-PROMOTE: ; %bb.0: ; %entry +; GCN-PROMOTE-NEXT: s_load_dword s4, s[0:1], 0x2c +; GCN-PROMOTE-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-PROMOTE-NEXT: s_waitcnt lgkmcnt(0) +; GCN-PROMOTE-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-PROMOTE-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 2 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: s_cmp_lg_u32 s4, 3 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v0, 2, v0, vcc +; GCN-PROMOTE-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-PROMOTE-NEXT: v_cndmask_b32_e32 v0, 3, v0, vcc +; GCN-PROMOTE-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v0, s0 +; GCN-PROMOTE-NEXT: v_mov_b32_e32 v1, s1 +; GCN-PROMOTE-NEXT: flat_store_dword v[0:1], v2 +; GCN-PROMOTE-NEXT: s_endpgm entry: - %tmp = alloca [4 x i32], addrspace(5) - %y = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 1 - %z = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 2 - %w = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 3 - store i32 0, ptr addrspace(5) %tmp + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar store i32 1, ptr addrspace(5) %y store i32 2, ptr addrspace(5) %z store i32 3, ptr addrspace(5) %w - %tmp1 = getelementptr [4 x i32], ptr addrspace(5) %tmp, i32 0, i32 %index - %tmp2 = load i32, ptr addrspace(5) %tmp1 - %tmp3 = load i32, ptr addrspace(5) %tmp - %tmp4 = load i32, ptr addrspace(5) %y - %add1 = add i32 %tmp2, %tmp3 - %add2 = add i32 %add1, %tmp4 + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + %tmpvar3 = load i32, ptr addrspace(5) %tmpvar + %tmpvar4 = load i32, ptr addrspace(5) %y + %add1 = add i32 %tmpvar2, %tmpvar3 + %add2 = add i32 %add1, %tmpvar4 store i32 %add2, ptr addrspace(1) %out ret void } -; OPT-LABEL: @bitcast_vector_to_vector( -; OPT-NOT: alloca -; OPT: store <4 x i32> , ptr addrspace(1) %out, align 16 - -; GCN-LABEL: {{^}}bitcast_vector_to_vector: -; GCN: v_mov_b32_e32 v0, 1 -; GCN: v_mov_b32_e32 v1, 2 -; GCN: v_mov_b32_e32 v2, 3 -; GCN: v_mov_b32_e32 v3, 4 - -; GCN: ScratchSize: 0 - define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out) { +; GCN-LABEL: bitcast_vector_to_vector: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: v_mov_b32_e32 v3, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm .entry: %alloca = alloca <4 x float>, align 16, addrspace(5) store <4 x i32> , ptr addrspace(5) %alloca @@ -355,19 +874,19 @@ ret void } -; OPT-LABEL: @vector_bitcast_from_alloca_array( -; OPT-NOT: alloca -; OPT: store <4 x i32> , ptr addrspace(1) %out, align 16 - -; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array: -; GCN: v_mov_b32_e32 v0, 1 -; GCN: v_mov_b32_e32 v1, 2 -; GCN: v_mov_b32_e32 v2, 3 -; GCN: v_mov_b32_e32 v3, 4 - -; GCN: ScratchSize: 0 - define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out) { +; GCN-LABEL: vector_bitcast_from_alloca_array: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: v_mov_b32_e32 v3, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm .entry: %alloca = alloca [4 x float], align 16, addrspace(5) store <4 x i32> , ptr addrspace(5) %alloca @@ -376,25 +895,19 @@ ret void } -; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array( -; OPT-NOT: alloca -; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1 -; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2 -; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3 -; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 - -; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array: -; GCN: v_mov_b32_e32 v0, 1 -; GCN: v_mov_b32_e32 v1, 2 -; GCN: v_mov_b32_e32 v2, 3 -; GCN: v_mov_b32_e32 v3, 4 - -; GCN: ScratchSize: 0 - define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out) { +; GCN-LABEL: vector_bitcast_to_array_from_alloca_array: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: v_mov_b32_e32 v3, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %alloca = alloca [4 x float], align 16, addrspace(5) store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca %load = load [4 x i32], ptr addrspace(5) %alloca, align 16 @@ -402,27 +915,21 @@ ret void } -; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array( -; OPT-NOT: alloca -; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4 -; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1 -; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4 -; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2 -; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4 -; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3 -; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4 - -; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array: -; GCN: v_mov_b32_e32 v0, 1 -; GCN: v_mov_b32_e32 v1, 2 -; GCN: v_mov_b32_e32 v2, 3 -; GCN: v_mov_b32_e32 v3, 4 - -; GCN: ScratchSize: 0 - %struct.v4 = type { i32, i32, i32, i32 } define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out) { +; GCN-LABEL: vector_bitcast_to_struct_from_alloca_array: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 1 +; GCN-NEXT: v_mov_b32_e32 v1, 2 +; GCN-NEXT: v_mov_b32_e32 v2, 3 +; GCN-NEXT: v_mov_b32_e32 v3, 4 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v5, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GCN-NEXT: s_endpgm %alloca = alloca [4 x float], align 16, addrspace(5) store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca %load = load %struct.v4, ptr addrspace(5) %alloca, align 16 diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll --- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt %s -passes=instcombine -S | FileCheck %s ;; Based on test/Transforms/InstCombine/shufflevec-bitcast.ll in which the @@ -5,10 +6,11 @@ ;; added by hand. Check the DIAssignID attachment on the store is preserved. define <2 x i4> @shuf_bitcast_insert_use2(<2 x i8> %v, i8 %x, ptr %p) { -; CHECK-LABEL: @shuf_bitcast_insert_use2( -; CHECK-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V:%.*]], i8 [[X:%.*]], i64 0 -; CHECK-NEXT: store <2 x i8> [[I]], ptr [[P:%.*]], align 2, !DIAssignID ![[ID:[0-9]+]] -; CHECK-NEXT: dbg.assign(metadata <2 x i8> %i, {{.+}}, {{.+}}, metadata ![[ID]], metadata ptr %p,{{.+}}) +; CHECK-LABEL: define <2 x i4> @shuf_bitcast_insert_use2 +; CHECK-SAME: (<2 x i8> [[V:%.*]], i8 [[X:%.*]], ptr [[P:%.*]]) { +; CHECK-NEXT: [[I:%.*]] = insertelement <2 x i8> [[V]], i8 [[X]], i64 0 +; CHECK-NEXT: store <2 x i8> [[I]], ptr [[P]], align 2, !DIAssignID [[DIASSIGNID6:![0-9]+]] +; CHECK-NEXT: call void @llvm.dbg.assign(metadata <2 x i8> [[I]], metadata [[META7:![0-9]+]], metadata !DIExpression(), metadata [[DIASSIGNID6]], metadata ptr [[P]], metadata !DIExpression()), !dbg [[DBG19:![0-9]+]] ; CHECK-NEXT: [[R:%.*]] = bitcast i8 [[X]] to <2 x i4> ; CHECK-NEXT: ret <2 x i4> [[R]] ; diff --git a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll --- a/llvm/test/Transforms/InstCombine/X86/blend_x86.ll +++ b/llvm/test/Transforms/InstCombine/X86/blend_x86.ll @@ -247,8 +247,8 @@ ; CHECK-NEXT: [[YCAST:%.*]] = bitcast <2 x i64> [[Y:%.*]] to <4 x i32> ; CHECK-NEXT: [[ZCAST:%.*]] = bitcast <2 x i64> [[Z:%.*]] to <4 x i32> ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt <4 x i32> [[YCAST]], [[ZCAST]] -; CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> [[LD1]] -; CHECK-NEXT: [[RCAST:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> [[LD1]] +; CHECK-NEXT: [[RCAST:%.*]] = bitcast <4 x i32> [[TMP1]] to <2 x i64> ; CHECK-NEXT: ret <2 x i64> [[RCAST]] ; %ld = load <16 x i8>, ptr %x, align 16 diff --git a/llvm/test/Transforms/InstCombine/X86/pr2645-1.ll b/llvm/test/Transforms/InstCombine/X86/pr2645-1.ll --- a/llvm/test/Transforms/InstCombine/X86/pr2645-1.ll +++ b/llvm/test/Transforms/InstCombine/X86/pr2645-1.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: [[I12:%.*]] = shufflevector <8 x i16> [[I11]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: [[I13:%.*]] = bitcast <8 x i16> [[I12]] to <4 x i32> ; CHECK-NEXT: [[I14:%.*]] = sitofp <4 x i32> [[I13]] to <4 x float> -; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[DOT0]] to i64 -; CHECK-NEXT: [[I15:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[DOT0]] to i64 +; CHECK-NEXT: [[I15:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 [[TMP1]] ; CHECK-NEXT: store <4 x float> [[I14]], ptr [[I15]], align 1 ; CHECK-NEXT: [[I17]] = add i32 [[DOT0]], 1 ; CHECK-NEXT: br label [[BB3]] diff --git a/llvm/test/Transforms/InstCombine/cast_phi.ll b/llvm/test/Transforms/InstCombine/cast_phi.ll --- a/llvm/test/Transforms/InstCombine/cast_phi.ll +++ b/llvm/test/Transforms/InstCombine/cast_phi.ll @@ -248,13 +248,13 @@ ; CHECK-NEXT: br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]] ; CHECK: t: ; CHECK-NEXT: [[Y:%.*]] = call i32 @get_i32() -; CHECK-NEXT: [[PHI_CAST:%.*]] = zext i32 [[Y]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[Y]] to i64 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: f: ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[PHI_CAST]], [[T]] ], [ 3, [[F]] ] +; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[TMP0]], [[T]] ], [ 3, [[F]] ] ; CHECK-NEXT: ret i64 [[P]] ; entry: @@ -282,13 +282,13 @@ ; CHECK-NEXT: br i1 [[CMP]], label [[T:%.*]], label [[F:%.*]] ; CHECK: t: ; CHECK-NEXT: [[Y:%.*]] = call i3 @get_i3() -; CHECK-NEXT: [[PHI_CAST:%.*]] = zext i3 [[Y]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = zext i3 [[Y]] to i64 ; CHECK-NEXT: br label [[EXIT:%.*]] ; CHECK: f: ; CHECK-NEXT: call void @bar() ; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: -; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[PHI_CAST]], [[T]] ], [ 3, [[F]] ] +; CHECK-NEXT: [[P:%.*]] = phi i64 [ [[TMP0]], [[T]] ], [ 3, [[F]] ] ; CHECK-NEXT: ret i64 [[P]] ; entry: diff --git a/llvm/test/Transforms/InstCombine/insert-val-extract-elem.ll b/llvm/test/Transforms/InstCombine/insert-val-extract-elem.ll --- a/llvm/test/Transforms/InstCombine/insert-val-extract-elem.ll +++ b/llvm/test/Transforms/InstCombine/insert-val-extract-elem.ll @@ -1,10 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 ; RUN: opt -S -passes=instcombine %s | FileCheck %s -; CHECK-LABEL: julia_2xdouble -; CHECK-NOT: insertvalue -; CHECK-NOT: extractelement -; CHECK: store <2 x double> define void @julia_2xdouble(ptr sret([2 x double]), ptr) { +; CHECK-LABEL: define void @julia_2xdouble +; CHECK-SAME: (ptr sret([2 x double]) [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: top: +; CHECK-NEXT: [[X:%.*]] = load <2 x double>, ptr [[TMP1]], align 16 +; CHECK-NEXT: store <2 x double> [[X]], ptr [[TMP0]], align 8 +; CHECK-NEXT: ret void +; top: %x = load <2 x double>, ptr %1 %x0 = extractelement <2 x double> %x, i32 0 @@ -16,11 +20,14 @@ } ; Test with two inserts to the same index -; CHECK-LABEL: julia_2xi64 -; CHECK-NOT: insertvalue -; CHECK-NOT: extractelement -; CHECK: store <2 x i64> define void @julia_2xi64(ptr sret([2 x i64]), ptr) { +; CHECK-LABEL: define void @julia_2xi64 +; CHECK-SAME: (ptr sret([2 x i64]) [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: top: +; CHECK-NEXT: [[X:%.*]] = load <2 x i64>, ptr [[TMP1]], align 16 +; CHECK-NEXT: store <2 x i64> [[X]], ptr [[TMP0]], align 4 +; CHECK-NEXT: ret void +; top: %x = load <2 x i64>, ptr %1 %x0 = extractelement <2 x i64> %x, i32 1 @@ -33,11 +40,14 @@ ret void } -; CHECK-LABEL: julia_4xfloat -; CHECK-NOT: insertvalue -; CHECK-NOT: extractelement -; CHECK: store <4 x float> define void @julia_4xfloat(ptr sret([4 x float]), ptr) { +; CHECK-LABEL: define void @julia_4xfloat +; CHECK-SAME: (ptr sret([4 x float]) [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: top: +; CHECK-NEXT: [[X:%.*]] = load <4 x float>, ptr [[TMP1]], align 16 +; CHECK-NEXT: store <4 x float> [[X]], ptr [[TMP0]], align 4 +; CHECK-NEXT: ret void +; top: %x = load <4 x float>, ptr %1 %x0 = extractelement <4 x float> %x, i32 0 @@ -54,11 +64,14 @@ %pseudovec = type { float, float, float, float } -; CHECK-LABEL: julia_pseudovec -; CHECK-NOT: insertvalue -; CHECK-NOT: extractelement -; CHECK: store <4 x float> define void @julia_pseudovec(ptr sret(%pseudovec), ptr) { +; CHECK-LABEL: define void @julia_pseudovec +; CHECK-SAME: (ptr sret([[PSEUDOVEC:%.*]]) [[TMP0:%.*]], ptr [[TMP1:%.*]]) { +; CHECK-NEXT: top: +; CHECK-NEXT: [[X:%.*]] = load <4 x float>, ptr [[TMP1]], align 16 +; CHECK-NEXT: store <4 x float> [[X]], ptr [[TMP0]], align 4 +; CHECK-NEXT: ret void +; top: %x = load <4 x float>, ptr %1 %x0 = extractelement <4 x float> %x, i32 0 diff --git a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll --- a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll +++ b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll @@ -21,8 +21,8 @@ ; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[CMP_I:%.*]] = fcmp fast olt float [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[CMP_I]], float [[TMP2]], float [[TMP1]] -; CHECK-NEXT: store float [[TMP3]], ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[DOTV:%.*]] = select i1 [[CMP_I]], float [[TMP2]], float [[TMP1]] +; CHECK-NEXT: store float [[DOTV]], ptr [[ARRAYIDX]], align 4 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 ; CHECK-NEXT: br label [[FOR_COND]] ; @@ -80,8 +80,8 @@ ; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[LOADADDR1:%.*]], align 4 ; CHECK-NEXT: [[LD2:%.*]] = load float, ptr [[LOADADDR2:%.*]], align 4 ; CHECK-NEXT: [[COND:%.*]] = fcmp ogt float [[LD1]], [[LD2]] -; CHECK-NEXT: [[LD3:%.*]] = select i1 [[COND]], float [[LD1]], float [[LD2]] -; CHECK-NEXT: store float [[LD3]], ptr [[STOREADDR:%.*]], align 4 +; CHECK-NEXT: [[LD_V:%.*]] = select i1 [[COND]], float [[LD1]], float [[LD2]] +; CHECK-NEXT: store float [[LD_V]], ptr [[STOREADDR:%.*]], align 4 ; CHECK-NEXT: ret void ; %ld1 = load float, ptr %loadaddr1, align 4 diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll --- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll +++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll @@ -151,7 +151,7 @@ define void @test_load_cast_combine_nonnull(ptr %ptr) { ; CHECK-LABEL: @test_load_cast_combine_nonnull( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull +; CHECK-NEXT: [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull !6 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[PTR]], i64 42 ; CHECK-NEXT: store ptr [[P]], ptr [[GEP]], align 8 ; CHECK-NEXT: ret void @@ -165,7 +165,7 @@ define i32 @test_load_cast_combine_noundef(ptr %ptr) { ; CHECK-LABEL: @test_load_cast_combine_noundef( -; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noundef +; CHECK-NEXT: [[L1:%.*]] = load i32, ptr [[PTR:%.*]], align 4, !noundef !6 ; CHECK-NEXT: ret i32 [[L1]] ; %l = load float, ptr %ptr, !noundef !{} diff --git a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll --- a/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll +++ b/llvm/test/Transforms/InstCombine/multiple-uses-load-bitcast-select.ll @@ -7,21 +7,21 @@ ; CHECK-NEXT: [[Z1:%.*]] = alloca double, align 8 ; CHECK-NEXT: [[LD1:%.*]] = load double, ptr [[Y1]], align 8 ; CHECK-NEXT: [[LD2:%.*]] = load double, ptr [[Z1]], align 8 -; CHECK-NEXT: [[TMP10:%.*]] = fcmp olt double [[LD1]], [[LD2]] -; CHECK-NEXT: [[TMP121:%.*]] = select i1 [[TMP10]], double [[LD1]], double [[LD2]] -; CHECK-NEXT: store double [[TMP121]], ptr [[ST1:%.*]], align 8 -; CHECK-NEXT: store double [[TMP121]], ptr [[ST2:%.*]], align 8 +; CHECK-NEXT: [[TMPVAR10:%.*]] = fcmp olt double [[LD1]], [[LD2]] +; CHECK-NEXT: [[TMPVAR12_V:%.*]] = select i1 [[TMPVAR10]], double [[LD1]], double [[LD2]] +; CHECK-NEXT: store double [[TMPVAR12_V]], ptr [[ST1:%.*]], align 8 +; CHECK-NEXT: store double [[TMPVAR12_V]], ptr [[ST2:%.*]], align 8 ; CHECK-NEXT: ret void ; %y1 = alloca double %z1 = alloca double %ld1 = load double, ptr %y1 %ld2 = load double, ptr %z1 - %tmp10 = fcmp olt double %ld1, %ld2 - %sel = select i1 %tmp10, ptr %y1, ptr %z1 - %tmp12 = load i64, ptr %sel - store i64 %tmp12, ptr %st1 - store i64 %tmp12, ptr %st2 + %tmpvar10 = fcmp olt double %ld1, %ld2 + %sel = select i1 %tmpvar10, ptr %y1, ptr %z1 + %tmpvar12 = load i64, ptr %sel + store i64 %tmpvar12, ptr %st1 + store i64 %tmpvar12, ptr %st2 ret void } diff --git a/llvm/test/Transforms/InstCombine/pr25342.ll b/llvm/test/Transforms/InstCombine/pr25342.ll --- a/llvm/test/Transforms/InstCombine/pr25342.ll +++ b/llvm/test/Transforms/InstCombine/pr25342.ll @@ -93,8 +93,8 @@ ; CHECK-NEXT: [[ADD_I:%.*]] = fadd float [[SUB_I]], [[TMP0]] ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_0]], 1 ; CHECK-NEXT: [[BIT0:%.*]] = and i32 [[INC]], 1 -; CHECK-NEXT: [[EVEN:%.*]] = icmp eq i32 [[BIT0]], 0 -; CHECK-NEXT: br i1 [[EVEN]], label [[EVEN_BB:%.*]], label [[ODD_BB]] +; CHECK-NEXT: [[EVEN_NOT_NOT:%.*]] = icmp eq i32 [[BIT0]], 0 +; CHECK-NEXT: br i1 [[EVEN_NOT_NOT]], label [[EVEN_BB:%.*]], label [[ODD_BB]] ; CHECK: even.bb: ; CHECK-NEXT: [[TMP5:%.*]] = fadd float [[SUB_I]], [[ADD_I]] ; CHECK-NEXT: br label [[ODD_BB]] diff --git a/llvm/test/Transforms/InstCombine/reduction-and-sext-zext-i1.ll b/llvm/test/Transforms/InstCombine/reduction-and-sext-zext-i1.ll --- a/llvm/test/Transforms/InstCombine/reduction-and-sext-zext-i1.ll +++ b/llvm/test/Transforms/InstCombine/reduction-and-sext-zext-i1.ll @@ -6,8 +6,8 @@ define i1 @reduce_and_self(<8 x i1> %x) { ; CHECK-LABEL: @reduce_and_self( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], -1 -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[RES:%.*]] = icmp eq i8 [[TMP1]], -1 +; CHECK-NEXT: ret i1 [[RES]] ; %res = call i1 @llvm.vector.reduce.and.v8i32(<8 x i1> %x) ret i1 %res @@ -17,8 +17,8 @@ ; CHECK-LABEL: @reduce_and_sext( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i4 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i32 +; CHECK-NEXT: ret i32 [[RES]] ; %sext = sext <4 x i1> %x to <4 x i32> %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %sext) @@ -29,8 +29,8 @@ ; CHECK-LABEL: @reduce_and_zext( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64 -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = zext i1 [[TMP2]] to i64 +; CHECK-NEXT: ret i64 [[RES]] ; %zext = zext <8 x i1> %x to <8 x i64> %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %zext) @@ -41,8 +41,8 @@ ; CHECK-LABEL: @reduce_and_sext_same( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[X:%.*]] to i16 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i16 +; CHECK-NEXT: ret i16 [[RES]] ; %sext = sext <16 x i1> %x to <16 x i16> %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %sext) @@ -53,8 +53,8 @@ ; CHECK-LABEL: @reduce_and_zext_long( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i128 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i8 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i8 +; CHECK-NEXT: ret i8 [[RES]] ; %sext = sext <128 x i1> %x to <128 x i8> %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %sext) @@ -66,11 +66,11 @@ ; CHECK-LABEL: @reduce_and_zext_long_external_use( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i128 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <128 x i1> [[X]], i64 0 -; CHECK-NEXT: [[EXT:%.*]] = sext i1 [[TMP4]] to i8 +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <128 x i1> [[X]], i64 0 +; CHECK-NEXT: [[EXT:%.*]] = sext i1 [[TMP3]] to i8 ; CHECK-NEXT: store i8 [[EXT]], ptr @glob, align 1 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: ret i8 [[RES]] ; %sext = sext <128 x i1> %x to <128 x i8> %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %sext) @@ -84,11 +84,11 @@ ; CHECK-LABEL: @reduce_and_zext_external_use( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], -1 -; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[X]], i64 0 -; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[TMP4]] to i64 +; CHECK-NEXT: [[RES:%.*]] = zext i1 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[X]], i64 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[TMP3]] to i64 ; CHECK-NEXT: store i64 [[EXT]], ptr @glob1, align 8 -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: ret i64 [[RES]] ; %zext = zext <8 x i1> %x to <8 x i64> %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %zext) @@ -102,8 +102,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[LHS1:%.*]] = load i64, ptr [[ARG1:%.*]], align 8 ; CHECK-NEXT: [[RHS2:%.*]] = load i64, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i64 [[LHS1]], [[RHS2]] -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[ALL_EQ:%.*]] = icmp eq i64 [[LHS1]], [[RHS2]] +; CHECK-NEXT: ret i1 [[ALL_EQ]] ; bb: %lhs = load <8 x i8>, ptr %arg1 @@ -120,8 +120,8 @@ ; CHECK-NEXT: [[RHS:%.*]] = load <8 x i16>, ptr [[ARG:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[LHS]], [[RHS]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0 -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[ALL_EQ:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: ret i1 [[ALL_EQ]] ; bb: %lhs = load <8 x i16>, ptr %arg1 @@ -136,8 +136,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[LHS1:%.*]] = load i64, ptr [[ARG1:%.*]], align 8 ; CHECK-NEXT: [[RHS2:%.*]] = load i64, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[LHS1]], [[RHS2]] -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[ALL_EQ:%.*]] = icmp ne i64 [[LHS1]], [[RHS2]] +; CHECK-NEXT: ret i1 [[ALL_EQ]] ; bb: %lhs = load <8 x i8>, ptr %arg1 @@ -155,8 +155,8 @@ ; CHECK-NEXT: [[RHS:%.*]] = load <8 x i16>, ptr [[ARG:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[LHS]], [[RHS]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i8 [[TMP0]], 0 -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[ALL_EQ:%.*]] = icmp ne i8 [[TMP0]], 0 +; CHECK-NEXT: ret i1 [[ALL_EQ]] ; bb: %lhs = load <8 x i16>, ptr %arg1 diff --git a/llvm/test/Transforms/InstCombine/reduction-or-sext-zext-i1.ll b/llvm/test/Transforms/InstCombine/reduction-or-sext-zext-i1.ll --- a/llvm/test/Transforms/InstCombine/reduction-or-sext-zext-i1.ll +++ b/llvm/test/Transforms/InstCombine/reduction-or-sext-zext-i1.ll @@ -6,8 +6,8 @@ define i1 @reduce_or_self(<8 x i1> %x) { ; CHECK-LABEL: @reduce_or_self( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i8 [[TMP1]], 0 -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[RES:%.*]] = icmp ne i8 [[TMP1]], 0 +; CHECK-NEXT: ret i1 [[RES]] ; %res = call i1 @llvm.vector.reduce.or.v8i32(<8 x i1> %x) ret i1 %res @@ -17,8 +17,8 @@ ; CHECK-LABEL: @reduce_or_sext( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i1> [[X:%.*]] to i4 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i4 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i32 -; CHECK-NEXT: ret i32 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i32 +; CHECK-NEXT: ret i32 [[RES]] ; %sext = sext <4 x i1> %x to <4 x i32> %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %sext) @@ -29,8 +29,8 @@ ; CHECK-LABEL: @reduce_or_zext( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i8 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64 -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = zext i1 [[TMP2]] to i64 +; CHECK-NEXT: ret i64 [[RES]] ; %zext = zext <8 x i1> %x to <8 x i64> %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %zext) @@ -41,8 +41,8 @@ ; CHECK-LABEL: @reduce_or_sext_same( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i1> [[X:%.*]] to i16 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i16 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i16 -; CHECK-NEXT: ret i16 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i16 +; CHECK-NEXT: ret i16 [[RES]] ; %sext = sext <16 x i1> %x to <16 x i16> %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %sext) @@ -53,8 +53,8 @@ ; CHECK-LABEL: @reduce_or_zext_long( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i8 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i8 +; CHECK-NEXT: ret i8 [[RES]] ; %sext = sext <128 x i1> %x to <128 x i8> %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %sext) @@ -66,11 +66,11 @@ ; CHECK-LABEL: @reduce_or_zext_long_external_use( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <128 x i1> [[X:%.*]] to i128 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i128 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = sext i1 [[TMP2]] to i8 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <128 x i1> [[X]], i64 0 -; CHECK-NEXT: [[EXT:%.*]] = sext i1 [[TMP4]] to i8 +; CHECK-NEXT: [[RES:%.*]] = sext i1 [[TMP2]] to i8 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <128 x i1> [[X]], i64 0 +; CHECK-NEXT: [[EXT:%.*]] = sext i1 [[TMP3]] to i8 ; CHECK-NEXT: store i8 [[EXT]], ptr @glob, align 1 -; CHECK-NEXT: ret i8 [[TMP3]] +; CHECK-NEXT: ret i8 [[RES]] ; %sext = sext <128 x i1> %x to <128 x i8> %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %sext) @@ -84,11 +84,11 @@ ; CHECK-LABEL: @reduce_or_zext_external_use( ; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i1> [[X:%.*]] to i8 ; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i8 [[TMP1]], 0 -; CHECK-NEXT: [[TMP3:%.*]] = zext i1 [[TMP2]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x i1> [[X]], i64 0 -; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[TMP4]] to i64 +; CHECK-NEXT: [[RES:%.*]] = zext i1 [[TMP2]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x i1> [[X]], i64 0 +; CHECK-NEXT: [[EXT:%.*]] = zext i1 [[TMP3]] to i64 ; CHECK-NEXT: store i64 [[EXT]], ptr @glob1, align 8 -; CHECK-NEXT: ret i64 [[TMP3]] +; CHECK-NEXT: ret i64 [[RES]] ; %zext = zext <8 x i1> %x to <8 x i64> %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %zext) @@ -102,8 +102,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[LHS1:%.*]] = load i64, ptr [[ARG1:%.*]], align 8 ; CHECK-NEXT: [[RHS2:%.*]] = load i64, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i64 [[LHS1]], [[RHS2]] -; CHECK-NEXT: ret i1 [[DOTNOT]] +; CHECK-NEXT: [[ANY_NE_NOT:%.*]] = icmp eq i64 [[LHS1]], [[RHS2]] +; CHECK-NEXT: ret i1 [[ANY_NE_NOT]] ; bb: %lhs = load <8 x i8>, ptr %arg1 @@ -121,8 +121,8 @@ ; CHECK-NEXT: [[RHS:%.*]] = load <8 x i16>, ptr [[ARG:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[LHS]], [[RHS]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 -; CHECK-NEXT: [[DOTNOT:%.*]] = icmp eq i8 [[TMP0]], 0 -; CHECK-NEXT: ret i1 [[DOTNOT]] +; CHECK-NEXT: [[ANY_NE_NOT:%.*]] = icmp eq i8 [[TMP0]], 0 +; CHECK-NEXT: ret i1 [[ANY_NE_NOT]] ; bb: %lhs = load <8 x i16>, ptr %arg1 @@ -139,8 +139,8 @@ ; CHECK-NEXT: bb: ; CHECK-NEXT: [[LHS1:%.*]] = load i64, ptr [[ARG1:%.*]], align 8 ; CHECK-NEXT: [[RHS2:%.*]] = load i64, ptr [[ARG:%.*]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[LHS1]], [[RHS2]] -; CHECK-NEXT: ret i1 [[TMP2]] +; CHECK-NEXT: [[ANY_NE:%.*]] = icmp ne i64 [[LHS1]], [[RHS2]] +; CHECK-NEXT: ret i1 [[ANY_NE]] ; bb: %lhs = load <8 x i8>, ptr %arg1 @@ -157,8 +157,8 @@ ; CHECK-NEXT: [[RHS:%.*]] = load <8 x i16>, ptr [[ARG:%.*]], align 16 ; CHECK-NEXT: [[CMP:%.*]] = icmp ne <8 x i16> [[LHS]], [[RHS]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i1> [[CMP]] to i8 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ne i8 [[TMP0]], 0 -; CHECK-NEXT: ret i1 [[TMP1]] +; CHECK-NEXT: [[ANY_NE:%.*]] = icmp ne i8 [[TMP0]], 0 +; CHECK-NEXT: ret i1 [[ANY_NE]] ; bb: %lhs = load <8 x i16>, ptr %arg1 diff --git a/llvm/test/Transforms/InstCombine/select_meta.ll b/llvm/test/Transforms/InstCombine/select_meta.ll --- a/llvm/test/Transforms/InstCombine/select_meta.ll +++ b/llvm/test/Transforms/InstCombine/select_meta.ll @@ -65,8 +65,8 @@ define i64 @test43(i32 %a) nounwind { ; CHECK-LABEL: @test43( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[A:%.*]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: ret i64 [[TMP2]] +; CHECK-NEXT: [[MAX:%.*]] = zext i32 [[TMP1]] to i64 +; CHECK-NEXT: ret i64 [[MAX]] ; %a_ext = sext i32 %a to i64 %is_a_nonnegative = icmp sgt i32 %a, -1 @@ -131,8 +131,8 @@ ; SMAX(SMAX(x, y), x) -> SMAX(x, y) define i32 @test30(i32 %x, i32 %y) { ; CHECK-LABEL: @test30( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[COND:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 [[Y:%.*]]) +; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp sgt i32 %x, %y %cond = select i1 %cmp, i32 %x, i32 %y, !prof !1 @@ -144,8 +144,8 @@ ; SMAX(SMAX(75, X), 36) -> SMAX(X, 75) define i32 @test70(i32 %x) { ; CHECK-LABEL: @test70( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 75) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[COND:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 75) +; CHECK-NEXT: ret i32 [[COND]] ; %cmp = icmp slt i32 %x, 75 %cond = select i1 %cmp, i32 75, i32 %x, !prof !1 @@ -158,8 +158,8 @@ ; SMIN(SMIN(X, 92), 11) -> SMIN(X, 11) define i32 @test72(i32 %x) { ; CHECK-LABEL: @test72( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 11) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[RETVAL:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 11) +; CHECK-NEXT: ret i32 [[RETVAL]] ; %cmp = icmp sgt i32 %x, 92 %cond = select i1 %cmp, i32 92, i32 %x, !prof !1 @@ -172,9 +172,9 @@ ; SMAX(SMAX(X, 36), 75) -> SMAX(X, 75) define i32 @test74(i32 %x) { ; CHECK-LABEL: @test74( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 36) -; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.umax.i32(i32 [[TMP1]], i32 75) -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[COND:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 36) +; CHECK-NEXT: [[RETVAL:%.*]] = call i32 @llvm.umax.i32(i32 [[COND]], i32 75) +; CHECK-NEXT: ret i32 [[RETVAL]] ; %cmp = icmp slt i32 %x, 36 %cond = select i1 %cmp, i32 36, i32 %x, !prof !1 @@ -187,8 +187,8 @@ define i32 @smin1(i32 %x) { ; CHECK-LABEL: @smin1( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[SEL:%.*]] = xor i32 [[TMP1]], -1 +; CHECK-NEXT: ret i32 [[SEL]] ; %not_x = xor i32 %x, -1 %cmp = icmp sgt i32 %x, 0 @@ -200,8 +200,8 @@ define i32 @smin2(i32 %x) { ; CHECK-LABEL: @smin2( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smax.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[SEL:%.*]] = xor i32 [[TMP1]], -1 +; CHECK-NEXT: ret i32 [[SEL]] ; %not_x = xor i32 %x, -1 %cmp = icmp slt i32 %x, 0 @@ -213,8 +213,8 @@ define i32 @smax1(i32 %x) { ; CHECK-LABEL: @smax1( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[SEL:%.*]] = xor i32 [[TMP1]], -1 +; CHECK-NEXT: ret i32 [[SEL]] ; %not_x = xor i32 %x, -1 %cmp = icmp slt i32 %x, 0 @@ -226,8 +226,8 @@ define i32 @smax2(i32 %x) { ; CHECK-LABEL: @smax2( ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.smin.i32(i32 [[X:%.*]], i32 0) -; CHECK-NEXT: [[TMP2:%.*]] = xor i32 [[TMP1]], -1 -; CHECK-NEXT: ret i32 [[TMP2]] +; CHECK-NEXT: [[SEL:%.*]] = xor i32 [[TMP1]], -1 +; CHECK-NEXT: ret i32 [[SEL]] ; %not_x = xor i32 %x, -1 %cmp = icmp sgt i32 %x, 0 @@ -238,8 +238,8 @@ ; The compare should change, but the metadata remains the same because the select operands are not swapped. define i32 @umin1(i32 %x) { ; CHECK-LABEL: @umin1( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 -2147483648) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 -2147483648) +; CHECK-NEXT: ret i32 [[SEL]] ; %cmp = icmp sgt i32 %x, -1 %sel = select i1 %cmp, i32 %x, i32 -2147483648, !prof !1 @@ -249,8 +249,8 @@ ; The compare should change, and the metadata is swapped because the select operands are swapped. define i32 @umin2(i32 %x) { ; CHECK-LABEL: @umin2( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umin.i32(i32 [[X:%.*]], i32 2147483647) +; CHECK-NEXT: ret i32 [[SEL]] ; %cmp = icmp slt i32 %x, 0 %sel = select i1 %cmp, i32 2147483647, i32 %x, !prof !1 @@ -260,8 +260,8 @@ ; The compare should change, but the metadata remains the same because the select operands are not swapped. define i32 @umax1(i32 %x) { ; CHECK-LABEL: @umax1( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[X:%.*]], i32 2147483647) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umax.i32(i32 [[X:%.*]], i32 2147483647) +; CHECK-NEXT: ret i32 [[SEL]] ; %cmp = icmp slt i32 %x, 0 %sel = select i1 %cmp, i32 %x, i32 2147483647, !prof !1 @@ -271,8 +271,8 @@ ; The compare should change, and the metadata is swapped because the select operands are swapped. define i32 @umax2(i32 %x) { ; CHECK-LABEL: @umax2( -; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.umax.i32(i32 [[X:%.*]], i32 -2147483648) -; CHECK-NEXT: ret i32 [[TMP1]] +; CHECK-NEXT: [[SEL:%.*]] = call i32 @llvm.umax.i32(i32 [[X:%.*]], i32 -2147483648) +; CHECK-NEXT: ret i32 [[SEL]] ; %cmp = icmp sgt i32 %x, -1 %sel = select i1 %cmp, i32 -2147483648, i32 %x, !prof !1 diff --git a/llvm/test/Transforms/PhaseOrdering/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/Transforms/PhaseOrdering/AMDGPU/vector-alloca-bitcast.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AMDGPU/vector-alloca-bitcast.ll @@ -0,0 +1,404 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-promote-alloca,sroa,instcombine' < %s | FileCheck -check-prefix=OPT %s + + +target datalayout = "A5" + +define amdgpu_kernel void @vector_read_alloca_bitcast(ptr addrspace(1) %out, i32 %index) { +; OPT-LABEL: define amdgpu_kernel void @vector_read_alloca_bitcast +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[INDEX:%.*]]) { +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> , i32 [[INDEX]] +; OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +entry: + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + store i32 %tmpvar2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_write_alloca_bitcast(ptr addrspace(1) %out, i32 %w_index, i32 %r_index) { +; OPT-LABEL: define amdgpu_kernel void @vector_write_alloca_bitcast +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[W_INDEX:%.*]], i32 [[R_INDEX:%.*]]) { +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> zeroinitializer, i32 1, i32 [[W_INDEX]] +; OPT-NEXT: [[TMP1:%.*]] = extractelement <4 x i32> [[TMP0]], i32 [[R_INDEX]] +; OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +entry: + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar + store i32 0, ptr addrspace(5) %y + store i32 0, ptr addrspace(5) %z + store i32 0, ptr addrspace(5) %w + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %w_index + store i32 1, ptr addrspace(5) %tmpvar1 + %tmpvar2 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %r_index + %tmpvar3 = load i32, ptr addrspace(5) %tmpvar2 + store i32 %tmpvar3, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_write_read_bitcast_to_float(ptr addrspace(1) %arg) { +; OPT-LABEL: define amdgpu_kernel void @vector_write_read_bitcast_to_float +; OPT-SAME: (ptr addrspace(1) [[ARG:%.*]]) { +; OPT-NEXT: bb: +; OPT-NEXT: br label [[BB2:%.*]] +; OPT: bb2: +; OPT-NEXT: [[TMPVAR_SROA_0_0:%.*]] = phi <6 x float> [ undef, [[BB:%.*]] ], [ [[TMP0:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR3:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMPVAR13:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR4:%.*]] = zext i32 [[TMPVAR3]] to i64 +; OPT-NEXT: [[TMPVAR5:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[TMPVAR4]] +; OPT-NEXT: [[TMPVAR72:%.*]] = load float, ptr addrspace(1) [[TMPVAR5]], align 4 +; OPT-NEXT: [[TMPVAR8:%.*]] = trunc i32 [[TMPVAR3]] to i16 +; OPT-NEXT: [[TMPVAR9:%.*]] = urem i16 [[TMPVAR8]], 6 +; OPT-NEXT: [[TMPVAR10:%.*]] = zext i16 [[TMPVAR9]] to i32 +; OPT-NEXT: [[TMP0]] = insertelement <6 x float> [[TMPVAR_SROA_0_0]], float [[TMPVAR72]], i32 [[TMPVAR10]] +; OPT-NEXT: [[TMPVAR13]] = add nuw nsw i32 [[TMPVAR3]], 1 +; OPT-NEXT: [[TMPVAR14:%.*]] = icmp eq i32 [[TMPVAR13]], 1000 +; OPT-NEXT: br i1 [[TMPVAR14]], label [[DOTPREHEADER:%.*]], label [[BB2]] +; OPT: bb15: +; OPT-NEXT: ret void +; OPT: .preheader: +; OPT-NEXT: [[TMPVAR16:%.*]] = phi i32 [ [[TMPVAR27:%.*]], [[DOTPREHEADER]] ], [ 0, [[BB2]] ] +; OPT-NEXT: [[TMPVAR17:%.*]] = trunc i32 [[TMPVAR16]] to i16 +; OPT-NEXT: [[TMPVAR18:%.*]] = urem i16 [[TMPVAR17]], 6 +; OPT-NEXT: [[TMPVAR19:%.*]] = sub nuw nsw i16 5, [[TMPVAR18]] +; OPT-NEXT: [[TMPVAR20:%.*]] = zext i16 [[TMPVAR19]] to i32 +; OPT-NEXT: [[BC:%.*]] = bitcast <6 x float> [[TMP0]] to <6 x i32> +; OPT-NEXT: [[TMP1:%.*]] = extractelement <6 x i32> [[BC]], i32 [[TMPVAR20]] +; OPT-NEXT: [[TMPVAR24:%.*]] = zext i32 [[TMPVAR16]] to i64 +; OPT-NEXT: [[TMPVAR25:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARG]], i64 [[TMPVAR24]] +; OPT-NEXT: store i32 [[TMP1]], ptr addrspace(1) [[TMPVAR25]], align 4 +; OPT-NEXT: [[TMPVAR27]] = add nuw nsw i32 [[TMPVAR16]], 1 +; OPT-NEXT: [[TMPVAR28:%.*]] = icmp eq i32 [[TMPVAR27]], 1000 +; OPT-NEXT: br i1 [[TMPVAR28]], label [[BB15:%.*]], label [[DOTPREHEADER]] +; +bb: + %tmpvar = alloca [6 x float], align 4, addrspace(5) + call void @llvm.lifetime.start.p5(i64 24, ptr addrspace(5) %tmpvar) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar13, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar7 = load i32, ptr addrspace(1) %tmpvar5, align 4 + %tmpvar8 = trunc i32 %tmpvar3 to i16 + %tmpvar9 = urem i16 %tmpvar8, 6 + %tmpvar10 = zext i16 %tmpvar9 to i32 + %tmpvar11 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar10 + store i32 %tmpvar7, ptr addrspace(5) %tmpvar11, align 4 + %tmpvar13 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar14 = icmp eq i32 %tmpvar13, 1000 + br i1 %tmpvar14, label %.preheader, label %bb2 + +bb15: ; preds = %.preheader + call void @llvm.lifetime.end.p5(i64 24, ptr addrspace(5) %tmpvar) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmpvar16 = phi i32 [ %tmpvar27, %.preheader ], [ 0, %bb2 ] + %tmpvar17 = trunc i32 %tmpvar16 to i16 + %tmpvar18 = urem i16 %tmpvar17, 6 + %tmpvar19 = sub nuw nsw i16 5, %tmpvar18 + %tmpvar20 = zext i16 %tmpvar19 to i32 + %tmpvar21 = getelementptr inbounds [6 x float], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar20 + %tmpvar23 = load i32, ptr addrspace(5) %tmpvar21, align 4 + %tmpvar24 = zext i32 %tmpvar16 to i64 + %tmpvar25 = getelementptr inbounds float, ptr addrspace(1) %arg, i64 %tmpvar24 + store i32 %tmpvar23, ptr addrspace(1) %tmpvar25, align 4 + %tmpvar27 = add nuw nsw i32 %tmpvar16, 1 + %tmpvar28 = icmp eq i32 %tmpvar27, 1000 + br i1 %tmpvar28, label %bb15, label %.preheader +} + +define amdgpu_kernel void @vector_write_read_bitcast_to_double(ptr addrspace(1) %arg) { +; OPT-LABEL: define amdgpu_kernel void @vector_write_read_bitcast_to_double +; OPT-SAME: (ptr addrspace(1) [[ARG:%.*]]) { +; OPT-NEXT: bb: +; OPT-NEXT: br label [[BB2:%.*]] +; OPT: bb2: +; OPT-NEXT: [[TMPVAR_SROA_0_0:%.*]] = phi <6 x double> [ undef, [[BB:%.*]] ], [ [[TMP0:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR3:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMPVAR13:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR4:%.*]] = zext i32 [[TMPVAR3]] to i64 +; OPT-NEXT: [[TMPVAR5:%.*]] = getelementptr inbounds double, ptr addrspace(1) [[ARG]], i64 [[TMPVAR4]] +; OPT-NEXT: [[TMPVAR72:%.*]] = load double, ptr addrspace(1) [[TMPVAR5]], align 8 +; OPT-NEXT: [[TMPVAR8:%.*]] = trunc i32 [[TMPVAR3]] to i16 +; OPT-NEXT: [[TMPVAR9:%.*]] = urem i16 [[TMPVAR8]], 6 +; OPT-NEXT: [[TMPVAR10:%.*]] = zext i16 [[TMPVAR9]] to i32 +; OPT-NEXT: [[TMP0]] = insertelement <6 x double> [[TMPVAR_SROA_0_0]], double [[TMPVAR72]], i32 [[TMPVAR10]] +; OPT-NEXT: [[TMPVAR13]] = add nuw nsw i32 [[TMPVAR3]], 1 +; OPT-NEXT: [[TMPVAR14:%.*]] = icmp eq i32 [[TMPVAR13]], 1000 +; OPT-NEXT: br i1 [[TMPVAR14]], label [[DOTPREHEADER:%.*]], label [[BB2]] +; OPT: bb15: +; OPT-NEXT: ret void +; OPT: .preheader: +; OPT-NEXT: [[TMPVAR16:%.*]] = phi i32 [ [[TMPVAR27:%.*]], [[DOTPREHEADER]] ], [ 0, [[BB2]] ] +; OPT-NEXT: [[TMPVAR17:%.*]] = trunc i32 [[TMPVAR16]] to i16 +; OPT-NEXT: [[TMPVAR18:%.*]] = urem i16 [[TMPVAR17]], 6 +; OPT-NEXT: [[TMPVAR19:%.*]] = sub nuw nsw i16 5, [[TMPVAR18]] +; OPT-NEXT: [[TMPVAR20:%.*]] = zext i16 [[TMPVAR19]] to i32 +; OPT-NEXT: [[BC:%.*]] = bitcast <6 x double> [[TMP0]] to <6 x i64> +; OPT-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[BC]], i32 [[TMPVAR20]] +; OPT-NEXT: [[TMPVAR24:%.*]] = zext i32 [[TMPVAR16]] to i64 +; OPT-NEXT: [[TMPVAR25:%.*]] = getelementptr inbounds double, ptr addrspace(1) [[ARG]], i64 [[TMPVAR24]] +; OPT-NEXT: store i64 [[TMP1]], ptr addrspace(1) [[TMPVAR25]], align 8 +; OPT-NEXT: [[TMPVAR27]] = add nuw nsw i32 [[TMPVAR16]], 1 +; OPT-NEXT: [[TMPVAR28:%.*]] = icmp eq i32 [[TMPVAR27]], 1000 +; OPT-NEXT: br i1 [[TMPVAR28]], label [[BB15:%.*]], label [[DOTPREHEADER]] +; +bb: + %tmpvar = alloca [6 x double], align 8, addrspace(5) + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmpvar) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar13, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar7 = load i64, ptr addrspace(1) %tmpvar5, align 8 + %tmpvar8 = trunc i32 %tmpvar3 to i16 + %tmpvar9 = urem i16 %tmpvar8, 6 + %tmpvar10 = zext i16 %tmpvar9 to i32 + %tmpvar11 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar10 + store i64 %tmpvar7, ptr addrspace(5) %tmpvar11, align 8 + %tmpvar13 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar14 = icmp eq i32 %tmpvar13, 1000 + br i1 %tmpvar14, label %.preheader, label %bb2 + +bb15: ; preds = %.preheader + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmpvar) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmpvar16 = phi i32 [ %tmpvar27, %.preheader ], [ 0, %bb2 ] + %tmpvar17 = trunc i32 %tmpvar16 to i16 + %tmpvar18 = urem i16 %tmpvar17, 6 + %tmpvar19 = sub nuw nsw i16 5, %tmpvar18 + %tmpvar20 = zext i16 %tmpvar19 to i32 + %tmpvar21 = getelementptr inbounds [6 x double], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar20 + %tmpvar23 = load i64, ptr addrspace(5) %tmpvar21, align 8 + %tmpvar24 = zext i32 %tmpvar16 to i64 + %tmpvar25 = getelementptr inbounds double, ptr addrspace(1) %arg, i64 %tmpvar24 + store i64 %tmpvar23, ptr addrspace(1) %tmpvar25, align 8 + %tmpvar27 = add nuw nsw i32 %tmpvar16, 1 + %tmpvar28 = icmp eq i32 %tmpvar27, 1000 + br i1 %tmpvar28, label %bb15, label %.preheader +} + +define amdgpu_kernel void @vector_write_read_bitcast_to_i64(ptr addrspace(1) %arg) { +; OPT-LABEL: define amdgpu_kernel void @vector_write_read_bitcast_to_i64 +; OPT-SAME: (ptr addrspace(1) [[ARG:%.*]]) { +; OPT-NEXT: bb: +; OPT-NEXT: br label [[BB2:%.*]] +; OPT: bb2: +; OPT-NEXT: [[TMPVAR_SROA_0_0:%.*]] = phi <6 x i64> [ undef, [[BB:%.*]] ], [ [[TMP0:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR3:%.*]] = phi i32 [ 0, [[BB]] ], [ [[TMPVAR11:%.*]], [[BB2]] ] +; OPT-NEXT: [[TMPVAR4:%.*]] = zext i32 [[TMPVAR3]] to i64 +; OPT-NEXT: [[TMPVAR5:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG]], i64 [[TMPVAR4]] +; OPT-NEXT: [[TMPVAR6:%.*]] = load i64, ptr addrspace(1) [[TMPVAR5]], align 8 +; OPT-NEXT: [[TMPVAR7:%.*]] = trunc i32 [[TMPVAR3]] to i16 +; OPT-NEXT: [[TMPVAR8:%.*]] = urem i16 [[TMPVAR7]], 6 +; OPT-NEXT: [[TMPVAR9:%.*]] = zext i16 [[TMPVAR8]] to i32 +; OPT-NEXT: [[TMP0]] = insertelement <6 x i64> [[TMPVAR_SROA_0_0]], i64 [[TMPVAR6]], i32 [[TMPVAR9]] +; OPT-NEXT: [[TMPVAR11]] = add nuw nsw i32 [[TMPVAR3]], 1 +; OPT-NEXT: [[TMPVAR12:%.*]] = icmp eq i32 [[TMPVAR11]], 1000 +; OPT-NEXT: br i1 [[TMPVAR12]], label [[DOTPREHEADER:%.*]], label [[BB2]] +; OPT: bb13: +; OPT-NEXT: ret void +; OPT: .preheader: +; OPT-NEXT: [[TMPVAR14:%.*]] = phi i32 [ [[TMPVAR23:%.*]], [[DOTPREHEADER]] ], [ 0, [[BB2]] ] +; OPT-NEXT: [[TMPVAR15:%.*]] = trunc i32 [[TMPVAR14]] to i16 +; OPT-NEXT: [[TMPVAR16:%.*]] = urem i16 [[TMPVAR15]], 6 +; OPT-NEXT: [[TMPVAR17:%.*]] = sub nuw nsw i16 5, [[TMPVAR16]] +; OPT-NEXT: [[TMPVAR18:%.*]] = zext i16 [[TMPVAR17]] to i32 +; OPT-NEXT: [[TMP1:%.*]] = extractelement <6 x i64> [[TMP0]], i32 [[TMPVAR18]] +; OPT-NEXT: [[TMPVAR21:%.*]] = zext i32 [[TMPVAR14]] to i64 +; OPT-NEXT: [[TMPVAR22:%.*]] = getelementptr inbounds i64, ptr addrspace(1) [[ARG]], i64 [[TMPVAR21]] +; OPT-NEXT: store i64 [[TMP1]], ptr addrspace(1) [[TMPVAR22]], align 8 +; OPT-NEXT: [[TMPVAR23]] = add nuw nsw i32 [[TMPVAR14]], 1 +; OPT-NEXT: [[TMPVAR24:%.*]] = icmp eq i32 [[TMPVAR23]], 1000 +; OPT-NEXT: br i1 [[TMPVAR24]], label [[BB13:%.*]], label [[DOTPREHEADER]] +; +bb: + %tmpvar = alloca [6 x i64], align 8, addrspace(5) + call void @llvm.lifetime.start.p5(i64 48, ptr addrspace(5) %tmpvar) #2 + br label %bb2 + +bb2: ; preds = %bb2, %bb + %tmpvar3 = phi i32 [ 0, %bb ], [ %tmpvar11, %bb2 ] + %tmpvar4 = zext i32 %tmpvar3 to i64 + %tmpvar5 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmpvar4 + %tmpvar6 = load i64, ptr addrspace(1) %tmpvar5, align 8 + %tmpvar7 = trunc i32 %tmpvar3 to i16 + %tmpvar8 = urem i16 %tmpvar7, 6 + %tmpvar9 = zext i16 %tmpvar8 to i32 + %tmpvar10 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar9 + store i64 %tmpvar6, ptr addrspace(5) %tmpvar10, align 8 + %tmpvar11 = add nuw nsw i32 %tmpvar3, 1 + %tmpvar12 = icmp eq i32 %tmpvar11, 1000 + br i1 %tmpvar12, label %.preheader, label %bb2 + +bb13: ; preds = %.preheader + call void @llvm.lifetime.end.p5(i64 48, ptr addrspace(5) %tmpvar) #2 + ret void + +.preheader: ; preds = %.preheader, %bb2 + %tmpvar14 = phi i32 [ %tmpvar23, %.preheader ], [ 0, %bb2 ] + %tmpvar15 = trunc i32 %tmpvar14 to i16 + %tmpvar16 = urem i16 %tmpvar15, 6 + %tmpvar17 = sub nuw nsw i16 5, %tmpvar16 + %tmpvar18 = zext i16 %tmpvar17 to i32 + %tmpvar19 = getelementptr inbounds [6 x i64], ptr addrspace(5) %tmpvar, i32 0, i32 %tmpvar18 + %tmpvar20 = load i64, ptr addrspace(5) %tmpvar19, align 8 + %tmpvar21 = zext i32 %tmpvar14 to i64 + %tmpvar22 = getelementptr inbounds i64, ptr addrspace(1) %arg, i64 %tmpvar21 + store i64 %tmpvar20, ptr addrspace(1) %tmpvar22, align 8 + %tmpvar23 = add nuw nsw i32 %tmpvar14, 1 + %tmpvar24 = icmp eq i32 %tmpvar23, 1000 + br i1 %tmpvar24, label %bb13, label %.preheader +} + +define amdgpu_kernel void @vector_read_alloca_bitcast_assume(ptr addrspace(1) %out, i32 %index) { +; OPT-LABEL: define amdgpu_kernel void @vector_read_alloca_bitcast_assume +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[INDEX:%.*]]) { +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> , i32 [[INDEX]] +; OPT-NEXT: store i32 [[TMP0]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +entry: + %tmpvar = alloca [4 x i32], addrspace(5) + %cmp = icmp ne ptr addrspace(5) %tmpvar, null + call void @llvm.assume(i1 %cmp) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + store i32 %tmpvar2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_read_alloca_multiuse(ptr addrspace(1) %out, i32 %index) { +; OPT-LABEL: define amdgpu_kernel void @vector_read_alloca_multiuse +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]], i32 [[INDEX:%.*]]) { +; OPT-NEXT: entry: +; OPT-NEXT: [[TMP0:%.*]] = extractelement <4 x i32> , i32 [[INDEX]] +; OPT-NEXT: [[ADD2:%.*]] = add nuw nsw i32 [[TMP0]], 1 +; OPT-NEXT: store i32 [[ADD2]], ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: ret void +; +entry: + %tmpvar = alloca [4 x i32], addrspace(5) + %y = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 1 + %z = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 2 + %w = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 3 + store i32 0, ptr addrspace(5) %tmpvar + store i32 1, ptr addrspace(5) %y + store i32 2, ptr addrspace(5) %z + store i32 3, ptr addrspace(5) %w + %tmpvar1 = getelementptr [4 x i32], ptr addrspace(5) %tmpvar, i32 0, i32 %index + %tmpvar2 = load i32, ptr addrspace(5) %tmpvar1 + %tmpvar3 = load i32, ptr addrspace(5) %tmpvar + %tmpvar4 = load i32, ptr addrspace(5) %y + %add1 = add i32 %tmpvar2, %tmpvar3 + %add2 = add i32 %add1, %tmpvar4 + store i32 %add2, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @bitcast_vector_to_vector(ptr addrspace(1) %out) { +; OPT-LABEL: define amdgpu_kernel void @bitcast_vector_to_vector +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]]) { +; OPT-NEXT: .entry: +; OPT-NEXT: store <4 x i32> , ptr addrspace(1) [[OUT]], align 16 +; OPT-NEXT: ret void +; +.entry: + %alloca = alloca <4 x float>, align 16, addrspace(5) + store <4 x i32> , ptr addrspace(5) %alloca + %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16 + store <4 x i32> %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %out) { +; OPT-LABEL: define amdgpu_kernel void @vector_bitcast_from_alloca_array +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]]) { +; OPT-NEXT: .entry: +; OPT-NEXT: store <4 x i32> , ptr addrspace(1) [[OUT]], align 16 +; OPT-NEXT: ret void +; +.entry: + %alloca = alloca [4 x float], align 16, addrspace(5) + store <4 x i32> , ptr addrspace(5) %alloca + %load = load <4 x i32>, ptr addrspace(5) %alloca, align 16 + store <4 x i32> %load, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspace(1) %out) { +; OPT-LABEL: define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]]) { +; OPT-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: [[OUT_REPACK1:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) [[OUT]], i64 0, i64 1 +; OPT-NEXT: store i32 2, ptr addrspace(1) [[OUT_REPACK1]], align 4 +; OPT-NEXT: [[OUT_REPACK2:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) [[OUT]], i64 0, i64 2 +; OPT-NEXT: store i32 3, ptr addrspace(1) [[OUT_REPACK2]], align 4 +; OPT-NEXT: [[OUT_REPACK3:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) [[OUT]], i64 0, i64 3 +; OPT-NEXT: store i32 4, ptr addrspace(1) [[OUT_REPACK3]], align 4 +; OPT-NEXT: ret void +; + %alloca = alloca [4 x float], align 16, addrspace(5) + store [4 x i32] [i32 1, i32 2, i32 3, i32 4], ptr addrspace(5) %alloca + %load = load [4 x i32], ptr addrspace(5) %alloca, align 16 + store [4 x i32] %load, ptr addrspace(1) %out + ret void +} + +%struct.v4 = type { i32, i32, i32, i32 } + +define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(ptr addrspace(1) %out) { +; OPT-LABEL: define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array +; OPT-SAME: (ptr addrspace(1) [[OUT:%.*]]) { +; OPT-NEXT: store i32 1, ptr addrspace(1) [[OUT]], align 4 +; OPT-NEXT: [[OUT_REPACK1:%.*]] = getelementptr inbounds [[STRUCT_V4:%.*]], ptr addrspace(1) [[OUT]], i64 0, i32 1 +; OPT-NEXT: store i32 2, ptr addrspace(1) [[OUT_REPACK1]], align 4 +; OPT-NEXT: [[OUT_REPACK2:%.*]] = getelementptr inbounds [[STRUCT_V4]], ptr addrspace(1) [[OUT]], i64 0, i32 2 +; OPT-NEXT: store i32 3, ptr addrspace(1) [[OUT_REPACK2]], align 4 +; OPT-NEXT: [[OUT_REPACK3:%.*]] = getelementptr inbounds [[STRUCT_V4]], ptr addrspace(1) [[OUT]], i64 0, i32 3 +; OPT-NEXT: store i32 4, ptr addrspace(1) [[OUT_REPACK3]], align 4 +; OPT-NEXT: ret void +; + %alloca = alloca [4 x float], align 16, addrspace(5) + store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, ptr addrspace(5) %alloca + %load = load %struct.v4, ptr addrspace(5) %alloca, align 16 + store %struct.v4 %load, ptr addrspace(1) %out + ret void +} + +declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) + +declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) + +declare void @llvm.assume(i1) diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -O3 -S < %s | FileCheck %s + +%struct.ss = type { ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr, ptr } + +define internal void @phantomLoad(ptr %p, ptr %y, ptr %x) { +entry: + %0 = load i32, ptr %x + store i32 %0, ptr %y + ret void +} + +define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 { +; CHECK-LABEL: define nonnull ptr @parent +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[F]], i64 64 +; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[CMP_NOT_NOT_I:%.*]] = icmp eq i32 [[VAL3]], 0 +; CHECK-NEXT: br i1 [[CMP_NOT_NOT_I]], label [[IF_THEN_I:%.*]], label [[IF_ELSE_I:%.*]] +; CHECK: if.then.i: +; CHECK-NEXT: store half [[VAL1]], ptr [[F_VAL]], align 2 +; CHECK-NEXT: [[ADD_PTR_I_I_I_I_I:%.*]] = getelementptr inbounds i8, ptr [[F_VAL]], i64 16 +; CHECK-NEXT: br label [[BADCHILD_EXIT:%.*]] +; CHECK: if.else.i: +; CHECK-NEXT: [[ADD_PTR_I_I_I_I7_I:%.*]] = getelementptr inbounds i8, ptr [[F_VAL]], i64 16 +; CHECK-NEXT: store half [[VAL1]], ptr [[ADD_PTR_I_I_I_I7_I]], align 2 +; CHECK-NEXT: br label [[BADCHILD_EXIT]] +; CHECK: badChild.exit: +; CHECK-NEXT: [[THIS_64_VAL_SINK_I:%.*]] = phi ptr [ [[F_VAL]], [[IF_ELSE_I]] ], [ [[ADD_PTR_I_I_I_I_I]], [[IF_THEN_I]] ] +; CHECK-NEXT: store i16 [[VAL2]], ptr [[THIS_64_VAL_SINK_I]], align 2 +; CHECK-NEXT: ret ptr [[F]] +; +entry: + call void @badChild(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) #4 + ret ptr %f +} + + +define internal void @badChild(ptr align 8 dereferenceable(72) %this, half %val1, i16 %val2, i32 %val3) align 2 { +entry: + %othergep = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 2 + %load0 = load ptr, ptr %othergep, align 8 + %x = alloca i32 + %y = alloca i32 + call void @phantomLoad(ptr %load0, ptr %x, ptr %y) + %val1.cast = bitcast half %val1 to i16 + %cmp.not.not = icmp eq i32 %val3, 0 + br i1 %cmp.not.not, label %if.then, label %if.else +if.then: ; preds = %entry + %0 = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 8 + %1 = load ptr, ptr %0, align 8 + store i16 %val1.cast, ptr %1, align 2 + %add.ptr.i.i.i.i = getelementptr inbounds i8, ptr %1, i64 16 + store i16 %val2, ptr %add.ptr.i.i.i.i, align 2 + br label %if.end +if.else: ; preds = %entry + %2 = getelementptr inbounds %struct.ss, ptr %this, i64 0, i32 8 + %3 = load ptr, ptr %2, align 8 + %add.ptr.i.i.i.i7 = getelementptr inbounds i8, ptr %3, i64 16 + store i16 %val1.cast, ptr %add.ptr.i.i.i.i7, align 2 + store i16 %val2, ptr %3, align 2 + br label %if.end +if.end: ; preds = %if.else, %if.then + ret void +}