diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -14,6 +14,16 @@ } +class UniformSextInreg : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return !N->isDivergent(); }]>; + +class DivergentSextInreg : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return N->isDivergent(); }]>; + include "SOPInstructions.td" include "VOPInstructions.td" include "SMInstructions.td" @@ -1939,12 +1949,6 @@ //===----------------------------------------------------------------------===// // Conversion Patterns //===----------------------------------------------------------------------===// - -class UniformSextInreg : PatFrag< - (ops node:$src), - (sext_inreg $src, VT), - [{ return !N->isDivergent(); }]>; - def : GCNPat<(i32 (UniformSextInreg i32:$src)), (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 @@ -1979,13 +1983,8 @@ (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; - -class DivergentSextInreg : PatFrag< - (ops node:$src), - (sext_inreg $src, VT), - [{ return N->isDivergent(); }]>; - -def : GCNPat<(i32 (DivergentSextInreg i32:$src)), +def : GCNPat< + (i32 (DivergentSextInreg i32:$src)), (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; def : GCNPat < @@ -1998,6 +1997,16 @@ (V_BFE_I32_e64 $src, (i32 0), (i32 8)) >; +def : GCNPat< + (i32 (DivergentSextInreg i32:$src)), + (V_BFE_I32_e64 i32:$src, (i32 0), (i32 8)) +>; + +def : GCNPat < + (i32 (DivergentSextInreg i32:$src)), + (V_BFE_I32_e64 $src, (i32 0), (i32 16)) +>; + def : GCNPat < (i64 (DivergentSextInreg i64:$src)), (REG_SEQUENCE VReg_64, @@ -2051,11 +2060,17 @@ // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple outputs. def : GCNPat < - (i64 (sext i32:$src)), + (i64 (UniformUnaryFrag i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1) >; +def : GCNPat < + (i64 (DivergentUnaryFrag i32:$src)), + (REG_SEQUENCE VReg_64, $src, sub0, + (i32 (COPY_TO_REGCLASS (V_ASHRREV_I32_e64 (i32 31), $src), VGPR_32)), sub1) +>; + def : GCNPat < (i64 (sext i1:$src)), (REG_SEQUENCE VReg_64, @@ -2232,6 +2247,18 @@ // certainty what the source behavior is without more context on how // the src is lowered. e.g. fptrunc + fma may be lowered to a // v_fma_mix* instruction which does not zero, or may not. +def : GCNPat< + (i32 (DivergentUnaryFrag i16:$src)), + (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag i16:$src)), + (REG_SEQUENCE VReg_64, + (V_AND_B32_e64 (S_MOV_B32 (i32 0xffff)), $src), sub0, + (S_MOV_B32 (i32 0)), sub1) +>; + def : GCNPat< (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))), (COPY VSrc_b16:$src)>; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -276,10 +276,10 @@ >; def S_FLBIT_I32_I64 : SOP1_32_64 <"s_flbit_i32_i64">; def S_SEXT_I32_I8 : SOP1_32 <"s_sext_i32_i8", - [(set i32:$sdst, (sext_inreg i32:$src0, i8))] + [(set i32:$sdst, (UniformSextInreg i32:$src0))] >; def S_SEXT_I32_I16 : SOP1_32 <"s_sext_i32_i16", - [(set i32:$sdst, (sext_inreg i32:$src0, i16))] + [(set i32:$sdst, (UniformSextInreg i32:$src0))] >; } // End isReMaterializable = 1 @@ -1408,7 +1408,7 @@ // REG_SEQUENCE patterns don't support instructions with multiple // outputs. def : GCNPat< - (i64 (zext i16:$src)), + (i64 (UniformUnaryFrag i16:$src)), (REG_SEQUENCE SReg_64, (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0, (S_MOV_B32 (i32 0)), sub1) @@ -1421,7 +1421,7 @@ >; def : GCNPat< - (i32 (zext i16:$src)), + (i32 (UniformUnaryFrag i16:$src)), (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src) >; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-sext-inreg.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn < %s | FileCheck -enable-var-scope --check-prefixes=GCN %s + +define amdgpu_kernel void @uniform_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: uniform_sext_in_reg_i8_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s2, s4, s5 +; GCN-NEXT: s_sext_i32_i8 s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @divergent_sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: divergent_sext_in_reg_i8_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s4, s4, s5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %c = add i32 %a, %b ; add to prevent folding into extload + %c.divergent = add i32 %c, %tid + %shl = shl i32 %c.divergent, 24 + %ashr = ashr i32 %shl, 24 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @uniform_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: uniform_sext_in_reg_i16_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s2, s4, s5 +; GCN-NEXT: s_sext_i32_i16 s4, s2 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %c = add i32 %a, %b ; add to prevent folding into extload + %shl = shl i32 %c, 16 + %ashr = ashr i32 %shl, 16 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +define amdgpu_kernel void @divergent_sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: divergent_sext_in_reg_i16_to_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_i32 s4, s4, s5 +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %c = add i32 %a, %b ; add to prevent folding into extload + %c.divergent = add i32 %c, %tid + %shl = shl i32 %c.divergent, 16 + %ashr = ashr i32 %shl, 16 + store i32 %ashr, i32 addrspace(1)* %out, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable } diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll --- a/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll @@ -41,17 +41,15 @@ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; GCN-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 1, [[COPY1]], implicit $exec ; GCN-NEXT: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 killed [[V_AND_B32_e64_]], 1, implicit $exec - ; GCN-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[COPY2]] - ; GCN-NEXT: [[S_SEXT_I32_I16_:%[0-9]+]]:sreg_32 = S_SEXT_I32_I16 [[COPY3]] + ; GCN-NEXT: [[V_BFE_I32_e64_:%[0-9]+]]:vgpr_32 = V_BFE_I32_e64 [[COPY2]], 0, 16, implicit $exec ; GCN-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY killed [[S_MOV_B32_]] - ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[S_SEXT_I32_I16_]], [[COPY4]], implicit $exec + ; GCN-NEXT: [[V_CMP_LT_I32_e64_:%[0-9]+]]:sreg_64 = V_CMP_LT_I32_e64 killed [[V_BFE_I32_e64_]], killed [[S_MOV_B32_]], implicit $exec ; GCN-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64_xexec = S_OR_B64 killed [[V_CMP_LT_I32_e64_]], killed [[V_CMP_EQ_U32_e64_]], implicit-def dead $scc ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed [[S_OR_B64_]], implicit $exec - ; GCN-NEXT: [[COPY5:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; GCN-NEXT: [[COPY3:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] ; GCN-NEXT: $vgpr0 = COPY [[V_CNDMASK_B32_e64_]] - ; GCN-NEXT: [[COPY6:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY5]] - ; GCN-NEXT: S_SETPC_B64_return [[COPY6]], implicit $vgpr0 + ; GCN-NEXT: [[COPY4:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY3]] + ; GCN-NEXT: S_SETPC_B64_return [[COPY4]], implicit $vgpr0 %setcc = icmp slt i16 %x, 0 %select = select i1 %setcc, i1 true, i1 %z ret i1 %select diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -85,6 +85,47 @@ ret void } +define amdgpu_kernel void @sext_i32_to_i64_uniform(i64 addrspace(1)* %out, i32 %a, i64 %b) { +; GCN-LABEL: sext_i32_to_i64_uniform: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_ashr_i32 s7, s6, 31 +; GCN-NEXT: s_add_u32 s4, s4, s6 +; GCN-NEXT: s_addc_u32 s5, s5, s7 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %sext = sext i32 %a to i64 + %res = add i64 %b, %sext + store i64 %res, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @sext_i32_to_i64_divergent(i64 addrspace(1)* %out, i32 %a, i64 %b) { +; GCN-LABEL: sext_i32_to_i64_divergent: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s4, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %divergent.a = add i32 %a, %tid + %sext = sext i32 %divergent.a to i64 + store i64 %sext, i64 addrspace(1)* %out + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll old mode 100755 new mode 100644 copy from llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll copy to llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll --- a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll +++ b/llvm/test/CodeGen/AMDGPU/zext-divergence-driven-isel.ll @@ -1,50 +1,50 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s -define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { -; GCN-LABEL: sext_i16_to_i32_uniform: +define amdgpu_kernel void @zext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { +; GCN-LABEL: zext_i16_to_i32_uniform: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_sext_i32_i16 s4, s4 +; GCN-NEXT: s_and_b32 s4, s4, 0xffff ; GCN-NEXT: s_add_i32 s4, s5, s4 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %sext = sext i16 %a to i32 - %res = add i32 %b, %sext + %zext = zext i16 %a to i32 + %res = add i32 %b, %zext store i32 %res, i32 addrspace(1)* %out ret void } -define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) { -; GCN-LABEL: sext_i16_to_i64_uniform: +define amdgpu_kernel void @zext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) { +; GCN-LABEL: zext_i16_to_i64_uniform: ; GCN: ; %bb.0: -; GCN-NEXT: s_load_dword s4, s[0:1], 0xb -; GCN-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0xd +; GCN-NEXT: s_load_dword s6, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x100000 -; GCN-NEXT: s_add_u32 s4, s6, s4 -; GCN-NEXT: s_addc_u32 s5, s7, s5 +; GCN-NEXT: s_and_b32 s6, s6, 0xffff +; GCN-NEXT: s_add_u32 s4, s4, s6 +; GCN-NEXT: s_addc_u32 s5, s5, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm - %sext = sext i16 %a to i64 - %res = add i64 %b, %sext + %zext = zext i16 %a to i64 + %res = add i64 %b, %zext store i64 %res, i64 addrspace(1)* %out ret void } -define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) { -; GCN-LABEL: sext_i16_to_i32_divergent: +define amdgpu_kernel void @zext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) { +; GCN-LABEL: zext_i16_to_i32_divergent: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 @@ -52,36 +52,36 @@ ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.truncated = trunc i32 %tid to i16 %divergent.a = add i16 %a, %tid.truncated - %sext = sext i16 %divergent.a to i32 - store i32 %sext, i32 addrspace(1)* %out + %zext = zext i16 %divergent.a to i32 + store i32 %zext, i32 addrspace(1)* %out ret void } -define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) { -; GCN-LABEL: sext_i16_to_i64_divergent: +define amdgpu_kernel void @zext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) { +; GCN-LABEL: zext_i16_to_i64_divergent: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s4, s[0:1], 0xb ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.truncated = trunc i32 %tid to i16 %divergent.a = add i16 %a, %tid.truncated - %sext = sext i16 %divergent.a to i64 - store i64 %sext, i64 addrspace(1)* %out + %zext = zext i16 %divergent.a to i64 + store i64 %zext, i64 addrspace(1)* %out ret void }