diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1256,7 +1256,7 @@ // Same as a 32-bit inreg def : GCNPat< - (i32 (sext i16:$src)), + (i32 (UniformUnaryFrag i16:$src)), (S_SEXT_I32_I16 $src) >; @@ -1283,7 +1283,7 @@ >; def : GCNPat < - (i64 (sext i16:$src)), + (i64 (UniformUnaryFrag i16:$src)), (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0, (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1) >; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -402,6 +402,20 @@ } // End SubtargetPredicate = isGFX8Plus } // End SchedRW = [Write64Bit] +def : GCNPat< + (i64 (getDivergentFrag.ret i16:$src)), + (REG_SEQUENCE VReg_64, + (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0, + (i32 (COPY_TO_REGCLASS + (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) + ), VGPR_32)), sub1) +>; + +def : GCNPat< + (i32 (getDivergentFrag.ret i16:$src)), + (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))) +>; + let SubtargetPredicate = isGFX6GFX7GFX10 in { def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile>; } // End SubtargetPredicate = isGFX6GFX7GFX10 diff --git a/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll new file mode 100755 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sext-divergence-driven-isel.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s + +define amdgpu_kernel void @sext_i16_to_i32_uniform(i32 addrspace(1)* %out, i16 %a, i32 %b) { +; GCN-LABEL: sext_i16_to_i32_uniform: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_sext_i32_i16 s0, s0 +; GCN-NEXT: s_add_i32 s0, s1, s0 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm + %sext = sext i16 %a to i32 + %res = add i32 %b, %sext + store i32 %res, i32 addrspace(1)* %out + ret void +} + + +define amdgpu_kernel void @sext_i16_to_i64_uniform(i64 addrspace(1)* %out, i16 %a, i64 %b) { +; GCN-LABEL: sext_i16_to_i64_uniform: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s2, s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x100000 +; GCN-NEXT: s_add_u32 s0, s0, s2 +; GCN-NEXT: s_addc_u32 s1, s1, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_endpgm + %sext = sext i16 %a to i64 + %res = add i64 %b, %sext + store i64 %res, i64 addrspace(1)* %out + ret void +} + +define amdgpu_kernel void @sext_i16_to_i32_divergent(i32 addrspace(1)* %out, i16 %a, i32 %b) { +; GCN-LABEL: sext_i16_to_i32_divergent: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.truncated = trunc i32 %tid to i16 + %divergent.a = add i16 %a, %tid.truncated + %sext = sext i16 %divergent.a to i32 + store i32 %sext, i32 addrspace(1)* %out + ret void +} + + +define amdgpu_kernel void @sext_i16_to_i64_divergent(i64 addrspace(1)* %out, i16 %a, i64 %b) { +; GCN-LABEL: sext_i16_to_i64_divergent: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 +; GCN-NEXT: s_load_dword s0, s[0:1], 0xb +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-NEXT: s_endpgm + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.truncated = trunc i32 %tid to i16 + %divergent.a = add i16 %a, %tid.truncated + %sext = sext i16 %divergent.a to i64 + store i64 %sext, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone speculatable }