diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -233,9 +233,8 @@ void SelectMAD_64_32(SDNode *N); void SelectFMA_W_CHAIN(SDNode *N); void SelectFMUL_W_CHAIN(SDNode *N); - - SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, - uint32_t Offset, uint32_t Width); + SDNode *getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, + uint32_t Width); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -641,8 +641,8 @@ uint32_t OffsetVal = Offset->getZExtValue(); uint32_t WidthVal = Width->getZExtValue(); - ReplaceNode(N, getS_BFE(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32, - SDLoc(N), N->getOperand(0), OffsetVal, WidthVal)); + ReplaceNode(N, getBFE32(Signed, SDLoc(N), N->getOperand(0), OffsetVal, + WidthVal)); return; } case AMDGPUISD::DIV_SCALE: { @@ -1947,9 +1947,17 @@ return true; } -SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, +SDNode *AMDGPUDAGToDAGISel::getBFE32(bool IsSigned, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width) { + if (Val->isDivergent()) { + unsigned Opcode = IsSigned ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; + SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32); + SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W); + } + unsigned Opcode = IsSigned ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; // Transformation function, pack the offset and width of a BFE into // the format expected by the S_BFE_I32 / S_BFE_U32. In the second // source, bits [5:0] contain the offset and bits [22:16] the width. @@ -1974,10 +1982,8 @@ if (0 < BVal && BVal <= CVal && CVal < 32) { bool Signed = N->getOpcode() == ISD::SRA; - unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - - ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, - 32 - CVal)); + ReplaceNode(N, getBFE32(Signed, SDLoc(N), Shl.getOperand(0), CVal - BVal, + 32 - CVal)); return; } } @@ -2000,9 +2006,8 @@ if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), - Srl.getOperand(0), ShiftVal, WidthVal)); + ReplaceNode(N, getBFE32(false, SDLoc(N), Srl.getOperand(0), ShiftVal, + WidthVal)); return; } } @@ -2022,9 +2027,8 @@ if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), - And.getOperand(0), ShiftVal, WidthVal)); + ReplaceNode(N, getBFE32(false, SDLoc(N), And.getOperand(0), ShiftVal, + WidthVal)); return; } } @@ -2051,7 +2055,7 @@ break; unsigned Width = cast(N->getOperand(1))->getVT().getSizeInBits(); - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), + ReplaceNode(N, getBFE32(true, SDLoc(N), Src.getOperand(0), Amt->getZExtValue(), Width)); return; } diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -1871,40 +1871,92 @@ // Conversion Patterns //===----------------------------------------------------------------------===// -def : GCNPat<(i32 (sext_inreg i32:$src, i1)), +class UniformSextInreg : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return !N->isDivergent(); }]>; + +def : GCNPat<(i32 (UniformSextInreg i32:$src)), (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16 // Handle sext_inreg in i64 def : GCNPat < - (i64 (sext_inreg i64:$src, i1)), + (i64 (UniformSextInreg i64:$src)), (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16 >; def : GCNPat < - (i16 (sext_inreg i16:$src, i1)), + (i16 (UniformSextInreg i16:$src)), (S_BFE_I32 $src, (i32 0x00010000)) // 0 | 1 << 16 >; def : GCNPat < - (i16 (sext_inreg i16:$src, i8)), + (i16 (UniformSextInreg i16:$src)), (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16 >; def : GCNPat < - (i64 (sext_inreg i64:$src, i8)), + (i64 (UniformSextInreg i64:$src)), (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16 >; def : GCNPat < - (i64 (sext_inreg i64:$src, i16)), + (i64 (UniformSextInreg i64:$src)), (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16 >; def : GCNPat < - (i64 (sext_inreg i64:$src, i32)), + (i64 (UniformSextInreg i64:$src)), (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16 >; + +class DivergentSextInreg : PatFrag< + (ops node:$src), + (sext_inreg $src, VT), + [{ return N->isDivergent(); }]>; + +def : GCNPat<(i32 (DivergentSextInreg i32:$src)), + (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>; + +def : GCNPat < + (i16 (DivergentSextInreg i16:$src)), + (V_BFE_I32_e64 $src, (i32 0), (i32 1)) // 0 | 1 << 16 +>; + +def : GCNPat < + (i16 (DivergentSextInreg i16:$src)), + (V_BFE_I32_e64 $src, (i32 0), (i32 8)) // 0 | 8 << 16 +>; + +def : GCNPat < + (i64 (DivergentSextInreg i64:$src)), + (REG_SEQUENCE VReg_64, + (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1)), sub0, + (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 1))), sub1) +>; + +def : GCNPat < + (i64 (DivergentSextInreg i64:$src)), + (REG_SEQUENCE VReg_64, + (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8)/* 0 | 8 << 16 */), sub0, + (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 8))), sub1) +>; + +def : GCNPat < + (i64 (DivergentSextInreg i64:$src)), + (REG_SEQUENCE VReg_64, + (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16)/* 0 | 16 << 16 */), sub0, + (V_ASHRREV_I32_e32 (i32 31), (V_BFE_I32_e64 (i32 (EXTRACT_SUBREG i64:$src, sub0)), (i32 0), (i32 16))), sub1) +>; + +def : GCNPat < + (i64 (DivergentSextInreg i64:$src)), + (REG_SEQUENCE VReg_64, + (i32 (EXTRACT_SUBREG i64:$src, sub0)), sub0, + (V_ASHRREV_I32_e32 (i32 31), (i32 (EXTRACT_SUBREG i64:$src, sub0))), sub1) +>; + def : GCNPat < (i64 (zext i32:$src)), (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1) diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bfe-isel.ll @@ -0,0 +1,25 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN_LABEL: @bfe_uniform +; GCN: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x40010 +define amdgpu_kernel void @bfe_uniform(i32 %val, i32 addrspace(1)* %out) { + %hibits = lshr i32 %val, 16 + %masked = and i32 %hibits, 15 + store i32 %masked, i32 addrspace(1)* %out + ret void +} + +; GCN_LABEL: @bfe_divergent +; GCN: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 4 +define amdgpu_kernel void @bfe_divergent(i32 %val, i32 addrspace(1)* %out) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %divergent = add i32 %val, %tid + %hibits = lshr i32 %divergent, 16 + %masked = and i32 %hibits, 15 + store i32 %masked, i32 addrspace(1)* %out + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() + diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -79,14 +79,14 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[0:1], s[6:7] ; SI-NEXT: buffer_load_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64 -; SI-NEXT: s_movk_i32 s7, 0xfc01 +; SI-NEXT: s_movk_i32 s6, 0xfc01 ; SI-NEXT: s_mov_b32 s0, -1 ; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_brev_b32 s6, -2 +; SI-NEXT: s_brev_b32 s7, -2 ; SI-NEXT: v_mov_b32_e32 v8, 0x3ff00000 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_bfe_u32 v4, v3, 20, 11 -; SI-NEXT: v_add_i32_e32 v6, vcc, s7, v4 +; SI-NEXT: v_add_i32_e32 v6, vcc, s6, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[0:1], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 ; SI-NEXT: v_not_b32_e32 v4, v4 @@ -100,7 +100,7 @@ ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v3, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; SI-NEXT: v_add_f64 v[6:7], v[2:3], -v[4:5] -; SI-NEXT: v_bfi_b32 v2, s6, v8, v3 +; SI-NEXT: v_bfi_b32 v2, s7, v8, v3 ; SI-NEXT: v_cmp_ge_f64_e64 vcc, |v[6:7]|, 0.5 ; SI-NEXT: s_mov_b64 s[6:7], s[2:3] ; SI-NEXT: v_cndmask_b32_e32 v3, 0, v2, vcc diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -62,13 +62,13 @@ ; CHECK-NEXT: v_bfe_i32 v3, v2, 0, 31 ; CHECK-NEXT: v_bfe_i32 v4, v1, 0, 31 ; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31 -; CHECK-NEXT: s_mov_b32 s6, 0x38e38e39 -; CHECK-NEXT: s_mov_b32 s7, 0xc71c71c7 -; CHECK-NEXT: s_brev_b32 s4, -2 -; CHECK-NEXT: s_mov_b32 s5, 0x7ffffffd -; CHECK-NEXT: v_mul_hi_i32 v5, v5, s6 -; CHECK-NEXT: v_mul_hi_i32 v4, v4, s6 -; CHECK-NEXT: v_mul_hi_i32 v3, v3, s7 +; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39 +; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7 +; CHECK-NEXT: s_brev_b32 s6, -2 +; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd +; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4 +; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4 +; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5 ; CHECK-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5 ; CHECK-NEXT: v_lshrrev_b32_e32 v7, 31, v4 @@ -84,12 +84,12 @@ ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v2, s6, v2 +; CHECK-NEXT: v_and_b32_e32 v1, s6, v1 +; CHECK-NEXT: v_and_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc