diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -271,6 +271,8 @@ SDNode *getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width); + SDNode *getV_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, + uint32_t Offset, uint32_t Width); void SelectS_BFEFromShifts(SDNode *N); void SelectS_BFE(SDNode *N); bool isCBranchSCC(const SDNode *N) const; @@ -2224,6 +2226,15 @@ return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, PackedConst); } +SDNode *AMDGPUDAGToDAGISel::getV_BFE(unsigned Opcode, const SDLoc &DL, + SDValue Val, uint32_t Offset, + uint32_t Width) { + SDValue Off = CurDAG->getTargetConstant(Offset, DL, MVT::i32); + SDValue W = CurDAG->getTargetConstant(Width, DL, MVT::i32); + + return CurDAG->getMachineNode(Opcode, DL, MVT::i32, Val, Off, W); +} + void AMDGPUDAGToDAGISel::SelectS_BFEFromShifts(SDNode *N) { // "(a << b) srl c)" ---> "BFE_U32 a, (c-b), (32-c) // "(a << b) sra c)" ---> "BFE_I32 a, (c-b), (32-c) @@ -2239,10 +2250,17 @@ if (0 < BVal && BVal <= CVal && CVal < 32) { bool Signed = N->getOpcode() == ISD::SRA; - unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; + if (N->isDivergent()) { + unsigned Opcode = + Signed ? AMDGPU::V_BFE_I32_e64 : AMDGPU::V_BFE_U32_e64; + ReplaceNode(N, getV_BFE(Opcode, SDLoc(N), + Shl.getOperand(0), CVal - BVal, 32 - CVal)); + } else { + unsigned Opcode = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32; - ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), CVal - BVal, - 32 - CVal)); + ReplaceNode(N, getS_BFE(Opcode, SDLoc(N), Shl.getOperand(0), + CVal - BVal, 32 - CVal)); + } return; } } @@ -2266,8 +2284,12 @@ if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), - Srl.getOperand(0), ShiftVal, WidthVal)); + if (N->isDivergent()) + ReplaceNode(N, getV_BFE(AMDGPU::V_BFE_U32_e64, SDLoc(N), + Srl.getOperand(0), ShiftVal, WidthVal)); + else + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + Srl.getOperand(0), ShiftVal, WidthVal)); return; } } @@ -2287,9 +2309,12 @@ if (isMask_32(MaskVal)) { uint32_t WidthVal = countPopulation(MaskVal); - - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), - And.getOperand(0), ShiftVal, WidthVal)); + if (N->isDivergent()) + ReplaceNode(N, getV_BFE(AMDGPU::V_BFE_U32_e64, SDLoc(N), + And.getOperand(0), ShiftVal, WidthVal)); + else + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_U32, SDLoc(N), + And.getOperand(0), ShiftVal, WidthVal)); return; } } @@ -2316,8 +2341,12 @@ break; unsigned Width = cast(N->getOperand(1))->getVT().getSizeInBits(); - ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), - Amt->getZExtValue(), Width)); + if (N->isDivergent()) + ReplaceNode(N, getV_BFE(AMDGPU::V_BFE_I32_e64, SDLoc(N), + Src.getOperand(0), Amt->getZExtValue(), Width)); + else + ReplaceNode(N, getS_BFE(AMDGPU::S_BFE_I32, SDLoc(N), Src.getOperand(0), + Amt->getZExtValue(), Width)); return; } } diff --git a/llvm/test/CodeGen/AMDGPU/divergent_bfe.ll b/llvm/test/CodeGen/AMDGPU/divergent_bfe.ll new file mode 100755 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergent_bfe.ll @@ -0,0 +1,24 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN_LABEL: @bfe_uniform +; GCN: s_bfe_u32 +define amdgpu_kernel void @bfe_uniform(i32 %val, i32 addrspace(1)* %out) { + %hibits = lshr i32 %val, 16 + %masked = and i32 %hibits, 15 + store i32 %masked, i32 addrspace(1)* %out + ret void +} + +; GCN_LABEL: @bfe_divergent +; GCN: v_bfe_u32 +define amdgpu_kernel void @bfe_divergent(i32 %val, i32 addrspace(1)* %out) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %divergent = add i32 %val, %tid + %hibits = lshr i32 %divergent, 16 + %masked = and i32 %hibits, 15 + store i32 %masked, i32 addrspace(1)* %out + ret void +} + + +declare i32 @llvm.amdgcn.workitem.id.x() \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll --- a/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/AMDGPU/srem-seteq-illegal-types.ll @@ -62,13 +62,13 @@ ; CHECK-NEXT: v_bfe_i32 v3, v2, 0, 31 ; CHECK-NEXT: v_bfe_i32 v4, v1, 0, 31 ; CHECK-NEXT: v_bfe_i32 v5, v0, 0, 31 -; CHECK-NEXT: s_mov_b32 s6, 0x38e38e39 -; CHECK-NEXT: s_mov_b32 s7, 0xc71c71c7 -; CHECK-NEXT: s_brev_b32 s4, -2 -; CHECK-NEXT: s_mov_b32 s5, 0x7ffffffd -; CHECK-NEXT: v_mul_hi_i32 v5, v5, s6 -; CHECK-NEXT: v_mul_hi_i32 v4, v4, s6 -; CHECK-NEXT: v_mul_hi_i32 v3, v3, s7 +; CHECK-NEXT: s_mov_b32 s4, 0x38e38e39 +; CHECK-NEXT: s_mov_b32 s5, 0xc71c71c7 +; CHECK-NEXT: s_brev_b32 s6, -2 +; CHECK-NEXT: s_mov_b32 s7, 0x7ffffffd +; CHECK-NEXT: v_mul_hi_i32 v5, v5, s4 +; CHECK-NEXT: v_mul_hi_i32 v4, v4, s4 +; CHECK-NEXT: v_mul_hi_i32 v3, v3, s5 ; CHECK-NEXT: v_lshrrev_b32_e32 v6, 31, v5 ; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5 ; CHECK-NEXT: v_lshrrev_b32_e32 v7, 31, v4 @@ -84,12 +84,12 @@ ; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 ; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v4 ; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v2, v3 -; CHECK-NEXT: v_and_b32_e32 v2, s4, v2 -; CHECK-NEXT: v_and_b32_e32 v1, s4, v1 -; CHECK-NEXT: v_and_b32_e32 v0, s4, v0 +; CHECK-NEXT: v_and_b32_e32 v2, s6, v2 +; CHECK-NEXT: v_and_b32_e32 v1, s6, v1 +; CHECK-NEXT: v_and_b32_e32 v0, s6, v0 ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1 +; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1 ; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 3, v2 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc