diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2431,6 +2431,11 @@ } +def : GCNPat< + (i64 (DivergentUnaryFrag i64:$a)), + (REG_SEQUENCE VReg_64, + (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0, + (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>; // Prefer selecting to max when legal, but using mul is always valid. let AddedComplexity = -5 in { diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -235,10 +235,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1 in { def S_BREV_B32 : SOP1_32 <"s_brev_b32", - [(set i32:$sdst, (bitreverse i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; def S_BREV_B64 : SOP1_64 <"s_brev_b64", - [(set i64:$sdst, (bitreverse i64:$src0))] + [(set i64:$sdst, (UniformUnaryFrag i64:$src0))] >; } // End isReMaterializable = 1, isAsCheapAsAMove = 1 diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -282,7 +282,7 @@ } // End TRANS = 1, SchedRW = [WriteTrans32] defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>; -defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>; +defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, DivergentUnaryFrag>; defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>; defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>; defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>; diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-bitreverse.ll @@ -0,0 +1,43 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s + +; GCN-FUNC: uniform_bitreverse_i32 +; GCN: S_BREV_B32 +define amdgpu_kernel void @uniform_bitreverse_i32(i32 %val, i32 addrspace(1)* %out) { + %res = call i32 @llvm.bitreverse.i32(i32 %val) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-FUNC: divergent_bitreverse_i32 +; GCN: V_BFREV_B32 +define amdgpu_kernel void @divergent_bitreverse_i32(i32 %val, i32 addrspace(1)* %out) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %divergent = add i32 %val, %tid + %res = call i32 @llvm.bitreverse.i32(i32 %divergent) + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-FUNC: uniform_bitreverse_i64 +; GCN: S_BREV_B64 +define amdgpu_kernel void @uniform_bitreverse_i64(i64 %val, i64 addrspace(1)* %out) { + %res = call i64 @llvm.bitreverse.i64(i64 %val) + store i64 %res, i64 addrspace(1)* %out + ret void +} + +; GCN-FUNC: divergent_bitreverse_i64 +; GCN: V_BFREV_B32 +; GCN: V_BFREV_B32 +define amdgpu_kernel void @divergent_bitreverse_i64(i64 %val, i64 addrspace(1)* %out) { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %ext = zext i32 %tid to i64 + %divergent = add i64 %val, %ext + %res = call i64 @llvm.bitreverse.i64(i64 %divergent) + store i64 %res, i64 addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() +declare i32 @llvm.bitreverse.i32(i32) +declare i64 @llvm.bitreverse.i64(i64)