diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -1326,27 +1326,45 @@ Register DstReg = I.getOperand(0).getReg(); const unsigned Size = MRI->getType(DstReg).getSizeInBits(); const bool Is64 = Size == 64; + const bool IsWave32 = (STI.getWavefrontSize() == 32); - if (Size != STI.getWavefrontSize()) + // In the common case, the return type matches the wave size. + // However we also support emitting i64 ballots in wave32 mode. + if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32)) return false; std::optional Arg = getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI); + const auto BuildCopy = [&](Register SrcReg) { + if (Size == STI.getWavefrontSize()) { + BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg) + .addReg(SrcReg) + ->dump(); + return; + } + + // If emitting a i64 ballot in wave32, fill the upper bits with zeroes. + Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0); + BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg) + .addReg(SrcReg) + .addImm(AMDGPU::sub0) + .addReg(HiReg) + .addImm(AMDGPU::sub1); + }; + if (Arg) { const int64_t Value = Arg->Value.getSExtValue(); if (Value == 0) { unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32; BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0); - } else if (Value == -1) { // all ones - Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO; - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); - } else + } else if (Value == -1) // all ones + BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC); + else return false; - } else { - Register SrcReg = I.getOperand(2).getReg(); - BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg); - } + } else + BuildCopy(I.getOperand(2).getReg()); I.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -992,11 +992,18 @@ (i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64)) >; - let WaveSizePredicate = isWave32 in - def : GCNPat < - (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), - (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) - >; + let WaveSizePredicate = isWave32 in { + def : GCNPat < + (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32)) + >; + + // Support codegen of i64 setcc in wave32 mode. + def : GCNPat < + (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)), + (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1)) + >; + } } defm : ICMP_Pattern ; @@ -1056,13 +1063,22 @@ DSTCLAMP.NONE), SReg_64)) >; - let WaveSizePredicate = isWave32 in - def : GCNPat < - (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), - (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), - (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE), SReg_32)) - >; + let WaveSizePredicate = isWave32 in { + def : GCNPat < + (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), SReg_32)) + >; + + def : GCNPat < + (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), + (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), + (i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1, + DSTCLAMP.NONE), sub0, + (S_MOV_B32 (i32 0)), sub1)) + >; + } } defm : FCMP_Pattern ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -0,0 +1,106 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: llc -march=amdgcn -global-isel=0 -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,DAGISEL +; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1010 < %s | FileCheck %s --check-prefixes=CHECK,GISEL +; RUN: llc -march=amdgcn -global-isel -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck %s --check-prefixes=CHECK,GISEL + +declare i64 @llvm.amdgcn.ballot.i64(i1) +declare i64 @llvm.ctpop.i64(i64) + +; Test ballot(0) + +define amdgpu_cs i64 @constant_false() { +; CHECK-LABEL: constant_false: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 0) + ret i64 %ballot +} + +; Test ballot(1) + +define amdgpu_cs i64 @constant_true() { +; DAGISEL-LABEL: constant_true: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: s_mov_b32 s0, exec_lo +; DAGISEL-NEXT: s_mov_b32 s1, exec_hi +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: constant_true: +; GISEL: ; %bb.0: +; GISEL-NEXT: s_mov_b32 s0, exec_lo +; GISEL-NEXT: s_mov_b32 s1, 0 +; GISEL-NEXT: ; return to shader part epilog + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 1) + ret i64 %ballot +} + +; Test ballot of a non-comparison operation + +define amdgpu_cs i64 @non_compare(i32 %x) { +; CHECK-LABEL: non_compare: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; CHECK-NEXT: ; return to shader part epilog + %trunc = trunc i32 %x to i1 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %trunc) + ret i64 %ballot +} + +; Test ballot of comparisons + +define amdgpu_cs i64 @compare_ints(i32 %x, i32 %y) { +; CHECK-LABEL: compare_ints: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_eq_u32_e64 s0, v0, v1 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog + %cmp = icmp eq i32 %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define amdgpu_cs i64 @compare_int_with_constant(i32 %x) { +; DAGISEL-LABEL: compare_int_with_constant: +; DAGISEL: ; %bb.0: +; DAGISEL-NEXT: v_cmp_lt_i32_e64 s0, 0x62, v0 +; DAGISEL-NEXT: s_mov_b32 s1, 0 +; DAGISEL-NEXT: ; return to shader part epilog +; +; GISEL-LABEL: compare_int_with_constant: +; GISEL: ; %bb.0: +; GISEL-NEXT: v_cmp_le_i32_e64 s0, 0x63, v0 +; GISEL-NEXT: s_mov_b32 s1, 0 +; GISEL-NEXT: ; return to shader part epilog + %cmp = icmp sge i32 %x, 99 + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define amdgpu_cs i64 @compare_floats(float %x, float %y) { +; CHECK-LABEL: compare_floats: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt float %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + ret i64 %ballot +} + +define amdgpu_cs i64 @ctpop_of_ballot(float %x, float %y) { +; CHECK-LABEL: ctpop_of_ballot: +; CHECK: ; %bb.0: +; CHECK-NEXT: v_cmp_gt_f32_e64 s0, v0, v1 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: s_bcnt1_i32_b64 s0, s[0:1] +; CHECK-NEXT: ; return to shader part epilog + %cmp = fcmp ogt float %x, %y + %ballot = call i64 @llvm.amdgcn.ballot.i64(i1 %cmp) + %bcnt = call i64 @llvm.ctpop.i64(i64 %ballot) + ret i64 %bcnt +}