Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -212,6 +212,82 @@ } } +/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals. +/// For AND or OR, try using S_BITSET{0,1} to clear or set bits. +/// If the inverse of the immediate is legal, use ANDN2, ORN2 or +/// XNOR (as a ^ b == ~(a ^ ~b)). +/// \returns true if the caller should continue the machine function iterator +static bool shrinkScalarLogicOp(const GCNSubtarget &ST, + MachineRegisterInfo &MRI, + const SIInstrInfo *TII, + MachineInstr &MI) { + unsigned Opc = MI.getOpcode(); + const MachineOperand *Dest = &MI.getOperand(0); + MachineOperand *Src0 = &MI.getOperand(1); + MachineOperand *Src1 = &MI.getOperand(2); + MachineOperand *SrcReg = Src0; + MachineOperand *SrcImm = Src1; + + if (SrcImm->isImm() && + !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) { + uint32_t Imm = static_cast(SrcImm->getImm()); + uint32_t NewImm = 0; + + if (Opc == AMDGPU::S_AND_B32) { + if (isPowerOf2_32(~Imm)) { + NewImm = countTrailingOnes(Imm); + Opc = AMDGPU::S_BITSET0_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ANDN2_B32; + } + } else if (Opc == AMDGPU::S_OR_B32) { + if (isPowerOf2_32(Imm)) { + NewImm = countTrailingZeros(Imm); + Opc = AMDGPU::S_BITSET1_B32; + } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_ORN2_B32; + } + } else if (Opc == AMDGPU::S_XOR_B32) { + if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) { + NewImm = ~Imm; + Opc = AMDGPU::S_XNOR_B32; + } + } else { + llvm_unreachable("unexpected opcode"); + } + + if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) && + SrcImm == Src0) { + if (!TII->commuteInstruction(MI, false, 1, 2)) + NewImm = 0; + } + + if (NewImm != 0) { + if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) && + SrcReg->isReg()) { + MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg()); + MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg()); + return true; + } + + if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) { + MI.setDesc(TII->get(Opc)); + if (Opc == AMDGPU::S_BITSET0_B32 || + Opc == AMDGPU::S_BITSET1_B32) { + Src0->ChangeToImmediate(NewImm); + MI.RemoveOperand(2); + } else { + SrcImm->setImm(NewImm); + } + } + } + } + + return false; +} + // This is the same as MachineInstr::readsRegister/modifiesRegister except // it takes subregs into account. static bool instAccessReg(iterator_range &&R, @@ -512,6 +588,14 @@ continue; } + // Shrink scalar logic operations. + if (MI.getOpcode() == AMDGPU::S_AND_B32 || + MI.getOpcode() == AMDGPU::S_OR_B32 || + MI.getOpcode() == AMDGPU::S_XOR_B32) { + if (shrinkScalarLogicOp(ST, MRI, TII, MI)) + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; Index: test/CodeGen/AMDGPU/andorbitset.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/andorbitset.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_clear_msb: +; SI: s_bitset0_b32 s{{[0-9]+}}, 31 +define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 2147483647 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_msb: +; SI: s_bitset1_b32 s{{[0-9]+}}, 31 +define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 2147483648 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_clear_lsb: +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2 +define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 4294967294 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_lsb: +; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1 +define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 1 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_clear_midbit: +; SI: s_bitset0_b32 s{{[0-9]+}}, 8 +define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, 4294967039 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_set_midbit: +; SI: s_bitset1_b32 s{{[0-9]+}}, 8 +define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, 256 + store i32 %x, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/andorxorinvimm.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/andorxorinvimm.ll @@ -0,0 +1,49 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_or_to_orn2: +; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_or_to_orn2_imm0: +; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = or i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_and_to_andn2: +; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_and_to_andn2_imm0: +; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = and i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_xor_to_xnor: +; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) { + %x = xor i32 %in, -51 + store i32 %x, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_xor_to_xnor_imm0: +; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50 +define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) { + %x = xor i32 -51, %in + store i32 %x, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fabs.ll +++ test/CodeGen/AMDGPU/fabs.ll @@ -11,7 +11,8 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @fabs(float %bc) @@ -23,7 +24,8 @@ ; R600-NOT: AND ; R600: |PV.{{[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) { %bc= bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) @@ -34,7 +36,8 @@ ; FUNC-LABEL: {{^}}s_fabs_f32: ; R600: |{{(PV|T[0-9])\.[XYZW]}}| -; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff +; VI: s_bitset0_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) { %fabs = call float @llvm.fabs.f32(float %in) store float %fabs, float addrspace(1)* %out Index: test/CodeGen/AMDGPU/fneg-fabs.ll =================================================================== --- test/CodeGen/AMDGPU/fneg-fabs.ll +++ test/CodeGen/AMDGPU/fneg-fabs.ll @@ -1,5 +1,5 @@ -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s -; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s ; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32: @@ -35,6 +35,7 @@ ; R600: -PV ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 +; VI: s_bitset1_b32 s{{[0-9]+}}, 31 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) { %bc = bitcast i32 %in to float %fabs = call float @llvm.fabs.f32(float %bc) Index: test/CodeGen/AMDGPU/gep-address-space.ll =================================================================== --- test/CodeGen/AMDGPU/gep-address-space.ll +++ test/CodeGen/AMDGPU/gep-address-space.ll @@ -14,7 +14,7 @@ ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset: ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. -; SI: s_or_b32 +; SI: s_bitset1_b32 ; CI: s_add_i32 ; CHECK: ds_write_b32 define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind { Index: test/CodeGen/AMDGPU/local-64.ll =================================================================== --- test/CodeGen/AMDGPU/local-64.ll +++ test/CodeGen/AMDGPU/local-64.ll @@ -48,7 +48,7 @@ ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on ; SI, which is why it is being OR'd with the base pointer. -; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 +; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16 ; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 ; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000 ; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000