Index: lib/Target/AMDGPU/SIShrinkInstructions.cpp =================================================================== --- lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -198,6 +198,10 @@ Orig.isInternalRead()); } +static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { + return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); +} + bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const SIInstrInfo *TII = @@ -214,18 +218,6 @@ Next = std::next(I); MachineInstr &MI = *I; - // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. - if (MI.getOpcode() == AMDGPU::S_MOV_B32) { - const MachineOperand &Src = MI.getOperand(1); - - if (Src.isImm()) { - if (isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4)) - MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); - } - - continue; - } - if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) { // If this has a literal constant source that is the same as the // reversed bits of an inline immediate, replace with a bitreverse of @@ -250,6 +242,46 @@ } } + // FIXME: We also need to consider movs of constant operands since + // immediate operands are not folded if they have more than one use, and + // the operand folding pass is unaware if the immediate will be free since + // it won't know if the src == dest constraint will end up being + // satisfied. + if (MI.getOpcode() == AMDGPU::S_ADD_I32 || + MI.getOpcode() == AMDGPU::S_MUL_I32) { + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); + + // FIXME: This could work better if hints worked with subregisters. If + // we have a vector add of a constant, we usually don't get the correct + // allocation due to the subregister usage. + if (TargetRegisterInfo::isVirtualRegister(Dest.getReg()) && + Src0.isReg()) { + MRI.setRegAllocationHint(Dest.getReg(), 0, Src0.getReg()); + } + + if (Src0.isReg() && Src0.getReg() == Dest.getReg()) { + if (Src1.isImm() && isKImmOperand(TII, Src1)) { + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ? + AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32; + + MI.setDesc(TII->get(Opc)); + MI.tieOperands(0, 1); + } + } + } + + // Try to use S_MOVK_I32, which will save 4 bytes for small immediates. + if (MI.getOpcode() == AMDGPU::S_MOV_B32) { + const MachineOperand &Src = MI.getOperand(1); + + if (Src.isImm() && isKImmOperand(TII, Src)) + MI.setDesc(TII->get(AMDGPU::S_MOVK_I32)); + + continue; + } + if (!TII->hasVALU32BitEncoding(MI.getOpcode())) continue; Index: test/CodeGen/AMDGPU/fceil64.ll =================================================================== --- test/CodeGen/AMDGPU/fceil64.ll +++ test/CodeGen/AMDGPU/fceil64.ll @@ -13,8 +13,8 @@ ; CI: v_ceil_f64_e32 ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]] +; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 Index: test/CodeGen/AMDGPU/ftrunc.f64.ll =================================================================== --- test/CodeGen/AMDGPU/ftrunc.f64.ll +++ test/CodeGen/AMDGPU/ftrunc.f64.ll @@ -25,8 +25,8 @@ ; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000 -; SI-DAG: s_add_i32 [[A:s[0-9]+]], [[SEXP]], 0xfffffc01 -; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[A]] +; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01 +; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]] ; SI-DAG: s_not_b64 ; SI-DAG: s_and_b64 ; SI-DAG: cmp_gt_i32 Index: test/CodeGen/AMDGPU/s_addk_i32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/s_addk_i32.ll @@ -0,0 +1,93 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_addk_i32_k0: +; SI: s_load_dword [[VAL:s[0-9]+]] +; SI: s_addk_i32 [[VAL]], 0x41 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[VRESULT]] +; SI: s_endpgm +define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { + %add = add i32 %b, 65 + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; FIXME: This should be folded with any number of uses. +; SI-LABEL: {{^}}s_addk_i32_k0_x2: +; SI: s_movk_i32 [[K:s[0-9]+]], 0x41 +; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] +; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]] +; SI: s_endpgm +define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) { + %add0 = add i32 %a, 65 + %add1 = add i32 %b, 65 + store i32 %add0, i32 addrspace(1)* %out0 + store i32 %add1, i32 addrspace(1)* %out1 + ret void +} + +; SI-LABEL: {{^}}s_addk_i32_k1: +; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}} +; SI: s_endpgm +define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) { + %add = add i32 %b, 32767 ; (1 << 15) - 1 + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_addk_i32_k2: +; SI: s_addk_i32 {{s[0-9]+}}, 0xffef{{$}} +; SI: s_endpgm +define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) { + %add = add i32 %b, -17 + store i32 %add, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_addk_v2i32_k0: +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 +; SI: s_endpgm +define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) { + %add = add <2 x i32> %b, + store <2 x i32> %add, <2 x i32> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_addk_v4i32_k0: +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 +; SI: s_endpgm +define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) { + %add = add <4 x i32> %b, + store <4 x i32> %add, <4 x i32> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_addk_v8i32_k0: +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x45 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x46 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47 +; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48 +; SI: s_endpgm +define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) { + %add = add <8 x i32> %b, + store <8 x i32> %add, <8 x i32> addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}no_s_addk_i32_k0: +; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}} +; SI: s_endpgm +define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) { + %add = add i32 %b, 32768 ; 1 << 15 + store i32 %add, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/s_mulk_i32.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/s_mulk_i32.ll @@ -0,0 +1,41 @@ +; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}s_mulk_i32_k0: +; SI: s_load_dword [[VAL:s[0-9]+]] +; SI: s_mulk_i32 [[VAL]], 0x41 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]] +; SI: buffer_store_dword [[VRESULT]] +; SI: s_endpgm +define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { + %mul = mul i32 %b, 65 + store i32 %mul, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_mulk_i32_k1: +; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}} +; SI: s_endpgm +define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) { + %mul = mul i32 %b, 32767 ; (1 << 15) - 1 + store i32 %mul, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}s_mulk_i32_k2: +; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}} +; SI: s_endpgm +define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) { + %mul = mul i32 %b, -17 + store i32 %mul, i32 addrspace(1)* %out + ret void +} + +; SI-LABEL: {{^}}no_s_mulk_i32_k0: +; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}} +; SI: s_endpgm +define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) { + %mul = mul i32 %b, 32769 ; 1 << 15 + 1 + store i32 %mul, i32 addrspace(1)* %out + ret void +} Index: test/CodeGen/AMDGPU/shl_add_constant.ll =================================================================== --- test/CodeGen/AMDGPU/shl_add_constant.ll +++ test/CodeGen/AMDGPU/shl_add_constant.ll @@ -74,8 +74,8 @@ ; SI-DAG: s_load_dword [[Y:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc ; SI: s_lshl_b32 [[SHL3:s[0-9]+]], [[X]], 3 ; SI: s_add_i32 [[TMP:s[0-9]+]], [[Y]], [[SHL3]] -; SI: s_add_i32 [[RESULT:s[0-9]+]], [[TMP]], 0x3d8 -; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] +; SI: s_addk_i32 [[TMP]], 0x3d8 +; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]] ; SI: buffer_store_dword [[VRESULT]] define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {