Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -54,6 +54,11 @@ Register Base; }; +struct ShiftChain { + int64_t Imm; + Register Base; +}; + struct RegisterImmPair { Register Reg; int64_t Imm; @@ -228,6 +233,9 @@ bool matchPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); bool applyPtrAddImmedChain(MachineInstr &MI, PtrAddChain &MatchInfo); + bool matchShiftImmedChain(MachineInstr &MI, ShiftChain &MatchInfo); + bool applyShiftImmedChain(MachineInstr &MI, ShiftChain &MatchInfo); + /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -158,6 +158,13 @@ [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>; +def shift_immed_matchdata : GIDefMatchData<"ShiftChain">; +def shift_immed_chain : GICombineRule< + (defs root:$d, shift_immed_matchdata:$matchinfo), + (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$d, + [{ return Helper.matchShiftImmedChain(*${d}, ${matchinfo}); }]), + (apply [{ Helper.applyShiftImmedChain(*${d}, ${matchinfo}); }])>; + def mul_to_shl_matchdata : GIDefMatchData<"unsigned">; def mul_to_shl : GICombineRule< (defs root:$d, mul_to_shl_matchdata:$matchinfo), @@ -542,4 +549,5 @@ not_cmp_fold, opt_brcond_by_inverting_cond, unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, - const_combines, xor_of_and_with_same_reg, ptr_add_with_zero]>; + const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, + shift_immed_chain]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1539,6 +1539,73 @@ return true; } +bool CombinerHelper::matchShiftImmedChain(MachineInstr &MI, + ShiftChain &MatchInfo) { + // We're trying to match the following pattern with any of G_SHL/G_ASHR/G_LSHR + // shift instructions: + // %t1 = SHIFT %base, G_CONSTANT imm1 + // %root = SHIFT %t1, G_CONSTANT imm2 + // --> + // %root = SHIFT %base, G_CONSTANT (imm1 + imm2) + + unsigned Opcode = MI.getOpcode(); + assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR || + Opcode == TargetOpcode::G_LSHR) && + "Expected G_SHL, G_ASHR or G_LSHR"); + + Register Shl2 = MI.getOperand(1).getReg(); + Register Imm1 = MI.getOperand(2).getReg(); + auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm1, MRI); + if (!MaybeImmVal) + return false; + + MachineInstr *Shl2Def = MRI.getUniqueVRegDef(Shl2); + if (Shl2Def->getOpcode() != Opcode) + return false; + + Register Base = Shl2Def->getOperand(1).getReg(); + Register Imm2 = Shl2Def->getOperand(2).getReg(); + auto MaybeImm2Val = getConstantVRegValWithLookThrough(Imm2, MRI); + if (!MaybeImm2Val) + return false; + + // Pass the combined immediate to the apply function. + MatchInfo.Imm = MaybeImmVal->Value + MaybeImm2Val->Value; + MatchInfo.Base = Base; + return true; +} + +bool CombinerHelper::applyShiftImmedChain(MachineInstr &MI, + ShiftChain &MatchInfo) { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR || + Opcode == TargetOpcode::G_LSHR) && + "Expected G_SHL, G_ASHR or G_LSHR"); + + MachineIRBuilder MIB(MI); + LLT Ty = MRI.getType(MI.getOperand(1).getReg()); + auto Imm = MatchInfo.Imm; + + if (Imm >= Ty.getScalarSizeInBits()) { + // Any logical shift that exceeds scalar size will produce zero. + if (Opcode != TargetOpcode::G_ASHR) { + MIB.buildConstant(MI.getOperand(0), 0); + MI.eraseFromParent(); + return true; + } + // Arithmetic shift larger than scalar size has no effect. + Imm = Ty.getScalarSizeInBits() - 1; + } + + LLT ImmTy = MRI.getType(MI.getOperand(2).getReg()); + Register NewImm = MIB.buildConstant(ImmTy, Imm).getReg(0); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(MatchInfo.Base); + MI.getOperand(2).setReg(NewImm); + Observer.changedInstr(MI); + return true; +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll @@ -40,8 +40,7 @@ ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 7 -; GFX8-NEXT: v_ashrrev_i16_sdwa v0, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v0 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ashr_i8_7: Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_cs i32 @test_shl_1(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 5 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = shl i32 %z1, 3 + ret i32 %z2 +} + +define amdgpu_cs i32 @test_shl_2(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 10 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 1 + %z2 = shl i32 %z1, 2 + %z3 = shl i32 %z2, 3 + %z4 = shl i32 %z3, 4 + ret i32 %z4 +} + +define amdgpu_cs i32 @test_shl_i32(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_i32: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 10 + %z2 = shl i32 %z1, 10 + %z3 = shl i32 %z2, 10 + %z4 = shl i32 %z3, 10 + ret i32 %z4 +} + +define amdgpu_cs i64 @test_shl_i64(i64 inreg %arg1) { +; CHECK-LABEL: test_shl_i64: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i64 %arg1, 10 + %z2 = shl i64 %z1, 10 + %z3 = shl i64 %z2, 10 + %z4 = shl i64 %z3, 10 + %z5 = shl i64 %z4, 10 + %z6 = shl i64 %z5, 10 + %z7 = shl i64 %z6, 10 + ret i64 %z7 +} + +define amdgpu_cs i32 @test_ashr_1(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 5 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = ashr i32 %z1, 3 + ret i32 %z2 +} + +define amdgpu_cs i32 @test_ashr_2(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 10 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 1 + %z2 = ashr i32 %z1, 2 + %z3 = ashr i32 %z2, 3 + %z4 = ashr i32 %z3, 4 + ret i32 %z4 +} + +define amdgpu_cs i32 @test_ashr_i32(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_i32: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 31 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 10 + %z2 = ashr i32 %z1, 10 + %z3 = ashr i32 %z2, 10 + %z4 = ashr i32 %z3, 10 + ret i32 %z4 +} + +define amdgpu_cs i64 @test_ashr_i64(i64 inreg %arg1) { +; CHECK-LABEL: test_ashr_i64: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s1, 31 +; CHECK-NEXT: s_mov_b32 s1, s0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i64 %arg1, 10 + %z2 = ashr i64 %z1, 10 + %z3 = ashr i64 %z2, 10 + %z4 = ashr i64 %z3, 10 + %z5 = ashr i64 %z4, 10 + %z6 = ashr i64 %z5, 10 + %z7 = ashr i64 %z6, 10 + ret i64 %z7 +} + +define amdgpu_cs i32 @test_lshr_1(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 5 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = lshr i32 %z1, 3 + ret i32 %z2 +} + +define amdgpu_cs i32 @test_lshr_2(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 10 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 1 + %z2 = lshr i32 %z1, 2 + %z3 = lshr i32 %z2, 3 + %z4 = lshr i32 %z3, 4 + ret i32 %z4 +} + +define amdgpu_cs i32 @test_lshr_i32(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_i32: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 10 + %z2 = lshr i32 %z1, 10 + %z3 = lshr i32 %z2, 10 + %z4 = lshr i32 %z3, 10 + ret i32 %z4 +} + +define amdgpu_cs i64 @test_lshr_i64(i64 inreg %arg1) { +; CHECK-LABEL: test_lshr_i64: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_mov_b32 s0, 0 +; CHECK-NEXT: s_mov_b32 s1, 0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i64 %arg1, 10 + %z2 = lshr i64 %z1, 10 + %z3 = lshr i64 %z2, 10 + %z4 = lshr i64 %z3, 10 + %z5 = lshr i64 %z4, 10 + %z6 = lshr i64 %z5, 10 + %z7 = lshr i64 %z6, 10 + ret i64 %z7 +}