Index: llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h =================================================================== --- llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -59,6 +59,13 @@ Register Base; }; +struct ShiftOfShiftedLogic { + MachineInstr *Logic; + MachineInstr *Shift2; + Register LogicNonShiftReg; + uint64_t ValSum; +}; + struct RegisterImmPair { Register Reg; int64_t Imm; @@ -236,6 +243,15 @@ bool matchShiftImmedChain(MachineInstr &MI, ShiftChain &MatchInfo); bool applyShiftImmedChain(MachineInstr &MI, ShiftChain &MatchInfo); + /// If we have a shift-by-constant of a bitwise logic op that itself has a + /// shift-by-constant operand with identical opcode, we may be able to convert + /// that into 2 independent shifts followed by the logic op. This is a + /// throughput improvement. + bool matchShiftOfShiftedLogic(MachineInstr &MI, + ShiftOfShiftedLogic &MatchInfo); + bool applyShiftOfShiftedLogic(MachineInstr &MI, + ShiftOfShiftedLogic &MatchInfo); + /// Transform a multiply by a power-of-2 value to a left shift. bool matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); bool applyCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal); Index: llvm/include/llvm/Target/GlobalISel/Combine.td =================================================================== --- llvm/include/llvm/Target/GlobalISel/Combine.td +++ llvm/include/llvm/Target/GlobalISel/Combine.td @@ -165,6 +165,13 @@ [{ return Helper.matchShiftImmedChain(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyShiftImmedChain(*${d}, ${matchinfo}); }])>; +def shift_of_shifted_logic_matchdata : GIDefMatchData<"ShiftOfShiftedLogic">; +def shift_of_shifted_logic_chain : GICombineRule< + (defs root:$d, shift_of_shifted_logic_matchdata:$matchinfo), + (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$d, + [{ return Helper.matchShiftOfShiftedLogic(*${d}, ${matchinfo}); }]), + (apply [{ Helper.applyShiftOfShiftedLogic(*${d}, ${matchinfo}); }])>; + def mul_to_shl_matchdata : GIDefMatchData<"unsigned">; def mul_to_shl : GICombineRule< (defs root:$d, mul_to_shl_matchdata:$matchinfo), @@ -550,4 +557,4 @@ unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc, unmerge_zext_to_zext, trunc_ext_fold, trunc_shl, const_combines, xor_of_and_with_same_reg, ptr_add_with_zero, - shift_immed_chain]>; + shift_immed_chain, shift_of_shifted_logic_chain]>; Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1604,6 +1604,131 @@ return true; } + +bool CombinerHelper::matchShiftOfShiftedLogic(MachineInstr &MI, + ShiftOfShiftedLogic &MatchInfo) { + // We're trying to match the following pattern with any of G_SHL/G_ASHR/G_LSHR + // shift instructions in combination with any of G_AND/G_OR/G_XOR logic + // instructions. + // %t1 = SHIFT %X, G_CONSTANT C0 + // %t2 = LOGIC %t1, %Y + // %root = SHIFT %t2, G_CONSTANT C1 + // --> + // %t3 = SHIFT %X, G_CONSTANT (C0+C1) + // %t4 = SHIFT %Y, G_CONSTANT C1 + // %root = LOGIC %t3, %t4 + unsigned ShiftOpcode = MI.getOpcode(); + if (ShiftOpcode != TargetOpcode::G_SHL && + ShiftOpcode != TargetOpcode::G_ASHR && + ShiftOpcode != TargetOpcode::G_LSHR) + return false; + + // Match a one-use bitwise logic op. + Register LogicDest = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(LogicDest)) + return false; + + MachineInstr *LogicMI = MRI.getUniqueVRegDef(LogicDest); + unsigned LogicOpcode = LogicMI->getOpcode(); + if (LogicOpcode != TargetOpcode::G_AND && LogicOpcode != TargetOpcode::G_OR && + LogicOpcode != TargetOpcode::G_XOR) + return false; + + // Find a matching one-use shift by constant. + const Register C1 = MI.getOperand(2).getReg(); + auto MaybeImmVal = getConstantVRegValWithLookThrough(C1, MRI); + if (!MaybeImmVal) + return false; + + const uint64_t C1Val = MaybeImmVal->Value; + const unsigned BitWidth = MRI.getType(C1).getScalarSizeInBits(); + + auto matchFirstShift = [&](const MachineInstr *MI, uint64_t &ShiftVal) { + // Shift should match and should be a one-use. + if (MI->getOpcode() != ShiftOpcode || + !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg())) + return false; + + const Register Imm = MI->getOperand(2).getReg(); + + // Shift amount types do not have to match their operand type, so check that + // the constants are the same width. + if (MRI.getType(Imm).getScalarSizeInBits() != BitWidth) + return false; + + // Must be a constant. + auto MaybeImmVal = getConstantVRegValWithLookThrough(Imm, MRI); + if (!MaybeImmVal) + return false; + ShiftVal = MaybeImmVal->Value; + + // The fold is not valid if the sum of the shift values exceeds bitwidth. + if ((ShiftVal + C1Val) > BitWidth) + return false; + + return true; + }; + + // Logic ops are commutative, so check each operand for a match. + Register LogicMIReg1 = LogicMI->getOperand(1).getReg(); + MachineInstr *LogicMIOp1 = MRI.getUniqueVRegDef(LogicMIReg1); + Register LogicMIReg2 = LogicMI->getOperand(2).getReg(); + MachineInstr *LogicMIOp2 = MRI.getUniqueVRegDef(LogicMIReg2); + uint64_t C0Val; + + if (matchFirstShift(LogicMIOp1, C0Val)) { + MatchInfo.LogicNonShiftReg = LogicMIReg2; + MatchInfo.Shift2 = LogicMIOp1; + } + else if (matchFirstShift(LogicMIOp2, C0Val)) { + MatchInfo.LogicNonShiftReg = LogicMIReg1; + MatchInfo.Shift2 = LogicMIOp2; + } + else + return false; + + MatchInfo.Logic = LogicMI; + MatchInfo.ValSum = C0Val + C1Val; + + return true; +} + +bool CombinerHelper::applyShiftOfShiftedLogic(MachineInstr &MI, + ShiftOfShiftedLogic &MatchInfo) { + unsigned Opcode = MI.getOpcode(); + assert((Opcode == TargetOpcode::G_SHL || Opcode == TargetOpcode::G_ASHR || + Opcode == TargetOpcode::G_LSHR) && + "Expected G_SHL, G_ASHR or G_LSHR"); + + MachineIRBuilder MIB(MI); + LLT ShlType = MRI.getType(MI.getOperand(2).getReg()); + + Register Const = MIB.buildConstant(ShlType, MatchInfo.ValSum).getReg(0); + + Register Shift1Base = MatchInfo.Shift2->getOperand(1).getReg(); + Register Shift1 = + MIB.buildInstr(Opcode, {ShlType}, {Shift1Base, Const}).getReg(0); + + Register Shift2Const = MI.getOperand(2).getReg(); + Register Shift2 = MIB.buildInstr(Opcode, {ShlType}, + {MatchInfo.LogicNonShiftReg, Shift2Const}) + .getReg(0); + + Register Dest = MI.getOperand(0).getReg(); + MIB.buildInstr(MatchInfo.Logic->getOpcode(), {Dest}, {Shift1, Shift2}); + + Register ConstC0 = MatchInfo.Shift2->getOperand(2).getReg(); + if (MRI.hasOneNonDBGUse(ConstC0)) + MRI.getUniqueVRegDef(ConstC0)->eraseFromParent(); + + // These were one use so it's safe to remove them. + MatchInfo.Shift2->eraseFromParent(); + MatchInfo.Logic->eraseFromParent(); + + MI.eraseFromParent(); + return true; +} + bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI, unsigned &ShiftVal) { assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL"); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -global-isel -march=amdgcn -verify-machineinstrs < %s | FileCheck %s + +define amdgpu_cs i32 @test_shl_and_1(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_and_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 4 +; CHECK-NEXT: s_and_b32 s0, s0, -16 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = and i32 %z1, 1073741820 + %z3 = shl i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_and_2(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_and_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 8 +; CHECK-NEXT: s_and_b32 s0, s0, 0xffffff00 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 5 + %z2 = and i32 %z1, 536870880 + %z3 = shl i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_and_3(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_and_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 5 +; CHECK-NEXT: s_and_b32 s0, s0, 0x7ffffff0 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 3 + %z2 = and i32 %z1, 536870908 + %z3 = shl i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_and_1(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_and_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 4 +; CHECK-NEXT: s_and_b32 s0, s0, 0xfffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = and i32 %z1, 1073741820 + %z3 = lshr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_and_2(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_and_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 8 +; CHECK-NEXT: s_and_b32 s0, s0, 0x3fffffc +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 5 + %z2 = and i32 %z1, 536870880 + %z3 = lshr i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_and_3(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_and_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 5 +; CHECK-NEXT: s_and_b32 s0, s0, 0x7ffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 3 + %z2 = and i32 %z1, 536870908 + %z3 = lshr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_and_1(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_and_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 4 +; CHECK-NEXT: s_and_b32 s0, s0, 0xfffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = and i32 %z1, 1073741820 + %z3 = ashr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_and_2(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_and_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 8 +; CHECK-NEXT: s_and_b32 s0, s0, 0x3fffffc +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 5 + %z2 = and i32 %z1, 536870880 + %z3 = ashr i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_and_3(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_and_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 5 +; CHECK-NEXT: s_and_b32 s0, s0, 0x7ffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 3 + %z2 = and i32 %z1, 536870908 + %z3 = ashr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_or_1(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_or_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 4 +; CHECK-NEXT: s_or_b32 s0, s0, 12 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = or i32 %z1, 3221225475 + %z3 = shl i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_or_2(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_or_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 8 +; CHECK-NEXT: s_or_b32 s0, s0, 0xfffffc00 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 3 + %z2 = or i32 %z1, 536870880 + %z3 = shl i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_or_3(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_or_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 5 +; CHECK-NEXT: s_or_b32 s0, s0, 0x7fffff80 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = or i32 %z1, 268435440 + %z3 = shl i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_or_1(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_or_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 4 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = or i32 %z1, 3 + %z3 = lshr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_or_2(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_or_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 8 +; CHECK-NEXT: s_or_b32 s0, s0, 0xffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 3 + %z2 = or i32 %z1, 536870880 + %z3 = lshr i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_or_3(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_or_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 5 +; CHECK-NEXT: s_or_b32 s0, s0, 0x1fffffe +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = or i32 %z1, 268435440 + %z3 = lshr i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_or_1(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_or_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 4 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = or i32 %z1, 3 + %z3 = ashr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_or_2(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_or_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 8 +; CHECK-NEXT: s_or_b32 s0, s0, 0xffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 3 + %z2 = or i32 %z1, 536870880 + %z3 = ashr i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_or_3(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_or_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 5 +; CHECK-NEXT: s_or_b32 s0, s0, 0x1fffffe +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = or i32 %z1, 268435440 + %z3 = ashr i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_xor_1(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_xor_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 4 +; CHECK-NEXT: s_xor_b32 s0, s0, -16 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = xor i32 %z1, 1073741820 + %z3 = shl i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_xor_2(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_xor_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 6 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 1 + %z2 = xor i32 %z1, 4160749568 + %z3 = shl i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_shl_xor_3(i32 inreg %arg1) { +; CHECK-LABEL: test_shl_xor_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshl_b32 s0, s0, 5 +; CHECK-NEXT: s_xor_b32 s0, s0, 56 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = shl i32 %arg1, 2 + %z2 = xor i32 %z1, 3221225479 + %z3 = shl i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_xor_1(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_xor_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 4 +; CHECK-NEXT: s_xor_b32 s0, s0, 0xfffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = xor i32 %z1, 1073741820 + %z3 = lshr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_xor_2(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_xor_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 6 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 1 + %z2 = xor i32 %z1, 31 + %z3 = lshr i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_lshr_xor_3(i32 inreg %arg1) { +; CHECK-LABEL: test_lshr_xor_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_lshr_b32 s0, s0, 5 +; CHECK-NEXT: s_xor_b32 s0, s0, 0x18000000 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = lshr i32 %arg1, 2 + %z2 = xor i32 %z1, 3221225479 + %z3 = lshr i32 %z2, 3 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_xor_1(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_xor_1: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 4 +; CHECK-NEXT: s_xor_b32 s0, s0, 0xfffffff +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = xor i32 %z1, 1073741820 + %z3 = ashr i32 %z2, 2 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_xor_2(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_xor_2: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 6 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 1 + %z2 = xor i32 %z1, 31 + %z3 = ashr i32 %z2, 5 + ret i32 %z3 +} + +define amdgpu_cs i32 @test_ashr_xor_3(i32 inreg %arg1) { +; CHECK-LABEL: test_ashr_xor_3: +; CHECK: ; %bb.0: ; %.entry +; CHECK-NEXT: s_ashr_i32 s0, s0, 5 +; CHECK-NEXT: s_xor_b32 s0, s0, 0xf8000000 +; CHECK-NEXT: ; return to shader part epilog +.entry: + %z1 = ashr i32 %arg1, 2 + %z2 = xor i32 %z1, 3221225479 + %z3 = ashr i32 %z2, 3 + ret i32 %z3 +}