diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -789,6 +789,9 @@ /// (X ^ Y) != X -> Y != 0 bool matchRedundantBinOpInEquality(MachineInstr &MI, BuildFnTy &MatchInfo); + /// Match shifts greater or equal to the bitwidth of the operation. + bool matchShiftsTooBig(MachineInstr &MI); + private: /// Given a non-indexed load or store instruction \p MI, find an offset that /// can be usefully and legally folded into it as a post-indexing operation. diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td --- a/llvm/include/llvm/Target/GlobalISel/Combine.td +++ b/llvm/include/llvm/Target/GlobalISel/Combine.td @@ -205,6 +205,12 @@ [{ return Helper.matchPtrAddImmedChain(*${d}, ${matchinfo}); }]), (apply [{ Helper.applyPtrAddImmedChain(*${d}, ${matchinfo}); }])>; +def shifts_too_big : GICombineRule< + (defs root:$root), + (match (wip_match_opcode G_SHL, G_ASHR, G_LSHR):$root, + [{ return Helper.matchShiftsTooBig(*${root}); }]), + (apply [{ Helper.replaceInstWithUndef(*${root}); }])>; + // Fold shift (shift base x), y -> shift base, (x+y), if shifts are same def shift_immed_matchdata : GIDefMatchData<"RegisterImmPair">; def shift_immed_chain : GICombineRule< @@ -1089,7 +1095,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines, extract_vec_elt_combines, combines_for_extload, combine_indexed_load_store, undef_combines, identity_combines, phi_combines, - simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, + simplify_add_to_sub, hoist_logic_op_with_same_opcode_hands, shifts_too_big, reassocs, ptr_add_immed_chain, shl_ashr_to_sext_inreg, sext_inreg_of_load, width_reduction_combines, select_combines, diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -6194,6 +6194,16 @@ return CmpInst::isEquality(Pred) && Y.isValid(); } +bool CombinerHelper::matchShiftsTooBig(MachineInstr &MI) { + Register ShiftReg = MI.getOperand(2).getReg(); + LLT ResTy = MRI.getType(MI.getOperand(0).getReg()); + auto IsShiftTooBig = [&](const Constant *C) { + auto *CI = dyn_cast(C); + return CI && CI->uge(ResTy.getScalarSizeInBits()); + }; + return matchUnaryPredicate(MRI, ShiftReg, IsShiftTooBig); +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shifts-undef.mir @@ -0,0 +1,132 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- +name: shl_by_ge_bw +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: shl_by_ge_bw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %1:_(s32) = COPY $w0 + %0:_(s16) = G_TRUNC %1(s32) + %2:_(s16) = G_CONSTANT i16 20 + %3:_(s16) = G_SHL %0, %2(s16) + %4:_(s32) = G_ANYEXT %3(s16) + $w0 = COPY %4(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: lshr_by_ge_bw +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: lshr_by_ge_bw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %1:_(s32) = COPY $w0 + %0:_(s16) = G_TRUNC %1(s32) + %2:_(s16) = G_CONSTANT i16 16 + %3:_(s16) = G_LSHR %0, %2(s16) + %4:_(s32) = G_ANYEXT %3(s16) + $w0 = COPY %4(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: ashr_by_ge_bw +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$w0' } +body: | + bb.1: + liveins: $w0 + + ; CHECK-LABEL: name: ashr_by_ge_bw + ; CHECK: liveins: $w0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[DEF]](s16) + ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32) + ; CHECK-NEXT: RET_ReallyLR implicit $w0 + %1:_(s32) = COPY $w0 + %0:_(s16) = G_TRUNC %1(s32) + %2:_(s16) = G_CONSTANT i16 20 + %3:_(s16) = G_ASHR %0, %2(s16) + %4:_(s32) = G_ANYEXT %3(s16) + $w0 = COPY %4(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: shl_by_ge_bw_vector +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: shl_by_ge_bw_vector + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %shl:_(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: $q0 = COPY %shl(<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %1:_(<4 x s32>) = COPY $q0 + %0:_(s32) = G_CONSTANT i32 32 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0, %0, %0, %0 + %shl:_(<4 x s32>) = G_SHL %1, %bv(<4 x s32>) + $q0 = COPY %shl(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: shl_by_ge_bw_vector_partial +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$q0' } +body: | + bb.1: + liveins: $q0 + + ; CHECK-LABEL: name: shl_by_ge_bw_vector_partial + ; CHECK: liveins: $q0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 32 + ; CHECK-NEXT: %small:_(s32) = G_CONSTANT i32 4 + ; CHECK-NEXT: %bv:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), %small(s32) + ; CHECK-NEXT: %shl:_(<4 x s32>) = G_SHL [[COPY]], %bv(<4 x s32>) + ; CHECK-NEXT: $q0 = COPY %shl(<4 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit $q0 + %1:_(<4 x s32>) = COPY $q0 + %0:_(s32) = G_CONSTANT i32 32 + %small:_(s32) = G_CONSTANT i32 4 + %bv:_(<4 x s32>) = G_BUILD_VECTOR %0, %0, %0, %small + %shl:_(<4 x s32>) = G_SHL %1, %bv(<4 x s32>) + $q0 = COPY %shl(<4 x s32>) + RET_ReallyLR implicit $q0 +... diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-amdgpu-cvt-f32-ubyte.mir @@ -261,8 +261,9 @@ ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_16 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %arg:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE2 %arg + ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) + ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 %trunc:_(s16) = G_TRUNC %arg @@ -283,8 +284,9 @@ ; CHECK-LABEL: name: cvt_f32_ubyte0_zext_lshr_24 ; CHECK: liveins: $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: %arg:_(s32) = COPY $vgpr0 - ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE3 %arg + ; CHECK-NEXT: %shift:_(s16) = G_IMPLICIT_DEF + ; CHECK-NEXT: %zext:_(s32) = G_ZEXT %shift(s16) + ; CHECK-NEXT: %result:_(s32) = G_AMDGPU_CVT_F32_UBYTE0 %zext ; CHECK-NEXT: $vgpr0 = COPY %result(s32) %arg:_(s32) = COPY $vgpr0 %trunc:_(s16) = G_TRUNC %arg diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir @@ -121,10 +121,8 @@ ; CHECK-LABEL: name: narrow_ashr_s64_64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 64 %2:_(s64) = G_ASHR %0, %1 @@ -141,10 +139,8 @@ ; CHECK-LABEL: name: narrow_ashr_s64_65 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 - ; CHECK-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[ASHR]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 65 %2:_(s64) = G_ASHR %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir @@ -119,10 +119,8 @@ ; CHECK-LABEL: name: narrow_lshr_s64_64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 64 %2:_(s64) = G_LSHR %0, %1 @@ -139,10 +137,8 @@ ; CHECK-LABEL: name: narrow_lshr_s64_65 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 - ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[LSHR]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 65 %2:_(s64) = G_LSHR %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir @@ -120,10 +120,8 @@ ; CHECK-LABEL: name: narrow_shl_s64_64 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 64 %2:_(s64) = G_SHL %0, %1 @@ -140,10 +138,8 @@ ; CHECK-LABEL: name: narrow_shl_s64_65 ; CHECK: liveins: $vgpr0_vgpr1 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 - ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 - ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) - ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[SHL]](s64) + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[DEF]](s64) %0:_(s64) = COPY $vgpr0_vgpr1 %1:_(s32) = G_CONSTANT i32 65 %2:_(s64) = G_SHL %0, %1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll @@ -889,7 +889,7 @@ ; ; GFX8-LABEL: s_sext_inreg_v4i16_14: ; GFX8: ; %bb.0: -; GFX8-NEXT: s_bfe_u32 s0, 0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s0, -1, 0x100000 ; GFX8-NEXT: s_mov_b32 s1, s0 ; GFX8-NEXT: ; return to shader part epilog ; @@ -978,7 +978,7 @@ ; GFX8-LABEL: v_sext_inreg_v8i16_11: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_bfe_u32 s4, 0, 0x100000 +; GFX8-NEXT: s_bfe_u32 s4, -1, 0x100000 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: v_mov_b32_e32 v2, s4