diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -45,6 +45,12 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def insert_vec_elt_to_shuffle : GICombineRule< + (defs root:$insertelt, unsigned_matchinfo:$matchinfo), + (match (wip_match_opcode G_INSERT_VECTOR_ELT):$insertelt, + [{ return PreLegalizerHelper.matchInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }]), + (apply [{ PreLegalizerHelper.applyInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }])>; + def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; def clamp_i64_to_i16 : GICombineRule< @@ -109,7 +115,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPreLegalizerCombinerHelper", - [all_combines, clamp_i64_to_i16, foldable_fneg]> { + [all_combines, clamp_i64_to_i16, foldable_fneg, insert_vec_elt_to_shuffle]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -55,6 +55,9 @@ void applyClampI64ToI16(MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo); + + bool matchInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx); + void applyInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx); }; bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( @@ -154,6 +157,73 @@ MI.eraseFromParent(); } +bool AMDGPUPreLegalizerCombinerHelper::matchInsertVectorEltToShuffle( + MachineInstr &MI, unsigned &Idx) { + // Transfroms a G_INSERT_VECTOR_ELT into an equivalent G_SHUFFLE_MASK if: + // - Scalar Pack insts are present (for <32 bits element types) + // - The vector has <= 4 elements. + // as this is a preferred canonical form of the operation. + // + // Note that both restrictions are arbitrary. Currently, it's mostly targeted + // towards 2x16 vectors. Restrictions could be relaxed or entirely removed in + // the future if codegen can handle it without causing regressions. + + LLT VecTy = MRI.getType(MI.getOperand(0).getReg()); + const unsigned EltSize = VecTy.getElementType().getSizeInBits(); + if (EltSize < 32 && + !MI.getMF()->getSubtarget().hasScalarPackInsts()) + return false; + + if (VecTy.isScalable() || VecTy.getNumElements() > 4) + return false; + + Optional MaybeIdxVal = + getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); + if (!MaybeIdxVal) + return false; + + Idx = MaybeIdxVal->Value.getZExtValue(); + return true; +} + +void AMDGPUPreLegalizerCombinerHelper::applyInsertVectorEltToShuffle( + MachineInstr &MI, unsigned &Idx) { + B.setInstrAndDebugLoc(MI); + + Register Ins = MI.getOperand(2).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + + LLT VecTy = MRI.getType(Dst); + LLT EltTy = VecTy.getElementType(); + const unsigned NumElts = VecTy.getNumElements(); + + const auto Undef = MRI.createGenericVirtualRegister(EltTy); + B.buildUndef(Undef); + + const auto OtherVec = MRI.createGenericVirtualRegister(VecTy); + + SmallVector Srcs; + Srcs.push_back(Ins); + for (unsigned K = 1; K < NumElts; ++K) + Srcs.push_back(Undef); + + B.buildBuildVector(OtherVec, Srcs); + + // NumElts == Ins in OtherVec + // 0...(NumElts-1) = Original elements + SmallVector ShuffleMask; + for (unsigned CurIdx = 0; CurIdx < NumElts; ++CurIdx) { + if (CurIdx == Idx) + ShuffleMask.push_back(NumElts); + else + ShuffleMask.push_back(CurIdx); + } + + B.buildShuffleVector(Dst, Vec, OtherVec, ShuffleMask); + Helper.eraseInst(MI); +} + class AMDGPUPreLegalizerCombinerHelperState { protected: AMDGPUCombinerHelper &Helper; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir @@ -0,0 +1,169 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GFX9PLUS +# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,VI + +--- +name: test_v2s16_idx0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9PLUS-LABEL: name: test_v2s16_idx0 + ; GFX9PLUS: liveins: $vgpr0 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(2, 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + ; VI-LABEL: name: test_v2s16_idx0 + ; VI: liveins: $vgpr0 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 0 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 0 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v2s16_idx1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9PLUS-LABEL: name: test_v2s16_idx1 + ; GFX9PLUS: liveins: $vgpr0 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(0, 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + ; VI-LABEL: name: test_v2s16_idx1 + ; VI: liveins: $vgpr0 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 1 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v2s16_idx2_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_v2s16_idx2_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ins:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 2 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v3s16_idx2 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2 + ; GFX9PLUS-LABEL: name: test_v3s16_idx2 + ; GFX9PLUS: liveins: $vgpr0_vgpr1_vgpr2 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; GFX9PLUS-NEXT: %truncsrc:_(<3 x s16>) = G_TRUNC %src(<3 x s32>) + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<3 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<3 x s16>) = G_SHUFFLE_VECTOR %truncsrc(<3 x s16>), [[BUILD_VECTOR]], shufflemask(0, 1, 3) + ; GFX9PLUS-NEXT: %zextins:_(<3 x s32>) = G_ZEXT %ins(<3 x s16>) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1_vgpr2 = COPY %zextins(<3 x s32>) + ; VI-LABEL: name: test_v3s16_idx2 + ; VI: liveins: $vgpr0_vgpr1_vgpr2 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; VI-NEXT: %truncsrc:_(<3 x s16>) = G_TRUNC %src(<3 x s32>) + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 2 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt(s16), %idx(s32) + ; VI-NEXT: %zextins:_(<3 x s32>) = G_ZEXT %ins(<3 x s16>) + ; VI-NEXT: $vgpr0_vgpr1_vgpr2 = COPY %zextins(<3 x s32>) + %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %truncsrc:_(<3 x s16>) = G_TRUNC %src + %idx:_(s32) = G_CONSTANT i32 2 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt, %idx + %zextins:_(<3 x s32>) = G_ZEXT %ins + $vgpr0_vgpr1_vgpr2 = COPY %zextins +... + +--- +name: test_v2s32_idx1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_v2s32_idx1 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %elt:_(s32) = G_CONSTANT i32 42 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR %elt(s32), [[DEF]](s32) + ; CHECK-NEXT: %ins:_(<2 x s32>) = G_SHUFFLE_VECTOR %src(<2 x s32>), [[BUILD_VECTOR]], shufflemask(0, 2) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %ins(<2 x s32>) + %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %idx:_(s32) = G_CONSTANT i32 1 + %elt:_(s32) = G_CONSTANT i32 42 + %ins:_(<2 x s32>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0_vgpr1 = COPY %ins +... + +--- +name: test_v4s16_idx3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX9PLUS-LABEL: name: test_v4s16_idx3 + ; GFX9PLUS: liveins: $vgpr0_vgpr1 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<4 x s16>) = G_SHUFFLE_VECTOR %src(<4 x s16>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 4) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>) + ; VI-LABEL: name: test_v4s16_idx3 + ; VI: liveins: $vgpr0_vgpr1 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 3 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>) + %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %idx:_(s32) = G_CONSTANT i32 3 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0_vgpr1 = COPY %ins +...