diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td --- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td @@ -45,6 +45,12 @@ [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]), (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>; +def insert_vec_elt_to_shuffle : GICombineRule< + (defs root:$insertelt, unsigned_matchinfo:$matchinfo), + (match (wip_match_opcode G_INSERT_VECTOR_ELT):$insertelt, + [{ return PreLegalizerHelper.matchInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }]), + (apply [{ PreLegalizerHelper.applyInsertVectorEltToShuffle(*${insertelt}, ${matchinfo}); }])>; + def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">; def clamp_i64_to_i16 : GICombineRule< @@ -109,7 +115,7 @@ def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper< "AMDGPUGenPreLegalizerCombinerHelper", - [all_combines, clamp_i64_to_i16, foldable_fneg]> { + [all_combines, clamp_i64_to_i16, foldable_fneg, insert_vec_elt_to_shuffle]> { let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule"; let StateClass = "AMDGPUPreLegalizerCombinerHelperState"; let AdditionalArguments = []; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -55,6 +55,9 @@ void applyClampI64ToI16(MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo); + + bool matchInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx); + void applyInsertVectorEltToShuffle(MachineInstr &MI, unsigned &Idx); }; bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16( @@ -154,6 +157,65 @@ MI.eraseFromParent(); } +bool AMDGPUPreLegalizerCombinerHelper::matchInsertVectorEltToShuffle( + MachineInstr &MI, unsigned &Idx) { + // This is only beneficial if scalar pack insts are present. + if (!MI.getMF()->getSubtarget().hasScalarPackInsts()) + return false; + + // TODO: Only on small vectors? + LLT VecTy = MRI.getType(MI.getOperand(0).getReg()); + if (VecTy.getElementType() != LLT::scalar(16) || + (VecTy.getSizeInBits() % 32) != 0) + return false; + + Optional MaybeIdxVal = + getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); + if (!MaybeIdxVal) + return false; + + Idx = MaybeIdxVal->Value.getZExtValue(); + return true; +} + +void AMDGPUPreLegalizerCombinerHelper::applyInsertVectorEltToShuffle( + MachineInstr &MI, unsigned &Idx) { + B.setInstrAndDebugLoc(MI); + + Register Ins = MI.getOperand(2).getReg(); + Register Vec = MI.getOperand(1).getReg(); + Register Dst = MI.getOperand(0).getReg(); + + LLT VecTy = MRI.getType(Dst); + LLT EltTy = VecTy.getElementType(); + const unsigned NumElts = VecTy.getNumElements(); + + const auto Undef = MRI.createGenericVirtualRegister(EltTy); + B.buildUndef(Undef); + + const auto OtherVec = MRI.createGenericVirtualRegister(VecTy); + + SmallVector Srcs; + Srcs.push_back(Ins); + for (unsigned K = 1; K < NumElts; ++K) + Srcs.push_back(Undef); + + B.buildBuildVector(OtherVec, Srcs); + + // NumElts == Ins in OtherVec + // 0...(NumElts-1) = Original elements + SmallVector ShuffleMask; + for (unsigned CurIdx = 0; CurIdx < NumElts; ++CurIdx) { + if (CurIdx == Idx) + ShuffleMask.push_back(NumElts); + else + ShuffleMask.push_back(CurIdx); + } + + B.buildShuffleVector(Dst, Vec, OtherVec, ShuffleMask); + Helper.eraseInst(MI); +} + class AMDGPUPreLegalizerCombinerHelperState { protected: AMDGPUCombinerHelper &Helper; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-insertvecelt-to-shufflevector.mir @@ -0,0 +1,157 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GFX9PLUS +# RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,VI + +--- +name: test_v2s16_idx0 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9PLUS-LABEL: name: test_v2s16_idx0 + ; GFX9PLUS: liveins: $vgpr0 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(2, 1) + ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + ; VI-LABEL: name: test_v2s16_idx0 + ; VI: liveins: $vgpr0 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 0 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 0 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v2s16_idx1 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; GFX9PLUS-LABEL: name: test_v2s16_idx1 + ; GFX9PLUS: liveins: $vgpr0 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<2 x s16>) = G_SHUFFLE_VECTOR %src(<2 x s16>), [[BUILD_VECTOR]], shufflemask(0, 2) + ; GFX9PLUS-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + ; VI-LABEL: name: test_v2s16_idx1 + ; VI: liveins: $vgpr0 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<2 x s16>) = COPY $vgpr0 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 1 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 1 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v2s16_idx2_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + ; CHECK-LABEL: name: test_v2s16_idx2_nofold + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %ins:_(<2 x s16>) = G_IMPLICIT_DEF + ; CHECK-NEXT: $vgpr0 = COPY %ins(<2 x s16>) + %src:_(<2 x s16>) = COPY $vgpr0 + %idx:_(s32) = G_CONSTANT i32 2 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0 = COPY %ins +... + +--- +name: test_v3s16_idx2_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2 + ; CHECK-LABEL: name: test_v3s16_idx2_nofold + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + ; CHECK-NEXT: %truncsrc:_(<3 x s16>) = G_TRUNC %src(<3 x s32>) + ; CHECK-NEXT: %idx:_(s32) = G_CONSTANT i32 2 + ; CHECK-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; CHECK-NEXT: %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt(s16), %idx(s32) + ; CHECK-NEXT: %zextins:_(<3 x s32>) = G_ZEXT %ins(<3 x s16>) + ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2 = COPY %zextins(<3 x s32>) + %src:_(<3 x s32>) = COPY $vgpr0_vgpr1_vgpr2 + %truncsrc:_(<3 x s16>) = G_TRUNC %src + %idx:_(s32) = G_CONSTANT i32 2 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<3 x s16>) = G_INSERT_VECTOR_ELT %truncsrc, %elt, %idx + %zextins:_(<3 x s32>) = G_ZEXT %ins + $vgpr0_vgpr1_vgpr2 = COPY %zextins +... + +--- +name: test_v2s32_idx1_nofold +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; CHECK-LABEL: name: test_v2s32_idx1_nofold + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK-NEXT: %idx:_(s32) = G_CONSTANT i32 1 + ; CHECK-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; CHECK-NEXT: %ins:_(<2 x s32>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %ins(<2 x s32>) + %src:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %idx:_(s32) = G_CONSTANT i32 1 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<2 x s32>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0_vgpr1 = COPY %ins +... + +--- +name: test_v4s16_idx3 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + ; GFX9PLUS-LABEL: name: test_v4s16_idx3 + ; GFX9PLUS: liveins: $vgpr0_vgpr1 + ; GFX9PLUS-NEXT: {{ $}} + ; GFX9PLUS-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; GFX9PLUS-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; GFX9PLUS-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF + ; GFX9PLUS-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR %elt(s16), [[DEF]](s16), [[DEF]](s16), [[DEF]](s16) + ; GFX9PLUS-NEXT: %ins:_(<4 x s16>) = G_SHUFFLE_VECTOR %src(<4 x s16>), [[BUILD_VECTOR]], shufflemask(0, 1, 2, 4) + ; GFX9PLUS-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>) + ; VI-LABEL: name: test_v4s16_idx3 + ; VI: liveins: $vgpr0_vgpr1 + ; VI-NEXT: {{ $}} + ; VI-NEXT: %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + ; VI-NEXT: %idx:_(s32) = G_CONSTANT i32 3 + ; VI-NEXT: %elt:_(s16) = G_CONSTANT i16 42 + ; VI-NEXT: %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt(s16), %idx(s32) + ; VI-NEXT: $vgpr0_vgpr1 = COPY %ins(<4 x s16>) + %src:_(<4 x s16>) = COPY $vgpr0_vgpr1 + %idx:_(s32) = G_CONSTANT i32 3 + %elt:_(s16) = G_CONSTANT i16 42 + %ins:_(<4 x s16>) = G_INSERT_VECTOR_ELT %src, %elt, %idx + $vgpr0_vgpr1 = COPY %ins +...