Index: llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -1151,7 +1151,7 @@ // Construct MMOs for the accesses. auto *LoadMMO = MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); - auto *StoreMMO = + auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); // Create the load. @@ -1405,7 +1405,8 @@ bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI, unsigned TargetShiftSize, unsigned &ShiftVal) { - assert(MI.getOpcode() == TargetOpcode::G_LSHR && "Expected a shift"); + assert((MI.getOpcode() == TargetOpcode::G_SHL || + MI.getOpcode() == TargetOpcode::G_LSHR) && "Expected a shift"); LLT Ty = MRI.getType(MI.getOperand(0).getReg()); if (Ty.isVector()) // TODO: @@ -1425,32 +1426,55 @@ return ShiftVal >= Size / 2 && ShiftVal < Size; } -// dst = G_LSHR s64:x, C for C >= 32 -// => -// lo, hi = G_UNMERGE_VALUES x -// dst = merge_values (G_LSHR hi, C - 32), 0 bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI, const unsigned &ShiftVal) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(SrcReg); unsigned Size = Ty.getSizeInBits(); + unsigned HalfSize = Size / 2; - assert(ShiftVal >= Size / 2); - LLT HalfTy = LLT::scalar(Size / 2); + assert(ShiftVal >= HalfSize); + LLT HalfTy = LLT::scalar(HalfSize); Builder.setInstr(MI); auto Unmerge = Builder.buildUnmerge(HalfTy, SrcReg); + unsigned NarrowShiftAmt = ShiftVal - HalfSize; + + if (MI.getOpcode() == TargetOpcode::G_LSHR) { + Register Narrowed = Unmerge.getReg(1); + + // dst = G_LSHR s64:x, C for C >= 32 + // => + // lo, hi = G_UNMERGE_VALUES x + // dst = G_MERGE_VALUES (G_LSHR hi, C - 32), 0 + + if (NarrowShiftAmt != 0) { + Narrowed = Builder.buildLShr(HalfTy, Narrowed, + Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0); + } + + auto Zero = Builder.buildConstant(HalfTy, 0); + Builder.buildMerge(DstReg, { Narrowed, Zero }); + } else { + Register Narrowed = Unmerge.getReg(0); + // dst = G_SHL s64:x, C for C >= 32 + // => + // lo, hi = G_UNMERGE_VALUES x + // dst = G_MERGE_VALUES 0, (G_SHL hi, C - 32) + + // TODO: ashr + assert(MI.getOpcode() == TargetOpcode::G_SHL); + + if (NarrowShiftAmt != 0) { + Narrowed = Builder.buildShl(HalfTy, Narrowed, + Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0); + } - Register Narrowed = Unmerge.getReg(1); - if (ShiftVal > Size / 2) { - Narrowed = Builder.buildLShr( - HalfTy, Unmerge.getReg(1), - Builder.buildConstant(HalfTy, ShiftVal - Size / 2)).getReg(0); + auto Zero = Builder.buildConstant(HalfTy, 0); + Builder.buildMerge(DstReg, { Zero, Narrowed }); } - Builder.buildMerge( - DstReg, { Narrowed, Builder.buildConstant(HalfTy, 0).getReg(0) }); MI.eraseFromParent(); return true; } Index: llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp @@ -165,6 +165,7 @@ return true; switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: case TargetOpcode::G_LSHR: // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and Index: llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir @@ -0,0 +1,203 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + + +--- +name: narrow_shl_s64_32_s64amt +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_32_s64amt + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s64) = G_CONSTANT i64 32 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_32 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C]](s32), [[UV]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 32 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_33 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_33 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 33 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_31 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_31 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 31 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_63 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_63 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64) + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 31 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[UV]], [[C]](s32) + ; CHECK: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0 + ; CHECK: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[C1]](s32), [[SHL]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 63 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_64 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_64 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 64 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 64 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s64_65 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_s64_65 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65 + ; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[SHL]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 65 + %2:_(s64) = G_SHL %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: narrow_shl_s32_16 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_shl_s32_16 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 16 + %2:_(s32) = G_SHL %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_shl_s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0 + + ; CHECK-LABEL: name: narrow_shl_s32_17 + ; CHECK: liveins: $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32) + ; CHECK: $vgpr0 = COPY [[SHL]](s32) + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(s32) = G_SHL %0, %1 + $vgpr0 = COPY %2 +... + +--- +name: narrow_shl_v2s32_17 +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + + ; CHECK-LABEL: name: narrow_shl_v2s32_17 + ; CHECK: liveins: $vgpr0_vgpr1 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1 + ; CHECK: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 17 + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32) + ; CHECK: [[SHL:%[0-9]+]]:_(<2 x s32>) = G_SHL [[COPY]], [[BUILD_VECTOR]](<2 x s32>) + ; CHECK: $vgpr0_vgpr1 = COPY [[SHL]](<2 x s32>) + %0:_(<2 x s32>) = COPY $vgpr0_vgpr1 + %1:_(s32) = G_CONSTANT i32 17 + %2:_(<2 x s32>) = G_BUILD_VECTOR %1, %1 + %3:_(<2 x s32>) = G_SHL %0, %2 + $vgpr0_vgpr1 = COPY %3 +...