Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -931,6 +931,33 @@ return false; } +static bool isConstantSplatVectorMaskForType(SDNode *N, EVT ScalarTy) { + if (!ScalarTy.isSimple()) + return false; + + uint64_t MaskForTy = 0ULL; + switch (ScalarTy.getSimpleVT().SimpleTy) { + case MVT::i8: + MaskForTy = 0xFFULL; + break; + case MVT::i16: + MaskForTy = 0xFFFFULL; + break; + case MVT::i32: + MaskForTy = 0xFFFFFFFFULL; + break; + default: + return false; + break; + } + + APInt Val; + if (ISD::isConstantSplatVector(N, Val)) + return Val.getLimitedValue() == MaskForTy; + + return false; +} + // Returns the SDNode if it is a constant float BuildVector // or constant float. static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) { @@ -5647,6 +5674,28 @@ } } + // fold (and (masked_gather x)) -> (zext_masked_gather x) + if (auto *GN0 = dyn_cast(N0)) { + EVT MemVT = GN0->getMemoryVT(); + EVT ScalarVT = MemVT.getScalarType(); + + if (SDValue(GN0, 0).hasOneUse() && + isConstantSplatVectorMaskForType(N1.getNode(), ScalarVT) && + TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), + GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; + + SDValue ZExtLoad = DAG.getMaskedGather( + DAG.getVTList(VT, MVT::Other), MemVT, SDLoc(N), Ops, + GN0->getMemOperand(), GN0->getIndexType(), ISD::ZEXTLOAD); + + CombineTo(N, ZExtLoad); + AddToWorklist(ZExtLoad.getNode()); + // Avoid recheck of N. + return SDValue(N, 0); + } + } + // fold (and (load x), 255) -> (zextload x, i8) // fold (and (extload x, i16), 255) -> (zextload x, i8) // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8) @@ -11504,6 +11553,25 @@ } } + // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x) + if (auto *GN0 = dyn_cast(N0)) { + if (SDValue(GN0, 0).hasOneUse() && + ExtVT == GN0->getMemoryVT() && + TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) { + SDValue Ops[] = {GN0->getChain(), GN0->getPassThru(), GN0->getMask(), + GN0->getBasePtr(), GN0->getIndex(), GN0->getScale()}; + + SDValue ExtLoad = DAG.getMaskedGather( + DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops, + GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD); + + CombineTo(N, ExtLoad); + CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1)); + AddToWorklist(ExtLoad.getNode()); + return SDValue(N, 0); // Return N so it doesn't get rechecked! + } + } + // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16)) if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) { if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0), Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3826,6 +3826,26 @@ return AddrModes.find(Key)->second; } +unsigned getExtendedGatherOpcode(unsigned Opcode) { + switch (Opcode) { + default: + llvm_unreachable("unimplemented opcode"); + return Opcode; + case AArch64ISD::GLD1_MERGE_ZERO: + return AArch64ISD::GLD1S_MERGE_ZERO; + case AArch64ISD::GLD1_UXTW_MERGE_ZERO: + return AArch64ISD::GLD1S_UXTW_MERGE_ZERO; + case AArch64ISD::GLD1_SXTW_MERGE_ZERO: + return AArch64ISD::GLD1S_SXTW_MERGE_ZERO; + case AArch64ISD::GLD1_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_SCALED_MERGE_ZERO; + case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO; + case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO: + return AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO; + } +} + bool getGatherScatterIndexIsExtended(SDValue Index) { unsigned Opcode = Index.getOpcode(); if (Opcode == ISD::SIGN_EXTEND_INREG) @@ -3855,6 +3875,7 @@ SDValue PassThru = MGT->getPassThru(); SDValue Mask = MGT->getMask(); SDValue BasePtr = MGT->getBasePtr(); + ISD::LoadExtType ExtTy = MGT->getExtensionType(); ISD::MemIndexType IndexType = MGT->getIndexType(); bool IsScaled = @@ -3864,6 +3885,7 @@ bool IdxNeedsExtend = getGatherScatterIndexIsExtended(Index) || Index.getSimpleValueType().getVectorElementType() == MVT::i32; + bool ResNeedsExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD; EVT VT = PassThru.getSimpleValueType(); EVT MemVT = MGT->getMemoryVT(); @@ -3890,9 +3912,12 @@ if (getGatherScatterIndexIsExtended(Index)) Index = Index.getOperand(0); + unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend); + if (ResNeedsExtend) + Opcode = getExtendedGatherOpcode(Opcode); + SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; - return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL, - VTs, Ops); + return DAG.getNode(Opcode, DL, VTs, Ops); } SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op, Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unscaled unpacked 32-bit offsets @@ -9,7 +10,6 @@ ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) @@ -21,7 +21,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) @@ -72,9 +71,7 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) @@ -85,9 +82,7 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw #2] ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) @@ -103,7 +98,6 @@ ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] -; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) @@ -144,9 +138,7 @@ define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv4i16( %ptrs, i32 2, %mask, undef) Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unscaled unpacked 32-bit offsets @@ -9,7 +10,6 @@ ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) @@ -21,7 +21,6 @@ ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -34,7 +33,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -90,9 +88,7 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) @@ -103,9 +99,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -117,9 +111,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -136,7 +128,6 @@ ; CHECK-LABEL: masked_gather_nxv4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) @@ -148,7 +139,6 @@ ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -193,9 +183,7 @@ define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, sxtw] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv4i8( %ptrs, i32 1, %mask, undef) @@ -206,9 +194,7 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unscaled unpacked 32-bit offsets @@ -9,7 +10,6 @@ ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i16, i16* %base, %offsets.zext @@ -22,7 +22,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, i32* %base, %offsets.zext @@ -78,9 +77,7 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i16, i16* %base, %offsets.zext @@ -92,9 +89,7 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, i32* %base, %offsets.zext @@ -111,7 +106,6 @@ ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] -; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i16, i16* %base, %offsets.zext @@ -156,9 +150,7 @@ define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i16, i16* %base, %offsets.zext Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; unscaled unpacked 32-bit offsets @@ -9,7 +10,6 @@ ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -22,7 +22,6 @@ ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -36,7 +35,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -97,9 +95,7 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -111,9 +107,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -126,9 +120,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -146,7 +138,6 @@ ; CHECK-LABEL: masked_gather_nxv4i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] -; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -159,7 +150,6 @@ ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] -; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -208,9 +198,7 @@ define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -222,9 +210,7 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] -; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-scaled.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) @@ -17,7 +17,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) @@ -68,9 +67,7 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, lsl #1] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d, lsl #1] ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) @@ -81,9 +78,7 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, lsl #2] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d, lsl #2] ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) @@ -17,7 +17,6 @@ ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -30,7 +29,6 @@ ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -86,9 +84,7 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) @@ -99,9 +95,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -113,9 +107,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] -; CHECK-NEXT: ptrue p0.d -; CHECK-NEXT: sxtw z0.d, p0/m, z0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -1,5 +1,27 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve -aarch64-enable-mgather-combine=1 < %s | FileCheck %s + +; Test for multiple uses of the mgather where the sext should not be combined + +define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask, %vals) { +; CHECK-LABEL: masked_sgather_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0, z0.d] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z2.d, p0/m, z0.d +; CHECK-NEXT: add z0.d, z0.d, z1.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: mul z0.d, p0/m, z0.d, z2.d +; CHECK-NEXT: ret + %ptrs = getelementptr i8, i8* %base, %offsets + %data = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) + %data.sext = sext %data to + %add = add %data, %vals + %add.sext = sext %add to + %mul = mul %data.sext, %add.sext + ret %mul +} ; Tests that exercise various type legalisation scenarios for ISD::MGATHER. @@ -7,7 +29,7 @@ define @masked_gather_nxv2i32( %ptrs, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK-DAG: mov x8, xzr -; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d] ; CHECK: ret %data = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) ret %data @@ -41,8 +63,8 @@ ; CHECK-NEXT: mov x8, xzr ; CHECK-NEXT: zip2 p2.s, p0.s, p1.s ; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1sb { z1.d }, p2/z, [x8, z1.d] +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x8, z0.d] ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: sxtb z0.s, p0/m, z0.s