Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9428,13 +9428,13 @@ } // Fold sext/zext of index into index type. -bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled, - SelectionDAG &DAG) { +bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index, + bool Scaled, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Op = Index.getOperand(0); if (Index.getOpcode() == ISD::ZERO_EXTEND) { - MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9442,7 +9442,7 @@ } if (Index.getOpcode() == ISD::SIGN_EXTEND) { - MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9511,11 +9511,30 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) { MaskedGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); + SDValue Chain = MGT->getChain(); + SDValue Index = MGT->getIndex(); + SDValue Scale = MGT->getScale(); + SDValue PassThru = MGT->getPassThru(); + SDValue BasePtr = MGT->getBasePtr(); SDLoc DL(N); // Zap gathers with a zero mask. if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return CombineTo(N, MGT->getPassThru(), MGT->getChain()); + return CombineTo(N, PassThru, MGT->getChain()); + + if (refineUniformBase(BasePtr, Index, DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } + + if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } return SDValue(); } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3891,6 +3891,9 @@ SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other); + if (getGatherScatterIndexIsExtended(Index)) + Index = Index.getOperand(0); + SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru}; return DAG.getNode(getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend), DL, VTs, Ops); Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-scaled.ll @@ -8,8 +8,6 @@ define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret @@ -22,8 +20,6 @@ define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret @@ -36,8 +32,6 @@ define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret %ptrs = getelementptr i64, i64* %base, %offsets @@ -48,8 +42,6 @@ define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr half, half* %base, %offsets @@ -60,8 +52,6 @@ define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] ; CHECK-NEXT: ret %ptrs = getelementptr float, float* %base, %offsets @@ -72,8 +62,6 @@ define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] ; CHECK-NEXT: ret %ptrs = getelementptr double, double* %base, %offsets @@ -84,10 +72,9 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets %vals = call @llvm.masked.gather.nxv2i16( %ptrs, i32 2, %mask, undef) @@ -98,10 +85,9 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets %vals = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll @@ -8,8 +8,6 @@ define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret @@ -22,12 +20,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -40,12 +33,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -58,12 +46,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -74,12 +57,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -90,12 +68,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -106,12 +79,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -122,10 +90,9 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d ; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] -; CHECK-NEXT: sxtb z0.d, p1/m, z0.d +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets %vals = call @llvm.masked.gather.nxv2i8( %ptrs, i32 1, %mask, undef) @@ -136,13 +103,9 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -154,13 +117,9 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -188,18 +147,7 @@ define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -212,18 +160,7 @@ define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -234,18 +171,7 @@ define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -256,18 +182,7 @@ define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -291,19 +206,8 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll @@ -8,8 +8,7 @@ define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -22,8 +21,7 @@ define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -36,8 +34,7 @@ define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i64, i64* %base, %offsets.zext @@ -48,8 +45,7 @@ define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr half, half* %base, %offsets.zext @@ -60,8 +56,7 @@ define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr float, float* %base, %offsets.zext @@ -72,8 +67,7 @@ define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr double, double* %base, %offsets.zext @@ -84,8 +78,7 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -99,8 +92,7 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -118,14 +110,7 @@ define @masked_gather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -138,14 +123,7 @@ define @masked_gather_nxv4i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, i32* %base, %offsets.zext @@ -156,14 +134,7 @@ define @masked_gather_nxv4f16(half* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr half, half* %base, %offsets.zext @@ -174,14 +145,7 @@ define @masked_gather_nxv4f32(float* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr float, float* %base, %offsets.zext @@ -192,15 +156,8 @@ define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll @@ -8,8 +8,7 @@ define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -22,11 +21,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -40,11 +35,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -58,11 +49,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -74,11 +61,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -90,11 +73,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -106,11 +85,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -122,8 +97,7 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -137,11 +111,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -156,11 +126,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -179,14 +145,7 @@ define @masked_gather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -199,18 +158,7 @@ define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -224,18 +172,7 @@ define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -247,18 +184,7 @@ define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -270,18 +196,7 @@ define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -293,15 +208,8 @@ define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxtb z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -314,19 +222,8 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -16,10 +16,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -32,10 +29,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -48,10 +42,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -62,10 +53,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -76,10 +64,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -90,10 +75,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -117,10 +99,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -134,10 +113,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret Index: llvm/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -765,45 +765,41 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; KNL_64-LABEL: test14: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 -; KNL_64-NEXT: vmovd %esi, %xmm1 -; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vmovd %esi, %xmm0 +; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test14: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 -; SKX-NEXT: vpbroadcastd %esi, %ymm1 -; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 -; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpbroadcastd %esi, %ymm0 +; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 +; SKX-NEXT: vpsllq $2, %zmm0, %zmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test14: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1