Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9428,13 +9428,13 @@ } // Fold sext/zext of index into index type. -bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled, - SelectionDAG &DAG) { +bool refineIndexType(MaskedGatherScatterSDNode *MGS, SDValue &Index, + bool Scaled, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue Op = Index.getOperand(0); if (Index.getOpcode() == ISD::ZERO_EXTEND) { - MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9442,7 +9442,7 @@ } if (Index.getOpcode() == ISD::SIGN_EXTEND) { - MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); + MGS->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED); if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) { Index = Op; return true; @@ -9511,11 +9511,30 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) { MaskedGatherSDNode *MGT = cast(N); SDValue Mask = MGT->getMask(); + SDValue Chain = MGT->getChain(); + SDValue Index = MGT->getIndex(); + SDValue Scale = MGT->getScale(); + SDValue PassThru = MGT->getPassThru(); + SDValue BasePtr = MGT->getBasePtr(); SDLoc DL(N); // Zap gathers with a zero mask. if (ISD::isBuildVectorAllZeros(Mask.getNode())) - return CombineTo(N, MGT->getPassThru(), MGT->getChain()); + return CombineTo(N, PassThru, MGT->getChain()); + + if (refineUniformBase(BasePtr, Index, DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } + + if (refineIndexType(MGT, Index, MGT->isIndexScaled(), DAG)) { + SDValue Ops[] = {Chain, PassThru, Mask, BasePtr, Index, Scale}; + return DAG.getMaskedGather(DAG.getVTList(N->getValueType(0), MVT::Other), + PassThru.getValueType(), DL, Ops, + MGT->getMemOperand(), MGT->getIndexType()); + } return SDValue(); } Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4416,7 +4416,7 @@ if (!UniformBase) { Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(Ptr); - IndexType = ISD::SIGNED_SCALED; + IndexType = ISD::SIGNED_UNSCALED; Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout())); } SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale }; Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-signed-unscaled.ll @@ -20,12 +20,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -38,12 +33,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -56,12 +46,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -72,12 +57,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -88,12 +68,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -104,12 +79,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -133,13 +103,9 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: sxth z0.d, p1/m, z0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -151,13 +117,9 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ptrue p1.d -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] -; CHECK-NEXT: sxtw z0.d, p1/m, z0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -185,18 +147,7 @@ define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d, lsl #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -209,18 +160,7 @@ define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d, lsl #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d, lsl #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -231,18 +171,7 @@ define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d, lsl #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -253,18 +182,7 @@ define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x8, z0.d, lsl #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x8, z2.d, lsl #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -288,19 +206,8 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: sunpklo z2.d, z0.s -; CHECK-NEXT: sunpkhi z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z2.d, z1.d, z2.d -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x8, z2.d, lsl #1] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-scaled.ll @@ -8,7 +8,7 @@ define @masked_gather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -21,7 +21,7 @@ define @masked_gather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -34,7 +34,7 @@ define @masked_gather_nxv2i64(i64* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i64, i64* %base, %offsets.zext @@ -45,7 +45,7 @@ define @masked_gather_nxv2f16(half* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr half, half* %base, %offsets.zext @@ -56,7 +56,7 @@ define @masked_gather_nxv2f32(float* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr float, float* %base, %offsets.zext @@ -67,7 +67,7 @@ define @masked_gather_nxv2f64(double* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, sxtw #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw #3] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr double, double* %base, %offsets.zext @@ -78,7 +78,7 @@ define @masked_sgather_nxv2i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw #1] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -92,7 +92,7 @@ define @masked_sgather_nxv2i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, sxtw #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw #2] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -110,14 +110,7 @@ define @masked_gather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -130,14 +123,7 @@ define @masked_gather_nxv4i32(i32* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr i32, i32* %base, %offsets.zext @@ -148,14 +134,7 @@ define @masked_gather_nxv4f16(half* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr half, half* %base, %offsets.zext @@ -166,14 +145,7 @@ define @masked_gather_nxv4f32(float* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z0.d }, p2/z, [x0, z0.d, sxtw #2] -; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, z1.d, sxtw #2] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %ptrs = getelementptr float, float* %base, %offsets.zext @@ -184,15 +156,8 @@ define @masked_sgather_nxv4i16(i16* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z0.d }, p2/z, [x0, z0.d, sxtw #1] -; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, z1.d, sxtw #1] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw #1] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-32b-unsigned-unscaled.ll @@ -8,7 +8,7 @@ define @masked_gather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -21,11 +21,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -39,11 +35,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -57,11 +49,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -73,11 +61,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -89,11 +73,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -105,11 +85,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -121,7 +97,7 @@ define @masked_sgather_nxv2i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, sxtw] +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtb z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -135,11 +111,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -154,11 +126,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: and z0.d, z0.d, #0xffffffff -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d, uxtw] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -177,14 +145,7 @@ define @masked_gather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -197,18 +158,7 @@ define @masked_gather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d, lsl #1] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: and z0.s, z0.s, #0xffff ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -222,18 +172,7 @@ define @masked_gather_nxv4i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d, lsl #2] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -245,18 +184,7 @@ define @masked_gather_nxv4f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d, lsl #1] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -268,18 +196,7 @@ define @masked_gather_nxv4f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv4f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1w { z1.d }, p2/z, [x8, z1.d, lsl #2] -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to %byte_ptrs = getelementptr i8, i8* %base, %offsets.zext @@ -291,15 +208,8 @@ define @masked_sgather_nxv4i8(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: uunpklo z1.d, z0.s -; CHECK-NEXT: uunpkhi z0.d, z0.s -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1b { z0.d }, p2/z, [x0, z0.d, sxtw] -; CHECK-NEXT: ld1b { z1.d }, p0/z, [x0, z1.d, sxtw] +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s ; CHECK-NEXT: sxtb z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -312,19 +222,8 @@ define @masked_sgather_nxv4i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv4i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: uunpkhi z2.d, z0.s -; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: add z1.d, z1.d, z2.d -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [x8, z1.d, lsl #1] -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, uxtw] ; CHECK-NEXT: ptrue p0.s -; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: sxth z0.s, p0/m, z0.s ; CHECK-NEXT: ret %offsets.zext = zext %offsets to Index: llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-64b-unscaled.ll @@ -16,10 +16,7 @@ define @masked_gather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -32,10 +29,7 @@ define @masked_gather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: and z0.d, z0.d, #0xffffffff ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets @@ -48,10 +42,7 @@ define @masked_gather_nxv2i64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2i64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -62,10 +53,7 @@ define @masked_gather_nxv2f16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -76,10 +64,7 @@ define @masked_gather_nxv2f32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -90,10 +75,7 @@ define @masked_gather_nxv2f64(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_gather_nxv2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x8, z0.d, lsl #3] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ret %byte_ptrs = getelementptr i8, i8* %base, %offsets %ptrs = bitcast %byte_ptrs to @@ -117,10 +99,7 @@ define @masked_sgather_nxv2i16(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i16: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1h { z0.d }, p0/z, [x8, z0.d, lsl #1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxth z0.d, p0/m, z0.d ; CHECK-NEXT: ret @@ -134,10 +113,7 @@ define @masked_sgather_nxv2i32(i8* %base, %offsets, %mask) { ; CHECK-LABEL: masked_sgather_nxv2i32: ; CHECK: // %bb.0: -; CHECK-NEXT: mov z1.d, x0 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0, z0.d] ; CHECK-NEXT: ptrue p0.d ; CHECK-NEXT: sxtw z0.d, p0/m, z0.d ; CHECK-NEXT: ret Index: llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll =================================================================== --- llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -7,7 +7,7 @@ define @masked_gather_nxv2i32( %ptrs, %mask) { ; CHECK-LABEL: masked_gather_nxv2i32: ; CHECK-DAG: mov x8, xzr -; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d, lsl #2] +; CHECK-DAG: ld1w { z0.d }, p0/z, [x8, z0.d] ; CHECK: ret %data = call @llvm.masked.gather.nxv2i32( %ptrs, i32 4, %mask, undef) ret %data Index: llvm/test/CodeGen/X86/masked_gather_scatter.ll =================================================================== --- llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -765,45 +765,41 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) { ; KNL_64-LABEL: test14: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpbroadcastq %xmm0, %zmm0 -; KNL_64-NEXT: vmovd %esi, %xmm1 -; KNL_64-NEXT: vpbroadcastd %xmm1, %ymm1 -; KNL_64-NEXT: vpmovsxdq %ymm1, %zmm1 -; KNL_64-NEXT: vpsllq $2, %zmm1, %zmm1 -; KNL_64-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; KNL_64-NEXT: vmovq %xmm0, %rax +; KNL_64-NEXT: vmovd %esi, %xmm0 +; KNL_64-NEXT: vpbroadcastd %xmm0, %ymm0 +; KNL_64-NEXT: vpmovsxdq %ymm0, %zmm0 +; KNL_64-NEXT: vpsllq $2, %zmm0, %zmm0 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; KNL_64-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; KNL_64-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test14: ; KNL_32: # %bb.0: -; KNL_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; KNL_32-NEXT: vmovd %xmm0, %eax ; KNL_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; KNL_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 -; KNL_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; KNL_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test14: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastq %xmm0, %zmm0 -; SKX-NEXT: vpbroadcastd %esi, %ymm1 -; SKX-NEXT: vpmovsxdq %ymm1, %zmm1 -; SKX-NEXT: vpsllq $2, %zmm1, %zmm1 -; SKX-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; SKX-NEXT: vmovq %xmm0, %rax +; SKX-NEXT: vpbroadcastd %esi, %ymm0 +; SKX-NEXT: vpmovsxdq %ymm0, %zmm0 +; SKX-NEXT: vpsllq $2, %zmm0, %zmm0 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k1} +; SKX-NEXT: vgatherqps (%rax,%zmm0), %ymm1 {%k1} ; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm1, %zmm0 ; SKX-NEXT: retq ; ; SKX_32-LABEL: test14: ; SKX_32: # %bb.0: -; SKX_32-NEXT: vpbroadcastd %xmm0, %zmm0 +; SKX_32-NEXT: vmovd %xmm0, %eax ; SKX_32-NEXT: vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1 -; SKX_32-NEXT: vpaddd %zmm1, %zmm0, %zmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 -; SKX_32-NEXT: vgatherdps (,%zmm1), %zmm0 {%k1} +; SKX_32-NEXT: vgatherdps (%eax,%zmm1), %zmm0 {%k1} ; SKX_32-NEXT: retl %broadcast.splatinsert = insertelement <16 x float*> %vec, float* %base, i32 1