Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31560,6 +31560,22 @@ SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); + // Only scales up to 8 are supported. + uint64_t ScaleVal = cast(Scale)->getZExtValue(); + if (ScaleVal > 8) { + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, dl, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), dl, IndexVT)); + Scale = DAG.getTargetConstant(1, dl, Scale.getValueType()); + + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + ISD::MemIndexType IndexType = + N->isIndexSigned() ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED; + return DAG.getMaskedScatter(N->getVTList(), N->getMemoryVT(), dl, Ops, + N->getMemOperand(), IndexType, + N->isTruncatingStore()); + } + if (VT == MVT::v2f32 || VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. @@ -31714,6 +31730,30 @@ N->isTruncatingStore(), N->isCompressingStore()); } +static SDValue LegalizeMGATHERScale(MaskedGatherSDNode *N, SelectionDAG &DAG) { + // Only scales up to 8 are supported. + SDValue Scale = N->getScale(); + uint64_t ScaleVal = cast(Scale)->getZExtValue(); + if (ScaleVal > 8) { + SDLoc dl(N); + SDValue Index = N->getIndex(); + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, dl, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), dl, IndexVT)); + Scale = DAG.getTargetConstant(1, dl, Scale.getValueType()); + + SDValue Ops[] = {N->getChain(), N->getPassThru(), + N->getMask(), N->getBasePtr(), + Index, Scale}; + ISD::MemIndexType IndexType = + N->isIndexSigned() ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED; + return DAG.getMaskedGather(N->getVTList(), N->getMemoryVT(), dl, Ops, + N->getMemOperand(), IndexType, + N->getExtensionType()); + } + return SDValue(); +} + static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert(Subtarget.hasAVX2() && @@ -31733,6 +31773,9 @@ if (IndexVT == MVT::v2i32) return SDValue(); + if (SDValue Res = LegalizeMGATHERScale(N, DAG)) + return Res; + // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. MVT OrigVT = VT; @@ -32898,6 +32941,11 @@ SDValue Index = Gather->getIndex(); if (Index.getValueType() != MVT::v2i64) return; + if (SDValue Res = LegalizeMGATHERScale(Gather, DAG)) { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + return; + } assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && "Unexpected type action!"); EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); Index: llvm/test/CodeGen/X86/gather-scatter-opaque-ptr-2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/gather-scatter-opaque-ptr-2.ll @@ -0,0 +1,75 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux -mcpu=skylake -mattr=+avx2 < %s | FileCheck %s + +define <2 x float> @gather_v2f32_scale_512(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $9, %xmm0, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_16(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $4, %xmm0, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_8(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,8), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_4(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,4), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_3(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm2 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x float>) Index: llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll @@ -0,0 +1,150 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux -mcpu=skylake-avx512 < %s | FileCheck %s + +define void @scatter_scale_512(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $9, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_16(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $4, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_8(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,8) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_4(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_3(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define <4 x double> @gather_scale_512(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $9, %ymm0, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_16(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $4, %ymm0, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_8(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm0,8), %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_4(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm0,4), %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_3(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32 immarg, <4 x i1>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x double>)