Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -31546,6 +31546,22 @@ SDValue Chain = N->getChain(); SDValue BasePtr = N->getBasePtr(); + // Only scales up to 8 are supported. + uint64_t ScaleVal = cast(Scale)->getZExtValue(); + if (ScaleVal > 8) { + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, dl, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), dl, IndexVT)); + Scale = DAG.getTargetConstant(1, dl, Scale.getValueType()); + + SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; + ISD::MemIndexType IndexType = + N->isIndexSigned() ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED; + return DAG.getMaskedScatter(N->getVTList(), N->getMemoryVT(), dl, Ops, + N->getMemOperand(), IndexType, + N->isTruncatingStore()); + } + if (VT == MVT::v2f32 || VT == MVT::v2i32) { assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); // If the index is v2i64 and we have VLX we can use xmm for data and index. @@ -31709,6 +31725,7 @@ SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); SDValue Index = N->getIndex(); + SDValue Scale = N->getScale(); SDValue Mask = N->getMask(); SDValue PassThru = N->getPassThru(); MVT IndexVT = Index.getSimpleValueType(); @@ -31719,6 +31736,23 @@ if (IndexVT == MVT::v2i32) return SDValue(); + // Only scales up to 8 are supported. + uint64_t ScaleVal = cast(Scale)->getZExtValue(); + if (ScaleVal > 8) { + EVT IndexVT = Index.getValueType(); + Index = DAG.getNode(ISD::SHL, dl, IndexVT, Index, + DAG.getConstant(Log2_32(ScaleVal), dl, IndexVT)); + Scale = DAG.getTargetConstant(1, dl, Scale.getValueType()); + + SDValue Ops[] = {N->getChain(), PassThru, Mask, + N->getBasePtr(), Index, Scale}; + ISD::MemIndexType IndexType = + N->isIndexSigned() ? ISD::SIGNED_UNSCALED : ISD::UNSIGNED_UNSCALED; + return DAG.getMaskedGather(N->getVTList(), N->getMemoryVT(), dl, Ops, + N->getMemOperand(), IndexType, + N->getExtensionType()); + } + // If we don't have VLX and neither the passthru or index is 512-bits, we // need to widen until one is. MVT OrigVT = VT; Index: llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll @@ -0,0 +1,164 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux -mcpu=skylake-avx512 < %s | FileCheck %s + +define void @scatter_scale_512(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: scatter_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpsllq $9, (%rsi), %ymm0 +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_16(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: scatter_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpsllq $4, (%rsi), %ymm0 +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_8(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: scatter_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovapd (%rsi), %ymm0 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,8) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_4(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: scatter_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovapd (%rsi), %ymm0 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_3(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: scatter_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define <4 x double> @gather_scale_512(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: gather_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpsllq $9, (%rsi), %ymm1 +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_16(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: gather_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpsllq $4, (%rsi), %ymm1 +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_8(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: gather_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovapd (%rsi), %ymm1 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,8), %ymm0 {%k1} +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_4(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: gather_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovapd (%rsi), %ymm1 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vxorpd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1} +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_3(ptr %result, ptr %idx.ptr, ptr %mask.ptr) { +; CHECK-LABEL: gather_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm0 +; CHECK-NEXT: kmovb (%rdx), %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %idx = load <4 x i64>, ptr %idx.ptr + %mask = load <4 x i1>, ptr %mask.ptr + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32 immarg, <4 x i1>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x double>)