Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4361,7 +4361,8 @@ // In all other cases the function returns 'false'. static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, ISD::MemIndexType &IndexType, SDValue &Scale, - SelectionDAGBuilder *SDB, const BasicBlock *CurBB) { + SelectionDAGBuilder *SDB, const BasicBlock *CurBB, + uint64_t ElemSize) { SelectionDAG& DAG = SDB->DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const DataLayout &DL = DAG.getDataLayout(); @@ -4402,9 +4403,11 @@ Index = SDB->getValue(IndexVal); IndexType = ISD::SIGNED_SCALED; - // MGATHER/MSCATTER only support scaling by a power-of-two. + // MGATHER/MSCATTER are only required to support scaling by one or by the + // element size. Other scales may be produced using target-specific DAG + // combines. uint64_t ScaleVal = DL.getTypeAllocSize(GEP->getResultElementType()); - if (!isPowerOf2_64(ScaleVal)) + if (ScaleVal != ElemSize && ScaleVal != 1) return false; Scale = @@ -4430,7 +4433,7 @@ ISD::MemIndexType IndexType; SDValue Scale; bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, - I.getParent()); + I.getParent(), VT.getScalarStoreSize()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -4538,7 +4541,7 @@ ISD::MemIndexType IndexType; SDValue Scale; bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, - I.getParent()); + I.getParent(), VT.getScalarStoreSize()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(AS), MachineMemOperand::MOLoad, @@ -7408,7 +7411,8 @@ SDValue Base, Index, Scale; ISD::MemIndexType IndexType; bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale, - this, VPIntrin.getParent()); + this, VPIntrin.getParent(), + VT.getScalarStoreSize()); if (!UniformBase) { Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(PtrOperand); @@ -7464,7 +7468,8 @@ SDValue Base, Index, Scale; ISD::MemIndexType IndexType; bool UniformBase = getUniformBase(PtrOperand, Base, Index, IndexType, Scale, - this, VPIntrin.getParent()); + this, VPIntrin.getParent(), + VT.getScalarStoreSize()); if (!UniformBase) { Base = DAG.getConstant(0, DL, TLI.getPointerTy(DAG.getDataLayout())); Index = getValue(PtrOperand); Index: llvm/test/CodeGen/X86/gather-scatter-opaque-ptr-2.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/gather-scatter-opaque-ptr-2.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux -mcpu=skylake -mattr=+avx2 < %s | FileCheck %s + +define <2 x float> @gather_v2f32_scale_512(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $9, %xmm0, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_16(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsllq $4, %xmm0, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_8(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,8), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_4(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqps %xmm2, (%rdi,%xmm0,4), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_3(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddq %xmm0, %xmm0, %xmm2 +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm2 +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm0, %xmm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqps %xmm1, (%rdi,%xmm2), %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +define <2 x float> @gather_v2f32_scale_1(ptr %result, <2 x i64> %idx, <2 x i1> %mask) { +; CHECK-LABEL: gather_v2f32_scale_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; CHECK-NEXT: vpslld $31, %xmm1, %xmm2 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqps %xmm2, (%rdi,%xmm0), %xmm1 +; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [1 x i8], ptr %result, <2 x i64> %idx + %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr> %gep, i32 0, <2 x i1> %mask, <2 x float> zeroinitializer) + ret <2 x float> %res +} + +declare <2 x float> @llvm.masked.gather.v2f32.v2p0(<2 x ptr>, i32 immarg, <2 x i1>, <2 x float>) Index: llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/X86/gather-scatter-opaque-ptr.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux -mcpu=skylake-avx512 < %s | FileCheck %s + +define void @scatter_scale_512(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $9, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_16(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $4, %ymm0, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_8(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,8) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_4(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_3(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define void @scatter_scale_1(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: scatter_scale_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vscatterqpd %ymm1, (%rdi,%ymm0) {%k1} +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %gep = getelementptr inbounds [1 x i8], ptr %result, <4 x i64> %idx + call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> zeroinitializer, <4 x ptr> %gep, i32 0, <4 x i1> %mask) + ret void +} + +define <4 x double> @gather_scale_512(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $9, %ymm0, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [512 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_16(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpsllq $4, %ymm0, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [16 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_8(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm0,8), %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [8 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_4(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm0,4), %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [4 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_3(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_3: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpaddq %ymm0, %ymm0, %ymm1 +; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm1), %ymm0 {%k1} +; CHECK-NEXT: retq + %gep = getelementptr inbounds [3 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +define <4 x double> @gather_scale_1(ptr %result, <4 x i64> %idx, <4 x i1> %mask) { +; CHECK-LABEL: gather_scale_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vpslld $31, %xmm1, %xmm1 +; CHECK-NEXT: vpmovd2m %xmm1, %k1 +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vgatherqpd (%rdi,%ymm0), %ymm1 {%k1} +; CHECK-NEXT: vmovapd %ymm1, %ymm0 +; CHECK-NEXT: retq + %gep = getelementptr inbounds [1 x i8], ptr %result, <4 x i64> %idx + %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %gep, i32 0, <4 x i1> %mask, <4 x double> zeroinitializer) + ret <4 x double> %res +} + +declare void @llvm.masked.scatter.v4f64.v4p0(<4 x double>, <4 x ptr>, i32 immarg, <4 x i1>) +declare <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr>, i32 immarg, <4 x i1>, <4 x double>)