diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -368,6 +368,7 @@ bool optimizeInst(Instruction *I, bool &ModifiedDT); bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, Type *AccessTy, unsigned AddrSpace); + bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr); bool optimizeInlineAsmInst(CallInst *CS); bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); bool optimizeExt(Instruction *&I); @@ -2041,7 +2042,12 @@ II->eraseFromParent(); return true; } + break; } + case Intrinsic::masked_gather: + return optimizeGatherScatterInst(II, II->getArgOperand(0)); + case Intrinsic::masked_scatter: + return optimizeGatherScatterInst(II, II->getArgOperand(1)); } SmallVector PtrOps; @@ -5182,6 +5188,119 @@ return true; } +/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find +/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can +/// only handle a 2 operand GEP in the same basic block or a splat constant +/// vector. The 2 operands to the GEP must have a scalar pointer and a vector +/// index. +/// +/// If the existing GEP has a vector base pointer that is splat, we can look +/// through the splat to find the scalar pointer. If we can't find a scalar +/// pointer there's nothing we can do. +/// +/// If we have a GEP with more than 2 indices where the middle indices are all +/// zeroes, we can replace it with 2 GEPs where the second has 2 operands. +/// +/// If the final index isn't a vector or is a splat, we can emit a scalar GEP +/// followed by a GEP with an all zeroes vector index. This will enable +/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a +/// zero index. +bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst, + Value *Ptr) { + const GetElementPtrInst *GEP = dyn_cast(Ptr); + if (!GEP || !GEP->hasIndices()) + return false; + + // If the GEP and the gather/scatter aren't in the same BB, don't optimize. + // FIXME: We should support this by sinking the GEP. + if (MemoryInst->getParent() != GEP->getParent()) + return false; + + SmallVector Ops(GEP->op_begin(), GEP->op_end()); + + bool RewriteGEP = false; + + if (Ops[0]->getType()->isVectorTy()) { + Ops[0] = const_cast(getSplatValue(Ops[0])); + if (!Ops[0]) + return false; + RewriteGEP = true; + } + + unsigned FinalIndex = Ops.size() - 1; + + // Ensure all but the last index is 0. + // FIXME: This isn't strictly required. All that's required is that they are + // all scalars or splats. + for (unsigned i = 1; i < FinalIndex; ++i) { + auto *C = dyn_cast(Ops[i]); + if (!C) + return false; + if (isa(C->getType())) + C = C->getSplatValue(); + auto *CI = dyn_cast_or_null(C); + if (!CI || !CI->isZero()) + return false; + // Scalarize the index if needed. + Ops[i] = CI; + } + + // Try to scalarize the final index. + if (Ops[FinalIndex]->getType()->isVectorTy()) { + if (Value *V = const_cast(getSplatValue(Ops[FinalIndex]))) { + auto *C = dyn_cast(V); + // Don't scalarize all zeros vector. + if (!C || !C->isZero()) { + Ops[FinalIndex] = V; + RewriteGEP = true; + } + } + } + + // If we made any changes or the we have extra operands, we need to generate + // new instructions. + if (!RewriteGEP && Ops.size() == 2) + return false; + + unsigned NumElts = Ptr->getType()->getVectorNumElements(); + + IRBuilder<> Builder(MemoryInst); + + Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType()); + + Value *NewAddr; + + // If the final index isn't a vector, emit a scalar GEP containing all ops + // and a vector GEP with all zeroes final index. + if (!Ops[FinalIndex]->getType()->isVectorTy()) { + NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front()); + Type *IndexTy = VectorType::get(ScalarIndexTy, NumElts); + NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy)); + } else { + Value *Base = Ops[0]; + Value *Index = Ops[FinalIndex]; + + // Create a scalar GEP if there are more than 2 operands. + if (Ops.size() != 2) { + // Replace the last index with 0. + Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy); + Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front()); + } + + // Now create the GEP with scalar pointer and vector index. + NewAddr = Builder.CreateGEP(Base, Index); + } + + MemoryInst->replaceUsesOfWith(Ptr, NewAddr); + + // If we have no uses, recursively delete the value and all dead instructions + // using it. + if (Ptr->use_empty()) + RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo); + + return true; +} + /// If there are any memory operands, use OptimizeMemoryInst to sink their /// address computing into the block when possible / profitable. bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -518,7 +518,6 @@ void resolveOrClearDbgInfo(); SDValue getValue(const Value *V); - bool findValue(const Value *V) const; /// Return the SDNode for the specified IR value if it exists. SDNode *getNodeForIRValue(const Value *V) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1435,12 +1435,6 @@ return Val; } -// Return true if SDValue exists for the given Value -bool SelectionDAGBuilder::findValue(const Value *V) const { - return (NodeMap.find(V) != NodeMap.end()) || - (FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end()); -} - /// getNonRegisterValue - Return an SDValue for the given Value, but /// don't look in FuncInfo.ValueMap for a virtual register. SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) { @@ -4254,70 +4248,49 @@ // In all other cases the function returns 'false'. static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index, ISD::MemIndexType &IndexType, SDValue &Scale, - SelectionDAGBuilder *SDB) { + SelectionDAGBuilder *SDB, const BasicBlock *CurBB) { SelectionDAG& DAG = SDB->DAG; - LLVMContext &Context = *DAG.getContext(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + const DataLayout &DL = DAG.getDataLayout(); assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type"); - const GetElementPtrInst *GEP = dyn_cast(Ptr); - if (!GEP) - return false; - const Value *BasePtr = GEP->getPointerOperand(); - if (BasePtr->getType()->isVectorTy()) { - BasePtr = getSplatValue(BasePtr); - if (!BasePtr) + // Handle splat constant pointer. + if (auto *C = dyn_cast(Ptr)) { + C = C->getSplatValue(); + if (!C) return false; - } - unsigned FinalIndex = GEP->getNumOperands() - 1; - Value *IndexVal = GEP->getOperand(FinalIndex); - gep_type_iterator GTI = gep_type_begin(*GEP); + Base = SDB->getValue(C); - // Ensure all the other indices are 0. - for (unsigned i = 1; i < FinalIndex; ++i, ++GTI) { - auto *C = dyn_cast(GEP->getOperand(i)); - if (!C) - return false; - if (isa(C->getType())) - C = C->getSplatValue(); - auto *CI = dyn_cast_or_null(C); - if (!CI || !CI->isZero()) - return false; + unsigned NumElts = Ptr->getType()->getVectorNumElements(); + EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts); + Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT); + IndexType = ISD::SIGNED_SCALED; + Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL)); + return true; } - // The operands of the GEP may be defined in another basic block. - // In this case we'll not find nodes for the operands. - if (!SDB->findValue(BasePtr)) + const GetElementPtrInst *GEP = dyn_cast(Ptr); + if (!GEP || GEP->getParent() != CurBB) return false; - Constant *C = dyn_cast(IndexVal); - if (!C && !SDB->findValue(IndexVal)) + + if (GEP->getNumOperands() != 2) return false; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - const DataLayout &DL = DAG.getDataLayout(); - StructType *STy = GTI.getStructTypeOrNull(); + const Value *BasePtr = GEP->getPointerOperand(); + const Value *IndexVal = GEP->getOperand(GEP->getNumOperands() - 1); + + // Make sure the base is scalar and the index is a vector. + if (BasePtr->getType()->isVectorTy() || !IndexVal->getType()->isVectorTy()) + return false; - if (STy) { - const StructLayout *SL = DL.getStructLayout(STy); - unsigned Field = cast(IndexVal)->getUniqueInteger().getZExtValue(); - Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL)); - Index = DAG.getConstant(SL->getElementOffset(Field), - SDB->getCurSDLoc(), TLI.getPointerTy(DL)); - } else { - Scale = DAG.getTargetConstant( - DL.getTypeAllocSize(GEP->getResultElementType()), - SDB->getCurSDLoc(), TLI.getPointerTy(DL)); - Index = SDB->getValue(IndexVal); - } Base = SDB->getValue(BasePtr); + Index = SDB->getValue(IndexVal); IndexType = ISD::SIGNED_SCALED; - - if (STy || !Index.getValueType().isVector()) { - unsigned GEPWidth = cast(GEP->getType())->getNumElements(); - EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth); - Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index); - } + Scale = DAG.getTargetConstant( + DL.getTypeAllocSize(GEP->getResultElementType()), + SDB->getCurSDLoc(), TLI.getPointerTy(DL)); return true; } @@ -4341,7 +4314,8 @@ SDValue Index; ISD::MemIndexType IndexType; SDValue Scale; - bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this); + bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + I.getParent()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( @@ -4452,7 +4426,8 @@ SDValue Index; ISD::MemIndexType IndexType; SDValue Scale; - bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this); + bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this, + I.getParent()); unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace(); MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand( MachinePointerInfo(AS), MachineMemOperand::MOLoad, diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll --- a/llvm/test/CodeGen/X86/masked_gather.ll +++ b/llvm/test/CodeGen/X86/masked_gather.ll @@ -1721,11 +1721,10 @@ ; AVX512-NEXT: vptestnmd %zmm0, %zmm0, %k0 ; AVX512-NEXT: kshiftlw $8, %k0, %k0 ; AVX512-NEXT: kshiftrw $8, %k0, %k1 -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3] +; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: kmovw %k1, %k2 -; AVX512-NEXT: vpgatherdd c(,%zmm0,4), %zmm1 {%k2} -; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 = [28,28,28,28,28,28,28,28,28,28,28,28,28,28,28,28] -; AVX512-NEXT: vpgatherdd c(,%zmm0), %zmm2 {%k1} +; AVX512-NEXT: vpgatherdd c+12(,%zmm0), %zmm1 {%k2} +; AVX512-NEXT: vpgatherdd c+28(,%zmm0), %zmm2 {%k1} ; AVX512-NEXT: vpaddd %ymm2, %ymm2, %ymm0 ; AVX512-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll --- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll +++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll @@ -638,30 +638,38 @@ define <16 x float> @test11(float* %base, i32 %ind) { ; KNL_64-LABEL: test11: ; KNL_64: # %bb.0: -; KNL_64-NEXT: vpbroadcastd %esi, %zmm1 +; KNL_64-NEXT: movslq %esi, %rax +; KNL_64-NEXT: leaq (%rdi,%rax,4), %rax +; KNL_64-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_64-NEXT: kxnorw %k0, %k0, %k1 -; KNL_64-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; KNL_64-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} ; KNL_64-NEXT: retq ; ; KNL_32-LABEL: test11: ; KNL_32: # %bb.0: ; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 +; KNL_32-NEXT: shll $2, %eax +; KNL_32-NEXT: addl {{[0-9]+}}(%esp), %eax +; KNL_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; KNL_32-NEXT: kxnorw %k0, %k0, %k1 ; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; KNL_32-NEXT: retl ; ; SKX-LABEL: test11: ; SKX: # %bb.0: -; SKX-NEXT: vpbroadcastd %esi, %zmm1 +; SKX-NEXT: movslq %esi, %rax +; SKX-NEXT: leaq (%rdi,%rax,4), %rax +; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: vgatherdps (%rdi,%zmm1,4), %zmm0 {%k1} +; SKX-NEXT: vgatherdps (%rax,%zmm1,4), %zmm0 {%k1} ; SKX-NEXT: retq ; ; SKX_32-LABEL: test11: ; SKX_32: # %bb.0: ; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SKX_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1 +; SKX_32-NEXT: shll $2, %eax +; SKX_32-NEXT: addl {{[0-9]+}}(%esp), %eax +; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; SKX_32-NEXT: kxnorw %k0, %k0, %k1 ; SKX_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1} ; SKX_32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr45067.ll b/llvm/test/CodeGen/X86/pr45067.ll --- a/llvm/test/CodeGen/X86/pr45067.ll +++ b/llvm/test/CodeGen/X86/pr45067.ll @@ -6,13 +6,13 @@ define void @foo(<8 x i32>* %x, <8 x i1> %y) { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpbroadcastq _global@{{.*}}(%rip), %ymm2 -; CHECK-NEXT: vpgatherqd %xmm1, (,%ymm2), %xmm3 +; CHECK-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 +; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: movq _global@{{.*}}(%rip), %rax +; CHECK-NEXT: vpgatherdd %ymm1, (%rax,%ymm2), %ymm3 ; CHECK-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; CHECK-NEXT: vpslld $31, %ymm0, %ymm0 -; CHECK-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm1 -; CHECK-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) +; CHECK-NEXT: vpmaskmovd %ymm3, %ymm0, (%rdi) ; CHECK-NEXT: ud2 %tmp = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32 4, <8 x i1> , <8 x i32> undef) call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %tmp, <8 x i32>* %x, i32 4, <8 x i1> %y) diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/gather-scatter-opt.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -codegenprepare < %s | FileCheck %s + +target datalayout = +"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.a = type { i32, i32 } +@c = external dso_local global %struct.a, align 4 +@glob_array = internal unnamed_addr constant [16 x i32] [i32 1, i32 1, i32 2, i32 3, i32 5, i32 8, i32 13, i32 21, i32 34, i32 55, i32 89, i32 144, i32 233, i32 377, i32 610, i32 987], align 16 + +define <4 x i32> @splat_base(i32* %base, <4 x i64> %index) { +; CHECK-LABEL: @splat_base( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], <4 x i64> [[INDEX:%.*]] +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %broadcast.splatinsert = insertelement <4 x i32*> undef, i32* %base, i32 0 + %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer + %gep = getelementptr i32, <4 x i32*> %broadcast.splat, <4 x i64> %index + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} + +define <4 x i32> @splat_struct(%struct.a* %base) { +; CHECK-LABEL: @splat_struct( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], %struct.a* [[BASE:%.*]], i64 0, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %gep = getelementptr %struct.a, %struct.a* %base, <4 x i64> zeroinitializer, i32 1 + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} + +define <4 x i32> @scalar_index(i32* %base, i64 %index) { +; CHECK-LABEL: @scalar_index( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %broadcast.splatinsert = insertelement <4 x i32*> undef, i32* %base, i32 0 + %broadcast.splat = shufflevector <4 x i32*> %broadcast.splatinsert, <4 x i32*> undef, <4 x i32> zeroinitializer + %gep = getelementptr i32, <4 x i32*> %broadcast.splat, i64 %index + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} + +define <4 x i32> @splat_index(i32* %base, i64 %index) { +; CHECK-LABEL: @splat_index( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* [[BASE:%.*]], i64 [[INDEX:%.*]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], <4 x i64> zeroinitializer +; CHECK-NEXT: [[RES:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP2]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[RES]] +; + %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %index, i32 0 + %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer + %gep = getelementptr i32, i32* %base, <4 x i64> %broadcast.splat + %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gep, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %res +} + +define <4 x i32> @test_global_array(<4 x i64> %indxs) { +; CHECK-LABEL: @test_global_array( +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, i32* getelementptr inbounds ([16 x i32], [16 x i32]* @glob_array, i64 0, i64 0), <4 x i64> [[INDXS:%.*]] +; CHECK-NEXT: [[G:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> [[TMP1]], i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[G]] +; + %p = getelementptr inbounds [16 x i32], [16 x i32]* @glob_array, i64 0, <4 x i64> %indxs + %g = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %p, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %g +} + +define <4 x i32> @global_struct_splat() { +; CHECK-LABEL: @global_struct_splat( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> , i32 4, <4 x i1> , <4 x i32> undef) +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %1 = insertelement <4 x %struct.a*> undef, %struct.a* @c, i32 0 + %2 = shufflevector <4 x %struct.a*> %1, <4 x %struct.a*> undef, <4 x i32> zeroinitializer + %3 = getelementptr %struct.a, <4 x %struct.a*> %2, <4 x i64> zeroinitializer, i32 1 + %4 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %4 +} + +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)