diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -370,18 +370,17 @@ return Dummy; }; - const auto CreateTempPtrIntCast = - [&Builder, VecStoreSize](Value *Val, Type *PtrTy) -> Value * { - const unsigned TempIntSize = (VecStoreSize * 8); + const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, + Type *PtrTy) -> Value * { + assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); + const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy); if (!PtrTy->isVectorTy()) - return Builder.CreateBitOrPointerCast(Val, - Builder.getIntNTy(TempIntSize)); + return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size)); const unsigned NumPtrElts = cast(PtrTy)->getNumElements(); // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to // first cast the ptr vector to <2 x i64>. - assert(alignTo(TempIntSize, NumPtrElts) == TempIntSize && - "Vector size not divisble"); - Type *EltTy = Builder.getIntNTy(TempIntSize / NumPtrElts); + assert((Size % NumPtrElts == 0) && "Vector size not divisble"); + Type *EltTy = Builder.getIntNTy(Size / NumPtrElts); return Builder.CreateBitOrPointerCast( Val, FixedVectorType::get(EltTy, NumPtrElts)); }; @@ -399,21 +398,46 @@ cast(Inst)->getPointerOperand(), GEPVectorIdx); // We're loading the full vector. - if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) { - assert(cast(Index)->isZeroValue()); - Type *InstTy = Inst->getType(); - if (InstTy->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, InstTy); - Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, InstTy); + Type *AccessTy = Inst->getType(); + unsigned AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize && cast(Index)->isZeroValue()) { + if (AccessTy->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, AccessTy); + else if (CurVal->getType()->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType()); + Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy); Inst->replaceAllUsesWith(NewVal); return nullptr; } + // Loading a subvector. + if (isa(AccessTy)) { + assert(AccessSize % DL.getTypeStoreSize(VecEltTy) == 0); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + + unsigned IndexVal = cast(Index)->getZExtValue(); + Value *SubVec = PoisonValue::get(SubVecTy); + for (unsigned K = 0; K < NumElts; ++K) { + SubVec = Builder.CreateInsertElement( + SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K); + } + + if (AccessTy->isPtrOrPtrVectorTy()) + SubVec = CreateTempPtrIntCast(SubVec, AccessTy); + else if (SubVecTy->isPtrOrPtrVectorTy()) + SubVec = CreateTempPtrIntCast(SubVec, SubVecTy); + + SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy); + Inst->replaceAllUsesWith(SubVec); + return nullptr; + } + // We're loading one element. Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index); - if (Inst->getType() != VecEltTy) - ExtractElement = - Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); + if (AccessTy != VecEltTy) + ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy); Inst->replaceAllUsesWith(ExtractElement); return nullptr; @@ -428,14 +452,39 @@ Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. - if (DL.getTypeStoreSize(Val->getType()) == VecStoreSize) { - assert(cast(Index)->isZeroValue()); - Type *SrcTy = Val->getType(); - if (SrcTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, SrcTy); + Type *AccessTy = Val->getType(); + unsigned AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize && cast(Index)->isZeroValue()) { + if (AccessTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AccessTy); + else if (VectorTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, VectorTy); return Builder.CreateBitOrPointerCast(Val, VectorTy); } + // Storing a subvector. + if (isa(AccessTy)) { + assert(AccessSize % DL.getTypeStoreSize(VecEltTy) == 0); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy)); + + if (SubVecTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, SubVecTy); + else if (AccessTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AccessTy); + + Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); + + unsigned IndexVal = cast(Index)->getZExtValue(); + Value *CurVec = GetOrLoadCurrentVectorValue(); + for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) { + CurVec = Builder.CreateInsertElement( + CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K); + } + return CurVec; + } + if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, @@ -486,6 +535,29 @@ llvm_unreachable("Did not return after promoting instruction!"); } +static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, + const DataLayout &DL) { + // Access as a vector type can work if the size of the access vector is a + // multiple of the size of the alloca's vector element type. + // + // Examples: + // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK + // - 3*32 is not a multiple of 64 + // + // We could handle more complicated cases, but it'd make things a lot more + // complicated. + if (isa(AccessTy)) { + TypeSize AccTS = DL.getTypeStoreSize(AccessTy); + TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType()); + return AccTS.isKnownMultipleOf(VecTS); + } + + return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy, + DL); +} + /// Iterates over an instruction worklist that may contain multiple instructions /// from the same basic block, but in a different order. template @@ -616,10 +688,10 @@ // Check that this is a simple access of a vector element. bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() : cast(Inst)->isSimple(); - if (!IsSimple || - !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, *DL)) - return RejectUser(Inst, "not simple and/or vector element type not " - "castable to access type"); + if (!IsSimple) + return RejectUser(Inst, "not a simple load or store"); + if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + return RejectUser(Inst, "not a supported access type"); WorkList.push_back(Inst); continue; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll @@ -38,6 +38,24 @@ ret void } +define <4 x i64> @test_fullvec_out_of_bounds(<4 x i64> %arg) { +; CHECK-LABEL: define <4 x i64> @test_fullvec_out_of_bounds +; CHECK-SAME: (<4 x i64> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x i64> [[ARG]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> , i64 [[TMP0]], i64 1 +; CHECK-NEXT: ret <4 x i64> [[TMP2]] +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 + %stack.3 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 3 + store <4 x i64> %arg, ptr addrspace(5) %stack.3 + %reload = load <4 x i64>, ptr addrspace(5) %stack.2 + ret <4 x i64> %reload +} + define amdgpu_kernel void @test_no_overwrite(i64 %val, i1 %cond) { ; CHECK-LABEL: define amdgpu_kernel void @test_no_overwrite ; CHECK-SAME: (i64 [[VAL:%.*]], i1 [[COND:%.*]]) { @@ -123,16 +141,40 @@ ret <4 x ptr addrspace(3)> %tmp } -; Currently rejected due to the store not being cast-able. -; TODO: We should probably be able to vectorize this +define <8 x i16> @ptralloca_load_store_ints_full(<2 x i64> %arg) { +; CHECK-LABEL: define <8 x i16> @ptralloca_load_store_ints_full +; CHECK-SAME: (<2 x i64> [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[ARG]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)> +; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <4 x ptr addrspace(5)> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; +entry: + %stack = alloca [4 x ptr addrspace(5)], align 4, addrspace(5) + store <2 x i64> %arg, ptr addrspace(5) %stack + %reload = load <8 x i16>, ptr addrspace(5) %stack + ret <8 x i16> %reload +} + define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) { ; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec ; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 8, addrspace(5) -; CHECK-NEXT: store <2 x ptr addrspace(3)> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: [[TMP:%.*]] = load <2 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: [[TMP_FULL:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i32> [[TMP5]], i32 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i32> [[TMP6]] to <2 x ptr addrspace(3)> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 undef, i64 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 undef, i64 3 +; CHECK-NEXT: [[TMP12:%.*]] = inttoptr <4 x i32> [[TMP11]] to <4 x ptr addrspace(3)> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll @@ -0,0 +1,334 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +define void @test_trivial_subvector(<2 x i64> %val.0, <2 x i64> %val.1) { +; CHECK-LABEL: define void @test_trivial_subvector +; CHECK-SAME: (<2 x i64> [[VAL_0:%.*]], <2 x i64> [[VAL_1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[VAL_0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[VAL_0]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> poison, i64 [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[TMP4]], i64 1 +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x i64> [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP4]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 undef, i64 1 +; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <2 x i64> [[TMP9]] +; CHECK-NEXT: [[DUMMYUSER_2:%.*]] = freeze <2 x i64> undef +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 1 + %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 + + store <2 x i64> %val.0, ptr addrspace(5) %stack + store <2 x i64> %val.1, ptr addrspace(5) %stack.1 + store <2 x i64> %val.1, ptr addrspace(5) %stack.2 + + %reload = load <2 x i64>, ptr addrspace(5) %stack + %dummyuser = freeze <2 x i64> %reload + + %reload.1 = load <2 x i64>, ptr addrspace(5) %stack.1 + %dummyuser.1 = freeze <2 x i64> %reload.1 + + %reload.2 = load <2 x i64>, ptr addrspace(5) %stack.2 + %dummyuser.2 = freeze <2 x i64> %reload.2 + ret void +} + +define void @test_different_type_subvector(<4 x i32> %val.0, <8 x i16> %val.1, <16 x i8> %val.2, <128 x i1> %val.3) { +; CHECK-LABEL: define void @test_different_type_subvector +; CHECK-SAME: (<4 x i32> [[VAL_0:%.*]], <8 x i16> [[VAL_1:%.*]], <16 x i8> [[VAL_2:%.*]], <128 x i1> [[VAL_3:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_0]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <2 x i64> [[TMP6]] to <16 x i8> +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <16 x i8> [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = bitcast <8 x i16> [[VAL_1]] to <2 x i64> +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> [[TMP11]], i64 undef, i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast <2 x i64> [[TMP12]] to <8 x i16> +; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <8 x i16> [[TMP13]] +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <16 x i8> [[VAL_2]] to <2 x i64> +; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <4 x i32> undef +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <128 x i1> [[VAL_3]] to <2 x i64> +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> poison, i64 [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <2 x i64> [[TMP16]], i64 undef, i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = bitcast <2 x i64> [[TMP17]] to <128 x i1> +; CHECK-NEXT: [[DUMMYUSE_I1:%.*]] = freeze <128 x i1> [[TMP18]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 1 + %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 + + store <4 x i32> %val.0, ptr addrspace(5) %stack + %reload = load <16 x i8>, ptr addrspace(5) %stack + %dummyuser = freeze <16 x i8> %reload + + store <8 x i16> %val.1, ptr addrspace(5) %stack.1 + %reload.1 = load <8 x i16>, ptr addrspace(5) %stack.1 + %dummyuse.1 = freeze <8 x i16> %reload.1 + + store <16 x i8> %val.2, ptr addrspace(5) %stack.2 + %reload.2 = load <4 x i32>, ptr addrspace(5) %stack.2 + %dummyuse.2 = freeze <4 x i32> %reload.2 + + store <128 x i1> %val.3, ptr addrspace(5) %stack.2 + %reload.i1 = load <128 x i1>, ptr addrspace(5) %stack.1 + %dummyuse.i1 = freeze <128 x i1> %reload.i1 + + ret void +} + +; Not vectorized, >16 elts is not supported. +define void @test_different_type_subvector_i1alloca(<4 x i32> %val.0, <8 x i16> %val.1, <16 x i8> %val.2, <128 x i1> %val.3) { +; CHECK-LABEL: define void @test_different_type_subvector_i1alloca +; CHECK-SAME: (<4 x i32> [[VAL_0:%.*]], <8 x i16> [[VAL_1:%.*]], <16 x i8> [[VAL_2:%.*]], <128 x i1> [[VAL_3:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK:%.*]] = alloca [256 x i1], align 4, addrspace(5) +; CHECK-NEXT: [[STACK_1:%.*]] = getelementptr inbounds [256 x i1], ptr addrspace(5) [[STACK]], i32 0, i32 63 +; CHECK-NEXT: [[STACK_2:%.*]] = getelementptr inbounds [256 x i1], ptr addrspace(5) [[STACK]], i32 0, i32 127 +; CHECK-NEXT: store <4 x i32> [[VAL_0]], ptr addrspace(5) [[STACK]], align 16 +; CHECK-NEXT: [[RELOAD:%.*]] = load <16 x i8>, ptr addrspace(5) [[STACK]], align 16 +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <16 x i8> [[RELOAD]] +; CHECK-NEXT: store <8 x i16> [[VAL_1]], ptr addrspace(5) [[STACK_1]], align 16 +; CHECK-NEXT: [[RELOAD_1:%.*]] = load <8 x i16>, ptr addrspace(5) [[STACK_1]], align 16 +; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <8 x i16> [[RELOAD_1]] +; CHECK-NEXT: store <16 x i8> [[VAL_2]], ptr addrspace(5) [[STACK_2]], align 16 +; CHECK-NEXT: [[RELOAD_2:%.*]] = load <4 x i32>, ptr addrspace(5) [[STACK_2]], align 16 +; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <4 x i32> [[RELOAD_2]] +; CHECK-NEXT: store <128 x i1> [[VAL_3]], ptr addrspace(5) [[STACK_2]], align 16 +; CHECK-NEXT: [[RELOAD_I1:%.*]] = load <128 x i1>, ptr addrspace(5) [[STACK_1]], align 16 +; CHECK-NEXT: [[DUMMYUSE_I1:%.*]] = freeze <128 x i1> [[RELOAD_I1]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [256 x i1], align 4, addrspace(5) + %stack.1 = getelementptr inbounds [256 x i1], ptr addrspace(5) %stack, i32 0, i32 63 + %stack.2 = getelementptr inbounds [256 x i1], ptr addrspace(5) %stack, i32 0, i32 127 + + store <4 x i32> %val.0, ptr addrspace(5) %stack + %reload = load <16 x i8>, ptr addrspace(5) %stack + %dummyuser = freeze <16 x i8> %reload + + store <8 x i16> %val.1, ptr addrspace(5) %stack.1 + %reload.1 = load <8 x i16>, ptr addrspace(5) %stack.1 + %dummyuse.1 = freeze <8 x i16> %reload.1 + + store <16 x i8> %val.2, ptr addrspace(5) %stack.2 + %reload.2 = load <4 x i32>, ptr addrspace(5) %stack.2 + %dummyuse.2 = freeze <4 x i32> %reload.2 + + store <128 x i1> %val.3, ptr addrspace(5) %stack.2 + %reload.i1 = load <128 x i1>, ptr addrspace(5) %stack.1 + %dummyuse.i1 = freeze <128 x i1> %reload.i1 + + ret void +} + +define void @test_different_type_subvector_fp(<2 x double> %val.0, <4 x float> %val.1, <8 x half> %val.2) { +; CHECK-LABEL: define void @test_different_type_subvector_fp +; CHECK-SAME: (<2 x double> [[VAL_0:%.*]], <4 x float> [[VAL_1:%.*]], <8 x half> [[VAL_2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VAL_2]] to <2 x double> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> undef, double [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP2]], double [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[TMP3]], i64 1 +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x double> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <4 x float> [[VAL_1]] to <2 x double> +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x double> [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x double> [[TMP4]], double [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x double> [[TMP7]], i64 1 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x double> poison, double [[TMP8]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x double> [[TMP12]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[TMP14:%.*]] = bitcast <2 x double> [[TMP13]] to <4 x float> +; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <4 x float> [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x double> [[VAL_0]], i64 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <4 x double> [[TMP11]], double [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x double> [[VAL_0]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <4 x double> [[TMP16]], double [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x double> poison, double [[TMP15]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x double> [[TMP19]], double [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = bitcast <2 x double> [[TMP20]] to <8 x half> +; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <8 x half> [[TMP21]] +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x double> [[TMP18]], double 2.075080e-322, i64 0 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> [[TMP22]], double 3.162020e-322, i64 1 +; CHECK-NEXT: [[DUMMYUSE_3:%.*]] = freeze <4 x i32> +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x double], align 4, addrspace(5) + + store <8 x half> %val.2, ptr addrspace(5) %stack + %reload = load <2 x double>, ptr addrspace(5) %stack + %dummyuser = freeze <2 x double> %reload + + store <4 x float> %val.1, ptr addrspace(5) %stack + %reload.1 = load <4 x float>, ptr addrspace(5) %stack + %dummyuse.1 = freeze <4 x float> %reload.1 + + store <2 x double> %val.0, ptr addrspace(5) %stack + %reload.2 = load <8 x half>, ptr addrspace(5) %stack + %dummyuse.2 = freeze <8 x half> %reload.2 + + store <2 x i64> , ptr addrspace(5) %stack + %reload.3 = load <4 x i32>, ptr addrspace(5) %stack + %dummyuse.3 = freeze <4 x i32> %reload.3 + + ret void +} + +define void @test_different_type_subvector_ptrs(<2 x ptr addrspace(1)> %val.0, <4 x ptr addrspace(3)> %val.1) { +; CHECK-LABEL: define void @test_different_type_subvector_ptrs +; CHECK-SAME: (<2 x ptr addrspace(1)> [[VAL_0:%.*]], <4 x ptr addrspace(3)> [[VAL_1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(1)> [[VAL_0]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> undef, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP0]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <2 x i64> [[TMP6]] to <2 x ptr addrspace(1)> +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x ptr addrspace(1)> [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = ptrtoint <4 x ptr addrspace(3)> [[VAL_1]] to <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <2 x i64> +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i64 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i64> [[TMP11]], i64 [[TMP12]], i64 1 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <2 x i64> [[TMP14]], i64 [[TMP12]], i64 1 +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <2 x i64> [[TMP15]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(3)> +; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <4 x ptr addrspace(3)> [[TMP17]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + + store <2 x ptr addrspace(1)> %val.0, ptr addrspace(5) %stack + %reload = load <2 x ptr addrspace(1)>, ptr addrspace(5) %stack + %dummyuser = freeze <2 x ptr addrspace(1)> %reload + + store <4 x ptr addrspace(3)> %val.1, ptr addrspace(5) %stack + %reload.1 = load <4 x ptr addrspace(3)>, ptr addrspace(5) %stack + %dummyuser.1 = freeze <4 x ptr addrspace(3)> %reload.1 + + ret void +} + +define void @test_different_type_subvector_ptralloca(<2 x i64> %val.0, <8 x i16> %val.1, <2 x ptr addrspace(3)> %val.2) { +; CHECK-LABEL: define void @test_different_type_subvector_ptralloca +; CHECK-SAME: (<2 x i64> [[VAL_0:%.*]], <8 x i16> [[VAL_1:%.*]], <2 x ptr addrspace(3)> [[VAL_2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x i64> [[VAL_0]] to <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = inttoptr <4 x i32> [[TMP0]] to <4 x ptr addrspace(5)> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP1]], i64 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr addrspace(5)> undef, ptr addrspace(5) [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP3]], ptr addrspace(5) [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP5]], ptr addrspace(5) [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP1]], i64 3 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP7]], ptr addrspace(5) [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x ptr addrspace(5)> poison, ptr addrspace(5) [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP10]], ptr addrspace(5) [[TMP4]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP11]], ptr addrspace(5) [[TMP6]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP12]], ptr addrspace(5) [[TMP8]], i64 3 +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint <4 x ptr addrspace(5)> [[TMP13]] to <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <4 x i32> [[TMP14]] to <2 x i64> +; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <2 x i64> [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast <8 x i16> [[VAL_1]] to <4 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = inttoptr <4 x i32> [[TMP16]] to <4 x ptr addrspace(5)> +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP17]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP9]], ptr addrspace(5) [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP17]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP19]], ptr addrspace(5) [[TMP20]], i64 1 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP17]], i64 2 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP21]], ptr addrspace(5) [[TMP22]], i64 2 +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <4 x ptr addrspace(5)> [[TMP17]], i64 3 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP23]], ptr addrspace(5) [[TMP24]], i64 3 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x ptr addrspace(5)> poison, ptr addrspace(5) [[TMP18]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP26]], ptr addrspace(5) [[TMP20]], i64 1 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP27]], ptr addrspace(5) [[TMP22]], i64 2 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x ptr addrspace(5)> [[TMP28]], ptr addrspace(5) [[TMP24]], i64 3 +; CHECK-NEXT: [[TMP30:%.*]] = ptrtoint <4 x ptr addrspace(5)> [[TMP29]] to <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = bitcast <4 x i32> [[TMP30]] to <8 x i16> +; CHECK-NEXT: [[DUMMYUSER_2:%.*]] = freeze <8 x i16> [[TMP31]] +; CHECK-NEXT: [[TMP32:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VAL_2]] to <2 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = inttoptr <2 x i32> [[TMP32]] to <2 x ptr addrspace(5)> +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x ptr addrspace(5)> [[TMP33]], i64 0 +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP25]], ptr addrspace(5) [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP36:%.*]] = extractelement <2 x ptr addrspace(5)> [[TMP33]], i64 1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <8 x ptr addrspace(5)> [[TMP35]], ptr addrspace(5) [[TMP36]], i64 1 +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <2 x ptr addrspace(5)> poison, ptr addrspace(5) [[TMP34]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = insertelement <2 x ptr addrspace(5)> [[TMP38]], ptr addrspace(5) [[TMP36]], i64 1 +; CHECK-NEXT: [[TMP40:%.*]] = ptrtoint <2 x ptr addrspace(5)> [[TMP39]] to <2 x i32> +; CHECK-NEXT: [[TMP41:%.*]] = inttoptr <2 x i32> [[TMP40]] to <2 x ptr addrspace(3)> +; CHECK-NEXT: [[DUMMYUSER_3:%.*]] = freeze <2 x ptr addrspace(3)> [[TMP41]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [8 x ptr addrspace(5)], align 4, addrspace(5) + + store <2 x i64> %val.0, ptr addrspace(5) %stack + %reload = load <2 x i64>, ptr addrspace(5) %stack + %dummyuser.1 = freeze <2 x i64> %reload + + store <8 x i16> %val.1, ptr addrspace(5) %stack + %reload.1 = load <8 x i16>, ptr addrspace(5) %stack + %dummyuser.2 = freeze <8 x i16> %reload.1 + + store <2 x ptr addrspace(3)> %val.2, ptr addrspace(5) %stack + %reload.2 = load <2 x ptr addrspace(3)>, ptr addrspace(5) %stack + %dummyuser.3= freeze <2 x ptr addrspace(3)> %reload.2 + + ret void +} + +define void @test_out_of_bounds_subvec(<2 x i64> %val) { +; CHECK-LABEL: define void @test_out_of_bounds_subvec +; CHECK-SAME: (<2 x i64> [[VAL:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.3 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 3 + store <2 x i64> %val, ptr addrspace(5) %stack.3 + ret void +} + +define void @test_different_type_subvector_not_divisible(<3 x i32> %val) { +; CHECK-LABEL: define void @test_different_type_subvector_not_divisible +; CHECK-SAME: (<3 x i32> [[VAL:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: store <3 x i32> [[VAL]], ptr addrspace(5) [[STACK]], align 16 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + store <3 x i32> %val, ptr addrspace(5) %stack + ret void +}