diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -399,21 +399,47 @@ cast(Inst)->getPointerOperand(), GEPVectorIdx); // We're loading the full vector. - if (DL.getTypeStoreSize(Inst->getType()) == VecStoreSize) { + Type *AccessTy = Inst->getType(); + unsigned AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize) { assert(cast(Index)->isZeroValue()); - Type *InstTy = Inst->getType(); - if (InstTy->isPtrOrPtrVectorTy()) - CurVal = CreateTempPtrIntCast(CurVal, InstTy); - Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, InstTy); + if (AccessTy->isPtrOrPtrVectorTy()) + CurVal = CreateTempPtrIntCast(CurVal, AccessTy); + Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy); Inst->replaceAllUsesWith(NewVal); return nullptr; } + // Loading a subvector. + if (isa(AccessTy)) { + assert(AccessSize % (DL.getTypeStoreSize(VecEltTy)) == 0); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + + // extract_vector is only legal if (Index % NumElts == 0) + // If we can't use it, use insertelements. + unsigned IndexVal = cast(Index)->getZExtValue(); + Value *SubVec = nullptr; + if (IndexVal % NumElts == 0) { + Value *I64Index = Builder.CreateZExt(Index, Builder.getInt64Ty()); + SubVec = Builder.CreateExtractVector(SubVecTy, CurVal, I64Index); + } else { + SubVec = UndefValue::get(SubVecTy); + for (unsigned K = 0; K < NumElts; ++K) { + SubVec = Builder.CreateInsertElement( + SubVec, Builder.CreateExtractElement(CurVal, K + IndexVal), K); + } + } + + SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy); + Inst->replaceAllUsesWith(SubVec); + return nullptr; + } + // We're loading one element. Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index); - if (Inst->getType() != VecEltTy) - ExtractElement = - Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType()); + if (AccessTy != VecEltTy) + ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy); Inst->replaceAllUsesWith(ExtractElement); return nullptr; @@ -428,14 +454,40 @@ Value *Val = SI->getValueOperand(); // We're storing the full vector, we can handle this without knowing CurVal. - if (DL.getTypeStoreSize(Val->getType()) == VecStoreSize) { + Type *AccessTy = Val->getType(); + unsigned AccessSize = DL.getTypeStoreSize(AccessTy); + if (AccessSize == VecStoreSize) { assert(cast(Index)->isZeroValue()); - Type *SrcTy = Val->getType(); - if (SrcTy->isPtrOrPtrVectorTy()) - Val = CreateTempPtrIntCast(Val, SrcTy); + if (AccessTy->isPtrOrPtrVectorTy()) + Val = CreateTempPtrIntCast(Val, AccessTy); return Builder.CreateBitOrPointerCast(Val, VectorTy); } + // Storing a subvector. + if (isa(AccessTy)) { + assert(AccessSize % (DL.getTypeStoreSize(VecEltTy)) == 0); + const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy); + auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts); + + Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); + + // insert_vector is only legal if (Index % NumElts == 0) + // If we can't use it, use insertelements. + unsigned IndexVal = cast(Index)->getZExtValue(); + Value *CurVec = GetOrLoadCurrentVectorValue(); + if (IndexVal % NumElts == 0) { + CurVec = Builder.CreateInsertVector( + VectorTy, CurVec, Val, + Builder.CreateZExt(Index, Builder.getInt64Ty())); + } else { + for (unsigned K = 0; K < NumElts; ++K) { + CurVec = Builder.CreateInsertElement( + CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K); + } + } + return CurVec; + } + if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, @@ -486,6 +538,32 @@ llvm_unreachable("Did not return after promoting instruction!"); } +static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, + const DataLayout &DL) { + // Access as a vector type can work if the size of the access vector is a + // multiple of the size of the alloca's vector element type. + // + // Examples: + // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK + // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK + // - 3*32 is not a multiple of 64 + // + // We could handle more complicated cases, but it'd make things a lot more + // complicated. + if (isa(AccessTy)) { + const unsigned AccessVecSize = DL.getTypeStoreSize(AccessTy); + const unsigned VecEltSize = DL.getTypeStoreSize(VecTy->getElementType()); + return (AccessVecSize % VecEltSize) == 0; + } + + if (CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy, + DL)) + return true; + + return false; +} + /// Iterates over an instruction worklist that may contain multiple instructions /// from the same basic block, but in a different order. template @@ -616,10 +694,10 @@ // Check that this is a simple access of a vector element. bool IsSimple = isa(Inst) ? cast(Inst)->isSimple() : cast(Inst)->isSimple(); - if (!IsSimple || - !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, *DL)) - return RejectUser(Inst, "not simple and/or vector element type not " - "castable to access type"); + if (!IsSimple) + return RejectUser(Inst, "not a simple load or store"); + if (!isSupportedAccessType(VectorTy, AccessTy, *DL)) + return RejectUser(Inst, "not a supported access type"); WorkList.push_back(Inst); continue; diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll --- a/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-loadstores.ll @@ -123,16 +123,16 @@ ret <4 x ptr addrspace(3)> %tmp } -; Currently rejected due to the store not being cast-able. -; TODO: We should probably be able to vectorize this define void @alloca_load_store_ptr_mixed_ptrvec(<2 x ptr addrspace(3)> %arg) { ; CHECK-LABEL: define void @alloca_load_store_ptr_mixed_ptrvec ; CHECK-SAME: (<2 x ptr addrspace(3)> [[ARG:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 8, addrspace(5) -; CHECK-NEXT: store <2 x ptr addrspace(3)> [[ARG]], ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: [[TMP:%.*]] = load <2 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 -; CHECK-NEXT: [[TMP_FULL:%.*]] = load <4 x ptr addrspace(3)>, ptr addrspace(5) [[ALLOCA]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[ARG]] to <2 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> undef, <2 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i32> @llvm.vector.extract.v2i32.v8i32(<8 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <2 x i32> [[TMP2]] to <2 x ptr addrspace(3)> +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = inttoptr <4 x i32> [[TMP4]] to <4 x ptr addrspace(3)> ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-subvecs.ll @@ -0,0 +1,125 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5" + +define void @test_trivial_subvector(<2 x i64> %val.0, <2 x i64> %val.1) { +; CHECK-LABEL: define void @test_trivial_subvector +; CHECK-SAME: (<2 x i64> [[VAL_0:%.*]], <2 x i64> [[VAL_1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> undef, <2 x i64> [[VAL_0]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[VAL_1]], i64 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[TMP3]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP4]], <2 x i64> [[VAL_1]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> [[TMP5]], i64 0) +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <2 x i64> [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i64> [[TMP5]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i64 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP9]], i64 1 +; CHECK-NEXT: [[DUMMYUSER_1:%.*]] = freeze <2 x i64> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> [[TMP5]], i64 2) +; CHECK-NEXT: [[DUMMYUSER_2:%.*]] = freeze <2 x i64> [[TMP11]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 1 + %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 + + store <2 x i64> %val.0, ptr addrspace(5) %stack + store <2 x i64> %val.1, ptr addrspace(5) %stack.1 + store <2 x i64> %val.1, ptr addrspace(5) %stack.2 + + %reload = load <2 x i64>, ptr addrspace(5) %stack + %dummyuser = freeze <2 x i64> %reload + + %reload.1 = load <2 x i64>, ptr addrspace(5) %stack.1 + %dummyuser.1 = freeze <2 x i64> %reload.1 + + %reload.2 = load <2 x i64>, ptr addrspace(5) %stack.2 + %dummyuser.2 = freeze <2 x i64> %reload.2 + ret void +} + +define void @test_different_type_subvector(<4 x i32> %val.0, <8 x i16> %val.1, <16 x i8> %val.2) { +; CHECK-LABEL: define void @test_different_type_subvector +; CHECK-SAME: (<4 x i32> [[VAL_0:%.*]], <8 x i16> [[VAL_1:%.*]], <16 x i8> [[VAL_2:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x i32> [[VAL_0]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> undef, <2 x i64> [[TMP0]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i16> [[VAL_1]] to <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64> [[TMP2]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP1]], i64 [[TMP3]], i64 1 +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x i64> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[TMP5]], i64 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast <16 x i8> [[VAL_2]] to <2 x i64> +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.vector.insert.v4i64.v2i64(<4 x i64> [[TMP6]], <2 x i64> [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> [[TMP8]], i64 0) +; CHECK-NEXT: [[TMP10:%.*]] = bitcast <2 x i64> [[TMP9]] to <16 x i8> +; CHECK-NEXT: [[DUMMYUSER:%.*]] = freeze <16 x i8> [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = extractelement <4 x i64> [[TMP8]], i64 1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <2 x i64> undef, i64 [[TMP11]], i64 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <4 x i64> [[TMP8]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP13]], i64 1 +; CHECK-NEXT: [[TMP15:%.*]] = bitcast <2 x i64> [[TMP14]] to <8 x i16> +; CHECK-NEXT: [[DUMMYUSE_1:%.*]] = freeze <8 x i16> [[TMP15]] +; CHECK-NEXT: [[TMP16:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP17:%.*]] = bitcast <2 x i64> [[TMP16]] to <4 x i32> +; CHECK-NEXT: [[DUMMYUSE_2:%.*]] = freeze <4 x i32> [[TMP17]] +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.1 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 1 + %stack.2 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 2 + + store <4 x i32> %val.0, ptr addrspace(5) %stack + store <8 x i16> %val.1, ptr addrspace(5) %stack.1 + store <16 x i8> %val.2, ptr addrspace(5) %stack.2 + + %reload = load <16 x i8>, ptr addrspace(5) %stack + %dummyuser = freeze <16 x i8> %reload + + %reload.1 = load <8 x i16>, ptr addrspace(5) %stack.1 + %dummyuse.1 = freeze <8 x i16> %reload.1 + + %reload.2 = load <4 x i32>, ptr addrspace(5) %stack.2 + %dummyuse.2 = freeze <4 x i32> %reload.2 + + ret void +} + +define void @test_out_of_bounds_subvec(<2 x i64> %val) { +; CHECK-LABEL: define void @test_out_of_bounds_subvec +; CHECK-SAME: (<2 x i64> [[VAL:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[VAL]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> undef, i64 [[TMP0]], i64 3 +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[VAL]], i64 1 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + %stack.3 = getelementptr inbounds [4 x i64], ptr addrspace(5) %stack, i32 0, i32 3 + + store <2 x i64> %val, ptr addrspace(5) %stack.3 + ret void +} + + +define void @test_different_type_subvector_not_divisible(<3 x i32> %val) { +; CHECK-LABEL: define void @test_different_type_subvector_not_divisible +; CHECK-SAME: (<3 x i32> [[VAL:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[STACK:%.*]] = alloca [4 x i64], align 4, addrspace(5) +; CHECK-NEXT: store <3 x i32> [[VAL]], ptr addrspace(5) [[STACK]], align 16 +; CHECK-NEXT: ret void +; +entry: + %stack = alloca [4 x i64], align 4, addrspace(5) + store <3 x i32> %val, ptr addrspace(5) %stack + ret void +}