diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp --- a/clang/lib/CodeGen/CGAtomic.cpp +++ b/clang/lib/CodeGen/CGAtomic.cpp @@ -119,7 +119,7 @@ ValueTy = lvalue.getType(); ValueSizeInBits = C.getTypeSize(ValueTy); AtomicTy = ValueTy = CGF.getContext().getExtVectorType( - lvalue.getType(), cast( + lvalue.getType(), cast( lvalue.getExtVectorAddress().getElementType()) ->getNumElements()); AtomicSizeInBits = C.getTypeSize(AtomicTy); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -4561,11 +4561,11 @@ getTarget().getTriple().getArch()); } -static llvm::VectorType *GetNeonType(CodeGenFunction *CGF, - NeonTypeFlags TypeFlags, - bool HasLegalHalfType = true, - bool V1Ty = false, - bool AllowBFloatArgsAndRet = true) { +static llvm::FixedVectorType *GetNeonType(CodeGenFunction *CGF, + NeonTypeFlags TypeFlags, + bool HasLegalHalfType = true, + bool V1Ty = false, + bool AllowBFloatArgsAndRet = true) { int IsQuad = TypeFlags.isQuad(); switch (TypeFlags.getEltType()) { case NeonTypeFlags::Int8: @@ -5621,8 +5621,8 @@ const bool AllowBFloatArgsAndRet = getTargetHooks().getABIInfo().allowBFloatArgsAndRet(); - llvm::VectorType *VTy = GetNeonType(this, Type, HasLegalHalfType, false, - AllowBFloatArgsAndRet); + llvm::FixedVectorType *VTy = + GetNeonType(this, Type, HasLegalHalfType, false, AllowBFloatArgsAndRet); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -5663,8 +5663,8 @@ return EmitNeonCall(CGM.getIntrinsic(Intrinsic::fabs, Ty), Ops, "vabs"); return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Ty), Ops, "vabs"); case NEON::BI__builtin_neon_vaddhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); // %sum = add <4 x i32> %lhs, %rhs Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); @@ -5936,14 +5936,16 @@ return Builder.CreateDefaultAlignedStore(Ops[1], Ops[0]); } case NEON::BI__builtin_neon_vmovl_v: { - llvm::Type *DTy =llvm::VectorType::getTruncatedElementVectorType(VTy); + llvm::FixedVectorType *DTy = + llvm::FixedVectorType::getTruncatedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], DTy); if (Usgn) return Builder.CreateZExt(Ops[0], Ty, "vmovl"); return Builder.CreateSExt(Ops[0], Ty, "vmovl"); } case NEON::BI__builtin_neon_vmovn_v: { - llvm::Type *QTy = llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *QTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], QTy); return Builder.CreateTrunc(Ops[0], Ty, "vmovn"); } @@ -5989,7 +5991,7 @@ case NEON::BI__builtin_neon_vqdmulh_lane_v: case NEON::BI__builtin_neon_vqrdmulhq_lane_v: case NEON::BI__builtin_neon_vqrdmulh_lane_v: { - auto *RTy = cast(Ty); + auto *RTy = cast(Ty); if (BuiltinID == NEON::BI__builtin_neon_vqdmulhq_lane_v || BuiltinID == NEON::BI__builtin_neon_vqrdmulhq_lane_v) RTy = llvm::FixedVectorType::get(RTy->getElementType(), @@ -6038,7 +6040,8 @@ return Builder.CreateShl(Builder.CreateBitCast(Ops[0],Ty), Ops[1], "vshl_n"); case NEON::BI__builtin_neon_vshll_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getTruncatedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getTruncatedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); if (Usgn) Ops[0] = Builder.CreateZExt(Ops[0], VTy); @@ -6048,7 +6051,8 @@ return Builder.CreateShl(Ops[0], Ops[1], "vshll_n"); } case NEON::BI__builtin_neon_vshrn_n_v: { - llvm::Type *SrcTy = llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); Ops[1] = EmitNeonShiftVector(Ops[1], SrcTy, false); if (Usgn) @@ -6097,8 +6101,8 @@ return EmitNeonCall(CGM.getIntrinsic(LLVMIntrinsic, Tys), Ops, ""); } case NEON::BI__builtin_neon_vsubhn_v: { - llvm::VectorType *SrcTy = - llvm::VectorType::getExtendedElementVectorType(VTy); + llvm::FixedVectorType *SrcTy = + llvm::FixedVectorType::getExtendedElementVectorType(VTy); // %sum = add <4 x i32> %lhs, %rhs Ops[0] = Builder.CreateBitCast(Ops[0], SrcTy); @@ -6310,7 +6314,7 @@ // Build a vector containing sequential number like (0, 1, 2, ..., 15) SmallVector Indices; - llvm::VectorType *TblTy = cast(Ops[0]->getType()); + auto *TblTy = cast(Ops[0]->getType()); for (unsigned i = 0, e = TblTy->getNumElements(); i != e; ++i) { Indices.push_back(2*i); Indices.push_back(2*i+1); @@ -7151,10 +7155,9 @@ bool usgn = Type.isUnsigned(); bool rightShift = false; - llvm::VectorType *VTy = GetNeonType(this, Type, - getTarget().hasLegalHalfType(), - false, - getTarget().hasBFloat16Type()); + llvm::FixedVectorType *VTy = + GetNeonType(this, Type, getTarget().hasLegalHalfType(), false, + getTarget().hasBFloat16Type()); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -7362,7 +7365,7 @@ // or odds, as desired). SmallVector Indices; unsigned InputElements = - cast(V->getType())->getNumElements(); + cast(V->getType())->getNumElements(); for (unsigned i = 0; i < InputElements; i += 2) Indices.push_back(i + Odd); return Builder.CreateShuffleVector(V, llvm::UndefValue::get(V->getType()), @@ -7375,7 +7378,7 @@ assert(V0->getType() == V1->getType() && "Can't zip different vector types"); SmallVector Indices; unsigned InputElements = - cast(V0->getType())->getNumElements(); + cast(V0->getType())->getNumElements(); for (unsigned i = 0; i < InputElements; i++) { Indices.push_back(i); Indices.push_back(i + InputElements); @@ -7571,7 +7574,7 @@ // Determine the type of this overloaded NEON intrinsic. NeonTypeFlags Type = Result->getZExtValue(); - llvm::VectorType *Ty = GetNeonType(&CGF, Type); + llvm::FixedVectorType *Ty = GetNeonType(&CGF, Type); if (!Ty) return nullptr; @@ -9773,7 +9776,7 @@ } } - llvm::VectorType *VTy = GetNeonType(this, Type); + llvm::FixedVectorType *VTy = GetNeonType(this, Type); llvm::Type *Ty = VTy; if (!Ty) return nullptr; @@ -9834,13 +9837,13 @@ return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "fmla"); } case NEON::BI__builtin_neon_vfma_laneq_v: { - llvm::VectorType *VTy = cast(Ty); + auto *VTy = cast(Ty); // v1f64 fma should be mapped to Neon scalar f64 fma if (VTy && VTy->getElementType() == DoubleTy) { Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); Ops[1] = Builder.CreateBitCast(Ops[1], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, true)); + llvm::FixedVectorType *VTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, true)); Ops[2] = Builder.CreateBitCast(Ops[2], VTy); Ops[2] = Builder.CreateExtractElement(Ops[2], Ops[3], "extract"); Value *Result; @@ -10208,8 +10211,8 @@ if (BuiltinID == NEON::BI__builtin_neon_vmul_laneq_v) Quad = true; Ops[0] = Builder.CreateBitCast(Ops[0], DoubleTy); - llvm::Type *VTy = GetNeonType(this, - NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); + llvm::FixedVectorType *VTy = + GetNeonType(this, NeonTypeFlags(NeonTypeFlags::Float64, false, Quad)); Ops[1] = Builder.CreateBitCast(Ops[1], VTy); Ops[1] = Builder.CreateExtractElement(Ops[1], Ops[2], "extract"); Value *Result = Builder.CreateFMul(Ops[0], Ops[1]); @@ -11081,7 +11084,8 @@ llvm::PointerType::getUnqual(Ops[1]->getType())); Value *MaskVec = getMaskVecValue( - CGF, Ops[2], cast(Ops[1]->getType())->getNumElements()); + CGF, Ops[2], + cast(Ops[1]->getType())->getNumElements()); return CGF.Builder.CreateMaskedStore(Ops[1], Ptr, Alignment, MaskVec); } @@ -11093,7 +11097,8 @@ llvm::PointerType::getUnqual(Ops[1]->getType())); Value *MaskVec = getMaskVecValue( - CGF, Ops[2], cast(Ops[1]->getType())->getNumElements()); + CGF, Ops[2], + cast(Ops[1]->getType())->getNumElements()); return CGF.Builder.CreateMaskedLoad(Ptr, Alignment, MaskVec, Ops[1]); } @@ -11107,7 +11112,8 @@ Value *Ptr = CGF.Builder.CreateBitCast(Ops[0], llvm::PointerType::getUnqual(PtrTy)); - Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); + Value *MaskVec = getMaskVecValue( + CGF, Ops[2], cast(ResultTy)->getNumElements()); llvm::Function *F = CGF.CGM.getIntrinsic(Intrinsic::masked_expandload, ResultTy); @@ -11117,7 +11123,7 @@ static Value *EmitX86CompressExpand(CodeGenFunction &CGF, ArrayRef Ops, bool IsCompress) { - auto *ResultTy = cast(Ops[1]->getType()); + auto *ResultTy = cast(Ops[1]->getType()); Value *MaskVec = getMaskVecValue(CGF, Ops[2], ResultTy->getNumElements()); @@ -11129,7 +11135,7 @@ static Value *EmitX86CompressStore(CodeGenFunction &CGF, ArrayRef Ops) { - auto *ResultTy = cast(Ops[1]->getType()); + auto *ResultTy = cast(Ops[1]->getType()); llvm::Type *PtrTy = ResultTy->getElementType(); // Cast the pointer to element type. @@ -11165,7 +11171,7 @@ // Funnel shifts amounts are treated as modulo and types are all power-of-2 so // we only care about the lowest log2 bits anyway. if (Amt->getType() != Ty) { - unsigned NumElts = cast(Ty)->getNumElements(); + unsigned NumElts = cast(Ty)->getNumElements(); Amt = CGF.Builder.CreateIntCast(Amt, Ty->getScalarType(), false); Amt = CGF.Builder.CreateVectorSplat(NumElts, Amt); } @@ -11224,7 +11230,7 @@ return Op0; Mask = getMaskVecValue( - CGF, Mask, cast(Op0->getType())->getNumElements()); + CGF, Mask, cast(Op0->getType())->getNumElements()); return CGF.Builder.CreateSelect(Mask, Op0, Op1); } @@ -11271,7 +11277,7 @@ assert((Ops.size() == 2 || Ops.size() == 4) && "Unexpected number of arguments"); unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *Cmp; if (CC == 3) { @@ -11548,7 +11554,8 @@ static Value *EmitX86SExtMask(CodeGenFunction &CGF, Value *Op, llvm::Type *DstTy) { - unsigned NumberOfElements = cast(DstTy)->getNumElements(); + unsigned NumberOfElements = + cast(DstTy)->getNumElements(); Value *Mask = getMaskVecValue(CGF, Op, NumberOfElements); return CGF.Builder.CreateSExt(Mask, DstTy, "vpmovm2"); } @@ -11584,11 +11591,12 @@ return CGF.Builder.CreateCall(F, {Ops[0], Ops[1], Ops[2], Ops[3]}); } - unsigned NumDstElts = cast(DstTy)->getNumElements(); + unsigned NumDstElts = cast(DstTy)->getNumElements(); Value *Src = Ops[0]; // Extract the subvector. - if (NumDstElts != cast(Src->getType())->getNumElements()) { + if (NumDstElts != + cast(Src->getType())->getNumElements()) { assert(NumDstElts == 4 && "Unexpected vector size"); Src = CGF.Builder.CreateShuffleVector(Src, UndefValue::get(Src->getType()), ArrayRef{0, 1, 2, 3}); @@ -11887,7 +11895,7 @@ case X86::BI__builtin_ia32_vec_ext_v8si: case X86::BI__builtin_ia32_vec_ext_v4di: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); uint64_t Index = cast(Ops[1])->getZExtValue(); Index &= NumElts - 1; // These builtins exist so we can ensure the index is an ICE and in range. @@ -11903,7 +11911,7 @@ case X86::BI__builtin_ia32_vec_set_v8si: case X86::BI__builtin_ia32_vec_set_v4di: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned Index = cast(Ops[2])->getZExtValue(); Index &= NumElts - 1; // These builtins exist so we can ensure the index is an ICE and in range. @@ -12329,9 +12337,9 @@ break; } - unsigned MinElts = - std::min(cast(Ops[0]->getType())->getNumElements(), - cast(Ops[2]->getType())->getNumElements()); + unsigned MinElts = std::min( + cast(Ops[0]->getType())->getNumElements(), + cast(Ops[2]->getType())->getNumElements()); Ops[3] = getMaskVecValue(*this, Ops[3], MinElts); Function *Intr = CGM.getIntrinsic(IID); return Builder.CreateCall(Intr, Ops); @@ -12438,9 +12446,9 @@ break; } - unsigned MinElts = - std::min(cast(Ops[2]->getType())->getNumElements(), - cast(Ops[3]->getType())->getNumElements()); + unsigned MinElts = std::min( + cast(Ops[2]->getType())->getNumElements(), + cast(Ops[3]->getType())->getNumElements()); Ops[1] = getMaskVecValue(*this, Ops[1], MinElts); Function *Intr = CGM.getIntrinsic(IID); return Builder.CreateCall(Intr, Ops); @@ -12462,10 +12470,10 @@ case X86::BI__builtin_ia32_extracti64x2_256_mask: case X86::BI__builtin_ia32_extractf64x2_512_mask: case X86::BI__builtin_ia32_extracti64x2_512_mask: { - auto *DstTy = cast(ConvertType(E->getType())); + auto *DstTy = cast(ConvertType(E->getType())); unsigned NumElts = DstTy->getNumElements(); unsigned SrcNumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned SubVectors = SrcNumElts / NumElts; unsigned Index = cast(Ops[1])->getZExtValue(); assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); @@ -12503,9 +12511,9 @@ case X86::BI__builtin_ia32_insertf64x2_512: case X86::BI__builtin_ia32_inserti64x2_512: { unsigned DstNumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned SrcNumElts = - cast(Ops[1]->getType())->getNumElements(); + cast(Ops[1]->getType())->getNumElements(); unsigned SubVectors = DstNumElts / SrcNumElts; unsigned Index = cast(Ops[2])->getZExtValue(); assert(llvm::isPowerOf2_32(SubVectors) && "Expected power of 2 subvectors"); @@ -12570,7 +12578,7 @@ case X86::BI__builtin_ia32_pblendd128: case X86::BI__builtin_ia32_pblendd256: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned Imm = cast(Ops[2])->getZExtValue(); int Indices[16]; @@ -12587,7 +12595,7 @@ case X86::BI__builtin_ia32_pshuflw256: case X86::BI__builtin_ia32_pshuflw512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // Splat the 8-bits of immediate 4 times to help the loop wrap around. @@ -12611,7 +12619,7 @@ case X86::BI__builtin_ia32_pshufhw256: case X86::BI__builtin_ia32_pshufhw512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // Splat the 8-bits of immediate 4 times to help the loop wrap around. @@ -12641,7 +12649,7 @@ case X86::BI__builtin_ia32_vpermilpd512: case X86::BI__builtin_ia32_vpermilps512: { uint32_t Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; @@ -12668,7 +12676,7 @@ case X86::BI__builtin_ia32_shufps256: case X86::BI__builtin_ia32_shufps512: { uint32_t Imm = cast(Ops[2])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() / 128; unsigned NumLaneElts = NumElts / NumLanes; @@ -12696,7 +12704,7 @@ case X86::BI__builtin_ia32_permdi512: case X86::BI__builtin_ia32_permdf512: { unsigned Imm = cast(Ops[1])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); // These intrinsics operate on 256-bit lanes of four 64-bit elements. @@ -12715,7 +12723,7 @@ unsigned ShiftVal = cast(Ops[2])->getZExtValue() & 0xff; unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); assert(NumElts % 16 == 0); // If palignr is shifting the pair of vectors more than the size of two @@ -12753,7 +12761,7 @@ case X86::BI__builtin_ia32_alignq256: case X86::BI__builtin_ia32_alignq512: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); unsigned ShiftVal = cast(Ops[2])->getZExtValue() & 0xff; // Mask the shift amount to width of two vectors. @@ -12776,7 +12784,7 @@ case X86::BI__builtin_ia32_shuf_i32x4: case X86::BI__builtin_ia32_shuf_i64x2: { unsigned Imm = cast(Ops[2])->getZExtValue(); - auto *Ty = cast(Ops[0]->getType()); + auto *Ty = cast(Ops[0]->getType()); unsigned NumElts = Ty->getNumElements(); unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2; unsigned NumLaneElts = NumElts / NumLanes; @@ -12803,7 +12811,7 @@ case X86::BI__builtin_ia32_permti256: { unsigned Imm = cast(Ops[2])->getZExtValue(); unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); // This takes a very simple approach since there are two lanes and a // shuffle can have 2 inputs. So we reserve the first input for the first @@ -12841,7 +12849,7 @@ case X86::BI__builtin_ia32_pslldqi256_byteshift: case X86::BI__builtin_ia32_pslldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); + auto *ResultType = cast(Ops[0]->getType()); // Builtin type is vXi64 so multiply by 8 to get bytes. unsigned NumElts = ResultType->getNumElements() * 8; @@ -12871,7 +12879,7 @@ case X86::BI__builtin_ia32_psrldqi256_byteshift: case X86::BI__builtin_ia32_psrldqi512_byteshift: { unsigned ShiftVal = cast(Ops[1])->getZExtValue() & 0xff; - auto *ResultType = cast(Ops[0]->getType()); + auto *ResultType = cast(Ops[0]->getType()); // Builtin type is vXi64 so multiply by 8 to get bytes. unsigned NumElts = ResultType->getNumElements() * 8; @@ -13518,7 +13526,7 @@ case X86::BI__builtin_ia32_fpclasspd256_mask: case X86::BI__builtin_ia32_fpclasspd512_mask: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *MaskIn = Ops[2]; Ops.erase(&Ops[2]); @@ -13556,7 +13564,7 @@ case X86::BI__builtin_ia32_vp2intersect_d_256: case X86::BI__builtin_ia32_vp2intersect_d_128: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Intrinsic::ID ID; switch (BuiltinID) { @@ -13615,7 +13623,7 @@ case X86::BI__builtin_ia32_vpshufbitqmb256_mask: case X86::BI__builtin_ia32_vpshufbitqmb512_mask: { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *MaskIn = Ops[2]; Ops.erase(&Ops[2]); @@ -13762,7 +13770,7 @@ Function *Intr = CGM.getIntrinsic(IID); if (IsMaskFCmp) { unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Ops[3] = getMaskVecValue(*this, Ops[3], NumElts); Value *Cmp = Builder.CreateCall(Intr, Ops); return EmitX86MaskedCompareResult(*this, Cmp, NumElts, nullptr); @@ -13777,7 +13785,7 @@ // We ignore SAE if strict FP is disabled. We only keep precise // exception behavior under strict FP. unsigned NumElts = - cast(Ops[0]->getType())->getNumElements(); + cast(Ops[0]->getType())->getNumElements(); Value *Cmp; if (IsSignaling) Cmp = Builder.CreateFCmpS(Pred, Ops[0], Ops[1]); @@ -13835,7 +13843,7 @@ case X86::BI__builtin_ia32_cvtneps2bf16_128_mask: { Ops[2] = getMaskVecValue( *this, Ops[2], - cast(Ops[0]->getType())->getNumElements()); + cast(Ops[0]->getType())->getNumElements()); Intrinsic::ID IID = Intrinsic::x86_avx512bf16_mask_cvtneps2bf16_128; return Builder.CreateCall(CGM.getIntrinsic(IID), Ops); } diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -1685,7 +1685,7 @@ if (Ty->isVectorType()) { const llvm::Type *EltTy = Addr.getElementType(); - const auto *VTy = cast(EltTy); + const auto *VTy = cast(EltTy); // Handle vectors of size 3 like size 4 for better performance. if (VTy->getNumElements() == 3) { @@ -1770,8 +1770,9 @@ auto *VectorTy = dyn_cast( cast(Addr.getPointer()->getType())->getElementType()); if (VectorTy && !IsVector) { - auto *ArrayTy = llvm::ArrayType::get(VectorTy->getElementType(), - VectorTy->getNumElements()); + auto *ArrayTy = llvm::ArrayType::get( + VectorTy->getElementType(), + cast(VectorTy)->getNumElements()); return Address(CGF.Builder.CreateElementBitCast(Addr, ArrayTy)); } @@ -1802,7 +1803,7 @@ llvm::Type *SrcTy = Value->getType(); auto *VecTy = dyn_cast(SrcTy); // Handle vec3 special. - if (VecTy && VecTy->getNumElements() == 3) { + if (VecTy && cast(VecTy)->getNumElements() == 3) { // Our source is a vec3, do a shuffle vector to make it a vec4. Value = Builder.CreateShuffleVector(Value, llvm::UndefValue::get(VecTy), ArrayRef{0, 1, 2, -1}, @@ -2217,7 +2218,7 @@ if (const VectorType *VTy = Dst.getType()->getAs()) { unsigned NumSrcElts = VTy->getNumElements(); unsigned NumDstElts = - cast(Vec->getType())->getNumElements(); + cast(Vec->getType())->getNumElements(); if (NumDstElts == NumSrcElts) { // Use shuffle vector is the src and destination are the same number of // elements and restore the vector mask since it is on the side it will be diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1320,7 +1320,7 @@ "Splatted expr doesn't match with vector element type?"); // Splat the element across to all elements - unsigned NumElements = cast(DstTy)->getNumElements(); + unsigned NumElements = cast(DstTy)->getNumElements(); return Builder.CreateVectorSplat(NumElements, Src, "splat"); } @@ -1553,12 +1553,12 @@ Value *RHS = CGF.EmitScalarExpr(E->getExpr(1)); Value *Mask; - llvm::VectorType *LTy = cast(LHS->getType()); + auto *LTy = cast(LHS->getType()); unsigned LHSElts = LTy->getNumElements(); Mask = RHS; - llvm::VectorType *MTy = cast(Mask->getType()); + auto *MTy = cast(Mask->getType()); // Mask off the high bits of each shuffle index. Value *MaskBits = @@ -1763,7 +1763,7 @@ return Visit(E->getInit(0)); } - unsigned ResElts = VType->getNumElements(); + unsigned ResElts = cast(VType)->getNumElements(); // Loop over initializers collecting the Value for each, and remembering // whether the source was swizzle (ExtVectorElementExpr). This will allow @@ -1787,7 +1787,8 @@ if (isa(IE)) { llvm::ExtractElementInst *EI = cast(Init); - if (EI->getVectorOperandType()->getNumElements() == ResElts) { + if (cast(EI->getVectorOperandType()) + ->getNumElements() == ResElts) { llvm::ConstantInt *C = cast(EI->getIndexOperand()); Value *LHS = nullptr, *RHS = nullptr; if (CurIdx == 0) { @@ -1825,7 +1826,7 @@ continue; } - unsigned InitElts = VVT->getNumElements(); + unsigned InitElts = cast(VVT)->getNumElements(); // If the initializer is an ExtVecEltExpr (a swizzle), and the swizzle's // input is the same width as the vector being constructed, generate an @@ -1834,7 +1835,7 @@ if (isa(IE)) { llvm::ShuffleVectorInst *SVI = cast(Init); Value *SVOp = SVI->getOperand(0); - llvm::VectorType *OpTy = cast(SVOp->getType()); + auto *OpTy = cast(SVOp->getType()); if (OpTy->getNumElements() == ResElts) { for (unsigned j = 0; j != CurIdx; ++j) { @@ -2170,7 +2171,7 @@ llvm::Type *DstTy = ConvertType(DestTy); Value *Elt = Visit(const_cast(E)); // Splat the element across to all elements - unsigned NumElements = cast(DstTy)->getNumElements(); + unsigned NumElements = cast(DstTy)->getNumElements(); return Builder.CreateVectorSplat(NumElements, Elt, "splat"); } @@ -4331,7 +4332,7 @@ llvm::Value *RHS = Visit(rhsExpr); llvm::Type *condType = ConvertType(condExpr->getType()); - llvm::VectorType *vecTy = cast(condType); + auto *vecTy = cast(condType); unsigned numElem = vecTy->getNumElements(); llvm::Type *elemType = vecTy->getElementType(); @@ -4534,10 +4535,14 @@ llvm::Type *DstTy = ConvertType(E->getType()); llvm::Type *SrcTy = Src->getType(); - unsigned NumElementsSrc = isa(SrcTy) ? - cast(SrcTy)->getNumElements() : 0; - unsigned NumElementsDst = isa(DstTy) ? - cast(DstTy)->getNumElements() : 0; + unsigned NumElementsSrc = + isa(SrcTy) + ? cast(SrcTy)->getNumElements() + : 0; + unsigned NumElementsDst = + isa(DstTy) + ? cast(DstTy)->getNumElements() + : 0; // Going from vec3 to non-vec3 is a special case and requires a shuffle // vector to get a vec4, then a bitcast if the target type is different. diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp --- a/clang/lib/CodeGen/SwiftCallingConv.cpp +++ b/clang/lib/CodeGen/SwiftCallingConv.cpp @@ -320,9 +320,12 @@ // If we have a vector type, split it. if (auto vecTy = dyn_cast_or_null(type)) { auto eltTy = vecTy->getElementType(); - CharUnits eltSize = (end - begin) / vecTy->getNumElements(); + CharUnits eltSize = + (end - begin) / cast(vecTy)->getNumElements(); assert(eltSize == getTypeStoreSize(CGM, eltTy)); - for (unsigned i = 0, e = vecTy->getNumElements(); i != e; ++i) { + for (unsigned i = 0, + e = cast(vecTy)->getNumElements(); + i != e; ++i) { addEntry(eltTy, begin, begin + eltSize); begin += eltSize; } @@ -674,8 +677,9 @@ bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, llvm::VectorType *vectorTy) { - return isLegalVectorType(CGM, vectorSize, vectorTy->getElementType(), - vectorTy->getNumElements()); + return isLegalVectorType( + CGM, vectorSize, vectorTy->getElementType(), + cast(vectorTy)->getNumElements()); } bool swiftcall::isLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, @@ -688,7 +692,7 @@ std::pair swiftcall::splitLegalVectorType(CodeGenModule &CGM, CharUnits vectorSize, llvm::VectorType *vectorTy) { - auto numElts = vectorTy->getNumElements(); + auto numElts = cast(vectorTy)->getNumElements(); auto eltTy = vectorTy->getElementType(); // Try to split the vector type in half. @@ -710,7 +714,7 @@ } // Try to split the vector into legal subvectors. - auto numElts = origVectorTy->getNumElements(); + auto numElts = cast(origVectorTy)->getNumElements(); auto eltTy = origVectorTy->getElementType(); assert(numElts != 1); diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -342,6 +342,7 @@ raw_ostream &operator<<(raw_ostream &OS, const SampleRecord &Sample); class FunctionSamples; +class SampleProfileReaderItaniumRemapper; using BodySampleMap = std::map; // NOTE: Using a StringMap here makes parsed profiles consume around 17% more @@ -428,35 +429,15 @@ return &iter->second; } - /// Returns a pointer to FunctionSamples at the given callsite location \p Loc - /// with callee \p CalleeName. If no callsite can be found, relax the - /// restriction to return the FunctionSamples at callsite location \p Loc - /// with the maximum total sample count. - const FunctionSamples *findFunctionSamplesAt(const LineLocation &Loc, - StringRef CalleeName) const { - std::string CalleeGUID; - CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID); - - auto iter = CallsiteSamples.find(Loc); - if (iter == CallsiteSamples.end()) - return nullptr; - auto FS = iter->second.find(CalleeName); - if (FS != iter->second.end()) - return &FS->second; - // If we cannot find exact match of the callee name, return the FS with - // the max total count. Only do this when CalleeName is not provided, - // i.e., only for indirect calls. - if (!CalleeName.empty()) - return nullptr; - uint64_t MaxTotalSamples = 0; - const FunctionSamples *R = nullptr; - for (const auto &NameFS : iter->second) - if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { - MaxTotalSamples = NameFS.second.getTotalSamples(); - R = &NameFS.second; - } - return R; - } + /// Returns a pointer to FunctionSamples at the given callsite location + /// \p Loc with callee \p CalleeName. If no callsite can be found, relax + /// the restriction to return the FunctionSamples at callsite location + /// \p Loc with the maximum total sample count. If \p Remapper is not + /// nullptr, use \p Remapper to find FunctionSamples with equivalent name + /// as \p CalleeName. + const FunctionSamples * + findFunctionSamplesAt(const LineLocation &Loc, StringRef CalleeName, + SampleProfileReaderItaniumRemapper *Remapper) const; bool empty() const { return TotalSamples == 0; } @@ -630,7 +611,11 @@ /// tree nodes in the profile. /// /// \returns the FunctionSamples pointer to the inlined instance. - const FunctionSamples *findFunctionSamples(const DILocation *DIL) const; + /// If \p Remapper is not nullptr, it will be used to find matching + /// FunctionSamples with not exactly the same but equivalent name. + const FunctionSamples *findFunctionSamples( + const DILocation *DIL, + SampleProfileReaderItaniumRemapper *Remapper = nullptr) const; static SampleProfileFormat Format; @@ -648,6 +633,10 @@ return UseMD5 ? std::stoull(Name.data()) : Function::getGUID(Name); } + // Find all the names in the current FunctionSamples including names in + // all the inline instances and names of call targets. + void findAllNames(DenseSet &NameSet) const; + private: /// Mangled name of the function. StringRef Name; diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -208,6 +208,7 @@ #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" @@ -275,15 +276,18 @@ return Remappings->lookup(FunctionName); } - /// Return the samples collected for function \p F if remapper knows - /// it is present in SampleMap. - FunctionSamples *getSamplesFor(StringRef FunctionName); + /// Return the equivalent name in the profile for \p FunctionName if + /// it exists. + Optional lookUpNameInProfile(StringRef FunctionName); private: // The buffer holding the content read from remapping file. std::unique_ptr Buffer; std::unique_ptr Remappings; - DenseMap SampleMap; + // Map remapping key to the name in the profile. By looking up the + // key in the remapper, a given new name can be mapped to the + // cannonical name using the NameMap. + DenseMap NameMap; // The Reader the remapper is servicing. SampleProfileReader &Reader; // Indicate whether remapping has been applied to the profile read @@ -370,15 +374,19 @@ /// Return the samples collected for function \p F. virtual FunctionSamples *getSamplesFor(StringRef Fname) { - if (Remapper) { - if (auto FS = Remapper->getSamplesFor(Fname)) - return FS; - } std::string FGUID; Fname = getRepInFormat(Fname, useMD5(), FGUID); auto It = Profiles.find(Fname); if (It != Profiles.end()) return &It->second; + + if (Remapper) { + if (auto NameInProfile = Remapper->lookUpNameInProfile(Fname)) { + auto It = Profiles.find(*NameInProfile); + if (It != Profiles.end()) + return &It->second; + } + } return nullptr; } @@ -423,6 +431,8 @@ /// Return whether names in the profile are all MD5 numbers. virtual bool useMD5() { return false; } + SampleProfileReaderItaniumRemapper *getRemapper() { return Remapper.get(); } + protected: /// Map every function to its associated profile. /// diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -4302,7 +4302,7 @@ auto *ValC = dyn_cast(Val); auto *IdxC = dyn_cast(Idx); if (VecC && ValC && IdxC) - return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC); + return ConstantExpr::getInsertElement(VecC, ValC, IdxC); // For fixed-length vector, fold into undef if index is out of bounds. if (auto *CI = dyn_cast(Idx)) { @@ -4367,7 +4367,7 @@ auto *VecVTy = cast(Vec->getType()); if (auto *CVec = dyn_cast(Vec)) { if (auto *CIdx = dyn_cast(Idx)) - return ConstantFoldExtractElementInstruction(CVec, CIdx); + return ConstantExpr::getExtractElement(CVec, CIdx); // The index is not relevant if our vector is a splat. if (auto *Splat = CVec->getSplatValue()) @@ -4565,8 +4565,8 @@ // If all operands are constant, constant fold the shuffle. This // transformation depends on the value of the mask which is not known at // compile time for scalable vectors - if (!Scalable && Op0Const && Op1Const) - return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask); + if (Op0Const && Op1Const) + return ConstantExpr::getShuffleVector(Op0Const, Op1Const, Mask); // Canonicalization: if only one input vector is constant, it shall be the // second one. This transformation depends on the value of the mask which diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -466,8 +466,6 @@ internalizeModule(*MergedModule, mustPreserveGV); - MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1); - ScopeRestrictionsDone = true; } @@ -559,6 +557,9 @@ // Mark which symbols can not be internalized this->applyScopeRestrictions(); + // Write LTOPostLink flag for passes that require all the modules. + MergedModule->addModuleFlag(Module::Error, "LTOPostLink", 1); + // Instantiate the pass manager to organize the passes. legacy::PassManager passes; diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp --- a/llvm/lib/ProfileData/SampleProf.cpp +++ b/llvm/lib/ProfileData/SampleProf.cpp @@ -14,6 +14,7 @@ #include "llvm/ProfileData/SampleProf.h" #include "llvm/Config/llvm-config.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/Error.h" @@ -174,8 +175,8 @@ 0xffff; } -const FunctionSamples * -FunctionSamples::findFunctionSamples(const DILocation *DIL) const { +const FunctionSamples *FunctionSamples::findFunctionSamples( + const DILocation *DIL, SampleProfileReaderItaniumRemapper *Remapper) const { assert(DIL); SmallVector, 10> S; @@ -190,11 +191,59 @@ return this; const FunctionSamples *FS = this; for (int i = S.size() - 1; i >= 0 && FS != nullptr; i--) { - FS = FS->findFunctionSamplesAt(S[i].first, S[i].second); + FS = FS->findFunctionSamplesAt(S[i].first, S[i].second, Remapper); } return FS; } +void FunctionSamples::findAllNames(DenseSet &NameSet) const { + NameSet.insert(Name); + for (const auto &BS : BodySamples) + for (const auto &TS : BS.second.getCallTargets()) + NameSet.insert(TS.getKey()); + + for (const auto &CS : CallsiteSamples) { + for (const auto &NameFS : CS.second) { + NameSet.insert(NameFS.first); + NameFS.second.findAllNames(NameSet); + } + } +} + +const FunctionSamples *FunctionSamples::findFunctionSamplesAt( + const LineLocation &Loc, StringRef CalleeName, + SampleProfileReaderItaniumRemapper *Remapper) const { + std::string CalleeGUID; + CalleeName = getRepInFormat(CalleeName, UseMD5, CalleeGUID); + + auto iter = CallsiteSamples.find(Loc); + if (iter == CallsiteSamples.end()) + return nullptr; + auto FS = iter->second.find(CalleeName); + if (FS != iter->second.end()) + return &FS->second; + if (Remapper) { + if (auto NameInProfile = Remapper->lookUpNameInProfile(CalleeName)) { + auto FS = iter->second.find(*NameInProfile); + if (FS != iter->second.end()) + return &FS->second; + } + } + // If we cannot find exact match of the callee name, return the FS with + // the max total count. Only do this when CalleeName is not provided, + // i.e., only for indirect calls. + if (!CalleeName.empty()) + return nullptr; + uint64_t MaxTotalSamples = 0; + const FunctionSamples *R = nullptr; + for (const auto &NameFS : iter->second) + if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { + MaxTotalSamples = NameFS.second.getTotalSamples(); + R = &NameFS.second; + } + return R; +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); } #endif diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -1291,18 +1291,22 @@ } assert(Remappings && "should be initialized while creating remapper"); - for (auto &Sample : Reader.getProfiles()) - if (auto Key = Remappings->insert(Sample.first())) - SampleMap.insert({Key, &Sample.second}); + for (auto &Sample : Reader.getProfiles()) { + DenseSet NamesInSample; + Sample.second.findAllNames(NamesInSample); + for (auto &Name : NamesInSample) + if (auto Key = Remappings->insert(Name)) + NameMap.insert({Key, Name}); + } RemappingApplied = true; } -FunctionSamples * -SampleProfileReaderItaniumRemapper::getSamplesFor(StringRef Fname) { +Optional +SampleProfileReaderItaniumRemapper::lookUpNameInProfile(StringRef Fname) { if (auto Key = Remappings->lookup(Fname)) - return SampleMap.lookup(Key); - return nullptr; + return NameMap.lookup(Key); + return None; } /// Prepare a memory buffer for the contents of \p Filename. diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5487,7 +5487,7 @@ def : InstAlias(NAME # _IMM) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), 0>; def : InstAlias(NAME # _IMM) Z_s:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; + (!cast(NAME # _IMM) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>; def : Pat<(op (nxv2i64 ZPR:$data), (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt), (!cast(NAME # _IMM) ZPR:$data, PPR:$gp, ZPR:$ptrs, imm_ty:$index)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -1304,9 +1304,9 @@ } else if (const ConstantSDNode *CAddr = dyn_cast(Addr)) { unsigned OffsetValue0 = CAddr->getZExtValue() / Align; unsigned OffsetValue1 = OffsetValue0 + 1; - assert(Align * OffsetValue0 == CAddr->getZExtValue()); + bool OffsetIsAligned = Align * OffsetValue0 == CAddr->getZExtValue(); - if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1)) { + if (isUInt<8>(OffsetValue0) && isUInt<8>(OffsetValue1) && OffsetIsAligned) { SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32); MachineSDNode *MovZero = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32, Zero); diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -469,8 +469,7 @@ SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const; SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const; - SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const; SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp --- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp @@ -97,6 +97,8 @@ setOperationAction(ISD::CTTZ, T, Custom); setOperationAction(ISD::LOAD, T, Custom); + setOperationAction(ISD::MLOAD, T, Custom); + setOperationAction(ISD::MSTORE, T, Custom); setOperationAction(ISD::MUL, T, Custom); setOperationAction(ISD::MULHS, T, Custom); setOperationAction(ISD::MULHU, T, Custom); @@ -150,6 +152,8 @@ setOperationAction(ISD::LOAD, T, Custom); setOperationAction(ISD::STORE, T, Custom); + setOperationAction(ISD::MLOAD, T, Custom); + setOperationAction(ISD::MSTORE, T, Custom); setOperationAction(ISD::CTLZ, T, Custom); setOperationAction(ISD::CTTZ, T, Custom); setOperationAction(ISD::CTPOP, T, Custom); @@ -188,6 +192,9 @@ setOperationAction(ISD::AND, BoolW, Custom); setOperationAction(ISD::OR, BoolW, Custom); setOperationAction(ISD::XOR, BoolW, Custom); + // Masked load/store takes a mask that may need splitting. + setOperationAction(ISD::MLOAD, BoolW, Custom); + setOperationAction(ISD::MSTORE, BoolW, Custom); } for (MVT T : LegalV) { @@ -1593,7 +1600,7 @@ SDValue HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const { - const SDLoc &dl(Op); + const SDLoc &dl(Op); MVT ResTy = ty(Op); unsigned IntNo = cast(Op.getOperand(0))->getZExtValue(); @@ -1613,6 +1620,75 @@ return Op; } +SDValue +HexagonTargetLowering::LowerHvxMaskedOp(SDValue Op, SelectionDAG &DAG) const { + const SDLoc &dl(Op); + unsigned HwLen = Subtarget.getVectorLength(); + auto *MaskN = cast(Op.getNode()); + SDValue Mask = MaskN->getMask(); + SDValue Chain = MaskN->getChain(); + SDValue Base = MaskN->getBasePtr(); + auto *MemOp = MaskN->getMemOperand(); + + unsigned Opc = Op->getOpcode(); + assert(Opc == ISD::MLOAD || Opc == ISD::MSTORE); + + if (Opc == ISD::MLOAD) { + MVT ValTy = ty(Op); + SDValue Load = DAG.getLoad(ValTy, dl, Chain, Base, MaskN->getMemOperand()); + SDValue Thru = cast(MaskN)->getPassThru(); + if (isUndef(Thru)) + return Load; + SDValue VSel = DAG.getNode(ISD::VSELECT, dl, ValTy, Mask, Load, Thru); + return DAG.getMergeValues({VSel, Load.getValue(1)}, dl); + } + + // MSTORE + // HVX only has aligned masked stores. + + // TODO: Fold negations of the mask into the store. + unsigned StoreOpc = Hexagon::V6_vS32b_qpred_ai; + SDValue Value = cast(MaskN)->getValue(); + SDValue Offset0 = DAG.getTargetConstant(0, dl, ty(Base)); + + if (MaskN->getAlign().value() % HwLen == 0) { + SDValue Store = getInstr(StoreOpc, dl, MVT::Other, + {Mask, Base, Offset0, Value, Chain}, DAG); + DAG.setNodeMemRefs(cast(Store.getNode()), {MemOp}); + return Store; + } + + // Unaligned case. + auto StoreAlign = [&](SDValue V, SDValue A) { + SDValue Z = getZero(dl, ty(V), DAG); + // TODO: use funnel shifts? + // vlalign(Vu,Vv,Rt) rotates the pair Vu:Vv left by Rt and takes the + // upper half. + SDValue LoV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {V, Z, A}, DAG); + SDValue HiV = getInstr(Hexagon::V6_vlalignb, dl, ty(V), {Z, V, A}, DAG); + return std::make_pair(LoV, HiV); + }; + + MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen); + MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen); + SDValue MaskV = DAG.getNode(HexagonISD::Q2V, dl, ByteTy, Mask); + VectorPair Tmp = StoreAlign(MaskV, Base); + VectorPair MaskU = {DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.first), + DAG.getNode(HexagonISD::V2Q, dl, BoolTy, Tmp.second)}; + VectorPair ValueU = StoreAlign(Value, Base); + + SDValue Offset1 = DAG.getTargetConstant(HwLen, dl, MVT::i32); + SDValue StoreLo = + getInstr(StoreOpc, dl, MVT::Other, + {MaskU.first, Base, Offset0, ValueU.first, Chain}, DAG); + SDValue StoreHi = + getInstr(StoreOpc, dl, MVT::Other, + {MaskU.second, Base, Offset1, ValueU.second, Chain}, DAG); + DAG.setNodeMemRefs(cast(StoreLo.getNode()), {MemOp}); + DAG.setNodeMemRefs(cast(StoreHi.getNode()), {MemOp}); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, {StoreLo, StoreHi}); +} + SDValue HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const { assert(!Op.isMachineOpcode()); @@ -1648,45 +1724,81 @@ SDValue HexagonTargetLowering::SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const { - LSBaseSDNode *BN = cast(Op.getNode()); - assert(BN->isUnindexed()); - MVT MemTy = BN->getMemoryVT().getSimpleVT(); + auto *MemN = cast(Op.getNode()); + + MVT MemTy = MemN->getMemoryVT().getSimpleVT(); if (!isHvxPairTy(MemTy)) return Op; const SDLoc &dl(Op); unsigned HwLen = Subtarget.getVectorLength(); MVT SingleTy = typeSplit(MemTy).first; - SDValue Chain = BN->getChain(); - SDValue Base0 = BN->getBasePtr(); + SDValue Chain = MemN->getChain(); + SDValue Base0 = MemN->getBasePtr(); SDValue Base1 = DAG.getMemBasePlusOffset(Base0, TypeSize::Fixed(HwLen), dl); MachineMemOperand *MOp0 = nullptr, *MOp1 = nullptr; - if (MachineMemOperand *MMO = BN->getMemOperand()) { + if (MachineMemOperand *MMO = MemN->getMemOperand()) { MachineFunction &MF = DAG.getMachineFunction(); MOp0 = MF.getMachineMemOperand(MMO, 0, HwLen); MOp1 = MF.getMachineMemOperand(MMO, HwLen, HwLen); } - unsigned MemOpc = BN->getOpcode(); - SDValue NewOp; + unsigned MemOpc = MemN->getOpcode(); if (MemOpc == ISD::LOAD) { + assert(cast(Op)->isUnindexed()); SDValue Load0 = DAG.getLoad(SingleTy, dl, Chain, Base0, MOp0); SDValue Load1 = DAG.getLoad(SingleTy, dl, Chain, Base1, MOp1); - NewOp = DAG.getMergeValues( - { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1), - DAG.getNode(ISD::TokenFactor, dl, MVT::Other, - Load0.getValue(1), Load1.getValue(1)) }, dl); - } else { - assert(MemOpc == ISD::STORE); + return DAG.getMergeValues( + { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, Load0, Load1), + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + Load0.getValue(1), Load1.getValue(1)) }, dl); + } + if (MemOpc == ISD::STORE) { + assert(cast(Op)->isUnindexed()); VectorPair Vals = opSplit(cast(Op)->getValue(), dl, DAG); SDValue Store0 = DAG.getStore(Chain, dl, Vals.first, Base0, MOp0); SDValue Store1 = DAG.getStore(Chain, dl, Vals.second, Base1, MOp1); - NewOp = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store0, Store1); + } + + assert(MemOpc == ISD::MLOAD || MemOpc == ISD::MSTORE); + + auto MaskN = cast(Op); + assert(MaskN->isUnindexed()); + VectorPair Masks = opSplit(MaskN->getMask(), dl, DAG); + SDValue Offset = DAG.getUNDEF(MVT::i32); + + if (MemOpc == ISD::MLOAD) { + VectorPair Thru = + opSplit(cast(Op)->getPassThru(), dl, DAG); + SDValue MLoad0 = + DAG.getMaskedLoad(SingleTy, dl, Chain, Base0, Offset, Masks.first, + Thru.first, SingleTy, MOp0, ISD::UNINDEXED, + ISD::NON_EXTLOAD, false); + SDValue MLoad1 = + DAG.getMaskedLoad(SingleTy, dl, Chain, Base1, Offset, Masks.second, + Thru.second, SingleTy, MOp1, ISD::UNINDEXED, + ISD::NON_EXTLOAD, false); + return DAG.getMergeValues( + { DAG.getNode(ISD::CONCAT_VECTORS, dl, MemTy, MLoad0, MLoad1), + DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + MLoad0.getValue(1), MLoad1.getValue(1)) }, dl); + } + if (MemOpc == ISD::MSTORE) { + VectorPair Vals = opSplit(cast(Op)->getValue(), dl, DAG); + SDValue MStore0 = DAG.getMaskedStore(Chain, dl, Vals.first, Base0, Offset, + Masks.first, SingleTy, MOp0, + ISD::UNINDEXED, false, false); + SDValue MStore1 = DAG.getMaskedStore(Chain, dl, Vals.second, Base1, Offset, + Masks.second, SingleTy, MOp1, + ISD::UNINDEXED, false, false); + return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MStore0, MStore1); } - return NewOp; + std::string Name = "Unexpected operation: " + Op->getOperationName(&DAG); + llvm_unreachable(Name.c_str()); } SDValue @@ -1749,6 +1861,8 @@ case ISD::SETCC: case ISD::INTRINSIC_VOID: return Op; case ISD::INTRINSIC_WO_CHAIN: return LowerHvxIntrinsic(Op, DAG); + case ISD::MLOAD: + case ISD::MSTORE: return LowerHvxMaskedOp(Op, DAG); // Unaligned loads will be handled by the default lowering. case ISD::LOAD: return SDValue(); } @@ -1761,6 +1875,25 @@ void HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const { + unsigned Opc = N->getOpcode(); + SDValue Op(N, 0); + + switch (Opc) { + case ISD::MLOAD: + if (isHvxPairTy(ty(Op))) { + SDValue S = SplitHvxMemOp(Op, DAG); + assert(S->getOpcode() == ISD::MERGE_VALUES); + Results.push_back(S.getOperand(0)); + Results.push_back(S.getOperand(1)); + } + break; + case ISD::MSTORE: + if (isHvxPairTy(ty(Op->getOperand(1)))) { // Stored value + SDValue S = SplitHvxMemOp(Op, DAG); + Results.push_back(S); + } + break; + } } void @@ -1783,6 +1916,8 @@ SDValue HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); const SDLoc &dl(N); SDValue Op(N, 0); diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -2721,6 +2721,8 @@ case Hexagon::PS_vloadrw_nt_ai: case Hexagon::V6_vL32b_ai: case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vS32b_qpred_ai: + case Hexagon::V6_vS32b_nqpred_ai: case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td --- a/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td +++ b/llvm/lib/Target/Hexagon/HexagonPatternsHVX.td @@ -364,6 +364,14 @@ (V6_vasrw (V6_vaslw HVI32:$Vs, (A2_tfrsi 16)), (A2_tfrsi 16))>; } + // Take a pair of vectors Vt:Vs and shift them towards LSB by (Rt & HwLen). + def: Pat<(VecI8 (valign HVI8:$Vt, HVI8:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(VecI16 (valign HVI16:$Vt, HVI16:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(VecI32 (valign HVI32:$Vt, HVI32:$Vs, I32:$Rt)), + (LoVec (V6_valignb HvxVR:$Vt, HvxVR:$Vs, I32:$Rt))>; + def: Pat<(HexagonVASL HVI8:$Vs, I32:$Rt), (V6_vpackeb (V6_vaslh (HiVec (VZxtb HvxVR:$Vs)), I32:$Rt), (V6_vaslh (LoVec (VZxtb HvxVR:$Vs)), I32:$Rt))>; diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -155,6 +155,9 @@ return 1; } + bool isLegalMaskedStore(Type *DataType, Align Alignment); + bool isLegalMaskedLoad(Type *DataType, Align Alignment); + /// @} int getUserCost(const User *U, ArrayRef Operands, diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp --- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -35,6 +35,9 @@ cl::init(true), cl::Hidden, cl::desc("Control lookup table emission on Hexagon target")); +static cl::opt HexagonMaskedVMem("hexagon-masked-vmem", cl::init(true), + cl::Hidden, cl::desc("Enable loop vectorizer for HVX")); + // Constant "cost factor" to make floating point operations more expensive // in terms of vectorization cost. This isn't the best way, but it should // do. Ultimately, the cost should use cycles. @@ -45,8 +48,7 @@ } bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const { - assert(VecTy->isVectorTy()); - if (isa(VecTy)) + if (!VecTy->isVectorTy() || isa(VecTy)) return false; // Avoid types like <2 x i32*>. if (!cast(VecTy)->getElementType()->isIntegerTy()) @@ -308,6 +310,14 @@ return 1; } +bool HexagonTTIImpl::isLegalMaskedStore(Type *DataType, Align /*Alignment*/) { + return HexagonMaskedVMem && isTypeForHVX(DataType); +} + +bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/) { + return HexagonMaskedVMem && isTypeForHVX(DataType); +} + /// --- Vector TTI end --- unsigned HexagonTTIImpl::getPrefetchDistance() const { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -840,7 +840,7 @@ return FS->findFunctionSamplesAt(LineLocation(FunctionSamples::getOffset(DIL), DIL->getBaseDiscriminator()), - CalleeName); + CalleeName, Reader->getRemapper()); } /// Returns a vector of FunctionSamples that are the indirect call targets @@ -903,7 +903,7 @@ auto it = DILocation2SampleMap.try_emplace(DIL,nullptr); if (it.second) - it.first->second = Samples->findFunctionSamples(DIL); + it.first->second = Samples->findFunctionSamples(DIL, Reader->getRemapper()); return it.first->second; } @@ -1050,24 +1050,23 @@ PSI->getOrCompHotCountThreshold()); continue; } - auto CalleeFunctionName = FS->getFuncName(); + if (!callsiteIsHot(FS, PSI)) + continue; + + const char *Reason = "Callee function not available"; + // R->getValue() != &F is to prevent promoting a recursive call. // If it is a recursive call, we do not inline it as it could bloat // the code exponentially. There is way to better handle this, e.g. // clone the caller first, and inline the cloned caller if it is // recursive. As llvm does not inline recursive calls, we will // simply ignore it instead of handling it explicitly. - if (CalleeFunctionName == F.getName()) - continue; - - if (!callsiteIsHot(FS, PSI)) - continue; - - const char *Reason = "Callee function not available"; + auto CalleeFunctionName = FS->getFuncName(); auto R = SymbolMap.find(CalleeFunctionName); if (R != SymbolMap.end() && R->getValue() && !R->getValue()->isDeclaration() && R->getValue()->getSubprogram() && R->getValue()->hasFnAttribute("use-sample-profile") && + R->getValue() != &F && isLegalToPromote(*I, R->getValue(), &Reason)) { uint64_t C = FS->getEntrySamples(); auto &DI = @@ -1854,7 +1853,6 @@ FunctionAnalysisManager *FAM) { auto &Ctx = M.getContext(); - std::unique_ptr RemapReader; auto ReaderOrErr = SampleProfileReader::create(Filename, Ctx, RemappingFilename); if (std::error_code EC = ReaderOrErr.getError()) { @@ -1910,6 +1908,7 @@ for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); + auto Remapper = Reader->getRemapper(); // Populate the symbol map. for (const auto &N_F : M.getValueSymbolTable()) { StringRef OrigName = N_F.getKey(); @@ -1927,6 +1926,15 @@ // to nullptr to avoid confusion. if (!r.second) r.first->second = nullptr; + OrigName = NewName; + } + // Insert the remapped names into SymbolMap. + if (Remapper) { + if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) { + if (*MapName == OrigName) + continue; + SymbolMap.insert(std::make_pair(*MapName, F)); + } } } diff --git a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll --- a/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll +++ b/llvm/test/Analysis/ConstantFolding/vscale-shufflevector.ll @@ -15,9 +15,7 @@ ; the compiler. It happens to be the case that this will be the result. ; CHECK-LABEL: define @vscale_version() -; CHECK-NEXT: %splatter = insertelement undef, i1 true, i32 0 -; CHECK-NEXT: %foo = shufflevector %splatter, undef, zeroinitializer -; CHECK-NEXT: ret %foo +; CHECK-NEXT: ret shufflevector ( insertelement ( undef, i1 true, i32 0), undef, zeroinitializer) define @vscale_version() { %splatter = insertelement undef, i1 true, i32 0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_read2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_read2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s ; FIXME: We don't get cases where the address was an SGPR because we ; get a copy to the address register for each one. @@ -317,7 +318,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-4: ds_read_u8 +; GFX9-ALIGNED-4: ds_read_u8 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -336,7 +339,9 @@ ; CI-DAG: s_mov_b32 m0 ; GFX9-NOT: m0 -; GCN-NOT: ds_read2_b32 +; CI-COUNT-2: ds_read_u16 +; GFX9-ALIGNED-2: ds_read_u16 +; GFX9-UNALIGNED-4: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}} ; GCN: s_endpgm define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 { %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -655,6 +660,22 @@ ret <2 x float> %r1 } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}read2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_read_u8 + +; GFX9-ALIGNED-COUNT-8: ds_read_u8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE_ADDR]] offset1:1{{$}} +define amdgpu_kernel void @read2_v2i32_align1_odd_offset(<2 x i32> addrspace(1)* %out) { +entry: + %load = load <2 x i32>, <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + store <2 x i32> %load, <2 x i32> addrspace(1)* %out + ret void +} + declare void @void_func_void() #3 declare i32 @llvm.amdgcn.workgroup.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -1,5 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,CI %s -; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9 %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,-unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-ALIGNED %s +; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+load-store-opt,+flat-for-global,+unaligned-access-mode < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s @lds = addrspace(3) global [512 x float] undef, align 4 @lds.f64 = addrspace(3) global [512 x double] undef, align 8 @@ -523,6 +524,21 @@ ret void } +@v2i32_align1 = internal addrspace(3) global [100 x <2 x i32>] undef, align 1 + +; GCN-LABEL: {{^}}write2_v2i32_align1_odd_offset: +; CI-COUNT-8: ds_write_b8 + +; GFX9-ALIGNED-COUNT-8: ds_write_b8 + +; GFX9-UNALIGNED: v_mov_b32_e32 [[BASE_ADDR:v[0-9]+]], 0x41{{$}} +; GFX9-UNALIGNED: ds_write2_b32 [[BASE_ADDR]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}} +define amdgpu_kernel void @write2_v2i32_align1_odd_offset() { +entry: + store <2 x i32> , <2 x i32> addrspace(3)* bitcast (i8 addrspace(3)* getelementptr (i8, i8 addrspace(3)* bitcast ([100 x <2 x i32>] addrspace(3)* @v2i32_align1 to i8 addrspace(3)*), i32 65) to <2 x i32> addrspace(3)*), align 1 + ret void +} + declare i32 @llvm.amdgcn.workgroup.id.x() #1 declare i32 @llvm.amdgcn.workgroup.id.y() #1 declare i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/autohvx/masked-vmem-basic.ll @@ -0,0 +1,35 @@ +; RUN: llc -march=hexagon < %s | FileCheck %s + +; CHECK-LABEL: f0: +; CHECK: vmemu +; CHECK: vmux +define <128 x i8> @f0(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 { + %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2) + %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1) + %v1 = bitcast <32 x i32> %v0 to <128 x i8> + %v2 = call <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>* %a0, i32 4, <128 x i1> %q0, <128 x i8> %v1) + ret <128 x i8> %v2 +} + +; CHECK-LABEL: f1: +; CHECK: vlalign +; CHECK: if (q{{.}}) vmem{{.*}} = v +define void @f1(<128 x i8>* %a0, i32 %a1, i32 %a2) #0 { + %q0 = call <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a2) + %v0 = call <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32 %a1) + %v1 = bitcast <32 x i32> %v0 to <128 x i8> + call void @llvm.masked.store.v128i8.p0v128i8(<128 x i8> %v1, <128 x i8>* %a0, i32 4, <128 x i1> %q0) + ret void +} + +declare <128 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32) #1 +declare <32 x i32> @llvm.hexagon.V6.lvsplatb.128B(i32) #1 +declare <128 x i8> @llvm.masked.load.v128i8.p0v128i8(<128 x i8>*, i32 immarg, <128 x i1>, <128 x i8>) #2 +declare void @llvm.masked.store.v128i8.p0v128i8(<128 x i8>, <128 x i8>*, i32 immarg, <128 x i1>) #2 + +attributes #0 = { "target-cpu"="hexagonv65" "target-features"="+hvx,+hvx-length128b" } +attributes #1 = { nounwind readnone } +attributes #2 = { argmemonly nounwind readonly willreturn } +attributes #3 = { argmemonly nounwind willreturn } + + diff --git a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll --- a/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll +++ b/llvm/test/CodeGen/Hexagon/hvx-bitcast-v64i1.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s ; Test that LLVM does not assert and bitcast v64i1 to i64 is lowered ; without crashing. diff --git a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll --- a/llvm/test/CodeGen/Hexagon/store-vector-pred.ll +++ b/llvm/test/CodeGen/Hexagon/store-vector-pred.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -hexagon-instsimplify=0 < %s | FileCheck %s +; RUN: llc -march=hexagon -hexagon-instsimplify=0 -hexagon-masked-vmem=0 < %s | FileCheck %s ; This test checks that store a vector predicate of type v128i1 is lowered ; without crashing. diff --git a/llvm/test/LTO/ARM/lto-linking-metadata.ll b/llvm/test/LTO/ARM/lto-linking-metadata.ll --- a/llvm/test/LTO/ARM/lto-linking-metadata.ll +++ b/llvm/test/LTO/ARM/lto-linking-metadata.ll @@ -1,7 +1,8 @@ ; RUN: opt %s -o %t1.bc -; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-merged-module -O1 --exported-symbol=foo +; RUN: llvm-lto %t1.bc -o %t1.save.opt -save-linked-module -save-merged-module -O1 --exported-symbol=foo ; RUN: llvm-dis < %t1.save.opt.merged.bc | FileCheck %s +; RUN: llvm-dis < %t1.save.opt.linked.bc | FileCheck %s --check-prefix=CHECK-LINKED ; RUN: llvm-lto2 run %t1.bc -o %t.out.o -save-temps \ ; RUN: -r=%t1.bc,foo,pxl @@ -17,3 +18,6 @@ ; CHECK: !llvm.module.flags = !{[[MD_NUM:![0-9]+]]} ; CHECK: [[MD_NUM]] = !{i32 1, !"LTOPostLink", i32 1} + +; CHECK-LINKED: @foo +; CHECK-LINKED-NOT: LTOPostLink diff --git a/llvm/test/MC/AArch64/SVE/st1b.s b/llvm/test/MC/AArch64/SVE/st1b.s --- a/llvm/test/MC/AArch64/SVE/st1b.s +++ b/llvm/test/MC/AArch64/SVE/st1b.s @@ -168,3 +168,27 @@ // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe4] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf 5f e4 + +st1b { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1b { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e4 + +st1b { z0.s }, p7, [z0.s] +// CHECK-INST: st1b { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e4 + +st1b { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1b { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e4 + +st1b { z0.d }, p7, [z0.d] +// CHECK-INST: st1b { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e4 diff --git a/llvm/test/MC/AArch64/SVE/st1d.s b/llvm/test/MC/AArch64/SVE/st1d.s --- a/llvm/test/MC/AArch64/SVE/st1d.s +++ b/llvm/test/MC/AArch64/SVE/st1d.s @@ -78,3 +78,15 @@ // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe5] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf df e5 + +st1d { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1d { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e5 + +st1d { z0.d }, p7, [z0.d] +// CHECK-INST: st1d { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e5 diff --git a/llvm/test/MC/AArch64/SVE/st1h.s b/llvm/test/MC/AArch64/SVE/st1h.s --- a/llvm/test/MC/AArch64/SVE/st1h.s +++ b/llvm/test/MC/AArch64/SVE/st1h.s @@ -168,3 +168,27 @@ // CHECK-ENCODING: [0xff,0xbf,0xdf,0xe4] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf df e4 + +st1h { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1h { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc e0 e4 + +st1h { z0.s }, p7, [z0.s] +// CHECK-INST: st1h { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0xe0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc e0 e4 + +st1h { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1h { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e4 + +st1h { z0.d }, p7, [z0.d] +// CHECK-INST: st1h { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0xc0,0xe4] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc c0 e4 diff --git a/llvm/test/MC/AArch64/SVE/st1w.s b/llvm/test/MC/AArch64/SVE/st1w.s --- a/llvm/test/MC/AArch64/SVE/st1w.s +++ b/llvm/test/MC/AArch64/SVE/st1w.s @@ -138,3 +138,27 @@ // CHECK-ENCODING: [0xff,0xbf,0x5f,0xe5] // CHECK-ERROR: instruction requires: sve // CHECK-UNKNOWN: ff bf 5f e5 + +st1w { z0.s }, p7, [z0.s, #0] +// CHECK-INST: st1w { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e5 + +st1w { z0.s }, p7, [z0.s] +// CHECK-INST: st1w { z0.s }, p7, [z0.s] +// CHECK-ENCODING: [0x00,0xbc,0x60,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 60 e5 + +st1w { z0.d }, p7, [z0.d, #0] +// CHECK-INST: st1w { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e5 + +st1w { z0.d }, p7, [z0.d] +// CHECK-INST: st1w { z0.d }, p7, [z0.d] +// CHECK-ENCODING: [0x00,0xbc,0x40,0xe5] +// CHECK-ERROR: instruction requires: sve +// CHECK-UNKNOWN: 00 bc 40 e5 diff --git a/llvm/test/Transforms/InstSimplify/vscale.ll b/llvm/test/Transforms/InstSimplify/vscale.ll --- a/llvm/test/Transforms/InstSimplify/vscale.ll +++ b/llvm/test/Transforms/InstSimplify/vscale.ll @@ -51,6 +51,23 @@ ret %r } +define @insertelement_inline_to_ret() { +; CHECK-LABEL: @insertelement_inline_to_ret( +; CHECK-NEXT: ret insertelement ( undef, i32 1, i32 0) +; + %i = insertelement undef, i32 1, i32 0 + ret %i +} + +define @insertelement_shufflevector_inline_to_ret() { +; CHECK-LABEL: @insertelement_shufflevector_inline_to_ret( +; CHECK-NEXT: ret shufflevector ( insertelement ( undef, i32 1, i32 0), undef, zeroinitializer) +; + %i = insertelement undef, i32 1, i32 0 + %i2 = shufflevector %i, undef, zeroinitializer + ret %i2 +} + ; extractelement define i32 @extractelement_idx_undef( %a) { @@ -120,6 +137,16 @@ ret %cmp } +define @bitcast() { +; CHECK-LABEL: @bitcast( +; CHECK-NEXT: ret bitcast ( shufflevector ( insertelement ( undef, i32 1, i32 0), undef, zeroinitializer) to ) +; + %i1 = insertelement undef, i32 1, i32 0 + %i2 = shufflevector %i1, undef, zeroinitializer + %i3 = bitcast %i2 to + ret %i3 +} + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Memory Access and Addressing Operations ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/remap-2.prof @@ -0,0 +1,16 @@ +test:15680:2500 + 1: 100 + 4: 100 + 5: 3000 xoo:1000 + 5: _ZN3foo3barERKN1N1XINS_4quuxEEE:2000 + 1: 2000 + 6: _ZN1N1XE:2500 + 1: 2500 + +_ZN1N1X1YE:15680:2500 + 1: 100 + 4: 100 + 5: 3000 xoo:1000 + 5: _ZN1N1X1YE:2000 + 1: 2000 + diff --git a/llvm/test/Transforms/SampleProfile/remap-2.ll b/llvm/test/Transforms/SampleProfile/remap-2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/remap-2.ll @@ -0,0 +1,74 @@ +; RUN: opt %s -passes=sample-profile -sample-profile-file=%S/Inputs/remap-2.prof -sample-profile-remapping-file=%S/Inputs/remap.map -S | FileCheck %s +; Check profile remapping works for searching inline instance, searching +; indirect call promotion candidate and prevent recursive inline. + +@x.addr = common global i32 zeroinitializer, align 16 +@y.addr = common global i32 zeroinitializer, align 16 + +define i32 @_ZN3foo3barERKN1M1XINS_6detail3quxEEE() #0 !dbg !9 { +entry: + %t0 = load i32, i32* @x.addr, align 4 + %t1 = load i32, i32* @y.addr, align 4 + %add = add nsw i32 %t0, %t1 + ret i32 %add +} + +define i32 @_ZN1M1XE() #0 !dbg !10 { +entry: + %t0 = load i32, i32* @x.addr, align 4 + %t1 = load i32, i32* @y.addr, align 4 + %sub = sub nsw i32 %t0, %t1 + ret i32 %sub +} + +define void @test(i32 ()*) #0 !dbg !4 { + %t2 = alloca i32 ()* + store i32 ()* %0, i32 ()** %t2 + %t3 = load i32 ()*, i32 ()** %t2 +; Check call i32 %t3 has been indirect call promoted and call i32 @_ZN1M1XE +; has been inlined. +; CHECK-LABEL: @test( +; CHECK: icmp eq i32 ()* %t3, @_ZN3foo3barERKN1M1XINS_6detail3quxEEE +; CHECK-NOT: call i32 @_ZN1M1XE + %t4 = call i32 %t3(), !dbg !7 + %t5 = call i32 @_ZN1M1XE(), !dbg !8 + ret void +} + +define void @_ZN1M1X1YE(i32 ()*) #0 !dbg !11 { + %t2 = alloca i32 ()* + store i32 ()* %0, i32 ()** %t2 + %t3 = load i32 ()*, i32 ()** %t2 +; Check call i32 %t3 has got its profile but is not indirect call promoted +; because the promotion candidate is a recursive call to the current function. +; CHECK-LABEL: @_ZN1M1X1YE( +; CHECK: call i32 %t3(), {{.*}} !prof ![[PROFID:[0-9]+]] +; CHECK-NOT: icmp eq i32 ()* %t3, @_ZN1M1X1YE + %t4 = call i32 %t3(), !dbg !12 + ret void +} + +; CHECK: ![[PROFID]] = !{!"VP", i32 0, i64 3000 + +attributes #0 = { "use-sample-profile" } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!13, !14} +!llvm.ident = !{!15} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "test", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = !DILocation(line: 8, scope: !4) +!8 = !DILocation(line: 9, scope: !4) +!9 = distinct !DISubprogram(name: "_ZN3foo3barERKN1M1XINS_6detail3quxEEE", line: 15, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!10 = distinct !DISubprogram(name: "_ZN1M1XE", line: 20, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!11 = distinct !DISubprogram(name: "_ZN1M1X1YE", line: 25, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!12 = !DILocation(line: 30, scope: !11) +!13 = !{i32 2, !"Dwarf Version", i32 4} +!14 = !{i32 1, !"Debug Info Version", i32 3} +!15 = !{!"clang version 3.5 "} + diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp --- a/llvm/tools/llvm-lto/llvm-lto.cpp +++ b/llvm/tools/llvm-lto/llvm-lto.cpp @@ -181,6 +181,10 @@ cl::desc("Save ThinLTO generated object files using filenames created in " "the given directory.")); +static cl::opt SaveLinkedModuleFile( + "save-linked-module", cl::init(false), + cl::desc("Write linked LTO module to file before optimize")); + static cl::opt SaveModuleFile("save-merged-module", cl::init(false), cl::desc("Write merged LTO module to file before CodeGen")); @@ -1029,6 +1033,15 @@ CodeGen.setFileType(FT.getValue()); if (!OutputFilename.empty()) { + if (SaveLinkedModuleFile) { + std::string ModuleFilename = OutputFilename; + ModuleFilename += ".linked.bc"; + std::string ErrMsg; + + if (!CodeGen.writeMergedModules(ModuleFilename)) + error("writing linked module failed."); + } + if (!CodeGen.optimize(DisableVerify, DisableInline, DisableGVNLoadPRE, DisableLTOVectorization)) { // Diagnostic messages should have been printed by the handler. diff --git a/llvm/unittests/ProfileData/SampleProfTest.cpp b/llvm/unittests/ProfileData/SampleProfTest.cpp --- a/llvm/unittests/ProfileData/SampleProfTest.cpp +++ b/llvm/unittests/ProfileData/SampleProfTest.cpp @@ -89,8 +89,8 @@ auto VerifySummary = [IsPartialProfile, PartialProfileRatio]( ProfileSummary &Summary) mutable { ASSERT_EQ(ProfileSummary::PSK_Sample, Summary.getKind()); - ASSERT_EQ(137392u, Summary.getTotalCount()); - ASSERT_EQ(8u, Summary.getNumCounts()); + ASSERT_EQ(138211u, Summary.getTotalCount()); + ASSERT_EQ(10u, Summary.getNumCounts()); ASSERT_EQ(4u, Summary.getNumFunctions()); ASSERT_EQ(1437u, Summary.getMaxFunctionCount()); ASSERT_EQ(60351u, Summary.getMaxCount()); @@ -112,7 +112,7 @@ ASSERT_EQ(60000u, EightyPerc->MinCount); ASSERT_EQ(12557u, NinetyPerc->MinCount); ASSERT_EQ(12557u, NinetyFivePerc->MinCount); - ASSERT_EQ(610u, NinetyNinePerc->MinCount); + ASSERT_EQ(600u, NinetyNinePerc->MinCount); }; VerifySummary(Summary); @@ -155,6 +155,22 @@ FooSamples.addBodySamples(8, 0, 60351); FooSamples.addBodySamples(10, 0, 605); + // Add inline instance with name "_Z3gooi". + StringRef GooName("_Z3gooi"); + auto &GooSamples = + FooSamples.functionSamplesAt(LineLocation(7, 0))[GooName.str()]; + GooSamples.setName(GooName); + GooSamples.addTotalSamples(502); + GooSamples.addBodySamples(3, 0, 502); + + // Add inline instance with name "_Z3hooi". + StringRef HooName("_Z3hooi"); + auto &HooSamples = + GooSamples.functionSamplesAt(LineLocation(9, 0))[HooName.str()]; + HooSamples.setName(HooName); + HooSamples.addTotalSamples(317); + HooSamples.addBodySamples(4, 0, 317); + StringRef BarName("_Z3bari"); FunctionSamples BarSamples; BarSamples.setName(BarName); @@ -197,6 +213,8 @@ createRemapFile(RemapPath, RemapFile); FooName = "_Z4fauxi"; BarName = "_Z3barl"; + GooName = "_Z3gool"; + HooName = "_Z3hool"; } M.getOrInsertFunction(FooName, fn_type); @@ -235,6 +253,33 @@ ASSERT_EQ(7711u, ReadFooSamples->getTotalSamples()); ASSERT_EQ(610u, ReadFooSamples->getHeadSamples()); + // Try to find a FunctionSamples with GooName at given callsites containing + // inline instance for GooName. Test the correct FunctionSamples can be + // found with Remapper support. + const FunctionSamples *ReadGooSamples = + ReadFooSamples->findFunctionSamplesAt(LineLocation(7, 0), GooName, + Reader->getRemapper()); + ASSERT_TRUE(ReadGooSamples != nullptr); + ASSERT_EQ(502u, ReadGooSamples->getTotalSamples()); + + // Try to find a FunctionSamples with GooName at given callsites containing + // no inline instance for GooName. Test no FunctionSamples will be + // found with Remapper support. + const FunctionSamples *ReadGooSamplesAgain = + ReadFooSamples->findFunctionSamplesAt(LineLocation(9, 0), GooName, + Reader->getRemapper()); + ASSERT_TRUE(ReadGooSamplesAgain == nullptr); + + // The inline instance of Hoo is inside of the inline instance of Goo. + // Try to find a FunctionSamples with HooName at given callsites containing + // inline instance for HooName. Test the correct FunctionSamples can be + // found with Remapper support. + const FunctionSamples *ReadHooSamples = + ReadGooSamples->findFunctionSamplesAt(LineLocation(9, 0), HooName, + Reader->getRemapper()); + ASSERT_TRUE(ReadHooSamples != nullptr); + ASSERT_EQ(317u, ReadHooSamples->getTotalSamples()); + FunctionSamples *ReadBarSamples = Reader->getSamplesFor(BarName); ASSERT_TRUE(ReadBarSamples != nullptr); if (!UseMD5) { diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -213,6 +213,7 @@ "HAVE_SIGALTSTACK=", "HAVE_STRERROR_R=", "HAVE_SYSCONF=", + "HAVE_SYSEXITS_H=", "HAVE_SYS_IOCTL_H=", "HAVE_SYS_MMAN_H=", "HAVE_SYS_PARAM_H=", @@ -249,6 +250,7 @@ "HAVE_SIGALTSTACK=1", "HAVE_STRERROR_R=1", "HAVE_SYSCONF=1", + "HAVE_SYSEXITS_H=1", "HAVE_SYS_IOCTL_H=1", "HAVE_SYS_MMAN_H=1", "HAVE_SYS_PARAM_H=1", diff --git a/polly/include/polly/ScopInfo.h b/polly/include/polly/ScopInfo.h --- a/polly/include/polly/ScopInfo.h +++ b/polly/include/polly/ScopInfo.h @@ -2319,7 +2319,7 @@ /// ScopBuilder::buildAccessRelations. Therefore, if this /// method is called before buildAccessRelations, false /// must be passed. - void removeStmts(std::function ShouldDelete, + void removeStmts(function_ref ShouldDelete, bool AfterHoisting = true); /// Get an isl string representing the context. diff --git a/polly/lib/Analysis/ScopInfo.cpp b/polly/lib/Analysis/ScopInfo.cpp --- a/polly/lib/Analysis/ScopInfo.cpp +++ b/polly/lib/Analysis/ScopInfo.cpp @@ -1752,7 +1752,7 @@ } } -void Scop::removeStmts(std::function ShouldDelete, +void Scop::removeStmts(function_ref ShouldDelete, bool AfterHoisting) { for (auto StmtIt = Stmts.begin(), StmtEnd = Stmts.end(); StmtIt != StmtEnd;) { if (!ShouldDelete(*StmtIt)) { @@ -1773,40 +1773,39 @@ } void Scop::removeStmtNotInDomainMap() { - auto ShouldDelete = [this](ScopStmt &Stmt) -> bool { + removeStmts([this](ScopStmt &Stmt) -> bool { isl::set Domain = DomainMap.lookup(Stmt.getEntryBlock()); if (!Domain) return true; return Domain.is_empty(); - }; - removeStmts(ShouldDelete, false); + }); } void Scop::simplifySCoP(bool AfterHoisting) { - auto ShouldDelete = [AfterHoisting](ScopStmt &Stmt) -> bool { - // Never delete statements that contain calls to debug functions. - if (hasDebugCall(&Stmt)) - return false; - - bool RemoveStmt = Stmt.isEmpty(); - - // Remove read only statements only after invariant load hoisting. - if (!RemoveStmt && AfterHoisting) { - bool OnlyRead = true; - for (MemoryAccess *MA : Stmt) { - if (MA->isRead()) - continue; - - OnlyRead = false; - break; - } - - RemoveStmt = OnlyRead; - } - return RemoveStmt; - }; - - removeStmts(ShouldDelete, AfterHoisting); + removeStmts( + [AfterHoisting](ScopStmt &Stmt) -> bool { + // Never delete statements that contain calls to debug functions. + if (hasDebugCall(&Stmt)) + return false; + + bool RemoveStmt = Stmt.isEmpty(); + + // Remove read only statements only after invariant load hoisting. + if (!RemoveStmt && AfterHoisting) { + bool OnlyRead = true; + for (MemoryAccess *MA : Stmt) { + if (MA->isRead()) + continue; + + OnlyRead = false; + break; + } + + RemoveStmt = OnlyRead; + } + return RemoveStmt; + }, + AfterHoisting); } InvariantEquivClassTy *Scop::lookupInvariantEquivClass(Value *Val) { diff --git a/polly/lib/Transform/Simplify.cpp b/polly/lib/Transform/Simplify.cpp --- a/polly/lib/Transform/Simplify.cpp +++ b/polly/lib/Transform/Simplify.cpp @@ -169,12 +169,11 @@ void removeEmptyDomainStmts() { size_t NumStmtsBefore = S->getSize(); - auto ShouldDelete = [](ScopStmt &Stmt) -> bool { + S->removeStmts([](ScopStmt &Stmt) -> bool { auto EffectiveDomain = Stmt.getDomain().intersect_params(Stmt.getParent()->getContext()); return EffectiveDomain.is_empty(); - }; - S->removeStmts(ShouldDelete); + }); assert(NumStmtsBefore >= S->getSize()); EmptyDomainsRemoved = NumStmtsBefore - S->getSize();