Index: llvm/lib/Transforms/Vectorize/VectorCombine.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -43,9 +43,9 @@ STATISTIC(NumScalarBO, "Number of scalar binops formed"); STATISTIC(NumScalarCmp, "Number of scalar compares formed"); -static cl::opt DisableVectorCombine( - "disable-vector-combine", cl::init(false), cl::Hidden, - cl::desc("Disable all vector combine transforms")); +static cl::opt + DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, + cl::desc("Disable all vector combine transforms")); static cl::opt DisableBinopExtractShuffle( "disable-binop-extract-shuffle", cl::init(false), cl::Hidden, @@ -890,8 +890,8 @@ // The original scalar pattern is: // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1) CmpInst::Predicate Pred = P0; - unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp - : Instruction::ICmp; + unsigned CmpOpcode = + CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp; auto *VecTy = dyn_cast(X->getType()); if (!VecTy) return false; @@ -935,8 +935,8 @@ Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC)); Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder); - Value *VecLogic = Builder.CreateBinOp(cast(I).getOpcode(), - VCmp, Shuf); + Value *VecLogic = + Builder.CreateBinOp(cast(I).getOpcode(), VCmp, Shuf); Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex); replaceValue(I, *NewExt); ++NumVecCmpBO; @@ -1013,19 +1013,36 @@ /// Check if it is legal to scalarize a memory access to \p VecTy at index \p /// Idx. \p Idx must access a valid vector element. -static ScalarizationResult canScalarizeAccess(FixedVectorType *VecTy, - Value *Idx, Instruction *CtxI, +static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, + Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT) { + // This is the number of elements of fixed vector types, + // or the minium number of elements of scalable vector types. + uint64_t NumElements; + + // We do checks for fixed vector types and scalable vector types in + // little endian. + if (isa(VecTy)) { + auto FixedVecTy = cast(VecTy); + NumElements = FixedVecTy->getNumElements(); + } else { + const DataLayout &DL = CtxI->getModule()->getDataLayout(); + if (DL.isBigEndian()) + return ScalarizationResult::unsafe(); + auto ScalableVecTy = cast(VecTy); + NumElements = ScalableVecTy->getMinNumElements(); + } + if (auto *C = dyn_cast(Idx)) { - if (C->getValue().ult(VecTy->getNumElements())) + if (C->getValue().ult(NumElements)) return ScalarizationResult::safe(); return ScalarizationResult::unsafe(); } unsigned IntWidth = Idx->getType()->getScalarSizeInBits(); APInt Zero(IntWidth, 0); - APInt MaxElts(IntWidth, VecTy->getNumElements()); + APInt MaxElts(IntWidth, NumElements); ConstantRange ValidIndices(Zero, MaxElts); ConstantRange IdxRange(IntWidth, true); @@ -1074,8 +1091,7 @@ // store i32 %b, i32* %1 bool VectorCombine::foldSingleElementStore(Instruction &I) { auto *SI = cast(&I); - if (!SI->isSimple() || - !isa(SI->getValueOperand()->getType())) + if (!SI->isSimple() || !isa(SI->getValueOperand()->getType())) return false; // TODO: Combine more complicated patterns (multiple insert) by referencing @@ -1089,7 +1105,7 @@ return false; if (auto *Load = dyn_cast(Source)) { - auto VecTy = cast(SI->getValueOperand()->getType()); + auto VecTy = cast(SI->getValueOperand()->getType()); const DataLayout &DL = I.getModule()->getDataLayout(); Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts(); // Don't optimize for atomic/volatile load or store. Ensure memory is not @@ -1136,9 +1152,8 @@ if (LI->isVolatile() || !DL.typeSizeEqualsStoreSize(FixedVT)) return false; - InstructionCost OriginalCost = - TTI.getMemoryOpCost(Instruction::Load, FixedVT, LI->getAlign(), - LI->getPointerAddressSpace()); + InstructionCost OriginalCost = TTI.getMemoryOpCost( + Instruction::Load, FixedVT, LI->getAlign(), LI->getPointerAddressSpace()); InstructionCost ScalarizedCost = 0; Instruction *LastCheckedInst = LI; @@ -1739,7 +1754,6 @@ if (Opcode == Instruction::Store) MadeChange |= foldSingleElementStore(I); - // If this is an early pipeline invocation of this pass, we are done. if (TryEarlyFoldsOnly) return; Index: llvm/test/Transforms/VectorCombine/load-insert-store.ll =================================================================== --- llvm/test/Transforms/VectorCombine/load-insert-store.ll +++ llvm/test/Transforms/VectorCombine/load-insert-store.ll @@ -47,12 +47,18 @@ } define void @insert_store_vscale(ptr %q, i16 zeroext %s) { -; CHECK-LABEL: @insert_store_vscale( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i16 [[S:%.*]], i32 3 -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 -; CHECK-NEXT: ret void +; LE-LABEL: @insert_store_vscale( +; LE-NEXT: entry: +; LE-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 3 +; LE-NEXT: store i16 [[S:%.*]], ptr [[TMP0]], align 2 +; LE-NEXT: ret void +; +; BE-LABEL: @insert_store_vscale( +; BE-NEXT: entry: +; BE-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 +; BE-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i16 [[S:%.*]], i32 3 +; BE-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; BE-NEXT: ret void ; entry: %0 = load , ptr %q @@ -247,14 +253,22 @@ ; To verify the index is not a constant but valid by assume, ; for scalable vector types. define void @insert_store_vscale_nonconst_index_known_valid_by_assume(ptr %q, i8 zeroext %s, i32 %idx) { -; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_valid_by_assume( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX:%.*]], 4 -; CHECK-NEXT: call void @llvm.assume(i1 [[CMP]]) -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 -; CHECK-NEXT: ret void +; LE-LABEL: @insert_store_vscale_nonconst_index_known_valid_by_assume( +; LE-NEXT: entry: +; LE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX:%.*]], 4 +; LE-NEXT: call void @llvm.assume(i1 [[CMP]]) +; LE-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX]] +; LE-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 +; LE-NEXT: ret void +; +; BE-LABEL: @insert_store_vscale_nonconst_index_known_valid_by_assume( +; BE-NEXT: entry: +; BE-NEXT: [[CMP:%.*]] = icmp ult i32 [[IDX:%.*]], 4 +; BE-NEXT: call void @llvm.assume(i1 [[CMP]]) +; BE-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 +; BE-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX]] +; BE-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; BE-NEXT: ret void ; entry: %cmp = icmp ult i32 %idx, 4 @@ -349,13 +363,20 @@ ; To verify the index is not a constant but valid by and, ; for scalable vector types. define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and(ptr %q, i8 zeroext %s, i32 noundef %idx) { -; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[IDX_CLAMPED:%.*]] = and i32 [[IDX:%.*]], 7 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 -; CHECK-NEXT: ret void +; LE-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and( +; LE-NEXT: entry: +; LE-NEXT: [[IDX_CLAMPED:%.*]] = and i32 [[IDX:%.*]], 7 +; LE-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]] +; LE-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 +; LE-NEXT: ret void +; +; BE-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_and( +; BE-NEXT: entry: +; BE-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 +; BE-NEXT: [[IDX_CLAMPED:%.*]] = and i32 [[IDX:%.*]], 7 +; BE-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] +; BE-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; BE-NEXT: ret void ; entry: %0 = load , ptr %q @@ -491,13 +512,20 @@ ; To verify the index is not a constant but valid by urem, ; for scalable vector types. define void @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem(ptr %q, i8 zeroext %s, i32 noundef %idx) { -; CHECK-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 -; CHECK-NEXT: [[IDX_CLAMPED:%.*]] = urem i32 [[IDX:%.*]], 16 -; CHECK-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] -; CHECK-NEXT: store [[VECINS]], ptr [[Q]], align 16 -; CHECK-NEXT: ret void +; LE-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem( +; LE-NEXT: entry: +; LE-NEXT: [[IDX_CLAMPED:%.*]] = urem i32 [[IDX:%.*]], 16 +; LE-NEXT: [[TMP0:%.*]] = getelementptr inbounds , ptr [[Q:%.*]], i32 0, i32 [[IDX_CLAMPED]] +; LE-NEXT: store i8 [[S:%.*]], ptr [[TMP0]], align 1 +; LE-NEXT: ret void +; +; BE-LABEL: @insert_store_vscale_nonconst_index_known_noundef_and_valid_by_urem( +; BE-NEXT: entry: +; BE-NEXT: [[TMP0:%.*]] = load , ptr [[Q:%.*]], align 16 +; BE-NEXT: [[IDX_CLAMPED:%.*]] = urem i32 [[IDX:%.*]], 16 +; BE-NEXT: [[VECINS:%.*]] = insertelement [[TMP0]], i8 [[S:%.*]], i32 [[IDX_CLAMPED]] +; BE-NEXT: store [[VECINS]], ptr [[Q]], align 16 +; BE-NEXT: ret void ; entry: %0 = load , ptr %q @@ -818,6 +846,3 @@ declare i32 @bar(i32, i1) readonly declare double @llvm.log2.f64(double) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; BE: {{.*}} -; LE: {{.*}}