diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -834,8 +834,17 @@ if (SI->getAlign() < NSI->getAlign()) NSI->setAlignment(SI->getAlign()); replaceValue(I, *NSI); + auto *IEI = cast(SI->getOperand(0)); // Need erasing the store manually. I.eraseFromParent(); + // Removing the insertelement instruction may unlock further load/extract + // scalarization opportunities. + if (IEI->use_empty()) + IEI->eraseFromParent(); + if (!Load->user_empty()) { + Instruction *UI = cast(*Load->user_begin()); + scalarizeLoadExtract(*UI); + } return true; } diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -20,13 +20,14 @@ ; CHECK-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP6]]) ; CHECK-NEXT: [[TMP7:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: [[TMP8:%.*]] = load <225 x double>, <225 x double>* [[TMP7]], align 8 -; CHECK-NEXT: [[MATRIXEXT4:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP5]] +; CHECK-NEXT: [[MATRIXEXT4:%.*]] = load double, double* [[TMP8]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double [[MATRIXEXT]], [[MATRIXEXT4]] -; CHECK-NEXT: [[MATRIXEXT7:%.*]] = extractelement <225 x double> [[TMP8]], i64 [[TMP1]] -; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP9]], align 8 +; CHECK-NEXT: [[MATRIXEXT7:%.*]] = load double, double* [[TMP9]], align 8 +; CHECK-NEXT: [[SUB:%.*]] = fsub double [[MATRIXEXT7]], [[MUL]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP7]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP10]], align 8 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extract-insert-store-scalarization.ll @@ -6,18 +6,19 @@ define void @load_extract_insert_store_const_idx(<225 x double>* %A) { ; CHECK-LABEL: @load_extract_insert_store_const_idx( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 0 +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 1 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 1 +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: %lv = load <225 x double>, <225 x double>* %A, align 8 - %ext.0 = extractelement <225 x double> %lv, i64 0 + %ext.0 = extractelement <225 x double> %lv, i64 1 %mul = fmul double 20.0, %ext.0 %ext.1 = extractelement <225 x double> %lv, i64 1 %sub = fsub double %ext.1, %mul @@ -33,13 +34,14 @@ ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_1]]) ; CHECK-NEXT: [[CMP_2:%.*]] = icmp ult i64 [[IDX_2:%.*]], 225 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_2]]) -; CHECK-NEXT: [[LV:%.*]] = load <225 x double>, <225 x double>* [[A:%.*]], align 8 -; CHECK-NEXT: [[EXT_0:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_1]] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A:%.*]], i32 0, i64 [[IDX_1]] +; CHECK-NEXT: [[EXT_0:%.*]] = load double, double* [[TMP0]], align 8 ; CHECK-NEXT: [[MUL:%.*]] = fmul double 2.000000e+01, [[EXT_0]] -; CHECK-NEXT: [[EXT_1:%.*]] = extractelement <225 x double> [[LV]], i64 [[IDX_2]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i32 0, i64 [[IDX_2]] +; CHECK-NEXT: [[EXT_1:%.*]] = load double, double* [[TMP1]], align 8 ; CHECK-NEXT: [[SUB:%.*]] = fsub double [[EXT_1]], [[MUL]] -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] -; CHECK-NEXT: store double [[SUB]], double* [[TMP0]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[A]], i64 0, i64 [[IDX_1]] +; CHECK-NEXT: store double [[SUB]], double* [[TMP2]], align 8 ; CHECK-NEXT: ret void ; entry: