diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp --- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp +++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp @@ -1559,10 +1559,29 @@ if (LoadOp0 && LoadOp1 && Store) { // The store address must dominate the MatMul instruction, otherwise // we create invalid IR. - // FIXME: See if we can hoist the store address computation. - auto *AddrI = dyn_cast(Store->getOperand(1)); - if (AddrI && (!DT->dominates(AddrI, MatMul))) - return; + SetVector WorkList; + WorkList.insert(Store->getOperand(1)); + SmallVector ToHoist; + for (unsigned I = 0; I != WorkList.size(); ++I) { + Value *Current = WorkList[I]; + auto *CurrI = dyn_cast(Current); + if (!CurrI) + continue; + if (isa(CurrI)) + return; + if (DT->dominates(CurrI, MatMul)) + continue; + if (CurrI->mayHaveSideEffects() || CurrI->mayReadFromMemory()) + return; + ToHoist.push_back(CurrI); + WorkList.insert(CurrI->op_begin(), CurrI->op_end()); + } + + sort(ToHoist, [this](Instruction *A, Instruction *B) { + return DT->dominates(A, B); + }); + for (Instruction *I : ToHoist) + I->moveBefore(MatMul); emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts); return; diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll --- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll +++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll @@ -10,29 +10,86 @@ define void @multiply_can_hoist_cast(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) { ; CHECK-LABEL: @multiply_can_hoist_cast( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 -; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]]) -; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]] -; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]]) -; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast [4 x double]* [[C:%.*]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8 -; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8 +; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[C:%.*]] to i64 +; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32 +; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]]) +; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[C]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8 +; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8 +; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -47,30 +104,87 @@ define void @multiply_can_hoist_multiple_insts(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) { ; CHECK-LABEL: @multiply_can_hoist_multiple_insts( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 -; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]]) -; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]] -; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 2, i64 0 -; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8 -; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 2 -; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 2 +; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[GEP]] to i64 +; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32 +; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]]) +; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[GEP]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8 +; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 1 +; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8 +; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 2 +; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 2, i64 3 +; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8 ; CHECK-NEXT: ret void ; entry: @@ -87,30 +201,87 @@ define void @multiply_can_hoist_multiple_insts2(<4 x double>* noalias %A, <4 x double> * %B, [4 x double]* %C) { ; CHECK-LABEL: @multiply_can_hoist_multiple_insts2( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST]], align 8 -; CHECK-NEXT: [[VEC_GEP:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST1:%.*]] = bitcast double* [[VEC_GEP]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST1]], align 8 -; CHECK-NEXT: [[VEC_CAST3:%.*]] = bitcast <4 x double>* [[B:%.*]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD4:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST3]], align 8 -; CHECK-NEXT: [[VEC_GEP5:%.*]] = getelementptr <4 x double>, <4 x double>* [[B]], i64 0, i64 2 -; CHECK-NEXT: [[VEC_CAST6:%.*]] = bitcast double* [[VEC_GEP5]] to <2 x double>* -; CHECK-NEXT: [[COL_LOAD7:%.*]] = load <2 x double>, <2 x double>* [[VEC_CAST6]], align 8 -; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]] -; CHECK-NEXT: [[SPLAT_SPLAT10:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT10]], <2 x double> [[TMP0]]) -; CHECK-NEXT: [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT13]] -; CHECK-NEXT: [[SPLAT_SPLAT16:%.*]] = shufflevector <2 x double> [[COL_LOAD7]], <2 x double> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD2]], <2 x double> [[SPLAT_SPLAT16]], <2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 42, i64 0 -; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP4]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP1]], <2 x double>* [[VEC_CAST17]], align 8 -; CHECK-NEXT: [[VEC_GEP18:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 2 -; CHECK-NEXT: [[VEC_CAST19:%.*]] = bitcast double* [[VEC_GEP18]] to <2 x double>* -; CHECK-NEXT: store <2 x double> [[TMP3]], <2 x double>* [[VEC_CAST19]], align 8 +; CHECK-NEXT: [[GEP_179:%.*]] = getelementptr [4 x double], [4 x double]* [[C:%.*]], i64 42 +; CHECK-NEXT: [[STORE_BEGIN:%.*]] = ptrtoint [4 x double]* [[GEP_179]] to i64 +; CHECK-NEXT: [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32 +; CHECK-NEXT: [[LOAD_BEGIN:%.*]] = ptrtoint <4 x double>* [[B:%.*]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[STORE_END]], [[LOAD_BEGIN]] +; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]] +; CHECK: alias_cont: +; CHECK-NEXT: [[LOAD_END:%.*]] = add nuw nsw i64 [[LOAD_BEGIN]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LOAD_END]], [[STORE_BEGIN]] +; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]] +; CHECK: copy: +; CHECK-NEXT: [[TMP2:%.*]] = alloca <4 x double>, align 32 +; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x double>* [[TMP2]] to i8* +; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x double>* [[B]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 32 dereferenceable(32) [[TMP3]], i8* noundef nonnull align 8 dereferenceable(32) [[TMP4]], i64 32, i1 false) +; CHECK-NEXT: br label [[NO_ALIAS]] +; CHECK: no_alias: +; CHECK-NEXT: [[TMP5:%.*]] = phi <4 x double>* [ [[B]], [[ENTRY:%.*]] ], [ [[B]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ] +; CHECK-NEXT: [[VEC_CAST:%.*]] = bitcast <4 x double>* [[A:%.*]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST]], align 8 +; CHECK-NEXT: [[VEC_CAST2:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD3:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST2]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD3]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST5:%.*]] = bitcast double* [[TMP7]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD6:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST5]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST8:%.*]] = bitcast double* [[TMP8]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD9:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST8]], align 8 +; CHECK-NEXT: [[TMP9:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD6]], <1 x double> [[COL_LOAD9]], <1 x double> [[TMP6]]) +; CHECK-NEXT: [[VEC_CAST15:%.*]] = bitcast [4 x double]* [[GEP_179]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP9]], <1 x double>* [[VEC_CAST15]], align 8 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST17:%.*]] = bitcast double* [[TMP10]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD18:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST17]], align 8 +; CHECK-NEXT: [[VEC_CAST20:%.*]] = bitcast <4 x double>* [[TMP5]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD21:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST20]], align 8 +; CHECK-NEXT: [[TMP11:%.*]] = fmul contract <1 x double> [[COL_LOAD18]], [[COL_LOAD21]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST26:%.*]] = bitcast double* [[TMP12]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD27:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST26]], align 8 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST29:%.*]] = bitcast double* [[TMP13]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD30:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST29]], align 8 +; CHECK-NEXT: [[TMP14:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD27]], <1 x double> [[COL_LOAD30]], <1 x double> [[TMP11]]) +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 1 +; CHECK-NEXT: [[VEC_CAST36:%.*]] = bitcast double* [[TMP15]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP14]], <1 x double>* [[VEC_CAST36]], align 8 +; CHECK-NEXT: [[VEC_CAST38:%.*]] = bitcast <4 x double>* [[A]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD39:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST38]], align 8 +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST41:%.*]] = bitcast double* [[TMP16]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD42:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST41]], align 8 +; CHECK-NEXT: [[TMP17:%.*]] = fmul contract <1 x double> [[COL_LOAD39]], [[COL_LOAD42]] +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST47:%.*]] = bitcast double* [[TMP18]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD48:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST47]], align 8 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST50:%.*]] = bitcast double* [[TMP19]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD51:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST50]], align 8 +; CHECK-NEXT: [[TMP20:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD48]], <1 x double> [[COL_LOAD51]], <1 x double> [[TMP17]]) +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 2 +; CHECK-NEXT: [[VEC_CAST57:%.*]] = bitcast double* [[TMP21]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[VEC_CAST57]], align 8 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 1 +; CHECK-NEXT: [[VEC_CAST59:%.*]] = bitcast double* [[TMP22]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST59]], align 8 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 2 +; CHECK-NEXT: [[VEC_CAST62:%.*]] = bitcast double* [[TMP23]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST62]], align 8 +; CHECK-NEXT: [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD60]], [[COL_LOAD63]] +; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <4 x double>, <4 x double>* [[A]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST68:%.*]] = bitcast double* [[TMP25]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD69:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST68]], align 8 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <4 x double>, <4 x double>* [[TMP5]], i64 0, i64 3 +; CHECK-NEXT: [[VEC_CAST71:%.*]] = bitcast double* [[TMP26]] to <1 x double>* +; CHECK-NEXT: [[COL_LOAD72:%.*]] = load <1 x double>, <1 x double>* [[VEC_CAST71]], align 8 +; CHECK-NEXT: [[TMP27:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD69]], <1 x double> [[COL_LOAD72]], <1 x double> [[TMP24]]) +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr [4 x double], [4 x double]* [[C]], i64 42, i64 3 +; CHECK-NEXT: [[VEC_CAST78:%.*]] = bitcast double* [[TMP28]] to <1 x double>* +; CHECK-NEXT: store <1 x double> [[TMP27]], <1 x double>* [[VEC_CAST78]], align 8 ; CHECK-NEXT: ret void ; entry: