diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -271,9 +271,12 @@ } unsigned getMaxInterleaveFactor(ElementCount VF) { + // Don't interleave if the loop has been vectorized with scalable vectors. + if (VF.isScalable()) + return 1; // If the loop will not be vectorized, don't interleave the loop. // Let regular unroll to unroll the loop. - return VF.getKnownMinValue() == 1 ? 1 : ST->getMaxInterleaveFactor(); + return VF.isScalar() ? 1 : ST->getMaxInterleaveFactor(); } enum RISCVRegisterClass { GPRRC, FPRRC, VRRC }; diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -945,51 +945,32 @@ ; CHECK-LABEL: @predicated_sdiv_by_minus_one( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP14]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 1 -; CHECK-NEXT: [[TMP16:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP17:%.*]] = icmp ne [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP18:%.*]] = select [[TMP16]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP19:%.*]] = select [[TMP17]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP20:%.*]] = sdiv [[WIDE_LOAD]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = sdiv [[WIDE_LOAD1]], [[TMP19]] -; CHECK-NEXT: [[TMP22:%.*]] = xor [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP23:%.*]] = xor [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP16]], [[TMP20]], [[WIDE_LOAD]] -; CHECK-NEXT: [[PREDPHI2:%.*]] = select [[TMP17]], [[TMP21]], [[WIDE_LOAD1]] -; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[TMP24]], 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP25]] -; CHECK-NEXT: store [[PREDPHI2]], ptr [[TMP26]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP28:%.*]] = mul i64 [[TMP27]], 16 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]] -; CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i8 -128, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP8:%.*]] = select [[TMP7]], shufflevector ( insertelement ( poison, i8 -1, i64 0), poison, zeroinitializer), shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP9:%.*]] = sdiv [[WIDE_LOAD]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = xor [[TMP7]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; CHECK-NEXT: [[PREDPHI:%.*]] = select [[TMP7]], [[TMP9]], [[WIDE_LOAD]] +; CHECK-NEXT: store [[PREDPHI]], ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -13,63 +13,49 @@ ; OUTLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; OUTLOOP: for.body.preheader: ; OUTLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 +; OUTLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 2 ; OUTLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; OUTLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; OUTLOOP: vector.ph: ; OUTLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 +; OUTLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 2 ; OUTLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; OUTLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; OUTLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; OUTLOOP: vector.body: ; OUTLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] -; OUTLOOP-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; OUTLOOP-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; OUTLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 -; OUTLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 2 -; OUTLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 -; OUTLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 -; OUTLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] -; OUTLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] -; OUTLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]] -; OUTLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 -; OUTLOOP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; OUTLOOP-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; OUTLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP14]] -; OUTLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 2 -; OUTLOOP-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD]] to -; OUTLOOP-NEXT: [[TMP17:%.*]] = sext [[WIDE_LOAD2]] to -; OUTLOOP-NEXT: [[TMP18]] = add [[VEC_PHI]], [[TMP16]] -; OUTLOOP-NEXT: [[TMP19]] = add [[VEC_PHI1]], [[TMP17]] -; OUTLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() -; OUTLOOP-NEXT: [[TMP21:%.*]] = mul i32 [[TMP20]], 4 -; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP21]] -; OUTLOOP-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; OUTLOOP-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; OUTLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; OUTLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; OUTLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; OUTLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; OUTLOOP-NEXT: [[TMP8]] = add [[VEC_PHI]], [[TMP7]] +; OUTLOOP-NEXT: [[TMP9:%.*]] = call i32 @llvm.vscale.i32() +; OUTLOOP-NEXT: [[TMP10:%.*]] = mul i32 [[TMP9]], 2 +; OUTLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP10]] +; OUTLOOP-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; OUTLOOP-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; OUTLOOP: middle.block: -; OUTLOOP-NEXT: [[BIN_RDX:%.*]] = add [[TMP19]], [[TMP18]] -; OUTLOOP-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[BIN_RDX]]) +; OUTLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32( [[TMP8]]) ; OUTLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; OUTLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; OUTLOOP: scalar.ph: ; OUTLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_BODY:%.*]] ; OUTLOOP: for.body: ; OUTLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; OUTLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; OUTLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; OUTLOOP-NEXT: [[TMP24:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP24]] to i32 +; OUTLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; OUTLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 ; OUTLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; OUTLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; OUTLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; OUTLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; OUTLOOP: for.cond.cleanup.loopexit: -; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP23]], [[MIDDLE_BLOCK]] ] +; OUTLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; OUTLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; OUTLOOP: for.cond.cleanup: ; OUTLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] @@ -81,64 +67,49 @@ ; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; INLOOP: for.body.preheader: ; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8 +; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 4 ; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: ; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8 +; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 4 ; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] ; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; INLOOP: vector.body: ; INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 -; INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4 -; INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 -; INLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 -; INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] -; INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] -; INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[TMP9]] -; INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i32 0 -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 2 -; INLOOP-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; INLOOP-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; INLOOP-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP10]], i64 [[TMP14]] -; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 2 -; INLOOP-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP17:%.*]] = sext [[WIDE_LOAD2]] to -; INLOOP-NEXT: [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP16]]) -; INLOOP-NEXT: [[TMP19]] = add i32 [[TMP18]], [[VEC_PHI]] -; INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP17]]) -; INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI1]] -; INLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP23:%.*]] = mul i32 [[TMP22]], 8 -; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP23]] -; INLOOP-NEXT: [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INLOOP-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; INLOOP-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[X:%.*]], i32 [[TMP4]] +; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 2 +; INLOOP-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD]] to +; INLOOP-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP7]]) +; INLOOP-NEXT: [[TMP9]] = add i32 [[TMP8]], [[VEC_PHI]] +; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; INLOOP-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 +; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP11]] +; INLOOP-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INLOOP-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INLOOP: middle.block: -; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP19]] ; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INLOOP: scalar.ph: ; INLOOP-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; INLOOP-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; INLOOP-NEXT: br label [[FOR_BODY:%.*]] ; INLOOP: for.body: ; INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[X]], i32 [[I_08]] -; INLOOP-NEXT: [[TMP25:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 -; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP25]] to i32 +; INLOOP-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2 +; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP13]] to i32 ; INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] ; INLOOP-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; INLOOP: for.cond.cleanup.loopexit: -; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] +; INLOOP-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; INLOOP-NEXT: br label [[FOR_COND_CLEANUP]] ; INLOOP: for.cond.cleanup: ; INLOOP-NEXT: [[R_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA]], [[FOR_COND_CLEANUP_LOOPEXIT]] ] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/lmul.ll @@ -49,43 +49,28 @@ ; LMUL2-LABEL: @load_store( ; LMUL2-NEXT: entry: ; LMUL2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; LMUL2-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL2: vector.ph: ; LMUL2-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; LMUL2-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL2: vector.body: ; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; LMUL2-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; LMUL2-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; LMUL2-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; LMUL2-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; LMUL2-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; LMUL2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] -; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]] -; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 -; LMUL2-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; LMUL2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] -; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 4 -; LMUL2-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL2-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL2-NEXT: store [[TMP16]], ptr [[TMP12]], align 4 -; LMUL2-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; LMUL2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]] -; LMUL2-NEXT: store [[TMP17]], ptr [[TMP20]], align 4 -; LMUL2-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; LMUL2-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; LMUL2-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL2-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] +; LMUL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL2-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; LMUL2-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL2-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL2-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; LMUL2-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL2-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL2: middle.block: ; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -107,43 +92,28 @@ ; LMUL4-LABEL: @load_store( ; LMUL4-NEXT: entry: ; LMUL4-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; LMUL4-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; LMUL4-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; LMUL4-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL4: vector.ph: ; LMUL4-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; LMUL4-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 ; LMUL4-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL4-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; LMUL4-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL4: vector.body: ; LMUL4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; LMUL4-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; LMUL4-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 -; LMUL4-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; LMUL4-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; LMUL4-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; LMUL4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] -; LMUL4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]] -; LMUL4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 -; LMUL4-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 -; LMUL4-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] -; LMUL4-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 4 -; LMUL4-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL4-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL4-NEXT: store [[TMP16]], ptr [[TMP12]], align 4 -; LMUL4-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 4 -; LMUL4-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]] -; LMUL4-NEXT: store [[TMP17]], ptr [[TMP20]], align 4 -; LMUL4-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; LMUL4-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 8 -; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; LMUL4-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL4-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] +; LMUL4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; LMUL4-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL4-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; LMUL4-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL4-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL4-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4 +; LMUL4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; LMUL4-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL4-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL4: middle.block: ; LMUL4-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; LMUL4-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -165,43 +135,28 @@ ; LMUL8-LABEL: @load_store( ; LMUL8-NEXT: entry: ; LMUL8-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16 +; LMUL8-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; LMUL8-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; LMUL8-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; LMUL8: vector.ph: ; LMUL8-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16 +; LMUL8-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; LMUL8-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; LMUL8-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; LMUL8-NEXT: br label [[VECTOR_BODY:%.*]] ; LMUL8: vector.body: ; LMUL8-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; LMUL8-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; LMUL8-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 8 -; LMUL8-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; LMUL8-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; LMUL8-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; LMUL8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] -; LMUL8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 [[TMP9]] -; LMUL8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 -; LMUL8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 -; LMUL8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] -; LMUL8-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 4 -; LMUL8-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL8-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) -; LMUL8-NEXT: store [[TMP16]], ptr [[TMP12]], align 4 -; LMUL8-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 -; LMUL8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP19]] -; LMUL8-NEXT: store [[TMP17]], ptr [[TMP20]], align 4 -; LMUL8-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; LMUL8-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 -; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; LMUL8-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; LMUL8-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL8-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 [[TMP4]] +; LMUL8-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 0 +; LMUL8-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; LMUL8-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) +; LMUL8-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; LMUL8-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; LMUL8-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 +; LMUL8-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; LMUL8-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL8-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; LMUL8: middle.block: ; LMUL8-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; LMUL8-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/mask-index-type.ll @@ -12,12 +12,12 @@ ; VLENUNK-LABEL: @test( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[TMP4:%.*]] = call @llvm.experimental.stepvector.nxv2i64() @@ -31,49 +31,27 @@ ; VLENUNK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement poison, i32 [[V]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector [[BROADCAST_SPLATINSERT4]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; VLENUNK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] ; VLENUNK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2 -; VLENUNK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 0 -; VLENUNK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 1 -; VLENUNK-NEXT: [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]] -; VLENUNK-NEXT: [[TMP16:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[TMP17:%.*]] = icmp ult [[STEP_ADD]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] -; VLENUNK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP15]] -; VLENUNK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[TMP18]], i32 0 -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP20]], i32 4, [[TMP16]], poison) -; VLENUNK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 2 -; VLENUNK-NEXT: [[TMP23:%.*]] = getelementptr i32, ptr [[TMP18]], i64 [[TMP22]] -; VLENUNK-NEXT: [[WIDE_MASKED_LOAD2:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP23]], i32 4, [[TMP17]], poison) -; VLENUNK-NEXT: [[TMP24:%.*]] = xor [[TMP16]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[TMP25:%.*]] = xor [[TMP17]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) -; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP24]], zeroinitializer, [[WIDE_MASKED_LOAD]] -; VLENUNK-NEXT: [[PREDPHI3:%.*]] = select [[TMP25]], zeroinitializer, [[WIDE_MASKED_LOAD2]] -; VLENUNK-NEXT: [[TMP26:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: [[TMP27:%.*]] = add [[PREDPHI3]], [[BROADCAST_SPLAT5]] -; VLENUNK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] -; VLENUNK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP15]] -; VLENUNK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 0 -; VLENUNK-NEXT: store [[TMP26]], ptr [[TMP30]], align 4 -; VLENUNK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 2 -; VLENUNK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i64 [[TMP32]] -; VLENUNK-NEXT: store [[TMP27]], ptr [[TMP33]], align 4 -; VLENUNK-NEXT: [[TMP34:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP35:%.*]] = mul i64 [[TMP34]], 4 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP35]] -; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[STEP_ADD]], [[DOTSPLAT]] -; VLENUNK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP36]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; VLENUNK-NEXT: [[TMP11:%.*]] = icmp ult [[VEC_IND]], shufflevector ( insertelement ( poison, i64 512, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 [[TMP10]] +; VLENUNK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; VLENUNK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv2i32.p0(ptr [[TMP13]], i32 4, [[TMP11]], poison) +; VLENUNK-NEXT: [[TMP14:%.*]] = xor [[TMP11]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer) +; VLENUNK-NEXT: [[PREDPHI:%.*]] = select [[TMP14]], zeroinitializer, [[WIDE_MASKED_LOAD]] +; VLENUNK-NEXT: [[TMP15:%.*]] = add [[PREDPHI]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP10]] +; VLENUNK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 +; VLENUNK-NEXT: store [[TMP15]], ptr [[TMP17]], align 4 +; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]] +; VLENUNK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; VLENUNK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -106,7 +106,7 @@ ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 -; CHECK-NEXT: LV: IC is 2 +; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. @@ -238,7 +238,7 @@ ; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class ; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class ; CHECK-NEXT: LV: Loop cost is 23 -; CHECK-NEXT: LV: IC is 2 +; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -121,47 +121,30 @@ ; VLENUNK-LABEL: @vector_add_i32( ; VLENUNK-NEXT: entry: ; VLENUNK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; VLENUNK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; VLENUNK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLENUNK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLENUNK: vector.ph: ; VLENUNK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; VLENUNK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; VLENUNK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLENUNK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[V]], i64 0 -; VLENUNK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] ; VLENUNK: vector.body: ; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLENUNK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; VLENUNK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; VLENUNK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; VLENUNK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; VLENUNK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] -; VLENUNK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; VLENUNK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 -; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 -; VLENUNK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; VLENUNK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] -; VLENUNK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 4 -; VLENUNK-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLENUNK-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; VLENUNK-NEXT: store [[TMP16]], ptr [[TMP12]], align 4 -; VLENUNK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; VLENUNK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP19]] -; VLENUNK-NEXT: store [[TMP17]], ptr [[TMP20]], align 4 -; VLENUNK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; VLENUNK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; VLENUNK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLENUNK-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLENUNK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLENUNK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -183,47 +166,30 @@ ; VLEN128-LABEL: @vector_add_i32( ; VLEN128-NEXT: entry: ; VLEN128-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; VLEN128-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; VLEN128-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; VLEN128-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; VLEN128: vector.ph: ; VLEN128-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; VLEN128-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 ; VLEN128-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; VLEN128-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; VLEN128-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[V:%.*]], i64 0 ; VLEN128-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; VLEN128-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[V]], i64 0 -; VLEN128-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; VLEN128-NEXT: br label [[VECTOR_BODY:%.*]] ; VLEN128: vector.body: ; VLEN128-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; VLEN128-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; VLEN128-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2 -; VLEN128-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 -; VLEN128-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 -; VLEN128-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], [[TMP8]] -; VLEN128-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] -; VLEN128-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP9]] -; VLEN128-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 -; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 -; VLEN128-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 -; VLEN128-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP14]] -; VLEN128-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP15]], align 4 -; VLEN128-NEXT: [[TMP16:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] -; VLEN128-NEXT: [[TMP17:%.*]] = add [[WIDE_LOAD1]], [[BROADCAST_SPLAT3]] -; VLEN128-NEXT: store [[TMP16]], ptr [[TMP12]], align 4 -; VLEN128-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2 -; VLEN128-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i64 [[TMP19]] -; VLEN128-NEXT: store [[TMP17]], ptr [[TMP20]], align 4 -; VLEN128-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; VLEN128-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 4 -; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]] -; VLEN128-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLEN128-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VLEN128-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP4]] +; VLEN128-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 +; VLEN128-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[TMP7:%.*]] = add [[WIDE_LOAD]], [[BROADCAST_SPLAT]] +; VLEN128-NEXT: store [[TMP7]], ptr [[TMP6]], align 4 +; VLEN128-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; VLEN128-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 2 +; VLEN128-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] +; VLEN128-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; VLEN128-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VLEN128: middle.block: ; VLEN128-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLEN128-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll @@ -2,7 +2,8 @@ ; RUN: -riscv-v-vector-bits-min=128 -riscv-v-vector-bits-max=128 \ ; RUN: -pass-remarks=loop-vectorize -pass-remarks-analysis=loop-vectorize \ ; RUN: -pass-remarks-missed=loop-vectorize -mtriple riscv64-linux-gnu \ -; RUN: -mattr=+v,+f -S 2>%t | FileCheck %s -check-prefix=CHECK +; RUN: -force-target-max-vector-interleave=2 -mattr=+v,+f -S 2>%t \ +; RUN: | FileCheck %s -check-prefix=CHECK ; RUN: cat %t | FileCheck %s -check-prefix=CHECK-REMARK ; Reduction can be vectorized