diff --git a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVGatherScatterLowering.cpp @@ -338,9 +338,9 @@ if (Ops[0]->getType()->isVectorTy()) return std::make_pair(nullptr, nullptr); - // Make sure we're in a loop and it is in loop simplify form. + // Make sure we're in a loop and that has a pre-header and a single latch. Loop *L = LI->getLoopFor(GEP->getParent()); - if (!L || !L->isLoopSimplifyForm()) + if (!L || !L->getLoopPreheader() || !L->getLoopLatch()) return std::make_pair(nullptr, nullptr); Optional VecOperand; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -877,243 +877,25 @@ declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32 immarg, <16 x i1>) define void @gather_no_scalar_remainder(i8* noalias nocapture noundef %arg, i8* noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { -; V-LABEL: gather_no_scalar_remainder: -; V: # %bb.0: # %bb -; V-NEXT: slli a2, a2, 4 -; V-NEXT: beqz a2, .LBB13_3 -; V-NEXT: # %bb.1: # %bb2 -; V-NEXT: vsetivli zero, 16, e64, m4, ta, mu -; V-NEXT: vid.v v8 -; V-NEXT: li a3, 5 -; V-NEXT: li a4, 16 -; V-NEXT: .LBB13_2: # %bb4 -; V-NEXT: # =>This Inner Loop Header: Depth=1 -; V-NEXT: vmul.vx v12, v8, a3 -; V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu -; V-NEXT: vluxei64.v v16, (a1), v12 -; V-NEXT: vle8.v v12, (a0) -; V-NEXT: vadd.vv v12, v12, v16 -; V-NEXT: vse8.v v12, (a0) -; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu -; V-NEXT: vadd.vx v8, v8, a4 -; V-NEXT: addi a2, a2, -16 -; V-NEXT: addi a0, a0, 16 -; V-NEXT: bnez a2, .LBB13_2 -; V-NEXT: .LBB13_3: # %bb16 -; V-NEXT: ret -; -; ZVE32F-LABEL: gather_no_scalar_remainder: -; ZVE32F: # %bb.0: # %bb -; ZVE32F-NEXT: addi sp, sp, -240 -; ZVE32F-NEXT: .cfi_def_cfa_offset 240 -; ZVE32F-NEXT: sd ra, 232(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s0, 224(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s1, 216(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s2, 208(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s3, 200(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s4, 192(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s5, 184(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s6, 176(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s7, 168(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s8, 160(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s9, 152(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s10, 144(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd s11, 136(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: .cfi_offset ra, -8 -; ZVE32F-NEXT: .cfi_offset s0, -16 -; ZVE32F-NEXT: .cfi_offset s1, -24 -; ZVE32F-NEXT: .cfi_offset s2, -32 -; ZVE32F-NEXT: .cfi_offset s3, -40 -; ZVE32F-NEXT: .cfi_offset s4, -48 -; ZVE32F-NEXT: .cfi_offset s5, -56 -; ZVE32F-NEXT: .cfi_offset s6, -64 -; ZVE32F-NEXT: .cfi_offset s7, -72 -; ZVE32F-NEXT: .cfi_offset s8, -80 -; ZVE32F-NEXT: .cfi_offset s9, -88 -; ZVE32F-NEXT: .cfi_offset s10, -96 -; ZVE32F-NEXT: .cfi_offset s11, -104 -; ZVE32F-NEXT: slli a2, a2, 4 -; ZVE32F-NEXT: beqz a2, .LBB13_3 -; ZVE32F-NEXT: # %bb.1: # %bb2 -; ZVE32F-NEXT: li a3, 0 -; ZVE32F-NEXT: li a4, 15 -; ZVE32F-NEXT: li a5, 14 -; ZVE32F-NEXT: li a6, 13 -; ZVE32F-NEXT: li a7, 12 -; ZVE32F-NEXT: li t0, 11 -; ZVE32F-NEXT: li t1, 10 -; ZVE32F-NEXT: li t2, 9 -; ZVE32F-NEXT: li t3, 8 -; ZVE32F-NEXT: li t4, 7 -; ZVE32F-NEXT: li t5, 6 -; ZVE32F-NEXT: li t6, 5 -; ZVE32F-NEXT: li s0, 4 -; ZVE32F-NEXT: li s1, 3 -; ZVE32F-NEXT: li s2, 2 -; ZVE32F-NEXT: li s3, 1 -; ZVE32F-NEXT: vsetivli zero, 16, e8, mf2, ta, mu -; ZVE32F-NEXT: .LBB13_2: # %bb4 -; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 -; ZVE32F-NEXT: sd s0, 56(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd t0, 64(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd a5, 72(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd a4, 80(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd a3, 88(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd a0, 96(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: sd a2, 104(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: slli s4, a3, 2 -; ZVE32F-NEXT: add a0, s4, a3 -; ZVE32F-NEXT: sd a0, 48(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: slli s5, s3, 2 -; ZVE32F-NEXT: add a0, s5, s3 -; ZVE32F-NEXT: sd a0, 40(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: slli s6, s2, 2 -; ZVE32F-NEXT: add a0, s6, s2 -; ZVE32F-NEXT: sd a0, 32(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: slli s7, s1, 2 -; ZVE32F-NEXT: add a0, s7, s1 -; ZVE32F-NEXT: sd a0, 24(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: slli s8, s0, 2 -; ZVE32F-NEXT: add a0, s8, s0 -; ZVE32F-NEXT: slli s9, t6, 2 -; ZVE32F-NEXT: add a3, s9, t6 -; ZVE32F-NEXT: slli s10, t5, 2 -; ZVE32F-NEXT: add s10, s10, t5 -; ZVE32F-NEXT: slli s11, t4, 2 -; ZVE32F-NEXT: add s11, s11, t4 -; ZVE32F-NEXT: slli ra, t3, 2 -; ZVE32F-NEXT: add ra, ra, t3 -; ZVE32F-NEXT: slli s4, t2, 2 -; ZVE32F-NEXT: add s4, s4, t2 -; ZVE32F-NEXT: slli s5, t1, 2 -; ZVE32F-NEXT: add s5, s5, t1 -; ZVE32F-NEXT: slli s6, t0, 2 -; ZVE32F-NEXT: add s6, s6, t0 -; ZVE32F-NEXT: slli s7, a7, 2 -; ZVE32F-NEXT: add s7, s7, a7 -; ZVE32F-NEXT: slli s8, a6, 2 -; ZVE32F-NEXT: add s8, s8, a6 -; ZVE32F-NEXT: slli a2, a5, 2 -; ZVE32F-NEXT: add a2, a2, a5 -; ZVE32F-NEXT: slli s9, a4, 2 -; ZVE32F-NEXT: add s9, s9, a4 -; ZVE32F-NEXT: add a4, a1, s9 -; ZVE32F-NEXT: sd a4, 16(sp) # 8-byte Folded Spill -; ZVE32F-NEXT: add a2, a1, a2 -; ZVE32F-NEXT: add s8, a1, s8 -; ZVE32F-NEXT: add s7, a1, s7 -; ZVE32F-NEXT: add s6, a1, s6 -; ZVE32F-NEXT: add s5, a1, s5 -; ZVE32F-NEXT: add s4, a1, s4 -; ZVE32F-NEXT: add ra, a1, ra -; ZVE32F-NEXT: add s11, a1, s11 -; ZVE32F-NEXT: add s10, a1, s10 -; ZVE32F-NEXT: add a3, a1, a3 -; ZVE32F-NEXT: add a0, a1, a0 -; ZVE32F-NEXT: ld a4, 24(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: add a4, a1, a4 -; ZVE32F-NEXT: ld a5, 32(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: add a5, a1, a5 -; ZVE32F-NEXT: mv t0, a6 -; ZVE32F-NEXT: ld a6, 40(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: add a6, a1, a6 -; ZVE32F-NEXT: ld s0, 48(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: add s9, a1, s0 -; ZVE32F-NEXT: lb s9, 0(s9) -; ZVE32F-NEXT: lb a6, 0(a6) -; ZVE32F-NEXT: lb a5, 0(a5) -; ZVE32F-NEXT: lb a4, 0(a4) -; ZVE32F-NEXT: lb a0, 0(a0) -; ZVE32F-NEXT: lb a3, 0(a3) -; ZVE32F-NEXT: lb s10, 0(s10) -; ZVE32F-NEXT: lb s11, 0(s11) -; ZVE32F-NEXT: lb ra, 0(ra) -; ZVE32F-NEXT: lb s4, 0(s4) -; ZVE32F-NEXT: lb s5, 0(s5) -; ZVE32F-NEXT: lb s6, 0(s6) -; ZVE32F-NEXT: lb s7, 0(s7) -; ZVE32F-NEXT: lb s8, 0(s8) -; ZVE32F-NEXT: lb a2, 0(a2) -; ZVE32F-NEXT: mv s0, t6 -; ZVE32F-NEXT: mv t6, t5 -; ZVE32F-NEXT: mv t5, t4 -; ZVE32F-NEXT: mv t4, t3 -; ZVE32F-NEXT: mv t3, t2 -; ZVE32F-NEXT: mv t2, t1 -; ZVE32F-NEXT: mv t1, a7 -; ZVE32F-NEXT: ld a7, 16(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: lb a7, 0(a7) -; ZVE32F-NEXT: sb s9, 112(sp) -; ZVE32F-NEXT: sb a6, 113(sp) -; ZVE32F-NEXT: mv a6, t0 -; ZVE32F-NEXT: ld t0, 64(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb a5, 114(sp) -; ZVE32F-NEXT: ld a5, 72(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb a4, 115(sp) -; ZVE32F-NEXT: ld a4, 80(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb a0, 116(sp) -; ZVE32F-NEXT: ld a0, 96(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb a3, 117(sp) -; ZVE32F-NEXT: ld a3, 88(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb s10, 118(sp) -; ZVE32F-NEXT: sb s11, 119(sp) -; ZVE32F-NEXT: sb ra, 120(sp) -; ZVE32F-NEXT: sb s4, 121(sp) -; ZVE32F-NEXT: sb s5, 122(sp) -; ZVE32F-NEXT: sb s6, 123(sp) -; ZVE32F-NEXT: sb s7, 124(sp) -; ZVE32F-NEXT: sb s8, 125(sp) -; ZVE32F-NEXT: sb a2, 126(sp) -; ZVE32F-NEXT: ld a2, 104(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: sb a7, 127(sp) -; ZVE32F-NEXT: mv a7, t1 -; ZVE32F-NEXT: mv t1, t2 -; ZVE32F-NEXT: mv t2, t3 -; ZVE32F-NEXT: mv t3, t4 -; ZVE32F-NEXT: mv t4, t5 -; ZVE32F-NEXT: mv t5, t6 -; ZVE32F-NEXT: mv t6, s0 -; ZVE32F-NEXT: ld s0, 56(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: addi s4, sp, 112 -; ZVE32F-NEXT: vle8.v v8, (s4) -; ZVE32F-NEXT: vle8.v v9, (a0) -; ZVE32F-NEXT: vadd.vv v8, v9, v8 -; ZVE32F-NEXT: vse8.v v8, (a0) -; ZVE32F-NEXT: addi a3, a3, 16 -; ZVE32F-NEXT: addi s3, s3, 16 -; ZVE32F-NEXT: addi s2, s2, 16 -; ZVE32F-NEXT: addi s1, s1, 16 -; ZVE32F-NEXT: addi s0, s0, 16 -; ZVE32F-NEXT: addi t6, t6, 16 -; ZVE32F-NEXT: addi t5, t5, 16 -; ZVE32F-NEXT: addi t4, t4, 16 -; ZVE32F-NEXT: addi t3, t3, 16 -; ZVE32F-NEXT: addi t2, t2, 16 -; ZVE32F-NEXT: addi t1, t1, 16 -; ZVE32F-NEXT: addi t0, t0, 16 -; ZVE32F-NEXT: addi a7, a7, 16 -; ZVE32F-NEXT: addi a6, a6, 16 -; ZVE32F-NEXT: addi a5, a5, 16 -; ZVE32F-NEXT: addi a4, a4, 16 -; ZVE32F-NEXT: addi a2, a2, -16 -; ZVE32F-NEXT: addi a0, a0, 16 -; ZVE32F-NEXT: bnez a2, .LBB13_2 -; ZVE32F-NEXT: .LBB13_3: # %bb16 -; ZVE32F-NEXT: ld ra, 232(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s0, 224(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s1, 216(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s2, 208(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s3, 200(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s4, 192(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s5, 184(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s6, 176(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s7, 168(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s8, 160(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s9, 152(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s10, 144(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: ld s11, 136(sp) # 8-byte Folded Reload -; ZVE32F-NEXT: addi sp, sp, 240 -; ZVE32F-NEXT: ret +; CHECK-LABEL: gather_no_scalar_remainder: +; CHECK: # %bb.0: # %bb +; CHECK-NEXT: slli a2, a2, 4 +; CHECK-NEXT: beqz a2, .LBB13_3 +; CHECK-NEXT: # %bb.1: # %bb2 +; CHECK-NEXT: li a3, 5 +; CHECK-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; CHECK-NEXT: .LBB13_2: # %bb4 +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vlse8.v v8, (a1), a3 +; CHECK-NEXT: vle8.v v9, (a0) +; CHECK-NEXT: vadd.vv v8, v9, v8 +; CHECK-NEXT: vse8.v v8, (a0) +; CHECK-NEXT: addi a2, a2, -16 +; CHECK-NEXT: addi a0, a0, 16 +; CHECK-NEXT: addi a1, a1, 80 +; CHECK-NEXT: bnez a2, .LBB13_2 +; CHECK-NEXT: .LBB13_3: # %bb16 +; CHECK-NEXT: ret bb: %i = shl i64 %arg2, 4 %i3 = icmp eq i64 %i, 0 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -867,10 +867,9 @@ ; CHECK-NEXT: br label [[BB4:%.*]] ; CHECK: bb4: ; CHECK-NEXT: [[I5:%.*]] = phi i64 [ [[I13:%.*]], [[BB4]] ], [ 0, [[BB2]] ] -; CHECK-NEXT: [[I6:%.*]] = phi <16 x i64> [ [[I14:%.*]], [[BB4]] ], [ , [[BB2]] ] -; CHECK-NEXT: [[I7:%.*]] = mul <16 x i64> [[I6]], -; CHECK-NEXT: [[I8:%.*]] = getelementptr inbounds i8, i8* [[ARG1:%.*]], <16 x i64> [[I7]] -; CHECK-NEXT: [[I9:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> [[I8]], i32 1, <16 x i1> , <16 x i8> undef) +; CHECK-NEXT: [[I6_SCALAR:%.*]] = phi i64 [ 0, [[BB2]] ], [ [[I14_SCALAR:%.*]], [[BB4]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, i8* [[ARG1:%.*]], i64 [[I6_SCALAR]] +; CHECK-NEXT: [[I9:%.*]] = call <16 x i8> @llvm.riscv.masked.strided.load.v16i8.p0i8.i64(<16 x i8> undef, i8* [[TMP0]], i64 5, <16 x i1> ) ; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 [[I5]] ; CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[I10]] to <16 x i8>* ; CHECK-NEXT: [[I11:%.*]] = load <16 x i8>, <16 x i8>* [[CAST]], align 1 @@ -878,7 +877,7 @@ ; CHECK-NEXT: [[CAST2:%.*]] = bitcast i8* [[I10]] to <16 x i8>* ; CHECK-NEXT: store <16 x i8> [[I12]], <16 x i8>* [[CAST2]], align 1 ; CHECK-NEXT: [[I13]] = add nuw i64 [[I5]], 16 -; CHECK-NEXT: [[I14]] = add <16 x i64> [[I6]], +; CHECK-NEXT: [[I14_SCALAR]] = add i64 [[I6_SCALAR]], 80 ; CHECK-NEXT: [[I15:%.*]] = icmp eq i64 [[I13]], [[I]] ; CHECK-NEXT: br i1 [[I15]], label [[BB16]], label [[BB4]] ; CHECK: bb16: