diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -872,3 +872,267 @@ %47 = icmp eq i32 %46, 1024 br i1 %47, label %36, label %37 } + +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32 immarg, <16 x i1>) + +define void @gather_no_scalar_remainder(i8* noalias nocapture noundef %0, i8* noalias nocapture noundef readonly %1, i64 noundef %2) { +; V-LABEL: gather_no_scalar_remainder: +; V: # %bb.0: +; V-NEXT: slli a2, a2, 4 +; V-NEXT: beqz a2, .LBB13_3 +; V-NEXT: # %bb.1: # %.preheader +; V-NEXT: vsetivli zero, 16, e64, m4, ta, mu +; V-NEXT: vid.v v8 +; V-NEXT: li a3, 5 +; V-NEXT: li a4, 16 +; V-NEXT: .LBB13_2: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vmul.vx v12, v8, a3 +; V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; V-NEXT: vluxei64.v v16, (a1), v12 +; V-NEXT: vle8.v v12, (a0) +; V-NEXT: vadd.vv v12, v12, v16 +; V-NEXT: vse8.v v12, (a0) +; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; V-NEXT: vadd.vx v8, v8, a4 +; V-NEXT: addi a2, a2, -16 +; V-NEXT: addi a0, a0, 16 +; V-NEXT: bnez a2, .LBB13_2 +; V-NEXT: .LBB13_3: +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_no_scalar_remainder: +; ZVE32F: # %bb.0: +; ZVE32F-NEXT: addi sp, sp, -240 +; ZVE32F-NEXT: .cfi_def_cfa_offset 240 +; ZVE32F-NEXT: sd ra, 232(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s0, 224(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s1, 216(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s2, 208(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s3, 200(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s4, 192(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s5, 184(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s6, 176(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s7, 168(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s8, 160(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s9, 152(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s10, 144(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s11, 136(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: .cfi_offset ra, -8 +; ZVE32F-NEXT: .cfi_offset s0, -16 +; ZVE32F-NEXT: .cfi_offset s1, -24 +; ZVE32F-NEXT: .cfi_offset s2, -32 +; ZVE32F-NEXT: .cfi_offset s3, -40 +; ZVE32F-NEXT: .cfi_offset s4, -48 +; ZVE32F-NEXT: .cfi_offset s5, -56 +; ZVE32F-NEXT: .cfi_offset s6, -64 +; ZVE32F-NEXT: .cfi_offset s7, -72 +; ZVE32F-NEXT: .cfi_offset s8, -80 +; ZVE32F-NEXT: .cfi_offset s9, -88 +; ZVE32F-NEXT: .cfi_offset s10, -96 +; ZVE32F-NEXT: .cfi_offset s11, -104 +; ZVE32F-NEXT: slli a2, a2, 4 +; ZVE32F-NEXT: beqz a2, .LBB13_3 +; ZVE32F-NEXT: # %bb.1: # %.preheader +; ZVE32F-NEXT: li a3, 0 +; ZVE32F-NEXT: li a4, 15 +; ZVE32F-NEXT: li a5, 14 +; ZVE32F-NEXT: li a6, 13 +; ZVE32F-NEXT: li a7, 12 +; ZVE32F-NEXT: li t0, 11 +; ZVE32F-NEXT: li t1, 10 +; ZVE32F-NEXT: li t2, 9 +; ZVE32F-NEXT: li t3, 8 +; ZVE32F-NEXT: li t4, 7 +; ZVE32F-NEXT: li t5, 6 +; ZVE32F-NEXT: li t6, 5 +; ZVE32F-NEXT: li s0, 4 +; ZVE32F-NEXT: li s1, 3 +; ZVE32F-NEXT: li s2, 2 +; ZVE32F-NEXT: li s3, 1 +; ZVE32F-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; ZVE32F-NEXT: .LBB13_2: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd t0, 64(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a5, 72(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a4, 80(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a3, 88(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a0, 96(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a2, 104(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s4, a3, 2 +; ZVE32F-NEXT: add a0, s4, a3 +; ZVE32F-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s5, s3, 2 +; ZVE32F-NEXT: add a0, s5, s3 +; ZVE32F-NEXT: sd a0, 40(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s6, s2, 2 +; ZVE32F-NEXT: add a0, s6, s2 +; ZVE32F-NEXT: sd a0, 32(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s7, s1, 2 +; ZVE32F-NEXT: add a0, s7, s1 +; ZVE32F-NEXT: sd a0, 24(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s8, s0, 2 +; ZVE32F-NEXT: add a0, s8, s0 +; ZVE32F-NEXT: slli s9, t6, 2 +; ZVE32F-NEXT: add a3, s9, t6 +; ZVE32F-NEXT: slli s10, t5, 2 +; ZVE32F-NEXT: add s10, s10, t5 +; ZVE32F-NEXT: slli s11, t4, 2 +; ZVE32F-NEXT: add s11, s11, t4 +; ZVE32F-NEXT: slli ra, t3, 2 +; ZVE32F-NEXT: add ra, ra, t3 +; ZVE32F-NEXT: slli s4, t2, 2 +; ZVE32F-NEXT: add s4, s4, t2 +; ZVE32F-NEXT: slli s5, t1, 2 +; ZVE32F-NEXT: add s5, s5, t1 +; ZVE32F-NEXT: slli s6, t0, 2 +; ZVE32F-NEXT: add s6, s6, t0 +; ZVE32F-NEXT: slli s7, a7, 2 +; ZVE32F-NEXT: add s7, s7, a7 +; ZVE32F-NEXT: slli s8, a6, 2 +; ZVE32F-NEXT: add s8, s8, a6 +; ZVE32F-NEXT: slli a2, a5, 2 +; ZVE32F-NEXT: add a2, a2, a5 +; ZVE32F-NEXT: slli s9, a4, 2 +; ZVE32F-NEXT: add s9, s9, a4 +; ZVE32F-NEXT: add a4, a1, s9 +; ZVE32F-NEXT: sd a4, 16(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: add a2, a1, a2 +; ZVE32F-NEXT: add s8, a1, s8 +; ZVE32F-NEXT: add s7, a1, s7 +; ZVE32F-NEXT: add s6, a1, s6 +; ZVE32F-NEXT: add s5, a1, s5 +; ZVE32F-NEXT: add s4, a1, s4 +; ZVE32F-NEXT: add ra, a1, ra +; ZVE32F-NEXT: add s11, a1, s11 +; ZVE32F-NEXT: add s10, a1, s10 +; ZVE32F-NEXT: add a3, a1, a3 +; ZVE32F-NEXT: add a0, a1, a0 +; ZVE32F-NEXT: ld a4, 24(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a4, a1, a4 +; ZVE32F-NEXT: ld a5, 32(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a5, a1, a5 +; ZVE32F-NEXT: mv t0, a6 +; ZVE32F-NEXT: ld a6, 40(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a6, a1, a6 +; ZVE32F-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add s9, a1, s0 +; ZVE32F-NEXT: lb s9, 0(s9) +; ZVE32F-NEXT: lb a6, 0(a6) +; ZVE32F-NEXT: lb a5, 0(a5) +; ZVE32F-NEXT: lb a4, 0(a4) +; ZVE32F-NEXT: lb a0, 0(a0) +; ZVE32F-NEXT: lb a3, 0(a3) +; ZVE32F-NEXT: lb s10, 0(s10) +; ZVE32F-NEXT: lb s11, 0(s11) +; ZVE32F-NEXT: lb ra, 0(ra) +; ZVE32F-NEXT: lb s4, 0(s4) +; ZVE32F-NEXT: lb s5, 0(s5) +; ZVE32F-NEXT: lb s6, 0(s6) +; ZVE32F-NEXT: lb s7, 0(s7) +; ZVE32F-NEXT: lb s8, 0(s8) +; ZVE32F-NEXT: lb a2, 0(a2) +; ZVE32F-NEXT: mv s0, t6 +; ZVE32F-NEXT: mv t6, t5 +; ZVE32F-NEXT: mv t5, t4 +; ZVE32F-NEXT: mv t4, t3 +; ZVE32F-NEXT: mv t3, t2 +; ZVE32F-NEXT: mv t2, t1 +; ZVE32F-NEXT: mv t1, a7 +; ZVE32F-NEXT: ld a7, 16(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: lb a7, 0(a7) +; ZVE32F-NEXT: sb s9, 112(sp) +; ZVE32F-NEXT: sb a6, 113(sp) +; ZVE32F-NEXT: mv a6, t0 +; ZVE32F-NEXT: ld t0, 64(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a5, 114(sp) +; ZVE32F-NEXT: ld a5, 72(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a4, 115(sp) +; ZVE32F-NEXT: ld a4, 80(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a0, 116(sp) +; ZVE32F-NEXT: ld a0, 96(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a3, 117(sp) +; ZVE32F-NEXT: ld a3, 88(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb s10, 118(sp) +; ZVE32F-NEXT: sb s11, 119(sp) +; ZVE32F-NEXT: sb ra, 120(sp) +; ZVE32F-NEXT: sb s4, 121(sp) +; ZVE32F-NEXT: sb s5, 122(sp) +; ZVE32F-NEXT: sb s6, 123(sp) +; ZVE32F-NEXT: sb s7, 124(sp) +; ZVE32F-NEXT: sb s8, 125(sp) +; ZVE32F-NEXT: sb a2, 126(sp) +; ZVE32F-NEXT: ld a2, 104(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a7, 127(sp) +; ZVE32F-NEXT: mv a7, t1 +; ZVE32F-NEXT: mv t1, t2 +; ZVE32F-NEXT: mv t2, t3 +; ZVE32F-NEXT: mv t3, t4 +; ZVE32F-NEXT: mv t4, t5 +; ZVE32F-NEXT: mv t5, t6 +; ZVE32F-NEXT: mv t6, s0 +; ZVE32F-NEXT: ld s0, 56(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: addi s4, sp, 112 +; ZVE32F-NEXT: vle8.v v8, (s4) +; ZVE32F-NEXT: vle8.v v9, (a0) +; ZVE32F-NEXT: vadd.vv v8, v9, v8 +; ZVE32F-NEXT: vse8.v v8, (a0) +; ZVE32F-NEXT: addi a3, a3, 16 +; ZVE32F-NEXT: addi s3, s3, 16 +; ZVE32F-NEXT: addi s2, s2, 16 +; ZVE32F-NEXT: addi s1, s1, 16 +; ZVE32F-NEXT: addi s0, s0, 16 +; ZVE32F-NEXT: addi t6, t6, 16 +; ZVE32F-NEXT: addi t5, t5, 16 +; ZVE32F-NEXT: addi t4, t4, 16 +; ZVE32F-NEXT: addi t3, t3, 16 +; ZVE32F-NEXT: addi t2, t2, 16 +; ZVE32F-NEXT: addi t1, t1, 16 +; ZVE32F-NEXT: addi t0, t0, 16 +; ZVE32F-NEXT: addi a7, a7, 16 +; ZVE32F-NEXT: addi a6, a6, 16 +; ZVE32F-NEXT: addi a5, a5, 16 +; ZVE32F-NEXT: addi a4, a4, 16 +; ZVE32F-NEXT: addi a2, a2, -16 +; ZVE32F-NEXT: addi a0, a0, 16 +; ZVE32F-NEXT: bnez a2, .LBB13_2 +; ZVE32F-NEXT: .LBB13_3: +; ZVE32F-NEXT: ld ra, 232(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s0, 224(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s1, 216(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s2, 208(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s3, 200(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s4, 192(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s5, 184(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s6, 176(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s7, 168(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s8, 160(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s9, 152(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s10, 144(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s11, 136(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: addi sp, sp, 240 +; ZVE32F-NEXT: ret + %4 = shl i64 %2, 4 + %5 = icmp eq i64 %4, 0 + br i1 %5, label %18, label %6 + +6: ; preds = %3, %6 + %7 = phi i64 [ %15, %6 ], [ 0, %3 ] + %8 = phi <16 x i64> [ %16, %6 ], [ , %3 ] + %9 = mul <16 x i64> %8, + %10 = getelementptr inbounds i8, i8* %1, <16 x i64> %9 + %11 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %10, i32 1, <16 x i1> , <16 x i8> undef) + %12 = getelementptr inbounds i8, i8* %0, i64 %7 + %cast = bitcast i8* %12 to <16 x i8>* + %13 = load <16 x i8>, <16 x i8>* %cast, align 1 + %14 = add <16 x i8> %13, %11 + %cast2 = bitcast i8* %12 to <16 x i8>* + store <16 x i8> %14, <16 x i8>* %cast2, align 1 + %15 = add nuw i64 %7, 16 + %16 = add <16 x i64> %8, + %17 = icmp eq i64 %15, %4 + br i1 %17, label %18, label %6 + +18: ; preds = %6, %3 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -853,3 +853,55 @@ %47 = icmp eq i32 %46, 1024 br i1 %47, label %36, label %37 } + +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32 immarg, <16 x i1>) + +define void @gather_no_scalar_remainder(i8* noalias nocapture noundef %0, i8* noalias nocapture noundef readonly %1, i64 noundef %2) { +; CHECK-LABEL: @gather_no_scalar_remainder( +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP2:%.*]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label [[TMP18:%.*]], label [[TMP6:%.*]] +; CHECK: 6: +; CHECK-NEXT: [[TMP7:%.*]] = phi i64 [ [[TMP15:%.*]], [[TMP6]] ], [ 0, [[TMP3:%.*]] ] +; CHECK-NEXT: [[TMP8:%.*]] = phi <16 x i64> [ [[TMP16:%.*]], [[TMP6]] ], [ , [[TMP3]] ] +; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i64> [[TMP8]], +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[TMP1:%.*]], <16 x i64> [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> [[TMP10]], i32 1, <16 x i1> , <16 x i8> undef) +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, i8* [[TMP0:%.*]], i64 [[TMP7]] +; CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>* +; CHECK-NEXT: [[TMP13:%.*]] = load <16 x i8>, <16 x i8>* [[CAST]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = add <16 x i8> [[TMP13]], [[TMP11]] +; CHECK-NEXT: [[CAST2:%.*]] = bitcast i8* [[TMP12]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[TMP14]], <16 x i8>* [[CAST2]], align 1 +; CHECK-NEXT: [[TMP15]] = add nuw i64 [[TMP7]], 16 +; CHECK-NEXT: [[TMP16]] = add <16 x i64> [[TMP8]], +; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[TMP15]], [[TMP4]] +; CHECK-NEXT: br i1 [[TMP17]], label [[TMP18]], label [[TMP6]] +; CHECK: 18: +; CHECK-NEXT: ret void +; + %4 = shl i64 %2, 4 + %5 = icmp eq i64 %4, 0 + br i1 %5, label %18, label %6 + +6: ; preds = %3, %6 + %7 = phi i64 [ %15, %6 ], [ 0, %3 ] + %8 = phi <16 x i64> [ %16, %6 ], [ , %3 ] + %9 = mul <16 x i64> %8, + %10 = getelementptr inbounds i8, i8* %1, <16 x i64> %9 + %11 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %10, i32 1, <16 x i1> , <16 x i8> undef) + %12 = getelementptr inbounds i8, i8* %0, i64 %7 + %cast = bitcast i8* %12 to <16 x i8>* + %13 = load <16 x i8>, <16 x i8>* %cast, align 1 + %14 = add <16 x i8> %13, %11 + %cast2 = bitcast i8* %12 to <16 x i8>* + store <16 x i8> %14, <16 x i8>* %cast2, align 1 + %15 = add nuw i64 %7, 16 + %16 = add <16 x i64> %8, + %17 = icmp eq i64 %15, %4 + br i1 %17, label %18, label %6 + +18: ; preds = %6, %3 + ret void +}