diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll @@ -872,3 +872,273 @@ %47 = icmp eq i32 %46, 1024 br i1 %47, label %36, label %37 } + +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32 immarg, <16 x i1>) + +define void @gather_no_scalar_remainder(i8* noalias nocapture noundef %arg, i8* noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { +; V-LABEL: gather_no_scalar_remainder: +; V: # %bb.0: # %bb +; V-NEXT: slli a2, a2, 4 +; V-NEXT: beqz a2, .LBB13_3 +; V-NEXT: # %bb.1: # %bb2 +; V-NEXT: vsetivli zero, 16, e64, m4, ta, mu +; V-NEXT: vid.v v8 +; V-NEXT: li a3, 5 +; V-NEXT: li a4, 16 +; V-NEXT: .LBB13_2: # %bb4 +; V-NEXT: # =>This Inner Loop Header: Depth=1 +; V-NEXT: vmul.vx v12, v8, a3 +; V-NEXT: vsetvli zero, zero, e8, mf2, ta, mu +; V-NEXT: vluxei64.v v16, (a1), v12 +; V-NEXT: vle8.v v12, (a0) +; V-NEXT: vadd.vv v12, v12, v16 +; V-NEXT: vse8.v v12, (a0) +; V-NEXT: vsetvli zero, zero, e64, m4, ta, mu +; V-NEXT: vadd.vx v8, v8, a4 +; V-NEXT: addi a2, a2, -16 +; V-NEXT: addi a0, a0, 16 +; V-NEXT: bnez a2, .LBB13_2 +; V-NEXT: .LBB13_3: # %bb16 +; V-NEXT: ret +; +; ZVE32F-LABEL: gather_no_scalar_remainder: +; ZVE32F: # %bb.0: # %bb +; ZVE32F-NEXT: addi sp, sp, -240 +; ZVE32F-NEXT: .cfi_def_cfa_offset 240 +; ZVE32F-NEXT: sd ra, 232(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s0, 224(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s1, 216(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s2, 208(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s3, 200(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s4, 192(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s5, 184(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s6, 176(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s7, 168(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s8, 160(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s9, 152(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s10, 144(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd s11, 136(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: .cfi_offset ra, -8 +; ZVE32F-NEXT: .cfi_offset s0, -16 +; ZVE32F-NEXT: .cfi_offset s1, -24 +; ZVE32F-NEXT: .cfi_offset s2, -32 +; ZVE32F-NEXT: .cfi_offset s3, -40 +; ZVE32F-NEXT: .cfi_offset s4, -48 +; ZVE32F-NEXT: .cfi_offset s5, -56 +; ZVE32F-NEXT: .cfi_offset s6, -64 +; ZVE32F-NEXT: .cfi_offset s7, -72 +; ZVE32F-NEXT: .cfi_offset s8, -80 +; ZVE32F-NEXT: .cfi_offset s9, -88 +; ZVE32F-NEXT: .cfi_offset s10, -96 +; ZVE32F-NEXT: .cfi_offset s11, -104 +; ZVE32F-NEXT: slli a2, a2, 4 +; ZVE32F-NEXT: beqz a2, .LBB13_3 +; ZVE32F-NEXT: # %bb.1: # %bb2 +; ZVE32F-NEXT: li a3, 0 +; ZVE32F-NEXT: li a4, 15 +; ZVE32F-NEXT: li a5, 14 +; ZVE32F-NEXT: li a6, 13 +; ZVE32F-NEXT: li a7, 12 +; ZVE32F-NEXT: li t0, 11 +; ZVE32F-NEXT: li t1, 10 +; ZVE32F-NEXT: li t2, 9 +; ZVE32F-NEXT: li t3, 8 +; ZVE32F-NEXT: li t4, 7 +; ZVE32F-NEXT: li t5, 6 +; ZVE32F-NEXT: li t6, 5 +; ZVE32F-NEXT: li s0, 4 +; ZVE32F-NEXT: li s1, 3 +; ZVE32F-NEXT: li s2, 2 +; ZVE32F-NEXT: li s3, 1 +; ZVE32F-NEXT: vsetivli zero, 16, e8, mf2, ta, mu +; ZVE32F-NEXT: .LBB13_2: # %bb4 +; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1 +; ZVE32F-NEXT: sd s0, 56(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd t0, 64(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a5, 72(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a4, 80(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a3, 88(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a0, 96(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: sd a2, 104(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s4, a3, 2 +; ZVE32F-NEXT: add a0, s4, a3 +; ZVE32F-NEXT: sd a0, 48(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s5, s3, 2 +; ZVE32F-NEXT: add a0, s5, s3 +; ZVE32F-NEXT: sd a0, 40(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s6, s2, 2 +; ZVE32F-NEXT: add a0, s6, s2 +; ZVE32F-NEXT: sd a0, 32(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s7, s1, 2 +; ZVE32F-NEXT: add a0, s7, s1 +; ZVE32F-NEXT: sd a0, 24(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: slli s8, s0, 2 +; ZVE32F-NEXT: add a0, s8, s0 +; ZVE32F-NEXT: slli s9, t6, 2 +; ZVE32F-NEXT: add a3, s9, t6 +; ZVE32F-NEXT: slli s10, t5, 2 +; ZVE32F-NEXT: add s10, s10, t5 +; ZVE32F-NEXT: slli s11, t4, 2 +; ZVE32F-NEXT: add s11, s11, t4 +; ZVE32F-NEXT: slli ra, t3, 2 +; ZVE32F-NEXT: add ra, ra, t3 +; ZVE32F-NEXT: slli s4, t2, 2 +; ZVE32F-NEXT: add s4, s4, t2 +; ZVE32F-NEXT: slli s5, t1, 2 +; ZVE32F-NEXT: add s5, s5, t1 +; ZVE32F-NEXT: slli s6, t0, 2 +; ZVE32F-NEXT: add s6, s6, t0 +; ZVE32F-NEXT: slli s7, a7, 2 +; ZVE32F-NEXT: add s7, s7, a7 +; ZVE32F-NEXT: slli s8, a6, 2 +; ZVE32F-NEXT: add s8, s8, a6 +; ZVE32F-NEXT: slli a2, a5, 2 +; ZVE32F-NEXT: add a2, a2, a5 +; ZVE32F-NEXT: slli s9, a4, 2 +; ZVE32F-NEXT: add s9, s9, a4 +; ZVE32F-NEXT: add a4, a1, s9 +; ZVE32F-NEXT: sd a4, 16(sp) # 8-byte Folded Spill +; ZVE32F-NEXT: add a2, a1, a2 +; ZVE32F-NEXT: add s8, a1, s8 +; ZVE32F-NEXT: add s7, a1, s7 +; ZVE32F-NEXT: add s6, a1, s6 +; ZVE32F-NEXT: add s5, a1, s5 +; ZVE32F-NEXT: add s4, a1, s4 +; ZVE32F-NEXT: add ra, a1, ra +; ZVE32F-NEXT: add s11, a1, s11 +; ZVE32F-NEXT: add s10, a1, s10 +; ZVE32F-NEXT: add a3, a1, a3 +; ZVE32F-NEXT: add a0, a1, a0 +; ZVE32F-NEXT: ld a4, 24(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a4, a1, a4 +; ZVE32F-NEXT: ld a5, 32(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a5, a1, a5 +; ZVE32F-NEXT: mv t0, a6 +; ZVE32F-NEXT: ld a6, 40(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add a6, a1, a6 +; ZVE32F-NEXT: ld s0, 48(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: add s9, a1, s0 +; ZVE32F-NEXT: lb s9, 0(s9) +; ZVE32F-NEXT: lb a6, 0(a6) +; ZVE32F-NEXT: lb a5, 0(a5) +; ZVE32F-NEXT: lb a4, 0(a4) +; ZVE32F-NEXT: lb a0, 0(a0) +; ZVE32F-NEXT: lb a3, 0(a3) +; ZVE32F-NEXT: lb s10, 0(s10) +; ZVE32F-NEXT: lb s11, 0(s11) +; ZVE32F-NEXT: lb ra, 0(ra) +; ZVE32F-NEXT: lb s4, 0(s4) +; ZVE32F-NEXT: lb s5, 0(s5) +; ZVE32F-NEXT: lb s6, 0(s6) +; ZVE32F-NEXT: lb s7, 0(s7) +; ZVE32F-NEXT: lb s8, 0(s8) +; ZVE32F-NEXT: lb a2, 0(a2) +; ZVE32F-NEXT: mv s0, t6 +; ZVE32F-NEXT: mv t6, t5 +; ZVE32F-NEXT: mv t5, t4 +; ZVE32F-NEXT: mv t4, t3 +; ZVE32F-NEXT: mv t3, t2 +; ZVE32F-NEXT: mv t2, t1 +; ZVE32F-NEXT: mv t1, a7 +; ZVE32F-NEXT: ld a7, 16(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: lb a7, 0(a7) +; ZVE32F-NEXT: sb s9, 112(sp) +; ZVE32F-NEXT: sb a6, 113(sp) +; ZVE32F-NEXT: mv a6, t0 +; ZVE32F-NEXT: ld t0, 64(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a5, 114(sp) +; ZVE32F-NEXT: ld a5, 72(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a4, 115(sp) +; ZVE32F-NEXT: ld a4, 80(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a0, 116(sp) +; ZVE32F-NEXT: ld a0, 96(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a3, 117(sp) +; ZVE32F-NEXT: ld a3, 88(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb s10, 118(sp) +; ZVE32F-NEXT: sb s11, 119(sp) +; ZVE32F-NEXT: sb ra, 120(sp) +; ZVE32F-NEXT: sb s4, 121(sp) +; ZVE32F-NEXT: sb s5, 122(sp) +; ZVE32F-NEXT: sb s6, 123(sp) +; ZVE32F-NEXT: sb s7, 124(sp) +; ZVE32F-NEXT: sb s8, 125(sp) +; ZVE32F-NEXT: sb a2, 126(sp) +; ZVE32F-NEXT: ld a2, 104(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: sb a7, 127(sp) +; ZVE32F-NEXT: mv a7, t1 +; ZVE32F-NEXT: mv t1, t2 +; ZVE32F-NEXT: mv t2, t3 +; ZVE32F-NEXT: mv t3, t4 +; ZVE32F-NEXT: mv t4, t5 +; ZVE32F-NEXT: mv t5, t6 +; ZVE32F-NEXT: mv t6, s0 +; ZVE32F-NEXT: ld s0, 56(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: addi s4, sp, 112 +; ZVE32F-NEXT: vle8.v v8, (s4) +; ZVE32F-NEXT: vle8.v v9, (a0) +; ZVE32F-NEXT: vadd.vv v8, v9, v8 +; ZVE32F-NEXT: vse8.v v8, (a0) +; ZVE32F-NEXT: addi a3, a3, 16 +; ZVE32F-NEXT: addi s3, s3, 16 +; ZVE32F-NEXT: addi s2, s2, 16 +; ZVE32F-NEXT: addi s1, s1, 16 +; ZVE32F-NEXT: addi s0, s0, 16 +; ZVE32F-NEXT: addi t6, t6, 16 +; ZVE32F-NEXT: addi t5, t5, 16 +; ZVE32F-NEXT: addi t4, t4, 16 +; ZVE32F-NEXT: addi t3, t3, 16 +; ZVE32F-NEXT: addi t2, t2, 16 +; ZVE32F-NEXT: addi t1, t1, 16 +; ZVE32F-NEXT: addi t0, t0, 16 +; ZVE32F-NEXT: addi a7, a7, 16 +; ZVE32F-NEXT: addi a6, a6, 16 +; ZVE32F-NEXT: addi a5, a5, 16 +; ZVE32F-NEXT: addi a4, a4, 16 +; ZVE32F-NEXT: addi a2, a2, -16 +; ZVE32F-NEXT: addi a0, a0, 16 +; ZVE32F-NEXT: bnez a2, .LBB13_2 +; ZVE32F-NEXT: .LBB13_3: # %bb16 +; ZVE32F-NEXT: ld ra, 232(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s0, 224(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s1, 216(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s2, 208(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s3, 200(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s4, 192(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s5, 184(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s6, 176(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s7, 168(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s8, 160(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s9, 152(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s10, 144(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: ld s11, 136(sp) # 8-byte Folded Reload +; ZVE32F-NEXT: addi sp, sp, 240 +; ZVE32F-NEXT: ret +bb: + %i = shl i64 %arg2, 4 + %i3 = icmp eq i64 %i, 0 + br i1 %i3, label %bb16, label %bb2 + +bb2: + br label %bb4 + +bb4: ; preds = %bb4, %bb + %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] + %i6 = phi <16 x i64> [ %i14, %bb4 ], [ , %bb2 ] + %i7 = mul <16 x i64> %i6, + %i8 = getelementptr inbounds i8, i8* %arg1, <16 x i64> %i7 + %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %i8, i32 1, <16 x i1> , <16 x i8> undef) + %i10 = getelementptr inbounds i8, i8* %arg, i64 %i5 + %cast = bitcast i8* %i10 to <16 x i8>* + %i11 = load <16 x i8>, <16 x i8>* %cast, align 1 + %i12 = add <16 x i8> %i11, %i9 + %cast2 = bitcast i8* %i10 to <16 x i8>* + store <16 x i8> %i12, <16 x i8>* %cast2, align 1 + %i13 = add nuw i64 %i5, 16 + %i14 = add <16 x i64> %i6, + %i15 = icmp eq i64 %i13, %i + br i1 %i15, label %bb16, label %bb4 + +bb16: ; preds = %bb4, %bb + ret void +} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store.ll @@ -853,3 +853,62 @@ %47 = icmp eq i32 %46, 1024 br i1 %47, label %36, label %37 } + +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32 immarg, <16 x i1>, <16 x i8>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32 immarg, <16 x i1>) + +define void @gather_no_scalar_remainder(i8* noalias nocapture noundef %arg, i8* noalias nocapture noundef readonly %arg1, i64 noundef %arg2) { +; CHECK-LABEL: @gather_no_scalar_remainder( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[I:%.*]] = shl i64 [[ARG2:%.*]], 4 +; CHECK-NEXT: [[I3:%.*]] = icmp eq i64 [[I]], 0 +; CHECK-NEXT: br i1 [[I3]], label [[BB16:%.*]], label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: br label [[BB4:%.*]] +; CHECK: bb4: +; CHECK-NEXT: [[I5:%.*]] = phi i64 [ [[I13:%.*]], [[BB4]] ], [ 0, [[BB2]] ] +; CHECK-NEXT: [[I6:%.*]] = phi <16 x i64> [ [[I14:%.*]], [[BB4]] ], [ , [[BB2]] ] +; CHECK-NEXT: [[I7:%.*]] = mul <16 x i64> [[I6]], +; CHECK-NEXT: [[I8:%.*]] = getelementptr inbounds i8, i8* [[ARG1:%.*]], <16 x i64> [[I7]] +; CHECK-NEXT: [[I9:%.*]] = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> [[I8]], i32 1, <16 x i1> , <16 x i8> undef) +; CHECK-NEXT: [[I10:%.*]] = getelementptr inbounds i8, i8* [[ARG:%.*]], i64 [[I5]] +; CHECK-NEXT: [[CAST:%.*]] = bitcast i8* [[I10]] to <16 x i8>* +; CHECK-NEXT: [[I11:%.*]] = load <16 x i8>, <16 x i8>* [[CAST]], align 1 +; CHECK-NEXT: [[I12:%.*]] = add <16 x i8> [[I11]], [[I9]] +; CHECK-NEXT: [[CAST2:%.*]] = bitcast i8* [[I10]] to <16 x i8>* +; CHECK-NEXT: store <16 x i8> [[I12]], <16 x i8>* [[CAST2]], align 1 +; CHECK-NEXT: [[I13]] = add nuw i64 [[I5]], 16 +; CHECK-NEXT: [[I14]] = add <16 x i64> [[I6]], +; CHECK-NEXT: [[I15:%.*]] = icmp eq i64 [[I13]], [[I]] +; CHECK-NEXT: br i1 [[I15]], label [[BB16]], label [[BB4]] +; CHECK: bb16: +; CHECK-NEXT: ret void +; +bb: + %i = shl i64 %arg2, 4 + %i3 = icmp eq i64 %i, 0 + br i1 %i3, label %bb16, label %bb2 + +bb2: + br label %bb4 + +bb4: ; preds = %bb4, %bb + %i5 = phi i64 [ %i13, %bb4 ], [ 0, %bb2 ] + %i6 = phi <16 x i64> [ %i14, %bb4 ], [ , %bb2 ] + %i7 = mul <16 x i64> %i6, + %i8 = getelementptr inbounds i8, i8* %arg1, <16 x i64> %i7 + %i9 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %i8, i32 1, <16 x i1> , <16 x i8> undef) + %i10 = getelementptr inbounds i8, i8* %arg, i64 %i5 + %cast = bitcast i8* %i10 to <16 x i8>* + %i11 = load <16 x i8>, <16 x i8>* %cast, align 1 + %i12 = add <16 x i8> %i11, %i9 + %cast2 = bitcast i8* %i10 to <16 x i8>* + store <16 x i8> %i12, <16 x i8>* %cast2, align 1 + %i13 = add nuw i64 %i5, 16 + %i14 = add <16 x i64> %i6, + %i15 = icmp eq i64 %i13, %i + br i1 %i15, label %bb16, label %bb4 + +bb16: ; preds = %bb4, %bb + ret void +}