diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -12182,8 +12182,16 @@ break; auto *Store = cast(N); + SDValue Chain = Store->getChain(); EVT MemVT = Store->getMemoryVT(); SDValue Val = Store->getValue(); + SDLoc DL(N); + + bool IsScalarizable = + MemVT.isFixedLengthVector() && ISD::isNormalStore(Store) && + MemVT.getVectorElementType().bitsLE(Subtarget.getXLenVT()) && + isPowerOf2_64(MemVT.getSizeInBits()) && + MemVT.getSizeInBits() <= Subtarget.getXLen(); // Using vector to store zeros requires e.g.: // vsetivli zero, 2, e64, m1, ta, ma @@ -12191,17 +12199,11 @@ // vse64.v v8, (a0) // If sufficiently aligned, we can use at most one scalar store to zero // initialize any power-of-two size up to XLen bits. - if (DCI.isBeforeLegalize() && !Store->isTruncatingStore() && - !Store->isIndexed() && ISD::isBuildVectorAllZeros(Val.getNode()) && - MemVT.getVectorElementType().bitsLE(Subtarget.getXLenVT()) && - isPowerOf2_64(MemVT.getSizeInBits()) && - MemVT.getSizeInBits() <= Subtarget.getXLen()) { - assert(!MemVT.isScalableVector()); + if (DCI.isBeforeLegalize() && IsScalarizable && + ISD::isBuildVectorAllZeros(Val.getNode())) { auto NewVT = MVT::getIntegerVT(MemVT.getSizeInBits()); if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), NewVT, *Store->getMemOperand())) { - SDLoc DL(N); - SDValue Chain = Store->getChain(); auto NewV = DAG.getConstant(0, DL, NewVT); return DAG.getStore(Chain, DL, NewV, Store->getBasePtr(), Store->getPointerInfo(), Store->getOriginalAlign(), @@ -12209,6 +12211,29 @@ } } + // Similarly, if sufficiently aligned we can scalarize vector copies, e.g. + // vsetivli zero, 2, e16, m1, ta, ma + // vle16.v v8, (a0) + // vse16.v v8, (a1) + if (auto *L = dyn_cast(Val); + L && DCI.isBeforeLegalize() && IsScalarizable && + L->hasNUsesOfValue(1, 0) && L->hasNUsesOfValue(1, 1) && + Store->getChain() == SDValue(L, 1) && ISD::isNormalLoad(L) && + L->getMemoryVT() == MemVT) { + MVT NewVT = MVT::getIntegerVT(MemVT.getSizeInBits()); + if (allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + NewVT, *Store->getMemOperand()) && + allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + NewVT, *L->getMemOperand())) { + SDValue NewL = DAG.getLoad(NewVT, DL, L->getChain(), L->getBasePtr(), + L->getPointerInfo(), L->getOriginalAlign(), + L->getMemOperand()->getFlags()); + return DAG.getStore(Chain, DL, NewL, Store->getBasePtr(), + Store->getPointerInfo(), Store->getOriginalAlign(), + Store->getMemOperand()->getFlags()); + } + } + // Combine store of vmv.x.s/vfmv.f.s to vse with VL of 1. // vfmv.f.s is represented as extract element from 0. Match it late to avoid // any illegal types. diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-load-store.ll @@ -5,9 +5,8 @@ define void @v2i8(ptr %p, ptr %q) { ; CHECK-LABEL: v2i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %p store <2 x i8> %v, ptr %q @@ -17,9 +16,8 @@ define void @v2i16(ptr %p, ptr %q) { ; CHECK-LABEL: v2i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x i16>, ptr %p store <2 x i16> %v, ptr %q @@ -27,12 +25,18 @@ } define void @v2i32(ptr %p, ptr %q) { -; CHECK-LABEL: v2i32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vse32.v v8, (a1) -; CHECK-NEXT: ret +; RV32-LABEL: v2i32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: v2i32: +; RV64: # %bb.0: +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: ret %v = load <2 x i32>, ptr %p store <2 x i32> %v, ptr %q ret void @@ -53,9 +57,8 @@ define void @v2f16(ptr %p, ptr %q) { ; CHECK-LABEL: v2f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vse16.v v8, (a1) +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x half>, ptr %p store <2 x half> %v, ptr %q @@ -63,12 +66,18 @@ } define void @v2f32(ptr %p, ptr %q) { -; CHECK-LABEL: v2f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: vse32.v v8, (a1) -; CHECK-NEXT: ret +; RV32-LABEL: v2f32: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vse32.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: v2f32: +; RV64: # %bb.0: +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: ret %v = load <2 x float>, ptr %p store <2 x float> %v, ptr %q ret void @@ -89,9 +98,8 @@ define void @v4i8(ptr %p, ptr %q) { ; CHECK-LABEL: v4i8: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) ; CHECK-NEXT: ret %v = load <4 x i8>, ptr %p store <4 x i8> %v, ptr %q @@ -99,12 +107,18 @@ } define void @v4i16(ptr %p, ptr %q) { -; CHECK-LABEL: v4i16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vse16.v v8, (a1) -; CHECK-NEXT: ret +; RV32-LABEL: v4i16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vse16.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: v4i16: +; RV64: # %bb.0: +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: ret %v = load <4 x i16>, ptr %p store <4 x i16> %v, ptr %q ret void @@ -135,12 +149,18 @@ } define void @v4f16(ptr %p, ptr %q) { -; CHECK-LABEL: v4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vse16.v v8, (a1) -; CHECK-NEXT: ret +; RV32-LABEL: v4f16: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; RV32-NEXT: vle16.v v8, (a0) +; RV32-NEXT: vse16.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: v4f16: +; RV64: # %bb.0: +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: ret %v = load <4 x half>, ptr %p store <4 x half> %v, ptr %q ret void @@ -171,12 +191,18 @@ } define void @v8i8(ptr %p, ptr %q) { -; CHECK-LABEL: v8i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) -; CHECK-NEXT: ret +; RV32-LABEL: v8i8: +; RV32: # %bb.0: +; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: vle8.v v8, (a0) +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: ret +; +; RV64-LABEL: v8i8: +; RV64: # %bb.0: +; RV64-NEXT: ld a0, 0(a0) +; RV64-NEXT: sd a0, 0(a1) +; RV64-NEXT: ret %v = load <8 x i8>, ptr %p store <8 x i8> %v, ptr %q ret void @@ -233,9 +259,8 @@ define void @v2i8_align2(ptr %p, ptr %q) { ; CHECK-LABEL: v2i8_align2: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %p, align 2 store <2 x i8> %v, ptr %q @@ -245,9 +270,8 @@ define void @v2i8_align4(ptr %p, ptr %q) { ; CHECK-LABEL: v2i8_align4: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %p, align 4 store <2 x i8> %v, ptr %q @@ -259,7 +283,8 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma ; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %v = load volatile <2 x i8>, ptr %p store <2 x i8> %v, ptr %q @@ -269,15 +294,10 @@ define void @v2i8_volatile_store(ptr %p, ptr %q) { ; CHECK-LABEL: v2i8_volatile_store: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vle8.v v8, (a0) -; CHECK-NEXT: vse8.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %v = load <2 x i8>, ptr %p store volatile <2 x i8> %v, ptr %q ret void } - -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; RV32: {{.*}} -; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-load-store.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-load-store.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-load-store.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-load-store.ll @@ -7,17 +7,9 @@ define void @load_store_v1i1(ptr %x, ptr %y) { ; CHECK-LABEL: load_store_v1i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma -; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vsetivli zero, 1, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v8, v9, 0 -; CHECK-NEXT: vsm.v v8, (a1) +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: sb a0, 0(a1) ; CHECK-NEXT: ret %a = load <1 x i1>, ptr %x store <1 x i1> %a, ptr %y @@ -27,17 +19,9 @@ define void @load_store_v2i1(ptr %x, ptr %y) { ; CHECK-LABEL: load_store_v2i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vsetivli zero, 2, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v8, v9, 0 -; CHECK-NEXT: vsm.v v8, (a1) +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: andi a0, a0, 3 +; CHECK-NEXT: sb a0, 0(a1) ; CHECK-NEXT: ret %a = load <2 x i1>, ptr %x store <2 x i1> %a, ptr %y @@ -47,17 +31,9 @@ define void @load_store_v4i1(ptr %x, ptr %y) { ; CHECK-LABEL: load_store_v4i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vlm.v v0, (a0) -; CHECK-NEXT: vmv.v.i v8, 0 -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmv.v.i v9, 0 -; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma -; CHECK-NEXT: vslideup.vi v9, v8, 0 -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vmsne.vi v8, v9, 0 -; CHECK-NEXT: vsm.v v8, (a1) +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: andi a0, a0, 15 +; CHECK-NEXT: sb a0, 0(a1) ; CHECK-NEXT: ret %a = load <4 x i1>, ptr %x store <4 x i1> %a, ptr %y @@ -67,9 +43,8 @@ define void @load_store_v8i1(ptr %x, ptr %y) { ; CHECK-LABEL: load_store_v8i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma -; CHECK-NEXT: vlm.v v8, (a0) -; CHECK-NEXT: vsm.v v8, (a1) +; CHECK-NEXT: lbu a0, 0(a0) +; CHECK-NEXT: sb a0, 0(a1) ; CHECK-NEXT: ret %a = load <8 x i1>, ptr %x store <8 x i1> %a, ptr %y @@ -79,9 +54,8 @@ define void @load_store_v16i1(ptr %x, ptr %y) { ; CHECK-LABEL: load_store_v16i1: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vlm.v v8, (a0) -; CHECK-NEXT: vsm.v v8, (a1) +; CHECK-NEXT: lh a0, 0(a0) +; CHECK-NEXT: sh a0, 0(a1) ; CHECK-NEXT: ret %a = load <16 x i1>, ptr %x store <16 x i1> %a, ptr %y @@ -89,26 +63,16 @@ } define void @load_store_v32i1(ptr %x, ptr %y) { -; LMULMAX2-LABEL: load_store_v32i1: -; LMULMAX2: # %bb.0: -; LMULMAX2-NEXT: li a2, 32 -; LMULMAX2-NEXT: vsetvli zero, a2, e8, m2, ta, ma -; LMULMAX2-NEXT: vlm.v v8, (a0) -; LMULMAX2-NEXT: vsm.v v8, (a1) -; LMULMAX2-NEXT: ret -; -; LMULMAX1-RV32-LABEL: load_store_v32i1: -; LMULMAX1-RV32: # %bb.0: -; LMULMAX1-RV32-NEXT: lw a0, 0(a0) -; LMULMAX1-RV32-NEXT: sw a0, 0(a1) -; LMULMAX1-RV32-NEXT: ret -; -; LMULMAX1-RV64-LABEL: load_store_v32i1: -; LMULMAX1-RV64: # %bb.0: -; LMULMAX1-RV64-NEXT: lw a0, 0(a0) -; LMULMAX1-RV64-NEXT: sw a0, 0(a1) -; LMULMAX1-RV64-NEXT: ret +; CHECK-LABEL: load_store_v32i1: +; CHECK: # %bb.0: +; CHECK-NEXT: lw a0, 0(a0) +; CHECK-NEXT: sw a0, 0(a1) +; CHECK-NEXT: ret %a = load <32 x i1>, ptr %x store <32 x i1> %a, ptr %y ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; LMULMAX1-RV32: {{.*}} +; LMULMAX1-RV64: {{.*}} +; LMULMAX2: {{.*}}