diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td --- a/llvm/lib/Target/RISCV/RISCV.td +++ b/llvm/lib/Target/RISCV/RISCV.td @@ -452,6 +452,11 @@ "true", "Has reasonably performant unaligned scalar " "loads and stores">; +def FeatureOptimizedZeroStrideLoad + : SubtargetFeature<"optimized-zero-stride-load", "HasOptimizedZeroStrideLoad", + "true", "Has optimized (perform fewer memory operations)" + "zero-stride vector load">; + def TuneLUIADDIFusion : SubtargetFeature<"lui-addi-fusion", "HasLUIADDIFusion", "true", "Enable LUI+ADDI macrofusion">; diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -1790,6 +1790,10 @@ case RISCVISD::VFMV_S_F_VL: case RISCVISD::VMV_V_X_VL: case RISCVISD::VFMV_V_F_VL: { + // Only if we have optimized zero-stride vector load. + if (!Subtarget->hasOptimizedZeroStrideLoad()) + break; + // Try to match splat of a scalar load to a strided load with stride of x0. bool IsScalarMove = Node->getOpcode() == RISCVISD::VMV_S_X_VL || Node->getOpcode() == RISCVISD::VFMV_S_F_VL; diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -101,6 +101,7 @@ bool HasShortForwardBranchOpt = false; bool HasLUIADDIFusion = false; bool HasForcedAtomics = false; + bool HasOptimizedZeroStrideLoad = false; unsigned XLen = 32; unsigned ZvlLen = 0; MVT XLenVT = MVT::i32; @@ -199,6 +200,7 @@ bool enableUnalignedScalarMem() const { return EnableUnalignedScalarMem; } bool hasLUIADDIFusion() const { return HasLUIADDIFusion; } bool hasForcedAtomics() const { return HasForcedAtomics; } + bool hasOptimizedZeroStrideLoad() const { return HasOptimizedZeroStrideLoad; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } unsigned getFLen() const { diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll @@ -99,15 +99,15 @@ define void @buildvec_dominant0_v4f32(<4 x float>* %x) { ; CHECK-LABEL: buildvec_dominant0_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; CHECK-NEXT: lui a1, %hi(.LCPI4_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI4_0) -; CHECK-NEXT: vlse32.v v8, (a1), zero -; CHECK-NEXT: vmv.s.x v9, zero +; CHECK-NEXT: flw ft0, %lo(.LCPI4_0)(a1) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: vmv.s.x v8, zero +; CHECK-NEXT: vfmv.v.f v9, ft0 ; CHECK-NEXT: vsetivli zero, 3, e32, m1, tu, ma -; CHECK-NEXT: vslideup.vi v8, v9, 2 +; CHECK-NEXT: vslideup.vi v9, v8, 2 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vse32.v v8, (a0) +; CHECK-NEXT: vse32.v v9, (a0) ; CHECK-NEXT: ret store <4 x float> , <4 x float>* %x ret void @@ -246,7 +246,8 @@ ; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; LMULMAX1-NEXT: vse32.v v9, (a2) ; LMULMAX1-NEXT: vse32.v v8, (a1) -; LMULMAX1-NEXT: vlse32.v v8, (a0), zero +; LMULMAX1-NEXT: flw ft0, 0(a0) +; LMULMAX1-NEXT: vfmv.v.f v8, ft0 ; LMULMAX1-NEXT: vmv.v.v v9, v8 ; LMULMAX1-NEXT: addi sp, sp, 32 ; LMULMAX1-NEXT: ret @@ -268,10 +269,10 @@ ; RV32-LABEL: splat_load_licm: ; RV32: # %bb.0: ; RV32-NEXT: lui a1, %hi(.LCPI12_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI12_0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vlse32.v v8, (a1), zero +; RV32-NEXT: flw ft0, %lo(.LCPI12_0)(a1) ; RV32-NEXT: li a1, 1024 +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vfmv.v.f v8, ft0 ; RV32-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV32-NEXT: vse32.v v8, (a0) ; RV32-NEXT: addi a1, a1, -4 @@ -283,10 +284,10 @@ ; RV64-LABEL: splat_load_licm: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, %hi(.LCPI12_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI12_0) -; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV64-NEXT: vlse32.v v8, (a1), zero +; RV64-NEXT: flw ft0, %lo(.LCPI12_0)(a1) ; RV64-NEXT: li a1, 1024 +; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV64-NEXT: vfmv.v.f v8, ft0 ; RV64-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1 ; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: addiw a1, a1, -4 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-shuffles.ll @@ -166,11 +166,11 @@ ; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma ; RV32-NEXT: lui a0, %hi(.LCPI7_0) -; RV32-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV32-NEXT: vlse64.v v10, (a0), zero -; RV32-NEXT: vid.v v12 -; RV32-NEXT: vrsub.vi v12, v12, 4 +; RV32-NEXT: fld ft0, %lo(.LCPI7_0)(a0) +; RV32-NEXT: vid.v v10 +; RV32-NEXT: vrsub.vi v12, v10, 4 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfmv.v.f v10, ft0 ; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret @@ -180,12 +180,12 @@ ; RV64-NEXT: li a0, 12 ; RV64-NEXT: vsetivli zero, 1, e8, mf8, ta, ma ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu ; RV64-NEXT: lui a0, %hi(.LCPI7_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI7_0) -; RV64-NEXT: vlse64.v v10, (a0), zero -; RV64-NEXT: vid.v v12 -; RV64-NEXT: vrsub.vi v12, v12, 4 +; RV64-NEXT: fld ft0, %lo(.LCPI7_0)(a0) +; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu +; RV64-NEXT: vid.v v10 +; RV64-NEXT: vrsub.vi v12, v10, 4 +; RV64-NEXT: vfmv.v.f v10, ft0 ; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret @@ -197,14 +197,14 @@ ; RV32-LABEL: vrgather_shuffle_vx_v4f64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; RV32-NEXT: vid.v v12 +; RV32-NEXT: vid.v v10 ; RV32-NEXT: li a0, 3 ; RV32-NEXT: lui a1, %hi(.LCPI8_0) -; RV32-NEXT: addi a1, a1, %lo(.LCPI8_0) -; RV32-NEXT: vlse64.v v10, (a1), zero -; RV32-NEXT: vmul.vx v12, v12, a0 +; RV32-NEXT: fld ft0, %lo(.LCPI8_0)(a1) +; RV32-NEXT: vmul.vx v12, v10, a0 ; RV32-NEXT: vmv.s.x v0, a0 ; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; RV32-NEXT: vfmv.v.f v10, ft0 ; RV32-NEXT: vrgatherei16.vv v10, v8, v12, v0.t ; RV32-NEXT: vmv.v.v v8, v10 ; RV32-NEXT: ret @@ -212,13 +212,13 @@ ; RV64-LABEL: vrgather_shuffle_vx_v4f64: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, mu -; RV64-NEXT: vid.v v12 +; RV64-NEXT: vid.v v10 ; RV64-NEXT: lui a0, %hi(.LCPI8_0) -; RV64-NEXT: addi a0, a0, %lo(.LCPI8_0) -; RV64-NEXT: vlse64.v v10, (a0), zero +; RV64-NEXT: fld ft0, %lo(.LCPI8_0)(a0) ; RV64-NEXT: li a0, 3 +; RV64-NEXT: vmul.vx v12, v10, a0 ; RV64-NEXT: vmv.s.x v0, a0 -; RV64-NEXT: vmul.vx v12, v12, a0 +; RV64-NEXT: vfmv.v.f v10, ft0 ; RV64-NEXT: vrgather.vv v10, v8, v12, v0.t ; RV64-NEXT: vmv.v.v v8, v10 ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-vrgather.ll @@ -7,9 +7,9 @@ define void @gather_const_v8f16(<8 x half>* %x) { ; CHECK-LABEL: gather_const_v8f16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 10 +; CHECK-NEXT: flh ft0, 10(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vfmv.v.f v8, ft0 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x half>, <8 x half>* %x @@ -23,9 +23,9 @@ define void @gather_const_v4f32(<4 x float>* %x) { ; CHECK-LABEL: gather_const_v4f32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 8 +; CHECK-NEXT: flw ft0, 8(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vfmv.v.f v8, ft0 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x float>, <4 x float>* %x @@ -39,8 +39,9 @@ define void @gather_const_v2f64(<2 x double>* %x) { ; CHECK-LABEL: gather_const_v2f64: ; CHECK: # %bb.0: +; CHECK-NEXT: fld ft0, 0(a0) ; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v8, (a0), zero +; CHECK-NEXT: vfmv.v.f v8, ft0 ; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret %a = load <2 x double>, <2 x double>* %x @@ -54,10 +55,10 @@ define void @gather_const_v64f16(<64 x half>* %x) { ; LMULMAX8-LABEL: gather_const_v64f16: ; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: flh ft0, 94(a0) ; LMULMAX8-NEXT: li a1, 64 -; LMULMAX8-NEXT: addi a2, a0, 94 ; LMULMAX8-NEXT: vsetvli zero, a1, e16, m8, ta, ma -; LMULMAX8-NEXT: vlse16.v v8, (a2), zero +; LMULMAX8-NEXT: vfmv.v.f v8, ft0 ; LMULMAX8-NEXT: vse16.v v8, (a0) ; LMULMAX8-NEXT: ret ; @@ -67,12 +68,12 @@ ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: addi a3, a0, 48 ; LMULMAX1-NEXT: addi a4, a0, 32 -; LMULMAX1-NEXT: addi a5, a0, 94 -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vlse16.v v8, (a5), zero ; LMULMAX1-NEXT: addi a5, a0, 64 +; LMULMAX1-NEXT: flh ft0, 94(a0) ; LMULMAX1-NEXT: addi a6, a0, 112 ; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vfmv.v.f v8, ft0 ; LMULMAX1-NEXT: vse16.v v8, (a7) ; LMULMAX1-NEXT: vse16.v v8, (a6) ; LMULMAX1-NEXT: vse16.v v8, (a5) @@ -93,10 +94,10 @@ define void @gather_const_v32f32(<32 x float>* %x) { ; LMULMAX8-LABEL: gather_const_v32f32: ; LMULMAX8: # %bb.0: +; LMULMAX8-NEXT: flw ft0, 68(a0) ; LMULMAX8-NEXT: li a1, 32 -; LMULMAX8-NEXT: addi a2, a0, 68 ; LMULMAX8-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; LMULMAX8-NEXT: vlse32.v v8, (a2), zero +; LMULMAX8-NEXT: vfmv.v.f v8, ft0 ; LMULMAX8-NEXT: vse32.v v8, (a0) ; LMULMAX8-NEXT: ret ; @@ -106,12 +107,12 @@ ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: addi a3, a0, 48 ; LMULMAX1-NEXT: addi a4, a0, 32 -; LMULMAX1-NEXT: addi a5, a0, 68 -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vlse32.v v8, (a5), zero ; LMULMAX1-NEXT: addi a5, a0, 80 +; LMULMAX1-NEXT: flw ft0, 68(a0) ; LMULMAX1-NEXT: addi a6, a0, 112 ; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vfmv.v.f v8, ft0 ; LMULMAX1-NEXT: vse32.v v8, (a7) ; LMULMAX1-NEXT: vse32.v v8, (a6) ; LMULMAX1-NEXT: vse32.v v8, (a1) @@ -132,9 +133,9 @@ define void @gather_const_v16f64(<16 x double>* %x) { ; LMULMAX8-LABEL: gather_const_v16f64: ; LMULMAX8: # %bb.0: -; LMULMAX8-NEXT: addi a1, a0, 80 +; LMULMAX8-NEXT: fld ft0, 80(a0) ; LMULMAX8-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; LMULMAX8-NEXT: vlse64.v v8, (a1), zero +; LMULMAX8-NEXT: vfmv.v.f v8, ft0 ; LMULMAX8-NEXT: vse64.v v8, (a0) ; LMULMAX8-NEXT: ret ; @@ -144,11 +145,12 @@ ; LMULMAX1-NEXT: addi a2, a0, 16 ; LMULMAX1-NEXT: addi a3, a0, 48 ; LMULMAX1-NEXT: addi a4, a0, 32 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vlse64.v v8, (a1), zero ; LMULMAX1-NEXT: addi a5, a0, 64 +; LMULMAX1-NEXT: fld ft0, 80(a0) ; LMULMAX1-NEXT: addi a6, a0, 112 ; LMULMAX1-NEXT: addi a7, a0, 96 +; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; LMULMAX1-NEXT: vfmv.v.f v8, ft0 ; LMULMAX1-NEXT: vse64.v v8, (a7) ; LMULMAX1-NEXT: vse64.v v8, (a6) ; LMULMAX1-NEXT: vse64.v v8, (a5) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-insert.ll @@ -41,13 +41,13 @@ ; RV32-LABEL: insertelt_v3i64: ; RV32: # %bb.0: ; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: lw a3, 20(a0) ; RV32-NEXT: vle64.v v8, (a0) -; RV32-NEXT: lw a3, 16(a0) -; RV32-NEXT: addi a4, a0, 20 +; RV32-NEXT: lw a4, 16(a0) ; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vlse32.v v10, (a4), zero +; RV32-NEXT: vmv.v.x v10, a3 ; RV32-NEXT: vsetvli zero, zero, e32, m1, tu, ma -; RV32-NEXT: vmv.s.x v10, a3 +; RV32-NEXT: vmv.s.x v10, a4 ; RV32-NEXT: vsetvli zero, zero, e64, m2, tu, ma ; RV32-NEXT: vslideup.vi v8, v10, 2 ; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-buildvec.ll @@ -459,9 +459,9 @@ ; RV64-LABEL: buildvec_seq_v16i8_v2i64: ; RV64: # %bb.0: ; RV64-NEXT: lui a1, %hi(.LCPI24_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI24_0) +; RV64-NEXT: ld a1, %lo(.LCPI24_0)(a1) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-vrgather.ll @@ -7,9 +7,9 @@ define void @gather_const_v16i8(<16 x i8>* %x) { ; CHECK-LABEL: gather_const_v16i8: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 12 +; CHECK-NEXT: lb a1, 12(a0) ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; CHECK-NEXT: vlse8.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse8.v v8, (a0) ; CHECK-NEXT: ret %a = load <16 x i8>, <16 x i8>* %x @@ -23,9 +23,9 @@ define void @gather_const_v8i16(<8 x i16>* %x) { ; CHECK-LABEL: gather_const_v8i16: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 10 +; CHECK-NEXT: lh a1, 10(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse16.v v8, (a0) ; CHECK-NEXT: ret %a = load <8 x i16>, <8 x i16>* %x @@ -39,9 +39,9 @@ define void @gather_const_v4i32(<4 x i32>* %x) { ; CHECK-LABEL: gather_const_v4i32: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 12 +; CHECK-NEXT: lw a1, 12(a0) ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 ; CHECK-NEXT: vse32.v v8, (a0) ; CHECK-NEXT: ret %a = load <4 x i32>, <4 x i32>* %x @@ -53,13 +53,6 @@ } define void @gather_const_v2i64(<2 x i64>* %x) { -; CHECK-LABEL: gather_const_v2i64: -; CHECK: # %bb.0: -; CHECK-NEXT: addi a1, a0, 8 -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v8, (a1), zero -; CHECK-NEXT: vse64.v v8, (a0) -; CHECK-NEXT: ret %a = load <2 x i64>, <2 x i64>* %x %b = extractelement <2 x i64> %a, i32 1 %c = insertelement <2 x i64> poison, i64 %b, i32 0 @@ -71,24 +64,25 @@ define void @gather_const_v64i8(<64 x i8>* %x) { ; LMULMAX4-LABEL: gather_const_v64i8: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: li a1, 64 -; LMULMAX4-NEXT: addi a2, a0, 32 -; LMULMAX4-NEXT: vsetvli zero, a1, e8, m4, ta, ma -; LMULMAX4-NEXT: vlse8.v v8, (a2), zero +; LMULMAX4-NEXT: lb a1, 32(a0) +; LMULMAX4-NEXT: li a2, 64 +; LMULMAX4-NEXT: vsetvli zero, a2, e8, m4, ta, ma +; LMULMAX4-NEXT: vmv.v.x v8, a1 ; LMULMAX4-NEXT: vse8.v v8, (a0) ; LMULMAX4-NEXT: ret ; ; LMULMAX1-LABEL: gather_const_v64i8: ; LMULMAX1: # %bb.0: ; LMULMAX1-NEXT: addi a1, a0, 32 +; LMULMAX1-NEXT: lb a2, 32(a0) +; LMULMAX1-NEXT: addi a3, a0, 16 +; LMULMAX1-NEXT: addi a4, a0, 48 ; LMULMAX1-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; LMULMAX1-NEXT: vlse8.v v8, (a1), zero -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 48 +; LMULMAX1-NEXT: vmv.v.x v8, a2 ; LMULMAX1-NEXT: vse8.v v8, (a1) -; LMULMAX1-NEXT: vse8.v v8, (a3) +; LMULMAX1-NEXT: vse8.v v8, (a4) ; LMULMAX1-NEXT: vse8.v v8, (a0) -; LMULMAX1-NEXT: vse8.v v8, (a2) +; LMULMAX1-NEXT: vse8.v v8, (a3) ; LMULMAX1-NEXT: ret %a = load <64 x i8>, <64 x i8>* %x %b = extractelement <64 x i8> %a, i32 32 @@ -101,25 +95,25 @@ define void @gather_const_v16i16(<32 x i16>* %x) { ; LMULMAX4-LABEL: gather_const_v16i16: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: li a1, 32 -; LMULMAX4-NEXT: addi a2, a0, 50 -; LMULMAX4-NEXT: vsetvli zero, a1, e16, m4, ta, ma -; LMULMAX4-NEXT: vlse16.v v8, (a2), zero +; LMULMAX4-NEXT: lh a1, 50(a0) +; LMULMAX4-NEXT: li a2, 32 +; LMULMAX4-NEXT: vsetvli zero, a2, e16, m4, ta, ma +; LMULMAX4-NEXT: vmv.v.x v8, a1 ; LMULMAX4-NEXT: vse16.v v8, (a0) ; LMULMAX4-NEXT: ret ; ; LMULMAX1-LABEL: gather_const_v16i16: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi a1, a0, 50 -; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; LMULMAX1-NEXT: vlse16.v v8, (a1), zero ; LMULMAX1-NEXT: addi a1, a0, 48 -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 32 -; LMULMAX1-NEXT: vse16.v v8, (a3) +; LMULMAX1-NEXT: lh a2, 50(a0) +; LMULMAX1-NEXT: addi a3, a0, 16 +; LMULMAX1-NEXT: addi a4, a0, 32 +; LMULMAX1-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; LMULMAX1-NEXT: vmv.v.x v8, a2 +; LMULMAX1-NEXT: vse16.v v8, (a4) ; LMULMAX1-NEXT: vse16.v v8, (a1) ; LMULMAX1-NEXT: vse16.v v8, (a0) -; LMULMAX1-NEXT: vse16.v v8, (a2) +; LMULMAX1-NEXT: vse16.v v8, (a3) ; LMULMAX1-NEXT: ret %a = load <32 x i16>, <32 x i16>* %x %b = extractelement <32 x i16> %a, i32 25 @@ -132,24 +126,24 @@ define void @gather_const_v16i32(<16 x i32>* %x) { ; LMULMAX4-LABEL: gather_const_v16i32: ; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi a1, a0, 36 +; LMULMAX4-NEXT: lw a1, 36(a0) ; LMULMAX4-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; LMULMAX4-NEXT: vlse32.v v8, (a1), zero +; LMULMAX4-NEXT: vmv.v.x v8, a1 ; LMULMAX4-NEXT: vse32.v v8, (a0) ; LMULMAX4-NEXT: ret ; ; LMULMAX1-LABEL: gather_const_v16i32: ; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi a1, a0, 36 -; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; LMULMAX1-NEXT: vlse32.v v8, (a1), zero ; LMULMAX1-NEXT: addi a1, a0, 32 -; LMULMAX1-NEXT: addi a2, a0, 16 -; LMULMAX1-NEXT: addi a3, a0, 48 +; LMULMAX1-NEXT: lw a2, 36(a0) +; LMULMAX1-NEXT: addi a3, a0, 16 +; LMULMAX1-NEXT: addi a4, a0, 48 +; LMULMAX1-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; LMULMAX1-NEXT: vmv.v.x v8, a2 ; LMULMAX1-NEXT: vse32.v v8, (a1) -; LMULMAX1-NEXT: vse32.v v8, (a3) +; LMULMAX1-NEXT: vse32.v v8, (a4) ; LMULMAX1-NEXT: vse32.v v8, (a0) -; LMULMAX1-NEXT: vse32.v v8, (a2) +; LMULMAX1-NEXT: vse32.v v8, (a3) ; LMULMAX1-NEXT: ret %a = load <16 x i32>, <16 x i32>* %x %b = extractelement <16 x i32> %a, i32 9 @@ -160,27 +154,6 @@ } define void @gather_const_v8i64(<8 x i64>* %x) { -; LMULMAX4-LABEL: gather_const_v8i64: -; LMULMAX4: # %bb.0: -; LMULMAX4-NEXT: addi a1, a0, 24 -; LMULMAX4-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; LMULMAX4-NEXT: vlse64.v v8, (a1), zero -; LMULMAX4-NEXT: vse64.v v8, (a0) -; LMULMAX4-NEXT: ret -; -; LMULMAX1-LABEL: gather_const_v8i64: -; LMULMAX1: # %bb.0: -; LMULMAX1-NEXT: addi a1, a0, 24 -; LMULMAX1-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; LMULMAX1-NEXT: vlse64.v v8, (a1), zero -; LMULMAX1-NEXT: addi a1, a0, 16 -; LMULMAX1-NEXT: addi a2, a0, 48 -; LMULMAX1-NEXT: addi a3, a0, 32 -; LMULMAX1-NEXT: vse64.v v8, (a3) -; LMULMAX1-NEXT: vse64.v v8, (a2) -; LMULMAX1-NEXT: vse64.v v8, (a0) -; LMULMAX1-NEXT: vse64.v v8, (a1) -; LMULMAX1-NEXT: ret %a = load <8 x i64>, <8 x i64>* %x %b = extractelement <8 x i64> %a, i32 3 %c = insertelement <8 x i64> poison, i64 %b, i32 0 @@ -192,9 +165,9 @@ define void @splat_concat_low(<4 x i16>* %x, <4 x i16>* %y, <8 x i16>* %z) { ; CHECK-LABEL: splat_concat_low: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, a0, 2 +; CHECK-NEXT: lh a0, 2(a0) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a0), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -208,9 +181,9 @@ define void @splat_concat_high(<4 x i16>* %x, <4 x i16>* %y, <8 x i16>* %z) { ; CHECK-LABEL: splat_concat_high: ; CHECK: # %bb.0: -; CHECK-NEXT: addi a0, a1, 2 +; CHECK-NEXT: lh a0, 2(a1) ; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v8, (a0), zero +; CHECK-NEXT: vmv.v.x v8, a0 ; CHECK-NEXT: vse16.v v8, (a2) ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int.ll @@ -1032,15 +1032,15 @@ ; ; RV64-LABEL: mulhu_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: lui a1, %hi(.LCPI55_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI55_0) -; RV64-NEXT: vlse64.v v8, (a1), zero -; RV64-NEXT: lui a1, %hi(.LCPI55_1) -; RV64-NEXT: ld a1, %lo(.LCPI55_1)(a1) +; RV64-NEXT: ld a1, %lo(.LCPI55_0)(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: lui a2, %hi(.LCPI55_1) +; RV64-NEXT: ld a2, %lo(.LCPI55_1)(a2) +; RV64-NEXT: vmv.v.x v8, a1 ; RV64-NEXT: vle64.v v9, (a0) ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV64-NEXT: vmulhu.vv v8, v9, v8 ; RV64-NEXT: vid.v v9 @@ -1166,12 +1166,12 @@ ; ; RV64-LABEL: mulhs_v4i32: ; RV64: # %bb.0: +; RV64-NEXT: lui a1, %hi(.LCPI58_0) +; RV64-NEXT: ld a1, %lo(.LCPI58_0)(a1) ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vle32.v v8, (a0) -; RV64-NEXT: lui a1, %hi(.LCPI58_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI58_0) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vlse64.v v9, (a1), zero +; RV64-NEXT: vmv.v.x v9, a1 ; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma ; RV64-NEXT: vmulh.vv v8, v8, v9 ; RV64-NEXT: vsra.vi v8, v8, 1 @@ -1221,15 +1221,15 @@ ; ; RV64-LABEL: mulhs_v2i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: lui a1, %hi(.LCPI59_0) -; RV64-NEXT: addi a1, a1, %lo(.LCPI59_0) -; RV64-NEXT: vlse64.v v8, (a1), zero -; RV64-NEXT: lui a1, %hi(.LCPI59_1) -; RV64-NEXT: ld a1, %lo(.LCPI59_1)(a1) +; RV64-NEXT: ld a1, %lo(.LCPI59_0)(a1) +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: lui a2, %hi(.LCPI59_1) +; RV64-NEXT: ld a2, %lo(.LCPI59_1)(a2) +; RV64-NEXT: vmv.v.x v8, a1 ; RV64-NEXT: vle64.v v9, (a0) ; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; RV64-NEXT: vmv.s.x v8, a1 +; RV64-NEXT: vmv.s.x v8, a2 ; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; RV64-NEXT: vmulh.vv v8, v9, v8 ; RV64-NEXT: vid.v v10 @@ -4730,22 +4730,22 @@ ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma ; LMULMAX1-RV64-NEXT: vmv.s.x v10, a2 ; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI156_0) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI156_0) +; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI156_0)(a2) +; LMULMAX1-RV64-NEXT: lui a3, %hi(.LCPI156_1) +; LMULMAX1-RV64-NEXT: ld a3, %lo(.LCPI156_1)(a3) ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma -; LMULMAX1-RV64-NEXT: vlse64.v v11, (a2), zero -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI156_1) -; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI156_1)(a2) +; LMULMAX1-RV64-NEXT: vmv.v.x v11, a2 ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma -; LMULMAX1-RV64-NEXT: vmv.s.x v11, a2 +; LMULMAX1-RV64-NEXT: vmv.s.x v11, a3 ; LMULMAX1-RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma ; LMULMAX1-RV64-NEXT: vmulhu.vv v11, v9, v11 ; LMULMAX1-RV64-NEXT: vsub.vv v9, v9, v11 +; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI156_2) +; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI156_2)(a2) ; LMULMAX1-RV64-NEXT: vmulhu.vv v9, v9, v10 ; LMULMAX1-RV64-NEXT: vadd.vv v9, v9, v11 ; LMULMAX1-RV64-NEXT: vid.v v10 -; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI156_2) -; LMULMAX1-RV64-NEXT: addi a2, a2, %lo(.LCPI156_2) -; LMULMAX1-RV64-NEXT: vlse64.v v11, (a2), zero +; LMULMAX1-RV64-NEXT: vmv.v.x v11, a2 ; LMULMAX1-RV64-NEXT: lui a2, %hi(.LCPI156_3) ; LMULMAX1-RV64-NEXT: ld a2, %lo(.LCPI156_3)(a2) ; LMULMAX1-RV64-NEXT: vadd.vi v12, v10, 2 @@ -4935,12 +4935,12 @@ ; ; LMULMAX2-RV64-LABEL: mulhs_v8i32: ; LMULMAX2-RV64: # %bb.0: +; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI159_0) +; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI159_0)(a1) ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-RV64-NEXT: vle32.v v8, (a0) -; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI159_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI159_0) ; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: vlse64.v v10, (a1), zero +; LMULMAX2-RV64-NEXT: vmv.v.x v10, a1 ; LMULMAX2-RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma ; LMULMAX2-RV64-NEXT: vmulh.vv v8, v8, v10 ; LMULMAX2-RV64-NEXT: vsra.vi v8, v8, 1 @@ -5035,12 +5035,12 @@ ; ; LMULMAX2-RV64-LABEL: mulhs_v4i64: ; LMULMAX2-RV64: # %bb.0: -; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; LMULMAX2-RV64-NEXT: li a1, 5 -; LMULMAX2-RV64-NEXT: vmv.s.x v0, a1 ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI160_0) -; LMULMAX2-RV64-NEXT: addi a1, a1, %lo(.LCPI160_0) -; LMULMAX2-RV64-NEXT: vlse64.v v8, (a1), zero +; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI160_0)(a1) +; LMULMAX2-RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; LMULMAX2-RV64-NEXT: li a2, 5 +; LMULMAX2-RV64-NEXT: vmv.s.x v0, a2 +; LMULMAX2-RV64-NEXT: vmv.v.x v8, a1 ; LMULMAX2-RV64-NEXT: lui a1, %hi(.LCPI160_1) ; LMULMAX2-RV64-NEXT: ld a1, %lo(.LCPI160_1)(a1) ; LMULMAX2-RV64-NEXT: vle64.v v10, (a0) @@ -5077,11 +5077,11 @@ ; ; LMULMAX1-RV64-LABEL: mulhs_v4i64: ; LMULMAX1-RV64: # %bb.0: +; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI160_0) +; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI160_0)(a1) ; LMULMAX1-RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; LMULMAX1-RV64-NEXT: vle64.v v8, (a0) -; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI160_0) -; LMULMAX1-RV64-NEXT: addi a1, a1, %lo(.LCPI160_0) -; LMULMAX1-RV64-NEXT: vlse64.v v9, (a1), zero +; LMULMAX1-RV64-NEXT: vmv.v.x v9, a1 ; LMULMAX1-RV64-NEXT: lui a1, %hi(.LCPI160_1) ; LMULMAX1-RV64-NEXT: ld a1, %lo(.LCPI160_1)(a1) ; LMULMAX1-RV64-NEXT: addi a2, a0, 16 diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-mask-buildvec.ll @@ -674,9 +674,9 @@ ; RV64-LMULMAX4-LABEL: buildvec_mask_v64i1: ; RV64-LMULMAX4: # %bb.0: ; RV64-LMULMAX4-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-LMULMAX4-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-LMULMAX4-NEXT: ld a0, %lo(.LCPI19_0)(a0) ; RV64-LMULMAX4-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-LMULMAX4-NEXT: vlse64.v v0, (a0), zero +; RV64-LMULMAX4-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX4-NEXT: ret ; ; RV32-LMULMAX8-LABEL: buildvec_mask_v64i1: @@ -695,9 +695,9 @@ ; RV64-LMULMAX8-LABEL: buildvec_mask_v64i1: ; RV64-LMULMAX8: # %bb.0: ; RV64-LMULMAX8-NEXT: lui a0, %hi(.LCPI19_0) -; RV64-LMULMAX8-NEXT: addi a0, a0, %lo(.LCPI19_0) +; RV64-LMULMAX8-NEXT: ld a0, %lo(.LCPI19_0)(a0) ; RV64-LMULMAX8-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-LMULMAX8-NEXT: vlse64.v v0, (a0), zero +; RV64-LMULMAX8-NEXT: vmv.s.x v0, a0 ; RV64-LMULMAX8-NEXT: ret ret <64 x i1> } @@ -804,12 +804,12 @@ ; RV64-LMULMAX4-LABEL: buildvec_mask_v128i1: ; RV64-LMULMAX4: # %bb.0: ; RV64-LMULMAX4-NEXT: lui a0, %hi(.LCPI20_0) -; RV64-LMULMAX4-NEXT: addi a0, a0, %lo(.LCPI20_0) +; RV64-LMULMAX4-NEXT: ld a0, %lo(.LCPI20_0)(a0) +; RV64-LMULMAX4-NEXT: lui a1, %hi(.LCPI20_1) +; RV64-LMULMAX4-NEXT: ld a1, %lo(.LCPI20_1)(a1) ; RV64-LMULMAX4-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-LMULMAX4-NEXT: vlse64.v v0, (a0), zero -; RV64-LMULMAX4-NEXT: lui a0, %hi(.LCPI20_1) -; RV64-LMULMAX4-NEXT: addi a0, a0, %lo(.LCPI20_1) -; RV64-LMULMAX4-NEXT: vlse64.v v8, (a0), zero +; RV64-LMULMAX4-NEXT: vmv.s.x v0, a0 +; RV64-LMULMAX4-NEXT: vmv.s.x v8, a1 ; RV64-LMULMAX4-NEXT: ret ; ; RV32-LMULMAX8-LABEL: buildvec_mask_v128i1: @@ -944,12 +944,12 @@ ; RV64-LMULMAX4-LABEL: buildvec_mask_optsize_v128i1: ; RV64-LMULMAX4: # %bb.0: ; RV64-LMULMAX4-NEXT: lui a0, %hi(.LCPI21_0) -; RV64-LMULMAX4-NEXT: addi a0, a0, %lo(.LCPI21_0) +; RV64-LMULMAX4-NEXT: ld a0, %lo(.LCPI21_0)(a0) +; RV64-LMULMAX4-NEXT: lui a1, %hi(.LCPI21_1) +; RV64-LMULMAX4-NEXT: ld a1, %lo(.LCPI21_1)(a1) ; RV64-LMULMAX4-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; RV64-LMULMAX4-NEXT: vlse64.v v0, (a0), zero -; RV64-LMULMAX4-NEXT: lui a0, %hi(.LCPI21_1) -; RV64-LMULMAX4-NEXT: addi a0, a0, %lo(.LCPI21_1) -; RV64-LMULMAX4-NEXT: vlse64.v v8, (a0), zero +; RV64-LMULMAX4-NEXT: vmv.s.x v0, a0 +; RV64-LMULMAX4-NEXT: vmv.s.x v8, a1 ; RV64-LMULMAX4-NEXT: ret ; ; RV32-LMULMAX8-LABEL: buildvec_mask_optsize_v128i1: diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-gather.ll @@ -41,8 +41,9 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB0_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lb a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e8, mf4, ta, ma -; RV64ZVE32F-NEXT: vlse8.v v8, (a0), zero +; RV64ZVE32F-NEXT: vmv.v.x v8, a0 ; RV64ZVE32F-NEXT: .LBB0_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i8> @llvm.masked.gather.v1i8.v1p0i8(<1 x i8*> %ptrs, i32 1, <1 x i1> %m, <1 x i8> %passthru) @@ -891,8 +892,9 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB13_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lh a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero +; RV64ZVE32F-NEXT: vmv.v.x v8, a0 ; RV64ZVE32F-NEXT: .LBB13_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i16> @llvm.masked.gather.v1i16.v1p0i16(<1 x i16*> %ptrs, i32 2, <1 x i1> %m, <1 x i16> %passthru) @@ -2109,8 +2111,9 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB27_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: lw a0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero +; RV64ZVE32F-NEXT: vmv.v.x v8, a0 ; RV64ZVE32F-NEXT: .LBB27_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x i32> @llvm.masked.gather.v1i32.v1p0i32(<1 x i32*> %ptrs, i32 4, <1 x i1> %m, <1 x i32> %passthru) @@ -7198,8 +7201,9 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB58_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: flh ft0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e16, mf2, ta, ma -; RV64ZVE32F-NEXT: vlse16.v v8, (a0), zero +; RV64ZVE32F-NEXT: vfmv.v.f v8, ft0 ; RV64ZVE32F-NEXT: .LBB58_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x half> @llvm.masked.gather.v1f16.v1p0f16(<1 x half*> %ptrs, i32 2, <1 x i1> %m, <1 x half> %passthru) @@ -8175,8 +8179,9 @@ ; RV64ZVE32F-NEXT: andi a1, a1, 1 ; RV64ZVE32F-NEXT: beqz a1, .LBB68_2 ; RV64ZVE32F-NEXT: # %bb.1: # %cond.load +; RV64ZVE32F-NEXT: flw ft0, 0(a0) ; RV64ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; RV64ZVE32F-NEXT: vlse32.v v8, (a0), zero +; RV64ZVE32F-NEXT: vfmv.v.f v8, ft0 ; RV64ZVE32F-NEXT: .LBB68_2: # %else ; RV64ZVE32F-NEXT: ret %v = call <1 x float> @llvm.masked.gather.v1f32.v1p0f32(<1 x float*> %ptrs, i32 4, <1 x i1> %m, <1 x float> %passthru) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-scatter.ll @@ -1714,12 +1714,12 @@ ; ; RV32ZVE32F-LABEL: mscatter_v2i64_truncstore_v2i32: ; RV32ZVE32F: # %bb.0: -; RV32ZVE32F-NEXT: lw a1, 0(a0) -; RV32ZVE32F-NEXT: addi a0, a0, 8 +; RV32ZVE32F-NEXT: lw a1, 8(a0) +; RV32ZVE32F-NEXT: lw a0, 0(a0) ; RV32ZVE32F-NEXT: vsetivli zero, 2, e32, m1, ta, ma -; RV32ZVE32F-NEXT: vlse32.v v9, (a0), zero +; RV32ZVE32F-NEXT: vmv.v.x v9, a1 ; RV32ZVE32F-NEXT: vsetvli zero, zero, e32, m1, tu, ma -; RV32ZVE32F-NEXT: vmv.s.x v9, a1 +; RV32ZVE32F-NEXT: vmv.s.x v9, a0 ; RV32ZVE32F-NEXT: vsoxei32.v v9, (zero), v8, v0.t ; RV32ZVE32F-NEXT: ret ; diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-reduction-fp.ll @@ -1216,18 +1216,6 @@ declare half @llvm.vector.reduce.fmin.v2f16(<2 x half>) define half @vreduce_fmin_v2f16(<2 x half>* %x) { -; CHECK-LABEL: vreduce_fmin_v2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI68_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI68_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call half @llvm.vector.reduce.fmin.v2f16(<2 x half> %v) ret half %red @@ -1236,54 +1224,18 @@ declare half @llvm.vector.reduce.fmin.v4f16(<4 x half>) define half @vreduce_fmin_v4f16(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI69_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI69_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) ret half %red } define half @vreduce_fmin_v4f16_nonans(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f16_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI70_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI70_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) ret half %red } define half @vreduce_fmin_v4f16_nonans_noinfs(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f16_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI71_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI71_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan ninf half @llvm.vector.reduce.fmin.v4f16(<4 x half> %v) ret half %red @@ -1299,11 +1251,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI72_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI72_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI72_0)(a0) +; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v16, (a0), zero +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1316,18 +1268,6 @@ declare float @llvm.vector.reduce.fmin.v2f32(<2 x float>) define float @vreduce_fmin_v2f32(<2 x float>* %x) { -; CHECK-LABEL: vreduce_fmin_v2f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI73_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI73_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call float @llvm.vector.reduce.fmin.v2f32(<2 x float> %v) ret float %red @@ -1336,54 +1276,18 @@ declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) define float @vreduce_fmin_v4f32(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI74_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI74_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) ret float %red } define float @vreduce_fmin_v4f32_nonans(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f32_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI75_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI75_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) ret float %red } define float @vreduce_fmin_v4f32_nonans_noinfs(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f32_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI76_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI76_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan ninf float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) ret float %red @@ -1396,20 +1300,20 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vle32.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: vfmin.vv v16, v24, v16 -; CHECK-NEXT: vfmin.vv v8, v8, v0 -; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: addi a0, a0, 256 +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI77_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI77_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI77_0)(a0) +; CHECK-NEXT: vfmin.vv v16, v24, v0 +; CHECK-NEXT: vfmin.vv v8, v16, v8 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v16, (a0), zero +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1422,18 +1326,6 @@ declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) define double @vreduce_fmin_v2f64(<2 x double>* %x) { -; CHECK-LABEL: vreduce_fmin_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI78_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI78_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> %v) ret double %red @@ -1442,54 +1334,18 @@ declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) define double @vreduce_fmin_v4f64(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI79_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI79_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) ret double %red } define double @vreduce_fmin_v4f64_nonans(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f64_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI80_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI80_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) ret double %red } define double @vreduce_fmin_v4f64_nonans_noinfs(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmin_v4f64_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI81_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI81_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan ninf double @llvm.vector.reduce.fmin.v4f64(<4 x double> %v) ret double %red @@ -1498,21 +1354,6 @@ declare double @llvm.vector.reduce.fmin.v32f64(<32 x double>) define double @vreduce_fmin_v32f64(<32 x double>* %x) { -; CHECK-LABEL: vreduce_fmin_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vfmin.vv v8, v8, v16 -; CHECK-NEXT: lui a0, %hi(.LCPI82_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI82_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vfredmin.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call double @llvm.vector.reduce.fmin.v32f64(<32 x double> %v) ret double %red @@ -1521,18 +1362,6 @@ declare half @llvm.vector.reduce.fmax.v2f16(<2 x half>) define half @vreduce_fmax_v2f16(<2 x half>* %x) { -; CHECK-LABEL: vreduce_fmax_v2f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI83_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI83_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x half>, <2 x half>* %x %red = call half @llvm.vector.reduce.fmax.v2f16(<2 x half> %v) ret half %red @@ -1541,54 +1370,18 @@ declare half @llvm.vector.reduce.fmax.v4f16(<4 x half>) define half @vreduce_fmax_v4f16(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI84_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI84_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) ret half %red } define half @vreduce_fmax_v4f16_nonans(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f16_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI85_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI85_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) ret half %red } define half @vreduce_fmax_v4f16_nonans_noinfs(<4 x half>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f16_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI86_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI86_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x half>, <4 x half>* %x %red = call nnan ninf half @llvm.vector.reduce.fmax.v4f16(<4 x half> %v) ret half %red @@ -1604,11 +1397,11 @@ ; CHECK-NEXT: vle16.v v8, (a0) ; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle16.v v16, (a0) -; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI87_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI87_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI87_0)(a0) +; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v16, (a0), zero +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1621,18 +1414,6 @@ declare float @llvm.vector.reduce.fmax.v2f32(<2 x float>) define float @vreduce_fmax_v2f32(<2 x float>* %x) { -; CHECK-LABEL: vreduce_fmax_v2f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI88_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI88_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x float>, <2 x float>* %x %red = call float @llvm.vector.reduce.fmax.v2f32(<2 x float> %v) ret float %red @@ -1641,54 +1422,18 @@ declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) define float @vreduce_fmax_v4f32(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI89_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI89_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) ret float %red } define float @vreduce_fmax_v4f32_nonans(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f32_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI90_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI90_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) ret float %red } define float @vreduce_fmax_v4f32_nonans_noinfs(<4 x float>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f32_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI91_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI91_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x float>, <4 x float>* %x %red = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) ret float %red @@ -1701,20 +1446,20 @@ ; CHECK: # %bb.0: ; CHECK-NEXT: li a1, 32 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: addi a2, a0, 384 +; CHECK-NEXT: vle32.v v8, (a2) +; CHECK-NEXT: addi a2, a0, 128 ; CHECK-NEXT: vle32.v v16, (a2) -; CHECK-NEXT: addi a2, a0, 256 -; CHECK-NEXT: addi a0, a0, 128 ; CHECK-NEXT: vle32.v v24, (a0) -; CHECK-NEXT: vle32.v v0, (a2) -; CHECK-NEXT: vfmax.vv v16, v24, v16 -; CHECK-NEXT: vfmax.vv v8, v8, v0 -; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: addi a0, a0, 256 +; CHECK-NEXT: vle32.v v0, (a0) +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: lui a0, %hi(.LCPI92_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI92_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI92_0)(a0) +; CHECK-NEXT: vfmax.vv v16, v24, v0 +; CHECK-NEXT: vfmax.vv v8, v16, v8 ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v16, (a0), zero +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1727,18 +1472,6 @@ declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) define double @vreduce_fmax_v2f64(<2 x double>* %x) { -; CHECK-LABEL: vreduce_fmax_v2f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI93_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI93_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v9 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <2 x double>, <2 x double>* %x %red = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> %v) ret double %red @@ -1747,54 +1480,18 @@ declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) define double @vreduce_fmax_v4f64(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI94_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI94_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) ret double %red } define double @vreduce_fmax_v4f64_nonans(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f64_nonans: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI95_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI95_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) ret double %red } define double @vreduce_fmax_v4f64_nonans_noinfs(<4 x double>* %x) { -; CHECK-LABEL: vreduce_fmax_v4f64_nonans_noinfs: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: lui a0, %hi(.LCPI96_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI96_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero -; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v10 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <4 x double>, <4 x double>* %x %red = call nnan ninf double @llvm.vector.reduce.fmax.v4f64(<4 x double> %v) ret double %red @@ -1803,21 +1500,6 @@ declare double @llvm.vector.reduce.fmax.v32f64(<32 x double>) define double @vreduce_fmax_v32f64(<32 x double>* %x) { -; CHECK-LABEL: vreduce_fmax_v32f64: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) -; CHECK-NEXT: addi a0, a0, 128 -; CHECK-NEXT: vle64.v v16, (a0) -; CHECK-NEXT: vfmax.vv v8, v8, v16 -; CHECK-NEXT: lui a0, %hi(.LCPI97_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI97_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v16, (a0), zero -; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; CHECK-NEXT: vfredmax.vs v8, v8, v16 -; CHECK-NEXT: vfmv.f.s fa0, v8 -; CHECK-NEXT: ret %v = load <32 x double>, <32 x double>* %x %red = call double @llvm.vector.reduce.fmax.v32f64(<32 x double> %v) ret double %red diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwadd.ll @@ -665,9 +665,11 @@ define <8 x i16> @vwadd_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { ; CHECK-LABEL: vwadd_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -718,9 +720,11 @@ define <4 x i32> @vwadd_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { ; CHECK-LABEL: vwadd_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwadd.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -849,9 +853,11 @@ ; ; RV64-LABEL: vwadd_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwadd.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwaddu.ll @@ -665,9 +665,11 @@ define <8 x i16> @vwaddu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { ; CHECK-LABEL: vwaddu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -718,9 +720,11 @@ define <4 x i32> @vwaddu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { ; CHECK-LABEL: vwaddu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwaddu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -846,9 +850,11 @@ ; ; RV64-LABEL: vwaddu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwaddu.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwmulsu.ll @@ -702,8 +702,9 @@ ; CHECK-LABEL: vwmulsu_vx_v8i16_i8_swap: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwmulsu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsub.ll @@ -648,8 +648,9 @@ ; CHECK-LABEL: vwsub_vx_v8i16_i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lb a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -665,9 +666,11 @@ define <8 x i16> @vwsub_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { ; CHECK-LABEL: vwsub_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -702,8 +705,9 @@ ; CHECK-LABEL: vwsub_vx_v4i32_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsub.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -719,9 +723,11 @@ define <4 x i32> @vwsub_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { ; CHECK-LABEL: vwsub_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsub.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -820,8 +826,9 @@ ; RV64-LABEL: vwsub_vx_v2i64_i32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: lw a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse32.v v10, (a1), zero +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vwsub.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x @@ -852,9 +859,11 @@ ; ; RV64-LABEL: vwsub_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsub.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vwsubu.ll @@ -648,8 +648,9 @@ ; CHECK-LABEL: vwsubu_vx_v8i16_i8: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: lbu a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse8.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -665,9 +666,11 @@ define <8 x i16> @vwsubu_vx_v8i16_i16(<8 x i8>* %x, i16* %y) { ; CHECK-LABEL: vwsubu_vx_v8i16_i16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma +; CHECK-NEXT: lh a1, 0(a1) ; CHECK-NEXT: vle8.v v9, (a0) -; CHECK-NEXT: vlse16.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <8 x i8>, <8 x i8>* %x @@ -702,8 +705,9 @@ ; CHECK-LABEL: vwsubu_vx_v4i32_i16: ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: lhu a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse16.v v10, (a1), zero +; CHECK-NEXT: vmv.v.x v10, a1 ; CHECK-NEXT: vwsubu.vv v8, v10, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -719,9 +723,11 @@ define <4 x i32> @vwsubu_vx_v4i32_i32(<4 x i16>* %x, i32* %y) { ; CHECK-LABEL: vwsubu_vx_v4i32_i32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; CHECK-NEXT: lw a1, 0(a1) ; CHECK-NEXT: vle16.v v9, (a0) -; CHECK-NEXT: vlse32.v v8, (a1), zero +; CHECK-NEXT: vmv.v.x v8, a1 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma ; CHECK-NEXT: vwsubu.wv v8, v8, v9 ; CHECK-NEXT: ret %a = load <4 x i16>, <4 x i16>* %x @@ -817,8 +823,9 @@ ; RV64-LABEL: vwsubu_vx_v2i64_i32: ; RV64: # %bb.0: ; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: lwu a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse32.v v10, (a1), zero +; RV64-NEXT: vmv.v.x v10, a1 ; RV64-NEXT: vwsubu.vv v8, v10, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x @@ -849,9 +856,11 @@ ; ; RV64-LABEL: vwsubu_vx_v2i64_i64: ; RV64: # %bb.0: -; RV64-NEXT: vsetivli zero, 2, e32, mf2, ta, ma +; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-NEXT: ld a1, 0(a1) ; RV64-NEXT: vle32.v v9, (a0) -; RV64-NEXT: vlse64.v v8, (a1), zero +; RV64-NEXT: vmv.v.x v8, a1 +; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma ; RV64-NEXT: vwsubu.wv v8, v8, v9 ; RV64-NEXT: ret %a = load <2 x i32>, <2 x i32>* %x diff --git a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll --- a/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vreductions-fp-sdnode.ll @@ -442,9 +442,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI30_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI30_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI30_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -457,9 +457,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f16_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI31_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI31_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI31_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -472,9 +472,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f16_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI32_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI32_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI32_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -489,9 +489,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI33_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI33_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI33_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -506,9 +506,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI34_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI34_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI34_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -522,13 +522,11 @@ define half @vreduce_fmin_nxv64f16( %v) { ; CHECK-LABEL: vreduce_fmin_nxv64f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI35_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI35_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v16, (a0), zero +; CHECK-NEXT: flh ft0, %lo(.LCPI35_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmin.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -542,9 +540,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI36_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI36_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI36_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -557,9 +555,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI37_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI37_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI37_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -572,9 +570,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f32_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI38_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI38_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI38_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -589,9 +587,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI39_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI39_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI39_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -606,9 +604,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI40_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI40_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI40_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v10, (a0), zero +; CHECK-NEXT: vfmv.s.f v10, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -622,13 +620,11 @@ define float @vreduce_fmin_nxv32f32( %v) { ; CHECK-LABEL: vreduce_fmin_nxv32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI41_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI41_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v16, (a0), zero +; CHECK-NEXT: flw ft0, %lo(.LCPI41_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmin.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -642,9 +638,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI42_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI42_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI42_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -657,9 +653,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI43_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI43_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI43_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -672,9 +668,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv1f64_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI44_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI44_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI44_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -689,9 +685,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI45_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI45_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI45_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero +; CHECK-NEXT: vfmv.s.f v10, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -706,9 +702,9 @@ ; CHECK-LABEL: vreduce_fmin_nxv4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI46_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI46_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI46_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v12, (a0), zero +; CHECK-NEXT: vfmv.s.f v12, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfredmin.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -722,13 +718,11 @@ define double @vreduce_fmin_nxv16f64( %v) { ; CHECK-LABEL: vreduce_fmin_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfmin.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI47_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI47_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v16, (a0), zero +; CHECK-NEXT: fld ft0, %lo(.LCPI47_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmin.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmin.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -742,9 +736,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI48_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI48_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI48_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -757,9 +751,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f16_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI49_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI49_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI49_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -772,9 +766,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f16_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI50_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI50_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI50_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf4, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -789,9 +783,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv2f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI51_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI51_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI51_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -806,9 +800,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv4f16: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI52_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI52_0) +; CHECK-NEXT: flh ft0, %lo(.LCPI52_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -822,13 +816,11 @@ define half @vreduce_fmax_nxv64f16( %v) { ; CHECK-LABEL: vreduce_fmax_nxv64f16: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI53_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI53_0) -; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma -; CHECK-NEXT: vlse16.v v16, (a0), zero +; CHECK-NEXT: flh ft0, %lo(.LCPI53_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmax.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -842,9 +834,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI54_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI54_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI54_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -857,9 +849,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f32_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI55_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI55_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI55_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -872,9 +864,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f32_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI56_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI56_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI56_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -889,9 +881,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv2f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI57_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI57_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI57_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -906,9 +898,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI58_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI58_0) +; CHECK-NEXT: flw ft0, %lo(.LCPI58_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v10, (a0), zero +; CHECK-NEXT: vfmv.s.f v10, ft0 ; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -922,13 +914,11 @@ define float @vreduce_fmax_nxv32f32( %v) { ; CHECK-LABEL: vreduce_fmax_nxv32f32: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI59_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI59_0) -; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v16, (a0), zero +; CHECK-NEXT: flw ft0, %lo(.LCPI59_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmax.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret @@ -942,9 +932,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI60_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI60_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI60_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -957,9 +947,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f64_nonans: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI61_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI61_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI61_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -972,9 +962,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv1f64_nonans_noinfs: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI62_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI62_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI62_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v9, (a0), zero +; CHECK-NEXT: vfmv.s.f v9, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v9 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -989,9 +979,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv2f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI63_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI63_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI63_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a0), zero +; CHECK-NEXT: vfmv.s.f v10, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m2, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v10 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1006,9 +996,9 @@ ; CHECK-LABEL: vreduce_fmax_nxv4f64: ; CHECK: # %bb.0: ; CHECK-NEXT: lui a0, %hi(.LCPI64_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI64_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI64_0)(a0) ; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v12, (a0), zero +; CHECK-NEXT: vfmv.s.f v12, ft0 ; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma ; CHECK-NEXT: vfredmax.vs v8, v8, v12 ; CHECK-NEXT: vfmv.f.s fa0, v8 @@ -1022,13 +1012,11 @@ define double @vreduce_fmax_nxv16f64( %v) { ; CHECK-LABEL: vreduce_fmax_nxv16f64: ; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma -; CHECK-NEXT: vfmax.vv v8, v8, v16 ; CHECK-NEXT: lui a0, %hi(.LCPI65_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI65_0) -; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v16, (a0), zero +; CHECK-NEXT: fld ft0, %lo(.LCPI65_0)(a0) ; CHECK-NEXT: vsetvli a0, zero, e64, m8, ta, ma +; CHECK-NEXT: vfmax.vv v8, v8, v16 +; CHECK-NEXT: vfmv.s.f v16, ft0 ; CHECK-NEXT: vfredmax.vs v8, v8, v16 ; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsetvli-insert-crossbb.ll @@ -125,12 +125,12 @@ ; CHECK-NEXT: beqz a1, .LBB3_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: lui a1, %hi(.LCPI3_0) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI3_0) -; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma -; CHECK-NEXT: vlse64.v v10, (a1), zero +; CHECK-NEXT: fld ft0, %lo(.LCPI3_0)(a1) ; CHECK-NEXT: lui a1, %hi(.LCPI3_1) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI3_1) -; CHECK-NEXT: vlse64.v v11, (a1), zero +; CHECK-NEXT: fld ft1, %lo(.LCPI3_1)(a1) +; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v10, ft0 +; CHECK-NEXT: vfmv.v.f v11, ft1 ; CHECK-NEXT: vfadd.vv v10, v10, v11 ; CHECK-NEXT: lui a1, %hi(scratch) ; CHECK-NEXT: addi a1, a1, %lo(scratch) @@ -138,12 +138,12 @@ ; CHECK-NEXT: j .LBB3_3 ; CHECK-NEXT: .LBB3_2: # %if.else ; CHECK-NEXT: lui a1, %hi(.LCPI3_2) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI3_2) -; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma -; CHECK-NEXT: vlse32.v v10, (a1), zero +; CHECK-NEXT: flw ft0, %lo(.LCPI3_2)(a1) ; CHECK-NEXT: lui a1, %hi(.LCPI3_3) -; CHECK-NEXT: addi a1, a1, %lo(.LCPI3_3) -; CHECK-NEXT: vlse32.v v11, (a1), zero +; CHECK-NEXT: flw ft1, %lo(.LCPI3_3)(a1) +; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma +; CHECK-NEXT: vfmv.v.f v10, ft0 +; CHECK-NEXT: vfmv.v.f v11, ft1 ; CHECK-NEXT: vfadd.vv v10, v10, v11 ; CHECK-NEXT: lui a1, %hi(scratch) ; CHECK-NEXT: addi a1, a1, %lo(scratch) @@ -245,13 +245,13 @@ ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: beqz a1, .LBB5_4 ; CHECK-NEXT: .LBB5_2: # %if.then4 +; CHECK-NEXT: lui a1, %hi(.LCPI5_0) +; CHECK-NEXT: fld ft0, %lo(.LCPI5_0)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI5_1) +; CHECK-NEXT: fld ft1, %lo(.LCPI5_1)(a1) ; CHECK-NEXT: vsetvli zero, a0, e64, m1, ta, mu -; CHECK-NEXT: lui a0, %hi(.LCPI5_0) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_0) -; CHECK-NEXT: vlse64.v v9, (a0), zero -; CHECK-NEXT: lui a0, %hi(.LCPI5_1) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_1) -; CHECK-NEXT: vlse64.v v10, (a0), zero +; CHECK-NEXT: vfmv.v.f v9, ft0 +; CHECK-NEXT: vfmv.v.f v10, ft1 ; CHECK-NEXT: vfadd.vv v9, v9, v10 ; CHECK-NEXT: lui a0, %hi(scratch) ; CHECK-NEXT: addi a0, a0, %lo(scratch) @@ -262,13 +262,13 @@ ; CHECK-NEXT: andi a1, a1, 2 ; CHECK-NEXT: bnez a1, .LBB5_2 ; CHECK-NEXT: .LBB5_4: # %if.else5 +; CHECK-NEXT: lui a1, %hi(.LCPI5_2) +; CHECK-NEXT: flw ft0, %lo(.LCPI5_2)(a1) +; CHECK-NEXT: lui a1, %hi(.LCPI5_3) +; CHECK-NEXT: flw ft1, %lo(.LCPI5_3)(a1) ; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, mu -; CHECK-NEXT: lui a0, %hi(.LCPI5_2) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_2) -; CHECK-NEXT: vlse32.v v9, (a0), zero -; CHECK-NEXT: lui a0, %hi(.LCPI5_3) -; CHECK-NEXT: addi a0, a0, %lo(.LCPI5_3) -; CHECK-NEXT: vlse32.v v10, (a0), zero +; CHECK-NEXT: vfmv.v.f v9, ft0 +; CHECK-NEXT: vfmv.v.f v10, ft1 ; CHECK-NEXT: vfadd.vv v9, v9, v10 ; CHECK-NEXT: lui a0, %hi(scratch) ; CHECK-NEXT: addi a0, a0, %lo(scratch) diff --git a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll --- a/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vsplats-fp.ll @@ -1,8 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+experimental-zvfh,+v -target-abi ilp32d -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-OPTIMIZED ; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+experimental-zvfh,+v -target-abi lp64d -verify-machineinstrs < %s \ -; RUN: | FileCheck %s +; RUN: | FileCheck %s --check-prefixes=CHECK,NO-OPTIMIZED +; RUN: llc -mtriple=riscv32 -mattr=+f,+d,+zfh,+experimental-zvfh,+v,+optimized-zero-stride-load -target-abi ilp32d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,OPTIMIZED +; RUN: llc -mtriple=riscv64 -mattr=+f,+d,+zfh,+experimental-zvfh,+v,+optimized-zero-stride-load -target-abi lp64d -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,OPTIMIZED define @vsplat_nxv8f16(half %f) { ; CHECK-LABEL: vsplat_nxv8f16: @@ -72,11 +76,18 @@ ; Test that we fold this to a vlse with 0 stride. define @vsplat_load_nxv8f32(float* %ptr) { -; CHECK-LABEL: vsplat_load_nxv8f32: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetvli a1, zero, e32, m4, ta, ma -; CHECK-NEXT: vlse32.v v8, (a0), zero -; CHECK-NEXT: ret +; NO-OPTIMIZED-LABEL: vsplat_load_nxv8f32: +; NO-OPTIMIZED: # %bb.0: +; NO-OPTIMIZED-NEXT: flw ft0, 0(a0) +; NO-OPTIMIZED-NEXT: vsetvli a0, zero, e32, m4, ta, ma +; NO-OPTIMIZED-NEXT: vfmv.v.f v8, ft0 +; NO-OPTIMIZED-NEXT: ret +; +; OPTIMIZED-LABEL: vsplat_load_nxv8f32: +; OPTIMIZED: # %bb.0: +; OPTIMIZED-NEXT: vsetvli a1, zero, e32, m4, ta, ma +; OPTIMIZED-NEXT: vlse32.v v8, (a0), zero +; OPTIMIZED-NEXT: ret %f = load float, float* %ptr %head = insertelement poison, float %f, i32 0 %splat = shufflevector %head, poison, zeroinitializer