Index: llvm/lib/Target/RISCV/RISCVSubtarget.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVSubtarget.cpp +++ llvm/lib/Target/RISCV/RISCVSubtarget.cpp @@ -44,7 +44,7 @@ "with zero meaning no minimum size is assumed. A value of -1 " "means use Zvl*b extension. This is primarily used to enable " "autovectorization with fixed width vectors."), - cl::init(0), cl::Hidden); + cl::init(-1), cl::Hidden); static cl::opt RVVVectorLMULMax( "riscv-v-fixed-length-vector-lmul-max", Index: llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll =================================================================== --- llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll +++ llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll @@ -15,16 +15,16 @@ ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv1i1_i32 = call @llvm.get.active.lane.mask.nxv1i1.i32(i32 undef, i32 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %mask_nxv32i1_i64 = call @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv16i1_i16 = call @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 224 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %mask_nxv16i1_i64 = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef) Index: llvm/test/CodeGen/RISCV/fold-vector-cmp.ll =================================================================== --- llvm/test/CodeGen/RISCV/fold-vector-cmp.ll +++ llvm/test/CodeGen/RISCV/fold-vector-cmp.ll @@ -1,14 +1,31 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=-v -o - %s | FileCheck %s -; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=+v -o - %s | FileCheck %s +; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=-v -o - %s | FileCheck --check-prefix=CHECK-NOV %s +; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=+v -o - %s | FileCheck --check-prefix=CHECK-V %s ; Reproducer for https://github.com/llvm/llvm-project/issues/55168. ; We should always return 1 (and not -1). define i32 @test(i32 %call.i) { -; CHECK-LABEL: test: -; CHECK: # %bb.0: -; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: ret +; CHECK-NOV-LABEL: test: +; CHECK-NOV: # %bb.0: +; CHECK-NOV-NEXT: li a0, 1 +; CHECK-NOV-NEXT: ret +; +; CHECK-V-LABEL: test: +; CHECK-V: # %bb.0: +; CHECK-V-NEXT: lui a1, 524288 +; CHECK-V-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-V-NEXT: vmv.v.x v8, a1 +; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, tu, mu +; CHECK-V-NEXT: vmv.s.x v8, a0 +; CHECK-V-NEXT: addiw a0, a1, 2 +; CHECK-V-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-V-NEXT: vmslt.vx v0, v8, a0 +; CHECK-V-NEXT: vmv.v.i v8, 0 +; CHECK-V-NEXT: vmerge.vim v8, v8, 1, v0 +; CHECK-V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-V-NEXT: vslidedown.vi v8, v8, 1 +; CHECK-V-NEXT: vmv.x.s a0, v8 +; CHECK-V-NEXT: ret %t2 = insertelement <2 x i32> , i32 %call.i, i64 0 %t3 = icmp slt <2 x i32> %t2, %t4 = zext <2 x i1> %t3 to <2 x i32> Index: llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll =================================================================== --- llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll +++ llvm/test/CodeGen/RISCV/fpclamptosat_vec.ll @@ -7,30 +7,14 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.d a1, fa1, rtz -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: addiw a3, a2, -1 -; CHECK-NEXT: fcvt.l.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a3, .LBB0_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a3, .LBB0_6 -; CHECK-NEXT: .LBB0_2: # %entry -; CHECK-NEXT: bge a2, a0, .LBB0_7 -; CHECK-NEXT: .LBB0_3: # %entry -; CHECK-NEXT: bge a2, a1, .LBB0_8 -; CHECK-NEXT: .LBB0_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB0_5: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: blt a0, a3, .LBB0_2 -; CHECK-NEXT: .LBB0_6: # %entry -; CHECK-NEXT: mv a0, a3 -; CHECK-NEXT: blt a2, a0, .LBB0_3 -; CHECK-NEXT: .LBB0_7: # %entry +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: blt a2, a1, .LBB0_4 -; CHECK-NEXT: .LBB0_8: # %entry -; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a1 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -45,20 +29,13 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) { ; CHECK-LABEL: utest_f64i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.lu.d a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a2, a1, 32 -; CHECK-NEXT: fcvt.lu.d a1, fa1, rtz -; CHECK-NEXT: bgeu a0, a2, .LBB1_3 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a1, a2, .LBB1_4 -; CHECK-NEXT: .LBB1_2: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB1_3: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a1, a2, .LBB1_2 -; CHECK-NEXT: .LBB1_4: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -71,30 +48,14 @@ define <2 x i32> @ustest_f64i32(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a2, a0, 32 -; CHECK-NEXT: fcvt.l.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB2_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB2_6 -; CHECK-NEXT: .LBB2_2: # %entry -; CHECK-NEXT: blez a0, .LBB2_7 -; CHECK-NEXT: .LBB2_3: # %entry -; CHECK-NEXT: blez a1, .LBB2_8 -; CHECK-NEXT: .LBB2_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB2_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB2_2 -; CHECK-NEXT: .LBB2_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bgtz a0, .LBB2_3 -; CHECK-NEXT: .LBB2_7: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bgtz a1, .LBB2_4 -; CHECK-NEXT: .LBB2_8: # %entry -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -109,59 +70,16 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NEXT: lui a3, 524288 -; CHECK-NEXT: addiw a6, a3, -1 -; CHECK-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a6, .LBB3_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NEXT: bge a2, a6, .LBB3_11 -; CHECK-NEXT: .LBB3_2: # %entry -; CHECK-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NEXT: bge a4, a6, .LBB3_12 -; CHECK-NEXT: .LBB3_3: # %entry -; CHECK-NEXT: bge a5, a6, .LBB3_13 -; CHECK-NEXT: .LBB3_4: # %entry -; CHECK-NEXT: bge a3, a5, .LBB3_14 -; CHECK-NEXT: .LBB3_5: # %entry -; CHECK-NEXT: bge a3, a4, .LBB3_15 -; CHECK-NEXT: .LBB3_6: # %entry -; CHECK-NEXT: bge a3, a2, .LBB3_16 -; CHECK-NEXT: .LBB3_7: # %entry -; CHECK-NEXT: blt a3, a1, .LBB3_9 -; CHECK-NEXT: .LBB3_8: # %entry -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: .LBB3_9: # %entry -; CHECK-NEXT: sw a1, 12(a0) -; CHECK-NEXT: sw a2, 8(a0) -; CHECK-NEXT: sw a4, 4(a0) -; CHECK-NEXT: sw a5, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a1 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB3_10: # %entry -; CHECK-NEXT: mv a1, a6 -; CHECK-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NEXT: blt a2, a6, .LBB3_2 -; CHECK-NEXT: .LBB3_11: # %entry -; CHECK-NEXT: mv a2, a6 -; CHECK-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NEXT: blt a4, a6, .LBB3_3 -; CHECK-NEXT: .LBB3_12: # %entry -; CHECK-NEXT: mv a4, a6 -; CHECK-NEXT: blt a5, a6, .LBB3_4 -; CHECK-NEXT: .LBB3_13: # %entry -; CHECK-NEXT: mv a5, a6 -; CHECK-NEXT: blt a3, a5, .LBB3_5 -; CHECK-NEXT: .LBB3_14: # %entry -; CHECK-NEXT: lui a5, 524288 -; CHECK-NEXT: blt a3, a4, .LBB3_6 -; CHECK-NEXT: .LBB3_15: # %entry -; CHECK-NEXT: lui a4, 524288 -; CHECK-NEXT: blt a3, a2, .LBB3_7 -; CHECK-NEXT: .LBB3_16: # %entry -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: bge a3, a1, .LBB3_8 -; CHECK-NEXT: j .LBB3_9 entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -175,39 +93,15 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.lu.s a1, fa0, rtz -; CHECK-NEXT: li a2, -1 -; CHECK-NEXT: srli a3, a2, 32 -; CHECK-NEXT: fcvt.lu.s a2, fa1, rtz -; CHECK-NEXT: bgeu a1, a3, .LBB4_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NEXT: bgeu a2, a3, .LBB4_7 -; CHECK-NEXT: .LBB4_2: # %entry -; CHECK-NEXT: fcvt.lu.s a5, fa3, rtz -; CHECK-NEXT: bgeu a4, a3, .LBB4_8 -; CHECK-NEXT: .LBB4_3: # %entry -; CHECK-NEXT: bltu a5, a3, .LBB4_5 -; CHECK-NEXT: .LBB4_4: # %entry -; CHECK-NEXT: mv a5, a3 -; CHECK-NEXT: .LBB4_5: # %entry -; CHECK-NEXT: sw a5, 12(a0) -; CHECK-NEXT: sw a4, 8(a0) -; CHECK-NEXT: sw a2, 4(a0) -; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB4_6: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NEXT: bltu a2, a3, .LBB4_2 -; CHECK-NEXT: .LBB4_7: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: fcvt.lu.s a5, fa3, rtz -; CHECK-NEXT: bltu a4, a3, .LBB4_3 -; CHECK-NEXT: .LBB4_8: # %entry -; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bgeu a5, a3, .LBB4_4 -; CHECK-NEXT: j .LBB4_5 entry: %conv = fptoui <4 x float> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -219,59 +113,16 @@ define <4 x i32> @ustest_f32i32(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NEXT: li a2, -1 -; CHECK-NEXT: srli a5, a2, 32 -; CHECK-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB5_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB5_11 -; CHECK-NEXT: .LBB5_2: # %entry -; CHECK-NEXT: fcvt.l.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB5_12 -; CHECK-NEXT: .LBB5_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB5_13 -; CHECK-NEXT: .LBB5_4: # %entry -; CHECK-NEXT: blez a4, .LBB5_14 -; CHECK-NEXT: .LBB5_5: # %entry -; CHECK-NEXT: blez a3, .LBB5_15 -; CHECK-NEXT: .LBB5_6: # %entry -; CHECK-NEXT: blez a2, .LBB5_16 -; CHECK-NEXT: .LBB5_7: # %entry -; CHECK-NEXT: bgtz a1, .LBB5_9 -; CHECK-NEXT: .LBB5_8: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: .LBB5_9: # %entry -; CHECK-NEXT: sw a1, 12(a0) -; CHECK-NEXT: sw a2, 8(a0) -; CHECK-NEXT: sw a3, 4(a0) -; CHECK-NEXT: sw a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB5_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB5_2 -; CHECK-NEXT: .LBB5_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.l.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB5_3 -; CHECK-NEXT: .LBB5_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB5_4 -; CHECK-NEXT: .LBB5_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: bgtz a4, .LBB5_5 -; CHECK-NEXT: .LBB5_14: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB5_6 -; CHECK-NEXT: .LBB5_15: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB5_7 -; CHECK-NEXT: .LBB5_16: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: blez a1, .LBB5_8 -; CHECK-NEXT: j .LBB5_9 entry: %conv = fptosi <4 x float> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -291,96 +142,59 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 24(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu a1, 16(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs2, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: addiw a4, a1, -1 -; CHECK-NEXT: bge a0, a4, .LBB6_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs1, rtz -; CHECK-NEXT: bge s2, a4, .LBB6_11 -; CHECK-NEXT: .LBB6_2: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NEXT: bge a2, a4, .LBB6_12 -; CHECK-NEXT: .LBB6_3: # %entry -; CHECK-NEXT: bge a3, a4, .LBB6_13 -; CHECK-NEXT: .LBB6_4: # %entry -; CHECK-NEXT: bge a1, a3, .LBB6_14 -; CHECK-NEXT: .LBB6_5: # %entry -; CHECK-NEXT: bge a1, a2, .LBB6_15 -; CHECK-NEXT: .LBB6_6: # %entry -; CHECK-NEXT: bge a1, s2, .LBB6_16 -; CHECK-NEXT: .LBB6_7: # %entry -; CHECK-NEXT: blt a1, a0, .LBB6_9 -; CHECK-NEXT: .LBB6_8: # %entry +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: .LBB6_9: # %entry -; CHECK-NEXT: sw a0, 12(s0) -; CHECK-NEXT: sw s2, 8(s0) -; CHECK-NEXT: sw a2, 4(s0) -; CHECK-NEXT: sw a3, 0(s0) +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a1 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB6_10: # %entry -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: fcvt.l.s a2, fs1, rtz -; CHECK-NEXT: blt s2, a4, .LBB6_2 -; CHECK-NEXT: .LBB6_11: # %entry -; CHECK-NEXT: mv s2, a4 -; CHECK-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NEXT: blt a2, a4, .LBB6_3 -; CHECK-NEXT: .LBB6_12: # %entry -; CHECK-NEXT: mv a2, a4 -; CHECK-NEXT: blt a3, a4, .LBB6_4 -; CHECK-NEXT: .LBB6_13: # %entry -; CHECK-NEXT: mv a3, a4 -; CHECK-NEXT: blt a1, a3, .LBB6_5 -; CHECK-NEXT: .LBB6_14: # %entry -; CHECK-NEXT: lui a3, 524288 -; CHECK-NEXT: blt a1, a2, .LBB6_6 -; CHECK-NEXT: .LBB6_15: # %entry -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: blt a1, s2, .LBB6_7 -; CHECK-NEXT: .LBB6_16: # %entry -; CHECK-NEXT: lui s2, 524288 -; CHECK-NEXT: bge a1, a0, .LBB6_8 -; CHECK-NEXT: j .LBB6_9 entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -400,76 +214,58 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 0(a1) -; CHECK-NEXT: lhu s2, 24(a1) -; CHECK-NEXT: lhu s3, 16(a1) -; CHECK-NEXT: lhu a1, 8(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.lu.s s2, fs2, rtz +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a1, a1, 32 -; CHECK-NEXT: bgeu a0, a1, .LBB7_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.lu.s a2, fs1, rtz -; CHECK-NEXT: bgeu s2, a1, .LBB7_7 -; CHECK-NEXT: .LBB7_2: # %entry -; CHECK-NEXT: fcvt.lu.s a3, fs0, rtz -; CHECK-NEXT: bgeu a2, a1, .LBB7_8 -; CHECK-NEXT: .LBB7_3: # %entry -; CHECK-NEXT: bltu a3, a1, .LBB7_5 -; CHECK-NEXT: .LBB7_4: # %entry -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: .LBB7_5: # %entry -; CHECK-NEXT: sw a3, 12(s0) -; CHECK-NEXT: sw a2, 8(s0) -; CHECK-NEXT: sw s2, 4(s0) -; CHECK-NEXT: sw a0, 0(s0) +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB7_6: # %entry -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: fcvt.lu.s a2, fs1, rtz -; CHECK-NEXT: bltu s2, a1, .LBB7_2 -; CHECK-NEXT: .LBB7_7: # %entry -; CHECK-NEXT: mv s2, a1 -; CHECK-NEXT: fcvt.lu.s a3, fs0, rtz -; CHECK-NEXT: bltu a2, a1, .LBB7_3 -; CHECK-NEXT: .LBB7_8: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgeu a3, a1, .LBB7_4 -; CHECK-NEXT: j .LBB7_5 entry: %conv = fptoui <4 x half> %x to <4 x i64> %0 = icmp ult <4 x i64> %conv, @@ -487,96 +283,59 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 24(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu a1, 16(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs2, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a3, a1, 32 -; CHECK-NEXT: bge a0, a3, .LBB8_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NEXT: bge s2, a3, .LBB8_11 -; CHECK-NEXT: .LBB8_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs0, rtz -; CHECK-NEXT: bge a1, a3, .LBB8_12 -; CHECK-NEXT: .LBB8_3: # %entry -; CHECK-NEXT: bge a2, a3, .LBB8_13 -; CHECK-NEXT: .LBB8_4: # %entry -; CHECK-NEXT: blez a2, .LBB8_14 -; CHECK-NEXT: .LBB8_5: # %entry -; CHECK-NEXT: blez a1, .LBB8_15 -; CHECK-NEXT: .LBB8_6: # %entry -; CHECK-NEXT: blez s2, .LBB8_16 -; CHECK-NEXT: .LBB8_7: # %entry -; CHECK-NEXT: bgtz a0, .LBB8_9 -; CHECK-NEXT: .LBB8_8: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: .LBB8_9: # %entry -; CHECK-NEXT: sw a0, 12(s0) -; CHECK-NEXT: sw s2, 8(s0) -; CHECK-NEXT: sw a1, 4(s0) -; CHECK-NEXT: sw a2, 0(s0) +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB8_10: # %entry -; CHECK-NEXT: mv a0, a3 -; CHECK-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NEXT: blt s2, a3, .LBB8_2 -; CHECK-NEXT: .LBB8_11: # %entry -; CHECK-NEXT: mv s2, a3 -; CHECK-NEXT: fcvt.l.s a2, fs0, rtz -; CHECK-NEXT: blt a1, a3, .LBB8_3 -; CHECK-NEXT: .LBB8_12: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: blt a2, a3, .LBB8_4 -; CHECK-NEXT: .LBB8_13: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: bgtz a2, .LBB8_5 -; CHECK-NEXT: .LBB8_14: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bgtz a1, .LBB8_6 -; CHECK-NEXT: .LBB8_15: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bgtz s2, .LBB8_7 -; CHECK-NEXT: .LBB8_16: # %entry -; CHECK-NEXT: li s2, 0 -; CHECK-NEXT: blez a0, .LBB8_8 -; CHECK-NEXT: j .LBB8_9 entry: %conv = fptosi <4 x half> %x to <4 x i64> %0 = icmp slt <4 x i64> %conv, @@ -592,32 +351,15 @@ define <2 x i16> @stest_f64i16(<2 x double> %x) { ; CHECK-LABEL: stest_f64i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 ; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addiw a2, a0, -1 -; CHECK-NEXT: fcvt.w.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB9_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB9_6 -; CHECK-NEXT: .LBB9_2: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: bge a2, a0, .LBB9_7 -; CHECK-NEXT: .LBB9_3: # %entry -; CHECK-NEXT: bge a2, a1, .LBB9_8 -; CHECK-NEXT: .LBB9_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB9_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB9_2 -; CHECK-NEXT: .LBB9_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: blt a2, a0, .LBB9_3 -; CHECK-NEXT: .LBB9_7: # %entry +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v9, a0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: blt a2, a1, .LBB9_4 -; CHECK-NEXT: .LBB9_8: # %entry -; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -632,20 +374,13 @@ define <2 x i16> @utest_f64i16(<2 x double> %x) { ; CHECK-LABEL: utest_f64i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.wu.d a0, fa0, rtz -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a2, a1, -1 -; CHECK-NEXT: fcvt.wu.d a1, fa1, rtz -; CHECK-NEXT: bgeu a0, a2, .LBB10_3 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a1, a2, .LBB10_4 -; CHECK-NEXT: .LBB10_2: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB10_3: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a1, a2, .LBB10_2 -; CHECK-NEXT: .LBB10_4: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -658,30 +393,14 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addiw a2, a0, -1 -; CHECK-NEXT: fcvt.w.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB11_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB11_6 -; CHECK-NEXT: .LBB11_2: # %entry -; CHECK-NEXT: blez a0, .LBB11_7 -; CHECK-NEXT: .LBB11_3: # %entry -; CHECK-NEXT: blez a1, .LBB11_8 -; CHECK-NEXT: .LBB11_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB11_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB11_2 -; CHECK-NEXT: .LBB11_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bgtz a0, .LBB11_3 -; CHECK-NEXT: .LBB11_7: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bgtz a1, .LBB11_4 -; CHECK-NEXT: .LBB11_8: # %entry -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v9, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -696,61 +415,16 @@ define <4 x i16> @stest_f32i16(<4 x float> %x) { ; CHECK-LABEL: stest_f32i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NEXT: lui a2, 8 -; CHECK-NEXT: addiw a5, a2, -1 -; CHECK-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB12_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB12_11 -; CHECK-NEXT: .LBB12_2: # %entry -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB12_12 -; CHECK-NEXT: .LBB12_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB12_13 -; CHECK-NEXT: .LBB12_4: # %entry -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: bge a5, a4, .LBB12_14 -; CHECK-NEXT: .LBB12_5: # %entry -; CHECK-NEXT: bge a5, a3, .LBB12_15 -; CHECK-NEXT: .LBB12_6: # %entry -; CHECK-NEXT: bge a5, a2, .LBB12_16 -; CHECK-NEXT: .LBB12_7: # %entry -; CHECK-NEXT: blt a5, a1, .LBB12_9 -; CHECK-NEXT: .LBB12_8: # %entry -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: .LBB12_9: # %entry -; CHECK-NEXT: sh a1, 6(a0) -; CHECK-NEXT: sh a2, 4(a0) -; CHECK-NEXT: sh a3, 2(a0) -; CHECK-NEXT: sh a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB12_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB12_2 -; CHECK-NEXT: .LBB12_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB12_3 -; CHECK-NEXT: .LBB12_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB12_4 -; CHECK-NEXT: .LBB12_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: blt a5, a4, .LBB12_5 -; CHECK-NEXT: .LBB12_14: # %entry -; CHECK-NEXT: lui a4, 1048568 -; CHECK-NEXT: blt a5, a3, .LBB12_6 -; CHECK-NEXT: .LBB12_15: # %entry -; CHECK-NEXT: lui a3, 1048568 -; CHECK-NEXT: blt a5, a2, .LBB12_7 -; CHECK-NEXT: .LBB12_16: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: bge a5, a1, .LBB12_8 -; CHECK-NEXT: j .LBB12_9 entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -764,39 +438,14 @@ define <4 x i16> @utest_f32i16(<4 x float> %x) { ; CHECK-LABEL: utest_f32i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.wu.s a1, fa0, rtz -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addiw a3, a2, -1 -; CHECK-NEXT: fcvt.wu.s a2, fa1, rtz -; CHECK-NEXT: bgeu a1, a3, .LBB13_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NEXT: bgeu a2, a3, .LBB13_7 -; CHECK-NEXT: .LBB13_2: # %entry -; CHECK-NEXT: fcvt.wu.s a5, fa3, rtz -; CHECK-NEXT: bgeu a4, a3, .LBB13_8 -; CHECK-NEXT: .LBB13_3: # %entry -; CHECK-NEXT: bltu a5, a3, .LBB13_5 -; CHECK-NEXT: .LBB13_4: # %entry -; CHECK-NEXT: mv a5, a3 -; CHECK-NEXT: .LBB13_5: # %entry -; CHECK-NEXT: sh a5, 6(a0) -; CHECK-NEXT: sh a4, 4(a0) -; CHECK-NEXT: sh a2, 2(a0) -; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB13_6: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NEXT: bltu a2, a3, .LBB13_2 -; CHECK-NEXT: .LBB13_7: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: fcvt.wu.s a5, fa3, rtz -; CHECK-NEXT: bltu a4, a3, .LBB13_3 -; CHECK-NEXT: .LBB13_8: # %entry -; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bgeu a5, a3, .LBB13_4 -; CHECK-NEXT: j .LBB13_5 entry: %conv = fptoui <4 x float> %x to <4 x i32> %0 = icmp ult <4 x i32> %conv, @@ -808,59 +457,15 @@ define <4 x i16> @ustest_f32i16(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addiw a5, a2, -1 -; CHECK-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB14_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB14_11 -; CHECK-NEXT: .LBB14_2: # %entry -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB14_12 -; CHECK-NEXT: .LBB14_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB14_13 -; CHECK-NEXT: .LBB14_4: # %entry -; CHECK-NEXT: blez a4, .LBB14_14 -; CHECK-NEXT: .LBB14_5: # %entry -; CHECK-NEXT: blez a3, .LBB14_15 -; CHECK-NEXT: .LBB14_6: # %entry -; CHECK-NEXT: blez a2, .LBB14_16 -; CHECK-NEXT: .LBB14_7: # %entry -; CHECK-NEXT: bgtz a1, .LBB14_9 -; CHECK-NEXT: .LBB14_8: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: .LBB14_9: # %entry -; CHECK-NEXT: sh a1, 6(a0) -; CHECK-NEXT: sh a2, 4(a0) -; CHECK-NEXT: sh a3, 2(a0) -; CHECK-NEXT: sh a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB14_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB14_2 -; CHECK-NEXT: .LBB14_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB14_3 -; CHECK-NEXT: .LBB14_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB14_4 -; CHECK-NEXT: .LBB14_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: bgtz a4, .LBB14_5 -; CHECK-NEXT: .LBB14_14: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB14_6 -; CHECK-NEXT: .LBB14_15: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB14_7 -; CHECK-NEXT: .LBB14_16: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: blez a1, .LBB14_8 -; CHECK-NEXT: j .LBB14_9 entry: %conv = fptosi <4 x float> %x to <4 x i32> %0 = icmp slt <4 x i32> %conv, @@ -874,24 +479,16 @@ define <8 x i16> @stest_f16i16(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -900,170 +497,100 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 56(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu s4, 16(a1) -; CHECK-NEXT: lhu s5, 24(a1) -; CHECK-NEXT: lhu s6, 32(a1) -; CHECK-NEXT: lhu s7, 40(a1) -; CHECK-NEXT: lhu a1, 48(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 -; CHECK-NEXT: mv a0, s7 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs6, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: addiw a7, a1, -1 -; CHECK-NEXT: bge a0, a7, .LBB15_18 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: bge s2, a7, .LBB15_19 -; CHECK-NEXT: .LBB15_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: bge a1, a7, .LBB15_20 -; CHECK-NEXT: .LBB15_3: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: bge a2, a7, .LBB15_21 -; CHECK-NEXT: .LBB15_4: # %entry -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: bge a3, a7, .LBB15_22 -; CHECK-NEXT: .LBB15_5: # %entry -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: bge a4, a7, .LBB15_23 -; CHECK-NEXT: .LBB15_6: # %entry -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: bge a5, a7, .LBB15_24 -; CHECK-NEXT: .LBB15_7: # %entry -; CHECK-NEXT: bge a6, a7, .LBB15_25 -; CHECK-NEXT: .LBB15_8: # %entry -; CHECK-NEXT: lui a7, 1048568 -; CHECK-NEXT: bge a7, a6, .LBB15_26 -; CHECK-NEXT: .LBB15_9: # %entry -; CHECK-NEXT: bge a7, a5, .LBB15_27 -; CHECK-NEXT: .LBB15_10: # %entry -; CHECK-NEXT: bge a7, a4, .LBB15_28 -; CHECK-NEXT: .LBB15_11: # %entry -; CHECK-NEXT: bge a7, a3, .LBB15_29 -; CHECK-NEXT: .LBB15_12: # %entry -; CHECK-NEXT: bge a7, a2, .LBB15_30 -; CHECK-NEXT: .LBB15_13: # %entry -; CHECK-NEXT: bge a7, a1, .LBB15_31 -; CHECK-NEXT: .LBB15_14: # %entry -; CHECK-NEXT: bge a7, s2, .LBB15_32 -; CHECK-NEXT: .LBB15_15: # %entry -; CHECK-NEXT: blt a7, a0, .LBB15_17 -; CHECK-NEXT: .LBB15_16: # %entry +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: .LBB15_17: # %entry -; CHECK-NEXT: sh a0, 14(s0) -; CHECK-NEXT: sh s2, 12(s0) -; CHECK-NEXT: sh a1, 10(s0) -; CHECK-NEXT: sh a2, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a4, 4(s0) -; CHECK-NEXT: sh a5, 2(s0) -; CHECK-NEXT: sh a6, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB15_18: # %entry -; CHECK-NEXT: mv a0, a7 -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: blt s2, a7, .LBB15_2 -; CHECK-NEXT: .LBB15_19: # %entry -; CHECK-NEXT: mv s2, a7 -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: blt a1, a7, .LBB15_3 -; CHECK-NEXT: .LBB15_20: # %entry -; CHECK-NEXT: mv a1, a7 -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: blt a2, a7, .LBB15_4 -; CHECK-NEXT: .LBB15_21: # %entry -; CHECK-NEXT: mv a2, a7 -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: blt a3, a7, .LBB15_5 -; CHECK-NEXT: .LBB15_22: # %entry -; CHECK-NEXT: mv a3, a7 -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: blt a4, a7, .LBB15_6 -; CHECK-NEXT: .LBB15_23: # %entry -; CHECK-NEXT: mv a4, a7 -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: blt a5, a7, .LBB15_7 -; CHECK-NEXT: .LBB15_24: # %entry -; CHECK-NEXT: mv a5, a7 -; CHECK-NEXT: blt a6, a7, .LBB15_8 -; CHECK-NEXT: .LBB15_25: # %entry -; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: lui a7, 1048568 -; CHECK-NEXT: blt a7, a6, .LBB15_9 -; CHECK-NEXT: .LBB15_26: # %entry -; CHECK-NEXT: lui a6, 1048568 -; CHECK-NEXT: blt a7, a5, .LBB15_10 -; CHECK-NEXT: .LBB15_27: # %entry -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: blt a7, a4, .LBB15_11 -; CHECK-NEXT: .LBB15_28: # %entry -; CHECK-NEXT: lui a4, 1048568 -; CHECK-NEXT: blt a7, a3, .LBB15_12 -; CHECK-NEXT: .LBB15_29: # %entry -; CHECK-NEXT: lui a3, 1048568 -; CHECK-NEXT: blt a7, a2, .LBB15_13 -; CHECK-NEXT: .LBB15_30: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: blt a7, a1, .LBB15_14 -; CHECK-NEXT: .LBB15_31: # %entry -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: blt a7, s2, .LBB15_15 -; CHECK-NEXT: .LBB15_32: # %entry -; CHECK-NEXT: lui s2, 1048568 -; CHECK-NEXT: bge a7, a0, .LBB15_16 -; CHECK-NEXT: j .LBB15_17 entry: %conv = fptosi <8 x half> %x to <8 x i32> %0 = icmp slt <8 x i32> %conv, @@ -1077,24 +604,16 @@ define <8 x i16> @utesth_f16i16(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -1103,128 +622,98 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 0(a1) -; CHECK-NEXT: lhu s2, 56(a1) -; CHECK-NEXT: lhu s3, 48(a1) -; CHECK-NEXT: lhu s4, 40(a1) -; CHECK-NEXT: lhu s5, 32(a1) -; CHECK-NEXT: lhu s6, 24(a1) -; CHECK-NEXT: lhu s7, 16(a1) -; CHECK-NEXT: lhu a1, 8(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 -; CHECK-NEXT: mv a0, s7 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.lu.s s2, fs6, rtz +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a1, a1, -1 -; CHECK-NEXT: bgeu a0, a1, .LBB16_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.lu.s a2, fs5, rtz -; CHECK-NEXT: bgeu s2, a1, .LBB16_11 -; CHECK-NEXT: .LBB16_2: # %entry -; CHECK-NEXT: fcvt.lu.s a3, fs4, rtz -; CHECK-NEXT: bgeu a2, a1, .LBB16_12 -; CHECK-NEXT: .LBB16_3: # %entry -; CHECK-NEXT: fcvt.lu.s a4, fs3, rtz -; CHECK-NEXT: bgeu a3, a1, .LBB16_13 -; CHECK-NEXT: .LBB16_4: # %entry -; CHECK-NEXT: fcvt.lu.s a5, fs2, rtz -; CHECK-NEXT: bgeu a4, a1, .LBB16_14 -; CHECK-NEXT: .LBB16_5: # %entry -; CHECK-NEXT: fcvt.lu.s a6, fs1, rtz -; CHECK-NEXT: bgeu a5, a1, .LBB16_15 -; CHECK-NEXT: .LBB16_6: # %entry -; CHECK-NEXT: fcvt.lu.s a7, fs0, rtz -; CHECK-NEXT: bgeu a6, a1, .LBB16_16 -; CHECK-NEXT: .LBB16_7: # %entry -; CHECK-NEXT: bltu a7, a1, .LBB16_9 -; CHECK-NEXT: .LBB16_8: # %entry -; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: .LBB16_9: # %entry -; CHECK-NEXT: sh a7, 14(s0) -; CHECK-NEXT: sh a6, 12(s0) -; CHECK-NEXT: sh a5, 10(s0) -; CHECK-NEXT: sh a4, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a2, 4(s0) -; CHECK-NEXT: sh s2, 2(s0) -; CHECK-NEXT: sh a0, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB16_10: # %entry -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: fcvt.lu.s a2, fs5, rtz -; CHECK-NEXT: bltu s2, a1, .LBB16_2 -; CHECK-NEXT: .LBB16_11: # %entry -; CHECK-NEXT: mv s2, a1 -; CHECK-NEXT: fcvt.lu.s a3, fs4, rtz -; CHECK-NEXT: bltu a2, a1, .LBB16_3 -; CHECK-NEXT: .LBB16_12: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: fcvt.lu.s a4, fs3, rtz -; CHECK-NEXT: bltu a3, a1, .LBB16_4 -; CHECK-NEXT: .LBB16_13: # %entry -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: fcvt.lu.s a5, fs2, rtz -; CHECK-NEXT: bltu a4, a1, .LBB16_5 -; CHECK-NEXT: .LBB16_14: # %entry -; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: fcvt.lu.s a6, fs1, rtz -; CHECK-NEXT: bltu a5, a1, .LBB16_6 -; CHECK-NEXT: .LBB16_15: # %entry -; CHECK-NEXT: mv a5, a1 -; CHECK-NEXT: fcvt.lu.s a7, fs0, rtz -; CHECK-NEXT: bltu a6, a1, .LBB16_7 -; CHECK-NEXT: .LBB16_16: # %entry -; CHECK-NEXT: mv a6, a1 -; CHECK-NEXT: bgeu a7, a1, .LBB16_8 -; CHECK-NEXT: j .LBB16_9 entry: %conv = fptoui <8 x half> %x to <8 x i32> %0 = icmp ult <8 x i32> %conv, @@ -1236,24 +725,16 @@ define <8 x i16> @ustest_f16i16(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -1262,168 +743,99 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 56(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu s4, 16(a1) -; CHECK-NEXT: lhu s5, 24(a1) -; CHECK-NEXT: lhu s6, 32(a1) -; CHECK-NEXT: lhu s7, 40(a1) -; CHECK-NEXT: lhu a1, 48(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 -; CHECK-NEXT: mv a0, s7 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs6, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a7, a1, -1 -; CHECK-NEXT: bge a0, a7, .LBB17_18 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: bge s2, a7, .LBB17_19 -; CHECK-NEXT: .LBB17_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: bge a1, a7, .LBB17_20 -; CHECK-NEXT: .LBB17_3: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: bge a2, a7, .LBB17_21 -; CHECK-NEXT: .LBB17_4: # %entry -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: bge a3, a7, .LBB17_22 -; CHECK-NEXT: .LBB17_5: # %entry -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: bge a4, a7, .LBB17_23 -; CHECK-NEXT: .LBB17_6: # %entry -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: bge a5, a7, .LBB17_24 -; CHECK-NEXT: .LBB17_7: # %entry -; CHECK-NEXT: bge a6, a7, .LBB17_25 -; CHECK-NEXT: .LBB17_8: # %entry -; CHECK-NEXT: blez a6, .LBB17_26 -; CHECK-NEXT: .LBB17_9: # %entry -; CHECK-NEXT: blez a5, .LBB17_27 -; CHECK-NEXT: .LBB17_10: # %entry -; CHECK-NEXT: blez a4, .LBB17_28 -; CHECK-NEXT: .LBB17_11: # %entry -; CHECK-NEXT: blez a3, .LBB17_29 -; CHECK-NEXT: .LBB17_12: # %entry -; CHECK-NEXT: blez a2, .LBB17_30 -; CHECK-NEXT: .LBB17_13: # %entry -; CHECK-NEXT: blez a1, .LBB17_31 -; CHECK-NEXT: .LBB17_14: # %entry -; CHECK-NEXT: blez s2, .LBB17_32 -; CHECK-NEXT: .LBB17_15: # %entry -; CHECK-NEXT: bgtz a0, .LBB17_17 -; CHECK-NEXT: .LBB17_16: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: .LBB17_17: # %entry -; CHECK-NEXT: sh a0, 14(s0) -; CHECK-NEXT: sh s2, 12(s0) -; CHECK-NEXT: sh a1, 10(s0) -; CHECK-NEXT: sh a2, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a4, 4(s0) -; CHECK-NEXT: sh a5, 2(s0) -; CHECK-NEXT: sh a6, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB17_18: # %entry -; CHECK-NEXT: mv a0, a7 -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: blt s2, a7, .LBB17_2 -; CHECK-NEXT: .LBB17_19: # %entry -; CHECK-NEXT: mv s2, a7 -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: blt a1, a7, .LBB17_3 -; CHECK-NEXT: .LBB17_20: # %entry -; CHECK-NEXT: mv a1, a7 -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: blt a2, a7, .LBB17_4 -; CHECK-NEXT: .LBB17_21: # %entry -; CHECK-NEXT: mv a2, a7 -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: blt a3, a7, .LBB17_5 -; CHECK-NEXT: .LBB17_22: # %entry -; CHECK-NEXT: mv a3, a7 -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: blt a4, a7, .LBB17_6 -; CHECK-NEXT: .LBB17_23: # %entry -; CHECK-NEXT: mv a4, a7 -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: blt a5, a7, .LBB17_7 -; CHECK-NEXT: .LBB17_24: # %entry -; CHECK-NEXT: mv a5, a7 -; CHECK-NEXT: blt a6, a7, .LBB17_8 -; CHECK-NEXT: .LBB17_25: # %entry -; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: bgtz a6, .LBB17_9 -; CHECK-NEXT: .LBB17_26: # %entry -; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: bgtz a5, .LBB17_10 -; CHECK-NEXT: .LBB17_27: # %entry -; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: bgtz a4, .LBB17_11 -; CHECK-NEXT: .LBB17_28: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB17_12 -; CHECK-NEXT: .LBB17_29: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB17_13 -; CHECK-NEXT: .LBB17_30: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bgtz a1, .LBB17_14 -; CHECK-NEXT: .LBB17_31: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bgtz s2, .LBB17_15 -; CHECK-NEXT: .LBB17_32: # %entry -; CHECK-NEXT: li s2, 0 -; CHECK-NEXT: blez a0, .LBB17_16 -; CHECK-NEXT: j .LBB17_17 entry: %conv = fptosi <8 x half> %x to <8 x i32> %0 = icmp slt <8 x i32> %conv, @@ -1439,79 +851,95 @@ define <2 x i64> @stest_f64i64(<2 x double> %x) { ; CHECK-LABEL: stest_f64i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixdfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixdfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: beqz a1, .LBB18_3 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 +; CHECK-NEXT: beqz s1, .LBB18_3 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: slti a4, a1, 0 -; CHECK-NEXT: bnez s1, .LBB18_4 +; CHECK-NEXT: slti a4, s1, 0 +; CHECK-NEXT: bnez a1, .LBB18_4 ; CHECK-NEXT: .LBB18_2: -; CHECK-NEXT: sltu a5, s0, a3 -; CHECK-NEXT: beqz a5, .LBB18_5 +; CHECK-NEXT: sltu a5, a0, a3 +; CHECK-NEXT: beqz a4, .LBB18_5 ; CHECK-NEXT: j .LBB18_6 ; CHECK-NEXT: .LBB18_3: -; CHECK-NEXT: sltu a4, a2, a3 -; CHECK-NEXT: beqz s1, .LBB18_2 +; CHECK-NEXT: sltu a4, s0, a3 +; CHECK-NEXT: beqz a1, .LBB18_2 ; CHECK-NEXT: .LBB18_4: # %entry -; CHECK-NEXT: slti a5, s1, 0 -; CHECK-NEXT: bnez a5, .LBB18_6 +; CHECK-NEXT: slti a5, a1, 0 +; CHECK-NEXT: bnez a4, .LBB18_6 ; CHECK-NEXT: .LBB18_5: # %entry ; CHECK-NEXT: li s1, 0 ; CHECK-NEXT: mv s0, a3 ; CHECK-NEXT: .LBB18_6: # %entry -; CHECK-NEXT: beqz a4, .LBB18_10 +; CHECK-NEXT: beqz a5, .LBB18_10 ; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: beq a1, a0, .LBB18_11 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: beq s1, a2, .LBB18_11 ; CHECK-NEXT: .LBB18_8: # %entry -; CHECK-NEXT: slt a1, a0, a1 -; CHECK-NEXT: bne s1, a0, .LBB18_12 +; CHECK-NEXT: slt a4, a2, s1 +; CHECK-NEXT: bne a1, a2, .LBB18_12 ; CHECK-NEXT: .LBB18_9: -; CHECK-NEXT: sltu a0, a3, s0 -; CHECK-NEXT: beqz a0, .LBB18_13 +; CHECK-NEXT: sltu a1, a3, a0 +; CHECK-NEXT: beqz a4, .LBB18_13 ; CHECK-NEXT: j .LBB18_14 ; CHECK-NEXT: .LBB18_10: # %entry ; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: bne a1, a0, .LBB18_8 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: bne s1, a2, .LBB18_8 ; CHECK-NEXT: .LBB18_11: -; CHECK-NEXT: sltu a1, a3, a2 -; CHECK-NEXT: beq s1, a0, .LBB18_9 +; CHECK-NEXT: sltu a4, a3, s0 +; CHECK-NEXT: beq a1, a2, .LBB18_9 ; CHECK-NEXT: .LBB18_12: # %entry -; CHECK-NEXT: slt a0, a0, s1 -; CHECK-NEXT: bnez a0, .LBB18_14 +; CHECK-NEXT: slt a1, a2, a1 +; CHECK-NEXT: bnez a4, .LBB18_14 ; CHECK-NEXT: .LBB18_13: # %entry ; CHECK-NEXT: mv s0, a3 ; CHECK-NEXT: .LBB18_14: # %entry ; CHECK-NEXT: bnez a1, .LBB18_16 ; CHECK-NEXT: # %bb.15: # %entry -; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: .LBB18_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i128> @@ -1526,37 +954,54 @@ define <2 x i64> @utest_f64i64(<2 x double> %x) { ; CHECK-LABEL: utest_f64i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa0 -; CHECK-NEXT: fmv.d fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixunsdfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixunsdfti@plt -; CHECK-NEXT: beqz a1, .LBB19_2 +; CHECK-NEXT: beqz s1, .LBB19_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: .LBB19_2: # %entry -; CHECK-NEXT: beqz s1, .LBB19_4 +; CHECK-NEXT: beqz a1, .LBB19_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: li s0, 0 +; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: .LBB19_4: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i128> @@ -1569,73 +1014,84 @@ define <2 x i64> @ustest_f64i64(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa0 -; CHECK-NEXT: fmv.d fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixdfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixdfti@plt ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgtz a1, .LBB20_7 +; CHECK-NEXT: bgtz a1, .LBB20_6 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a3, s1 -; CHECK-NEXT: bgtz s1, .LBB20_8 +; CHECK-NEXT: bgtz s1, .LBB20_7 ; CHECK-NEXT: .LBB20_2: # %entry -; CHECK-NEXT: bgtz a1, .LBB20_9 +; CHECK-NEXT: bgtz a1, .LBB20_8 ; CHECK-NEXT: .LBB20_3: # %entry -; CHECK-NEXT: bgtz s1, .LBB20_10 +; CHECK-NEXT: beqz a2, .LBB20_9 ; CHECK-NEXT: .LBB20_4: # %entry -; CHECK-NEXT: beqz a3, .LBB20_11 -; CHECK-NEXT: .LBB20_5: # %entry -; CHECK-NEXT: sgtz a1, a3 -; CHECK-NEXT: bnez a2, .LBB20_12 -; CHECK-NEXT: .LBB20_6: -; CHECK-NEXT: snez a2, a0 -; CHECK-NEXT: beqz a2, .LBB20_13 -; CHECK-NEXT: j .LBB20_14 -; CHECK-NEXT: .LBB20_7: # %entry +; CHECK-NEXT: sgtz a1, a2 +; CHECK-NEXT: bnez s1, .LBB20_10 +; CHECK-NEXT: .LBB20_5: +; CHECK-NEXT: snez a2, s0 +; CHECK-NEXT: beqz a2, .LBB20_11 +; CHECK-NEXT: j .LBB20_12 +; CHECK-NEXT: .LBB20_6: # %entry ; CHECK-NEXT: li a2, 1 -; CHECK-NEXT: mv a3, s1 ; CHECK-NEXT: blez s1, .LBB20_2 -; CHECK-NEXT: .LBB20_8: # %entry -; CHECK-NEXT: li a3, 1 +; CHECK-NEXT: .LBB20_7: # %entry +; CHECK-NEXT: li s0, 0 +; CHECK-NEXT: li s1, 1 ; CHECK-NEXT: blez a1, .LBB20_3 -; CHECK-NEXT: .LBB20_9: # %entry +; CHECK-NEXT: .LBB20_8: # %entry ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: blez s1, .LBB20_4 +; CHECK-NEXT: bnez a2, .LBB20_4 +; CHECK-NEXT: .LBB20_9: +; CHECK-NEXT: snez a1, a0 +; CHECK-NEXT: beqz s1, .LBB20_5 ; CHECK-NEXT: .LBB20_10: # %entry +; CHECK-NEXT: sgtz a2, s1 +; CHECK-NEXT: bnez a2, .LBB20_12 +; CHECK-NEXT: .LBB20_11: # %entry ; CHECK-NEXT: li s0, 0 -; CHECK-NEXT: bnez a3, .LBB20_5 -; CHECK-NEXT: .LBB20_11: -; CHECK-NEXT: snez a1, s0 -; CHECK-NEXT: beqz a2, .LBB20_6 ; CHECK-NEXT: .LBB20_12: # %entry -; CHECK-NEXT: sgtz a2, a2 -; CHECK-NEXT: bnez a2, .LBB20_14 -; CHECK-NEXT: .LBB20_13: # %entry +; CHECK-NEXT: bnez a1, .LBB20_14 +; CHECK-NEXT: # %bb.13: # %entry ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: .LBB20_14: # %entry -; CHECK-NEXT: bnez a1, .LBB20_16 -; CHECK-NEXT: # %bb.15: # %entry -; CHECK-NEXT: li s0, 0 -; CHECK-NEXT: .LBB20_16: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i128> @@ -1650,79 +1106,95 @@ define <2 x i64> @stest_f32i64(<2 x float> %x) { ; CHECK-LABEL: stest_f32i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixsfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: beqz a1, .LBB21_3 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 +; CHECK-NEXT: beqz s1, .LBB21_3 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: slti a4, a1, 0 -; CHECK-NEXT: bnez s1, .LBB21_4 +; CHECK-NEXT: slti a4, s1, 0 +; CHECK-NEXT: bnez a1, .LBB21_4 ; CHECK-NEXT: .LBB21_2: -; CHECK-NEXT: sltu a5, s0, a3 -; CHECK-NEXT: beqz a5, .LBB21_5 +; CHECK-NEXT: sltu a5, a0, a3 +; CHECK-NEXT: beqz a4, .LBB21_5 ; CHECK-NEXT: j .LBB21_6 ; CHECK-NEXT: .LBB21_3: -; CHECK-NEXT: sltu a4, a2, a3 -; CHECK-NEXT: beqz s1, .LBB21_2 +; CHECK-NEXT: sltu a4, s0, a3 +; CHECK-NEXT: beqz a1, .LBB21_2 ; CHECK-NEXT: .LBB21_4: # %entry -; CHECK-NEXT: slti a5, s1, 0 -; CHECK-NEXT: bnez a5, .LBB21_6 +; CHECK-NEXT: slti a5, a1, 0 +; CHECK-NEXT: bnez a4, .LBB21_6 ; CHECK-NEXT: .LBB21_5: # %entry ; CHECK-NEXT: li s1, 0 ; CHECK-NEXT: mv s0, a3 ; CHECK-NEXT: .LBB21_6: # %entry -; CHECK-NEXT: beqz a4, .LBB21_10 +; CHECK-NEXT: beqz a5, .LBB21_10 ; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: beq a1, a0, .LBB21_11 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: beq s1, a2, .LBB21_11 ; CHECK-NEXT: .LBB21_8: # %entry -; CHECK-NEXT: slt a1, a0, a1 -; CHECK-NEXT: bne s1, a0, .LBB21_12 +; CHECK-NEXT: slt a4, a2, s1 +; CHECK-NEXT: bne a1, a2, .LBB21_12 ; CHECK-NEXT: .LBB21_9: -; CHECK-NEXT: sltu a0, a3, s0 -; CHECK-NEXT: beqz a0, .LBB21_13 +; CHECK-NEXT: sltu a1, a3, a0 +; CHECK-NEXT: beqz a4, .LBB21_13 ; CHECK-NEXT: j .LBB21_14 ; CHECK-NEXT: .LBB21_10: # %entry ; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: bne a1, a0, .LBB21_8 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: bne s1, a2, .LBB21_8 ; CHECK-NEXT: .LBB21_11: -; CHECK-NEXT: sltu a1, a3, a2 -; CHECK-NEXT: beq s1, a0, .LBB21_9 +; CHECK-NEXT: sltu a4, a3, s0 +; CHECK-NEXT: beq a1, a2, .LBB21_9 ; CHECK-NEXT: .LBB21_12: # %entry -; CHECK-NEXT: slt a0, a0, s1 -; CHECK-NEXT: bnez a0, .LBB21_14 +; CHECK-NEXT: slt a1, a2, a1 +; CHECK-NEXT: bnez a4, .LBB21_14 ; CHECK-NEXT: .LBB21_13: # %entry ; CHECK-NEXT: mv s0, a3 ; CHECK-NEXT: .LBB21_14: # %entry ; CHECK-NEXT: bnez a1, .LBB21_16 ; CHECK-NEXT: # %bb.15: # %entry -; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: .LBB21_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x float> %x to <2 x i128> @@ -1737,37 +1209,54 @@ define <2 x i64> @utest_f32i64(<2 x float> %x) { ; CHECK-LABEL: utest_f32i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fmv.s fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixunssfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixunssfti@plt -; CHECK-NEXT: beqz a1, .LBB22_2 +; CHECK-NEXT: beqz s1, .LBB22_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: .LBB22_2: # %entry -; CHECK-NEXT: beqz s1, .LBB22_4 +; CHECK-NEXT: beqz a1, .LBB22_4 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: li s0, 0 +; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: .LBB22_4: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x float> %x to <2 x i128> @@ -1780,73 +1269,84 @@ define <2 x i64> @ustest_f32i64(<2 x float> %x) { ; CHECK-LABEL: ustest_f32i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fmv.s fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixsfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixsfti@plt ; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgtz a1, .LBB23_7 +; CHECK-NEXT: bgtz a1, .LBB23_6 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a3, s1 -; CHECK-NEXT: bgtz s1, .LBB23_8 +; CHECK-NEXT: bgtz s1, .LBB23_7 ; CHECK-NEXT: .LBB23_2: # %entry -; CHECK-NEXT: bgtz a1, .LBB23_9 +; CHECK-NEXT: bgtz a1, .LBB23_8 ; CHECK-NEXT: .LBB23_3: # %entry -; CHECK-NEXT: bgtz s1, .LBB23_10 +; CHECK-NEXT: beqz a2, .LBB23_9 ; CHECK-NEXT: .LBB23_4: # %entry -; CHECK-NEXT: beqz a3, .LBB23_11 -; CHECK-NEXT: .LBB23_5: # %entry -; CHECK-NEXT: sgtz a1, a3 -; CHECK-NEXT: bnez a2, .LBB23_12 -; CHECK-NEXT: .LBB23_6: -; CHECK-NEXT: snez a2, a0 -; CHECK-NEXT: beqz a2, .LBB23_13 -; CHECK-NEXT: j .LBB23_14 -; CHECK-NEXT: .LBB23_7: # %entry +; CHECK-NEXT: sgtz a1, a2 +; CHECK-NEXT: bnez s1, .LBB23_10 +; CHECK-NEXT: .LBB23_5: +; CHECK-NEXT: snez a2, s0 +; CHECK-NEXT: beqz a2, .LBB23_11 +; CHECK-NEXT: j .LBB23_12 +; CHECK-NEXT: .LBB23_6: # %entry ; CHECK-NEXT: li a2, 1 -; CHECK-NEXT: mv a3, s1 ; CHECK-NEXT: blez s1, .LBB23_2 -; CHECK-NEXT: .LBB23_8: # %entry -; CHECK-NEXT: li a3, 1 +; CHECK-NEXT: .LBB23_7: # %entry +; CHECK-NEXT: li s0, 0 +; CHECK-NEXT: li s1, 1 ; CHECK-NEXT: blez a1, .LBB23_3 -; CHECK-NEXT: .LBB23_9: # %entry +; CHECK-NEXT: .LBB23_8: # %entry ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: blez s1, .LBB23_4 +; CHECK-NEXT: bnez a2, .LBB23_4 +; CHECK-NEXT: .LBB23_9: +; CHECK-NEXT: snez a1, a0 +; CHECK-NEXT: beqz s1, .LBB23_5 ; CHECK-NEXT: .LBB23_10: # %entry +; CHECK-NEXT: sgtz a2, s1 +; CHECK-NEXT: bnez a2, .LBB23_12 +; CHECK-NEXT: .LBB23_11: # %entry ; CHECK-NEXT: li s0, 0 -; CHECK-NEXT: bnez a3, .LBB23_5 -; CHECK-NEXT: .LBB23_11: -; CHECK-NEXT: snez a1, s0 -; CHECK-NEXT: beqz a2, .LBB23_6 ; CHECK-NEXT: .LBB23_12: # %entry -; CHECK-NEXT: sgtz a2, a2 -; CHECK-NEXT: bnez a2, .LBB23_14 -; CHECK-NEXT: .LBB23_13: # %entry +; CHECK-NEXT: bnez a1, .LBB23_14 +; CHECK-NEXT: # %bb.13: # %entry ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: .LBB23_14: # %entry -; CHECK-NEXT: bnez a1, .LBB23_16 -; CHECK-NEXT: # %bb.15: # %entry -; CHECK-NEXT: li s0, 0 -; CHECK-NEXT: .LBB23_16: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x float> %x to <2 x i128> @@ -1861,12 +1361,12 @@ define <2 x i64> @stest_f16i64(<2 x half> %x) { ; CHECK-LABEL: stest_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -1879,9 +1379,8 @@ ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 ; CHECK-NEXT: beqz a1, .LBB24_3 ; CHECK-NEXT: # %bb.1: # %entry ; CHECK-NEXT: slti a4, a1, 0 @@ -1891,7 +1390,7 @@ ; CHECK-NEXT: beqz a5, .LBB24_5 ; CHECK-NEXT: j .LBB24_6 ; CHECK-NEXT: .LBB24_3: -; CHECK-NEXT: sltu a4, a2, a3 +; CHECK-NEXT: sltu a4, a0, a3 ; CHECK-NEXT: beqz s1, .LBB24_2 ; CHECK-NEXT: .LBB24_4: # %entry ; CHECK-NEXT: slti a5, s1, 0 @@ -1902,40 +1401,47 @@ ; CHECK-NEXT: .LBB24_6: # %entry ; CHECK-NEXT: beqz a4, .LBB24_10 ; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: beq a1, a0, .LBB24_11 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: beq a1, a2, .LBB24_11 ; CHECK-NEXT: .LBB24_8: # %entry -; CHECK-NEXT: slt a1, a0, a1 -; CHECK-NEXT: bne s1, a0, .LBB24_12 +; CHECK-NEXT: slt a1, a2, a1 +; CHECK-NEXT: bne s1, a2, .LBB24_12 ; CHECK-NEXT: .LBB24_9: -; CHECK-NEXT: sltu a0, a3, s0 -; CHECK-NEXT: beqz a0, .LBB24_13 +; CHECK-NEXT: sltu a2, a3, s0 +; CHECK-NEXT: beqz a2, .LBB24_13 ; CHECK-NEXT: j .LBB24_14 ; CHECK-NEXT: .LBB24_10: # %entry ; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: slli a3, a0, 63 -; CHECK-NEXT: bne a1, a0, .LBB24_8 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: slli a3, a2, 63 +; CHECK-NEXT: bne a1, a2, .LBB24_8 ; CHECK-NEXT: .LBB24_11: -; CHECK-NEXT: sltu a1, a3, a2 -; CHECK-NEXT: beq s1, a0, .LBB24_9 +; CHECK-NEXT: sltu a1, a3, a0 +; CHECK-NEXT: beq s1, a2, .LBB24_9 ; CHECK-NEXT: .LBB24_12: # %entry -; CHECK-NEXT: slt a0, a0, s1 -; CHECK-NEXT: bnez a0, .LBB24_14 +; CHECK-NEXT: slt a2, a2, s1 +; CHECK-NEXT: bnez a2, .LBB24_14 ; CHECK-NEXT: .LBB24_13: # %entry ; CHECK-NEXT: mv s0, a3 ; CHECK-NEXT: .LBB24_14: # %entry ; CHECK-NEXT: bnez a1, .LBB24_16 ; CHECK-NEXT: # %bb.15: # %entry -; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: .LBB24_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: sd s0, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x half> %x to <2 x i128> @@ -1950,12 +1456,12 @@ define <2 x i64> @utesth_f16i64(<2 x half> %x) { ; CHECK-LABEL: utesth_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -1977,12 +1483,20 @@ ; CHECK-NEXT: # %bb.3: # %entry ; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: .LBB25_4: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd s0, 8(sp) +; CHECK-NEXT: sd a0, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x half> %x to <2 x i128> @@ -1995,12 +1509,12 @@ define <2 x i64> @ustest_f16i64(<2 x half> %x) { ; CHECK-LABEL: ustest_f16i64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -2058,12 +1572,20 @@ ; CHECK-NEXT: # %bb.15: # %entry ; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: .LBB26_16: # %entry -; CHECK-NEXT: mv a1, s0 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd s0, 8(sp) +; CHECK-NEXT: sd a0, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x half> %x to <2 x i128> @@ -2082,30 +1604,14 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.d a1, fa1, rtz -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: addiw a3, a2, -1 -; CHECK-NEXT: fcvt.l.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a3, .LBB27_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a3, .LBB27_6 -; CHECK-NEXT: .LBB27_2: # %entry -; CHECK-NEXT: bge a2, a0, .LBB27_7 -; CHECK-NEXT: .LBB27_3: # %entry -; CHECK-NEXT: bge a2, a1, .LBB27_8 -; CHECK-NEXT: .LBB27_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB27_5: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: blt a0, a3, .LBB27_2 -; CHECK-NEXT: .LBB27_6: # %entry -; CHECK-NEXT: mv a0, a3 -; CHECK-NEXT: blt a2, a0, .LBB27_3 -; CHECK-NEXT: .LBB27_7: # %entry +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: blt a2, a1, .LBB27_4 -; CHECK-NEXT: .LBB27_8: # %entry -; CHECK-NEXT: lui a1, 524288 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a1 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -2118,20 +1624,13 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: utest_f64i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.lu.d a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a2, a1, 32 -; CHECK-NEXT: fcvt.lu.d a1, fa1, rtz -; CHECK-NEXT: bgeu a0, a2, .LBB28_3 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a1, a2, .LBB28_4 -; CHECK-NEXT: .LBB28_2: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB28_3: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a1, a2, .LBB28_2 -; CHECK-NEXT: .LBB28_4: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i64> @@ -2143,30 +1642,14 @@ define <2 x i32> @ustest_f64i32_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 ; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a2, a0, 32 -; CHECK-NEXT: fcvt.l.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB29_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB29_6 -; CHECK-NEXT: .LBB29_2: # %entry -; CHECK-NEXT: blez a0, .LBB29_7 -; CHECK-NEXT: .LBB29_3: # %entry -; CHECK-NEXT: blez a1, .LBB29_8 -; CHECK-NEXT: .LBB29_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB29_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB29_2 -; CHECK-NEXT: .LBB29_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bgtz a0, .LBB29_3 -; CHECK-NEXT: .LBB29_7: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bgtz a1, .LBB29_4 -; CHECK-NEXT: .LBB29_8: # %entry -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i64> @@ -2179,59 +1662,16 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: stest_f32i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NEXT: lui a3, 524288 -; CHECK-NEXT: addiw a6, a3, -1 -; CHECK-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a6, .LBB30_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NEXT: bge a2, a6, .LBB30_11 -; CHECK-NEXT: .LBB30_2: # %entry -; CHECK-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NEXT: bge a4, a6, .LBB30_12 -; CHECK-NEXT: .LBB30_3: # %entry -; CHECK-NEXT: bge a5, a6, .LBB30_13 -; CHECK-NEXT: .LBB30_4: # %entry -; CHECK-NEXT: bge a3, a5, .LBB30_14 -; CHECK-NEXT: .LBB30_5: # %entry -; CHECK-NEXT: bge a3, a4, .LBB30_15 -; CHECK-NEXT: .LBB30_6: # %entry -; CHECK-NEXT: bge a3, a2, .LBB30_16 -; CHECK-NEXT: .LBB30_7: # %entry -; CHECK-NEXT: blt a3, a1, .LBB30_9 -; CHECK-NEXT: .LBB30_8: # %entry -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: .LBB30_9: # %entry -; CHECK-NEXT: sw a1, 12(a0) -; CHECK-NEXT: sw a2, 8(a0) -; CHECK-NEXT: sw a4, 4(a0) -; CHECK-NEXT: sw a5, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: lui a0, 524288 +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a1 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB30_10: # %entry -; CHECK-NEXT: mv a1, a6 -; CHECK-NEXT: fcvt.l.s a4, fa1, rtz -; CHECK-NEXT: blt a2, a6, .LBB30_2 -; CHECK-NEXT: .LBB30_11: # %entry -; CHECK-NEXT: mv a2, a6 -; CHECK-NEXT: fcvt.l.s a5, fa0, rtz -; CHECK-NEXT: blt a4, a6, .LBB30_3 -; CHECK-NEXT: .LBB30_12: # %entry -; CHECK-NEXT: mv a4, a6 -; CHECK-NEXT: blt a5, a6, .LBB30_4 -; CHECK-NEXT: .LBB30_13: # %entry -; CHECK-NEXT: mv a5, a6 -; CHECK-NEXT: blt a3, a5, .LBB30_5 -; CHECK-NEXT: .LBB30_14: # %entry -; CHECK-NEXT: lui a5, 524288 -; CHECK-NEXT: blt a3, a4, .LBB30_6 -; CHECK-NEXT: .LBB30_15: # %entry -; CHECK-NEXT: lui a4, 524288 -; CHECK-NEXT: blt a3, a2, .LBB30_7 -; CHECK-NEXT: .LBB30_16: # %entry -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: bge a3, a1, .LBB30_8 -; CHECK-NEXT: j .LBB30_9 entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2243,39 +1683,15 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: utest_f32i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.lu.s a1, fa0, rtz -; CHECK-NEXT: li a2, -1 -; CHECK-NEXT: srli a3, a2, 32 -; CHECK-NEXT: fcvt.lu.s a2, fa1, rtz -; CHECK-NEXT: bgeu a1, a3, .LBB31_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NEXT: bgeu a2, a3, .LBB31_7 -; CHECK-NEXT: .LBB31_2: # %entry -; CHECK-NEXT: fcvt.lu.s a5, fa3, rtz -; CHECK-NEXT: bgeu a4, a3, .LBB31_8 -; CHECK-NEXT: .LBB31_3: # %entry -; CHECK-NEXT: bltu a5, a3, .LBB31_5 -; CHECK-NEXT: .LBB31_4: # %entry -; CHECK-NEXT: mv a5, a3 -; CHECK-NEXT: .LBB31_5: # %entry -; CHECK-NEXT: sw a5, 12(a0) -; CHECK-NEXT: sw a4, 8(a0) -; CHECK-NEXT: sw a2, 4(a0) -; CHECK-NEXT: sw a1, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.xu.f.v v10, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB31_6: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: fcvt.lu.s a4, fa2, rtz -; CHECK-NEXT: bltu a2, a3, .LBB31_2 -; CHECK-NEXT: .LBB31_7: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: fcvt.lu.s a5, fa3, rtz -; CHECK-NEXT: bltu a4, a3, .LBB31_3 -; CHECK-NEXT: .LBB31_8: # %entry -; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bgeu a5, a3, .LBB31_4 -; CHECK-NEXT: j .LBB31_5 entry: %conv = fptoui <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2286,59 +1702,16 @@ define <4 x i32> @ustest_f32i32_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i32_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.l.s a1, fa3, rtz -; CHECK-NEXT: li a2, -1 -; CHECK-NEXT: srli a5, a2, 32 -; CHECK-NEXT: fcvt.l.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB32_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB32_11 -; CHECK-NEXT: .LBB32_2: # %entry -; CHECK-NEXT: fcvt.l.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB32_12 -; CHECK-NEXT: .LBB32_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB32_13 -; CHECK-NEXT: .LBB32_4: # %entry -; CHECK-NEXT: blez a4, .LBB32_14 -; CHECK-NEXT: .LBB32_5: # %entry -; CHECK-NEXT: blez a3, .LBB32_15 -; CHECK-NEXT: .LBB32_6: # %entry -; CHECK-NEXT: blez a2, .LBB32_16 -; CHECK-NEXT: .LBB32_7: # %entry -; CHECK-NEXT: bgtz a1, .LBB32_9 -; CHECK-NEXT: .LBB32_8: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: .LBB32_9: # %entry -; CHECK-NEXT: sw a1, 12(a0) -; CHECK-NEXT: sw a2, 8(a0) -; CHECK-NEXT: sw a3, 4(a0) -; CHECK-NEXT: sw a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfwcvt.rtz.x.f.v v10, v8 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB32_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.l.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB32_2 -; CHECK-NEXT: .LBB32_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.l.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB32_3 -; CHECK-NEXT: .LBB32_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB32_4 -; CHECK-NEXT: .LBB32_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: bgtz a4, .LBB32_5 -; CHECK-NEXT: .LBB32_14: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB32_6 -; CHECK-NEXT: .LBB32_15: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB32_7 -; CHECK-NEXT: .LBB32_16: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: blez a1, .LBB32_8 -; CHECK-NEXT: j .LBB32_9 entry: %conv = fptosi <4 x float> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2356,96 +1729,59 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 24(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu a1, 16(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs2, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 524288 -; CHECK-NEXT: addiw a4, a1, -1 -; CHECK-NEXT: bge a0, a4, .LBB33_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs1, rtz -; CHECK-NEXT: bge s2, a4, .LBB33_11 -; CHECK-NEXT: .LBB33_2: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NEXT: bge a2, a4, .LBB33_12 -; CHECK-NEXT: .LBB33_3: # %entry -; CHECK-NEXT: bge a3, a4, .LBB33_13 -; CHECK-NEXT: .LBB33_4: # %entry -; CHECK-NEXT: bge a1, a3, .LBB33_14 -; CHECK-NEXT: .LBB33_5: # %entry -; CHECK-NEXT: bge a1, a2, .LBB33_15 -; CHECK-NEXT: .LBB33_6: # %entry -; CHECK-NEXT: bge a1, s2, .LBB33_16 -; CHECK-NEXT: .LBB33_7: # %entry -; CHECK-NEXT: blt a1, a0, .LBB33_9 -; CHECK-NEXT: .LBB33_8: # %entry +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 ; CHECK-NEXT: lui a0, 524288 -; CHECK-NEXT: .LBB33_9: # %entry -; CHECK-NEXT: sw a0, 12(s0) -; CHECK-NEXT: sw s2, 8(s0) -; CHECK-NEXT: sw a2, 4(s0) -; CHECK-NEXT: sw a3, 0(s0) +; CHECK-NEXT: addiw a1, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a1 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB33_10: # %entry -; CHECK-NEXT: mv a0, a4 -; CHECK-NEXT: fcvt.l.s a2, fs1, rtz -; CHECK-NEXT: blt s2, a4, .LBB33_2 -; CHECK-NEXT: .LBB33_11: # %entry -; CHECK-NEXT: mv s2, a4 -; CHECK-NEXT: fcvt.l.s a3, fs0, rtz -; CHECK-NEXT: blt a2, a4, .LBB33_3 -; CHECK-NEXT: .LBB33_12: # %entry -; CHECK-NEXT: mv a2, a4 -; CHECK-NEXT: blt a3, a4, .LBB33_4 -; CHECK-NEXT: .LBB33_13: # %entry -; CHECK-NEXT: mv a3, a4 -; CHECK-NEXT: blt a1, a3, .LBB33_5 -; CHECK-NEXT: .LBB33_14: # %entry -; CHECK-NEXT: lui a3, 524288 -; CHECK-NEXT: blt a1, a2, .LBB33_6 -; CHECK-NEXT: .LBB33_15: # %entry -; CHECK-NEXT: lui a2, 524288 -; CHECK-NEXT: blt a1, s2, .LBB33_7 -; CHECK-NEXT: .LBB33_16: # %entry -; CHECK-NEXT: lui s2, 524288 -; CHECK-NEXT: bge a1, a0, .LBB33_8 -; CHECK-NEXT: j .LBB33_9 entry: %conv = fptosi <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2463,76 +1799,58 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 0(a1) -; CHECK-NEXT: lhu s2, 24(a1) -; CHECK-NEXT: lhu s3, 16(a1) -; CHECK-NEXT: lhu a1, 8(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.lu.s s2, fs2, rtz +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a1, a1, 32 -; CHECK-NEXT: bgeu a0, a1, .LBB34_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.lu.s a2, fs1, rtz -; CHECK-NEXT: bgeu s2, a1, .LBB34_7 -; CHECK-NEXT: .LBB34_2: # %entry -; CHECK-NEXT: fcvt.lu.s a3, fs0, rtz -; CHECK-NEXT: bgeu a2, a1, .LBB34_8 -; CHECK-NEXT: .LBB34_3: # %entry -; CHECK-NEXT: bltu a3, a1, .LBB34_5 -; CHECK-NEXT: .LBB34_4: # %entry -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: .LBB34_5: # %entry -; CHECK-NEXT: sw a3, 12(s0) -; CHECK-NEXT: sw a2, 8(s0) -; CHECK-NEXT: sw s2, 4(s0) -; CHECK-NEXT: sw a0, 0(s0) +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v10, a0 +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB34_6: # %entry -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: fcvt.lu.s a2, fs1, rtz -; CHECK-NEXT: bltu s2, a1, .LBB34_2 -; CHECK-NEXT: .LBB34_7: # %entry -; CHECK-NEXT: mv s2, a1 -; CHECK-NEXT: fcvt.lu.s a3, fs0, rtz -; CHECK-NEXT: bltu a2, a1, .LBB34_3 -; CHECK-NEXT: .LBB34_8: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgeu a3, a1, .LBB34_4 -; CHECK-NEXT: j .LBB34_5 entry: %conv = fptoui <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.umin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2549,96 +1867,59 @@ ; CHECK-NEXT: sd s0, 48(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s1, 40(sp) # 8-byte Folded Spill ; CHECK-NEXT: sd s2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 0(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 ; CHECK-NEXT: .cfi_offset s2, -32 -; CHECK-NEXT: .cfi_offset s3, -40 -; CHECK-NEXT: .cfi_offset fs0, -48 -; CHECK-NEXT: .cfi_offset fs1, -56 -; CHECK-NEXT: .cfi_offset fs2, -64 -; CHECK-NEXT: lhu s1, 24(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu a1, 16(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: lhu s0, 24(a0) +; CHECK-NEXT: lhu s1, 16(a0) +; CHECK-NEXT: lhu s2, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 -; CHECK-NEXT: mv a0, s3 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 24(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs2, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 0(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: li a1, -1 -; CHECK-NEXT: srli a3, a1, 32 -; CHECK-NEXT: bge a0, a3, .LBB35_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NEXT: bge s2, a3, .LBB35_11 -; CHECK-NEXT: .LBB35_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs0, rtz -; CHECK-NEXT: bge a1, a3, .LBB35_12 -; CHECK-NEXT: .LBB35_3: # %entry -; CHECK-NEXT: bge a2, a3, .LBB35_13 -; CHECK-NEXT: .LBB35_4: # %entry -; CHECK-NEXT: blez a2, .LBB35_14 -; CHECK-NEXT: .LBB35_5: # %entry -; CHECK-NEXT: blez a1, .LBB35_15 -; CHECK-NEXT: .LBB35_6: # %entry -; CHECK-NEXT: blez s2, .LBB35_16 -; CHECK-NEXT: .LBB35_7: # %entry -; CHECK-NEXT: bgtz a0, .LBB35_9 -; CHECK-NEXT: .LBB35_8: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: .LBB35_9: # %entry -; CHECK-NEXT: sw a0, 12(s0) -; CHECK-NEXT: sw s2, 8(s0) -; CHECK-NEXT: sw a1, 4(s0) -; CHECK-NEXT: sw a2, 0(s0) +; CHECK-NEXT: sd a0, 16(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 1 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 3, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 2 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 4, e64, m2, tu, mu +; CHECK-NEXT: vslideup.vi v10, v8, 3 +; CHECK-NEXT: li a0, -1 +; CHECK-NEXT: srli a0, a0, 32 +; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v10, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 ; CHECK-NEXT: ld ra, 56(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s0, 48(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s1, 40(sp) # 8-byte Folded Reload ; CHECK-NEXT: ld s2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 0(sp) # 8-byte Folded Reload ; CHECK-NEXT: addi sp, sp, 64 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB35_10: # %entry -; CHECK-NEXT: mv a0, a3 -; CHECK-NEXT: fcvt.l.s a1, fs1, rtz -; CHECK-NEXT: blt s2, a3, .LBB35_2 -; CHECK-NEXT: .LBB35_11: # %entry -; CHECK-NEXT: mv s2, a3 -; CHECK-NEXT: fcvt.l.s a2, fs0, rtz -; CHECK-NEXT: blt a1, a3, .LBB35_3 -; CHECK-NEXT: .LBB35_12: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: blt a2, a3, .LBB35_4 -; CHECK-NEXT: .LBB35_13: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: bgtz a2, .LBB35_5 -; CHECK-NEXT: .LBB35_14: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bgtz a1, .LBB35_6 -; CHECK-NEXT: .LBB35_15: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bgtz s2, .LBB35_7 -; CHECK-NEXT: .LBB35_16: # %entry -; CHECK-NEXT: li s2, 0 -; CHECK-NEXT: blez a0, .LBB35_8 -; CHECK-NEXT: j .LBB35_9 entry: %conv = fptosi <4 x half> %x to <4 x i64> %spec.store.select = call <4 x i64> @llvm.smin.v4i64(<4 x i64> %conv, <4 x i64> ) @@ -2652,32 +1933,15 @@ define <2 x i16> @stest_f64i16_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 ; CHECK-NEXT: lui a0, 8 -; CHECK-NEXT: addiw a2, a0, -1 -; CHECK-NEXT: fcvt.w.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB36_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB36_6 -; CHECK-NEXT: .LBB36_2: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: bge a2, a0, .LBB36_7 -; CHECK-NEXT: .LBB36_3: # %entry -; CHECK-NEXT: bge a2, a1, .LBB36_8 -; CHECK-NEXT: .LBB36_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB36_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB36_2 -; CHECK-NEXT: .LBB36_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: blt a2, a0, .LBB36_3 -; CHECK-NEXT: .LBB36_7: # %entry +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v9, a0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: blt a2, a1, .LBB36_4 -; CHECK-NEXT: .LBB36_8: # %entry -; CHECK-NEXT: lui a1, 1048568 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -2690,20 +1954,13 @@ define <2 x i16> @utest_f64i16_mm(<2 x double> %x) { ; CHECK-LABEL: utest_f64i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.wu.d a0, fa0, rtz -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a2, a1, -1 -; CHECK-NEXT: fcvt.wu.d a1, fa1, rtz -; CHECK-NEXT: bgeu a0, a2, .LBB37_3 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a1, a2, .LBB37_4 -; CHECK-NEXT: .LBB37_2: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB37_3: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bltu a1, a2, .LBB37_2 -; CHECK-NEXT: .LBB37_4: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.xu.f.w v9, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v9, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptoui <2 x double> %x to <2 x i32> @@ -2715,30 +1972,14 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.d a1, fa1, rtz +; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, mu +; CHECK-NEXT: vfncvt.rtz.x.f.w v9, v8 ; CHECK-NEXT: lui a0, 16 -; CHECK-NEXT: addiw a2, a0, -1 -; CHECK-NEXT: fcvt.w.d a0, fa0, rtz -; CHECK-NEXT: bge a1, a2, .LBB38_5 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bge a0, a2, .LBB38_6 -; CHECK-NEXT: .LBB38_2: # %entry -; CHECK-NEXT: blez a0, .LBB38_7 -; CHECK-NEXT: .LBB38_3: # %entry -; CHECK-NEXT: blez a1, .LBB38_8 -; CHECK-NEXT: .LBB38_4: # %entry -; CHECK-NEXT: ret -; CHECK-NEXT: .LBB38_5: # %entry -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: blt a0, a2, .LBB38_2 -; CHECK-NEXT: .LBB38_6: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: bgtz a0, .LBB38_3 -; CHECK-NEXT: .LBB38_7: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bgtz a1, .LBB38_4 -; CHECK-NEXT: .LBB38_8: # %entry -; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v9, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret entry: %conv = fptosi <2 x double> %x to <2 x i32> @@ -2751,61 +1992,16 @@ define <4 x i16> @stest_f32i16_mm(<4 x float> %x) { ; CHECK-LABEL: stest_f32i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NEXT: lui a2, 8 -; CHECK-NEXT: addiw a5, a2, -1 -; CHECK-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB39_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB39_11 -; CHECK-NEXT: .LBB39_2: # %entry -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB39_12 -; CHECK-NEXT: .LBB39_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB39_13 -; CHECK-NEXT: .LBB39_4: # %entry -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: bge a5, a4, .LBB39_14 -; CHECK-NEXT: .LBB39_5: # %entry -; CHECK-NEXT: bge a5, a3, .LBB39_15 -; CHECK-NEXT: .LBB39_6: # %entry -; CHECK-NEXT: bge a5, a2, .LBB39_16 -; CHECK-NEXT: .LBB39_7: # %entry -; CHECK-NEXT: blt a5, a1, .LBB39_9 -; CHECK-NEXT: .LBB39_8: # %entry -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: .LBB39_9: # %entry -; CHECK-NEXT: sh a1, 6(a0) -; CHECK-NEXT: sh a2, 4(a0) -; CHECK-NEXT: sh a3, 2(a0) -; CHECK-NEXT: sh a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: lui a0, 1048568 +; CHECK-NEXT: vmax.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB39_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB39_2 -; CHECK-NEXT: .LBB39_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB39_3 -; CHECK-NEXT: .LBB39_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB39_4 -; CHECK-NEXT: .LBB39_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: blt a5, a4, .LBB39_5 -; CHECK-NEXT: .LBB39_14: # %entry -; CHECK-NEXT: lui a4, 1048568 -; CHECK-NEXT: blt a5, a3, .LBB39_6 -; CHECK-NEXT: .LBB39_15: # %entry -; CHECK-NEXT: lui a3, 1048568 -; CHECK-NEXT: blt a5, a2, .LBB39_7 -; CHECK-NEXT: .LBB39_16: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: bge a5, a1, .LBB39_8 -; CHECK-NEXT: j .LBB39_9 entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2817,39 +2013,14 @@ define <4 x i16> @utest_f32i16_mm(<4 x float> %x) { ; CHECK-LABEL: utest_f32i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.wu.s a1, fa0, rtz -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addiw a3, a2, -1 -; CHECK-NEXT: fcvt.wu.s a2, fa1, rtz -; CHECK-NEXT: bgeu a1, a3, .LBB40_6 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NEXT: bgeu a2, a3, .LBB40_7 -; CHECK-NEXT: .LBB40_2: # %entry -; CHECK-NEXT: fcvt.wu.s a5, fa3, rtz -; CHECK-NEXT: bgeu a4, a3, .LBB40_8 -; CHECK-NEXT: .LBB40_3: # %entry -; CHECK-NEXT: bltu a5, a3, .LBB40_5 -; CHECK-NEXT: .LBB40_4: # %entry -; CHECK-NEXT: mv a5, a3 -; CHECK-NEXT: .LBB40_5: # %entry -; CHECK-NEXT: sh a5, 6(a0) -; CHECK-NEXT: sh a4, 4(a0) -; CHECK-NEXT: sh a2, 2(a0) -; CHECK-NEXT: sh a1, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.xu.f.v v8, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vminu.vx v8, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB40_6: # %entry -; CHECK-NEXT: mv a1, a3 -; CHECK-NEXT: fcvt.wu.s a4, fa2, rtz -; CHECK-NEXT: bltu a2, a3, .LBB40_2 -; CHECK-NEXT: .LBB40_7: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: fcvt.wu.s a5, fa3, rtz -; CHECK-NEXT: bltu a4, a3, .LBB40_3 -; CHECK-NEXT: .LBB40_8: # %entry -; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bgeu a5, a3, .LBB40_4 -; CHECK-NEXT: j .LBB40_5 entry: %conv = fptoui <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.umin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2860,59 +2031,15 @@ define <4 x i16> @ustest_f32i16_mm(<4 x float> %x) { ; CHECK-LABEL: ustest_f32i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: fcvt.w.s a1, fa3, rtz -; CHECK-NEXT: lui a2, 16 -; CHECK-NEXT: addiw a5, a2, -1 -; CHECK-NEXT: fcvt.w.s a2, fa2, rtz -; CHECK-NEXT: bge a1, a5, .LBB41_10 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: bge a2, a5, .LBB41_11 -; CHECK-NEXT: .LBB41_2: # %entry -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: bge a3, a5, .LBB41_12 -; CHECK-NEXT: .LBB41_3: # %entry -; CHECK-NEXT: bge a4, a5, .LBB41_13 -; CHECK-NEXT: .LBB41_4: # %entry -; CHECK-NEXT: blez a4, .LBB41_14 -; CHECK-NEXT: .LBB41_5: # %entry -; CHECK-NEXT: blez a3, .LBB41_15 -; CHECK-NEXT: .LBB41_6: # %entry -; CHECK-NEXT: blez a2, .LBB41_16 -; CHECK-NEXT: .LBB41_7: # %entry -; CHECK-NEXT: bgtz a1, .LBB41_9 -; CHECK-NEXT: .LBB41_8: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: .LBB41_9: # %entry -; CHECK-NEXT: sh a1, 6(a0) -; CHECK-NEXT: sh a2, 4(a0) -; CHECK-NEXT: sh a3, 2(a0) -; CHECK-NEXT: sh a4, 0(a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, mu +; CHECK-NEXT: vfcvt.rtz.x.f.v v8, v8 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v8, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v8 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB41_10: # %entry -; CHECK-NEXT: mv a1, a5 -; CHECK-NEXT: fcvt.w.s a3, fa1, rtz -; CHECK-NEXT: blt a2, a5, .LBB41_2 -; CHECK-NEXT: .LBB41_11: # %entry -; CHECK-NEXT: mv a2, a5 -; CHECK-NEXT: fcvt.w.s a4, fa0, rtz -; CHECK-NEXT: blt a3, a5, .LBB41_3 -; CHECK-NEXT: .LBB41_12: # %entry -; CHECK-NEXT: mv a3, a5 -; CHECK-NEXT: blt a4, a5, .LBB41_4 -; CHECK-NEXT: .LBB41_13: # %entry -; CHECK-NEXT: mv a4, a5 -; CHECK-NEXT: bgtz a4, .LBB41_5 -; CHECK-NEXT: .LBB41_14: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB41_6 -; CHECK-NEXT: .LBB41_15: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB41_7 -; CHECK-NEXT: .LBB41_16: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: blez a1, .LBB41_8 -; CHECK-NEXT: j .LBB41_9 entry: %conv = fptosi <4 x float> %x to <4 x i32> %spec.store.select = call <4 x i32> @llvm.smin.v4i32(<4 x i32> %conv, <4 x i32> ) @@ -2924,24 +2051,16 @@ define <8 x i16> @stest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: stest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -2950,170 +2069,100 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 56(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu s4, 16(a1) -; CHECK-NEXT: lhu s5, 24(a1) -; CHECK-NEXT: lhu s6, 32(a1) -; CHECK-NEXT: lhu s7, 40(a1) -; CHECK-NEXT: lhu a1, 48(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 -; CHECK-NEXT: mv a0, s7 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs6, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 8 -; CHECK-NEXT: addiw a7, a1, -1 -; CHECK-NEXT: bge a0, a7, .LBB42_18 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: bge s2, a7, .LBB42_19 -; CHECK-NEXT: .LBB42_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: bge a1, a7, .LBB42_20 -; CHECK-NEXT: .LBB42_3: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: bge a2, a7, .LBB42_21 -; CHECK-NEXT: .LBB42_4: # %entry -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: bge a3, a7, .LBB42_22 -; CHECK-NEXT: .LBB42_5: # %entry -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: bge a4, a7, .LBB42_23 -; CHECK-NEXT: .LBB42_6: # %entry -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: bge a5, a7, .LBB42_24 -; CHECK-NEXT: .LBB42_7: # %entry -; CHECK-NEXT: bge a6, a7, .LBB42_25 -; CHECK-NEXT: .LBB42_8: # %entry -; CHECK-NEXT: lui a7, 1048568 -; CHECK-NEXT: bge a7, a6, .LBB42_26 -; CHECK-NEXT: .LBB42_9: # %entry -; CHECK-NEXT: bge a7, a5, .LBB42_27 -; CHECK-NEXT: .LBB42_10: # %entry -; CHECK-NEXT: bge a7, a4, .LBB42_28 -; CHECK-NEXT: .LBB42_11: # %entry -; CHECK-NEXT: bge a7, a3, .LBB42_29 -; CHECK-NEXT: .LBB42_12: # %entry -; CHECK-NEXT: bge a7, a2, .LBB42_30 -; CHECK-NEXT: .LBB42_13: # %entry -; CHECK-NEXT: bge a7, a1, .LBB42_31 -; CHECK-NEXT: .LBB42_14: # %entry -; CHECK-NEXT: bge a7, s2, .LBB42_32 -; CHECK-NEXT: .LBB42_15: # %entry -; CHECK-NEXT: blt a7, a0, .LBB42_17 -; CHECK-NEXT: .LBB42_16: # %entry +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 8 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v8, a0 ; CHECK-NEXT: lui a0, 1048568 -; CHECK-NEXT: .LBB42_17: # %entry -; CHECK-NEXT: sh a0, 14(s0) -; CHECK-NEXT: sh s2, 12(s0) -; CHECK-NEXT: sh a1, 10(s0) -; CHECK-NEXT: sh a2, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a4, 4(s0) -; CHECK-NEXT: sh a5, 2(s0) -; CHECK-NEXT: sh a6, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: vmax.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB42_18: # %entry -; CHECK-NEXT: mv a0, a7 -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: blt s2, a7, .LBB42_2 -; CHECK-NEXT: .LBB42_19: # %entry -; CHECK-NEXT: mv s2, a7 -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: blt a1, a7, .LBB42_3 -; CHECK-NEXT: .LBB42_20: # %entry -; CHECK-NEXT: mv a1, a7 -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: blt a2, a7, .LBB42_4 -; CHECK-NEXT: .LBB42_21: # %entry -; CHECK-NEXT: mv a2, a7 -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: blt a3, a7, .LBB42_5 -; CHECK-NEXT: .LBB42_22: # %entry -; CHECK-NEXT: mv a3, a7 -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: blt a4, a7, .LBB42_6 -; CHECK-NEXT: .LBB42_23: # %entry -; CHECK-NEXT: mv a4, a7 -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: blt a5, a7, .LBB42_7 -; CHECK-NEXT: .LBB42_24: # %entry -; CHECK-NEXT: mv a5, a7 -; CHECK-NEXT: blt a6, a7, .LBB42_8 -; CHECK-NEXT: .LBB42_25: # %entry -; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: lui a7, 1048568 -; CHECK-NEXT: blt a7, a6, .LBB42_9 -; CHECK-NEXT: .LBB42_26: # %entry -; CHECK-NEXT: lui a6, 1048568 -; CHECK-NEXT: blt a7, a5, .LBB42_10 -; CHECK-NEXT: .LBB42_27: # %entry -; CHECK-NEXT: lui a5, 1048568 -; CHECK-NEXT: blt a7, a4, .LBB42_11 -; CHECK-NEXT: .LBB42_28: # %entry -; CHECK-NEXT: lui a4, 1048568 -; CHECK-NEXT: blt a7, a3, .LBB42_12 -; CHECK-NEXT: .LBB42_29: # %entry -; CHECK-NEXT: lui a3, 1048568 -; CHECK-NEXT: blt a7, a2, .LBB42_13 -; CHECK-NEXT: .LBB42_30: # %entry -; CHECK-NEXT: lui a2, 1048568 -; CHECK-NEXT: blt a7, a1, .LBB42_14 -; CHECK-NEXT: .LBB42_31: # %entry -; CHECK-NEXT: lui a1, 1048568 -; CHECK-NEXT: blt a7, s2, .LBB42_15 -; CHECK-NEXT: .LBB42_32: # %entry -; CHECK-NEXT: lui s2, 1048568 -; CHECK-NEXT: bge a7, a0, .LBB42_16 -; CHECK-NEXT: j .LBB42_17 entry: %conv = fptosi <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -3125,24 +2174,16 @@ define <8 x i16> @utesth_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: utesth_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -3151,126 +2192,98 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 0(a1) -; CHECK-NEXT: lhu s2, 56(a1) -; CHECK-NEXT: lhu s3, 48(a1) -; CHECK-NEXT: lhu s4, 40(a1) -; CHECK-NEXT: lhu s5, 32(a1) -; CHECK-NEXT: lhu s6, 24(a1) -; CHECK-NEXT: lhu s7, 16(a1) -; CHECK-NEXT: lhu a1, 8(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 -; CHECK-NEXT: mv a0, s7 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.lu.s s3, fs6, rtz -; CHECK-NEXT: fcvt.lu.s a0, fs5, rtz -; CHECK-NEXT: sext.w s2, a0 +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz -; CHECK-NEXT: sext.w a0, a0 -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a1, a1, -1 -; CHECK-NEXT: bltu a0, a1, .LBB43_2 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: .LBB43_2: # %entry -; CHECK-NEXT: fcvt.lu.s a3, fs4, rtz -; CHECK-NEXT: sext.w a2, s3 -; CHECK-NEXT: bltu s2, a1, .LBB43_4 -; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: mv s2, a1 -; CHECK-NEXT: .LBB43_4: # %entry -; CHECK-NEXT: fcvt.lu.s a4, fs3, rtz -; CHECK-NEXT: sext.w a3, a3 -; CHECK-NEXT: bltu a2, a1, .LBB43_6 -; CHECK-NEXT: # %bb.5: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: .LBB43_6: # %entry -; CHECK-NEXT: fcvt.lu.s a5, fs2, rtz -; CHECK-NEXT: sext.w a4, a4 -; CHECK-NEXT: bltu a3, a1, .LBB43_8 -; CHECK-NEXT: # %bb.7: # %entry -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: .LBB43_8: # %entry -; CHECK-NEXT: fcvt.lu.s a6, fs1, rtz -; CHECK-NEXT: sext.w a5, a5 -; CHECK-NEXT: bltu a4, a1, .LBB43_10 -; CHECK-NEXT: # %bb.9: # %entry -; CHECK-NEXT: mv a4, a1 -; CHECK-NEXT: .LBB43_10: # %entry -; CHECK-NEXT: fcvt.lu.s a7, fs0, rtz -; CHECK-NEXT: sext.w a6, a6 -; CHECK-NEXT: bgeu a5, a1, .LBB43_15 -; CHECK-NEXT: # %bb.11: # %entry -; CHECK-NEXT: sext.w a7, a7 -; CHECK-NEXT: bgeu a6, a1, .LBB43_16 -; CHECK-NEXT: .LBB43_12: # %entry -; CHECK-NEXT: bltu a7, a1, .LBB43_14 -; CHECK-NEXT: .LBB43_13: # %entry -; CHECK-NEXT: mv a7, a1 -; CHECK-NEXT: .LBB43_14: # %entry -; CHECK-NEXT: sh a7, 14(s0) -; CHECK-NEXT: sh a6, 12(s0) -; CHECK-NEXT: sh a5, 10(s0) -; CHECK-NEXT: sh a4, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a2, 4(s0) -; CHECK-NEXT: sh s2, 2(s0) -; CHECK-NEXT: sh a0, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.lu.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vminu.vx v10, v8, a0 +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB43_15: # %entry -; CHECK-NEXT: mv a5, a1 -; CHECK-NEXT: sext.w a7, a7 -; CHECK-NEXT: bltu a6, a1, .LBB43_12 -; CHECK-NEXT: .LBB43_16: # %entry -; CHECK-NEXT: mv a6, a1 -; CHECK-NEXT: bgeu a7, a1, .LBB43_13 -; CHECK-NEXT: j .LBB43_14 entry: %conv = fptoui <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.umin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -3281,24 +2294,16 @@ define <8 x i16> @ustest_f16i16_mm(<8 x half> %x) { ; CHECK-LABEL: ustest_f16i16_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -128 -; CHECK-NEXT: .cfi_def_cfa_offset 128 -; CHECK-NEXT: sd ra, 120(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 112(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 104(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 96(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s3, 88(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s4, 80(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s5, 72(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s6, 64(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s7, 56(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 48(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs1, 40(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs2, 32(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs3, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs4, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs5, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs6, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -96 +; CHECK-NEXT: .cfi_def_cfa_offset 96 +; CHECK-NEXT: sd ra, 88(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 80(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s3, 56(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s4, 48(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s5, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s6, 32(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -3307,168 +2312,99 @@ ; CHECK-NEXT: .cfi_offset s4, -48 ; CHECK-NEXT: .cfi_offset s5, -56 ; CHECK-NEXT: .cfi_offset s6, -64 -; CHECK-NEXT: .cfi_offset s7, -72 -; CHECK-NEXT: .cfi_offset fs0, -80 -; CHECK-NEXT: .cfi_offset fs1, -88 -; CHECK-NEXT: .cfi_offset fs2, -96 -; CHECK-NEXT: .cfi_offset fs3, -104 -; CHECK-NEXT: .cfi_offset fs4, -112 -; CHECK-NEXT: .cfi_offset fs5, -120 -; CHECK-NEXT: .cfi_offset fs6, -128 -; CHECK-NEXT: lhu s1, 56(a1) -; CHECK-NEXT: lhu s2, 0(a1) -; CHECK-NEXT: lhu s3, 8(a1) -; CHECK-NEXT: lhu s4, 16(a1) -; CHECK-NEXT: lhu s5, 24(a1) -; CHECK-NEXT: lhu s6, 32(a1) -; CHECK-NEXT: lhu s7, 40(a1) -; CHECK-NEXT: lhu a1, 48(a1) -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs6, fa0 -; CHECK-NEXT: mv a0, s7 +; CHECK-NEXT: lhu s0, 56(a0) +; CHECK-NEXT: lhu s1, 48(a0) +; CHECK-NEXT: lhu s2, 40(a0) +; CHECK-NEXT: lhu s3, 32(a0) +; CHECK-NEXT: lhu s4, 24(a0) +; CHECK-NEXT: lhu s5, 16(a0) +; CHECK-NEXT: lhu s6, 0(a0) +; CHECK-NEXT: lhu a0, 8(a0) ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs5, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 28(sp) ; CHECK-NEXT: mv a0, s6 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs4, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 0(sp) ; CHECK-NEXT: mv a0, s5 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs3, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 24(sp) ; CHECK-NEXT: mv a0, s4 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs2, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 20(sp) ; CHECK-NEXT: mv a0, s3 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs1, fa0 +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 16(sp) ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fcvt.l.s s2, fs6, rtz +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 12(sp) ; CHECK-NEXT: mv a0, s1 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: fcvt.l.s a0, fa0, rtz -; CHECK-NEXT: lui a1, 16 -; CHECK-NEXT: addiw a7, a1, -1 -; CHECK-NEXT: bge a0, a7, .LBB44_18 -; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: bge s2, a7, .LBB44_19 -; CHECK-NEXT: .LBB44_2: # %entry -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: bge a1, a7, .LBB44_20 -; CHECK-NEXT: .LBB44_3: # %entry -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: bge a2, a7, .LBB44_21 -; CHECK-NEXT: .LBB44_4: # %entry -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: bge a3, a7, .LBB44_22 -; CHECK-NEXT: .LBB44_5: # %entry -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: bge a4, a7, .LBB44_23 -; CHECK-NEXT: .LBB44_6: # %entry -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: bge a5, a7, .LBB44_24 -; CHECK-NEXT: .LBB44_7: # %entry -; CHECK-NEXT: bge a6, a7, .LBB44_25 -; CHECK-NEXT: .LBB44_8: # %entry -; CHECK-NEXT: blez a6, .LBB44_26 -; CHECK-NEXT: .LBB44_9: # %entry -; CHECK-NEXT: blez a5, .LBB44_27 -; CHECK-NEXT: .LBB44_10: # %entry -; CHECK-NEXT: blez a4, .LBB44_28 -; CHECK-NEXT: .LBB44_11: # %entry -; CHECK-NEXT: blez a3, .LBB44_29 -; CHECK-NEXT: .LBB44_12: # %entry -; CHECK-NEXT: blez a2, .LBB44_30 -; CHECK-NEXT: .LBB44_13: # %entry -; CHECK-NEXT: blez a1, .LBB44_31 -; CHECK-NEXT: .LBB44_14: # %entry -; CHECK-NEXT: blez s2, .LBB44_32 -; CHECK-NEXT: .LBB44_15: # %entry -; CHECK-NEXT: bgtz a0, .LBB44_17 -; CHECK-NEXT: .LBB44_16: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: .LBB44_17: # %entry -; CHECK-NEXT: sh a0, 14(s0) -; CHECK-NEXT: sh s2, 12(s0) -; CHECK-NEXT: sh a1, 10(s0) -; CHECK-NEXT: sh a2, 8(s0) -; CHECK-NEXT: sh a3, 6(s0) -; CHECK-NEXT: sh a4, 4(s0) -; CHECK-NEXT: sh a5, 2(s0) -; CHECK-NEXT: sh a6, 0(s0) -; CHECK-NEXT: ld ra, 120(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 112(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 104(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 96(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s3, 88(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s4, 80(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s5, 72(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s6, 64(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s7, 56(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 48(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs1, 40(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs2, 32(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs3, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs4, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs5, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs6, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 128 +; CHECK-NEXT: sw a0, 8(sp) +; CHECK-NEXT: mv a0, s0 +; CHECK-NEXT: call __extendhfsf2@plt +; CHECK-NEXT: fcvt.l.s a0, fa0, rtz +; CHECK-NEXT: sw a0, 4(sp) +; CHECK-NEXT: addi a0, sp, 28 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle32.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 1 +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 3, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 2 +; CHECK-NEXT: addi a0, sp, 20 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 4, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 3 +; CHECK-NEXT: addi a0, sp, 16 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 4 +; CHECK-NEXT: addi a0, sp, 12 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 5 +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 7, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 6 +; CHECK-NEXT: addi a0, sp, 4 +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vle32.v v10, (a0) +; CHECK-NEXT: vsetivli zero, 8, e32, m2, tu, mu +; CHECK-NEXT: vslideup.vi v8, v10, 7 +; CHECK-NEXT: lui a0, 16 +; CHECK-NEXT: addiw a0, a0, -1 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, mu +; CHECK-NEXT: vmin.vx v8, v8, a0 +; CHECK-NEXT: vmax.vx v10, v8, zero +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, mu +; CHECK-NEXT: vncvt.x.x.w v8, v10 +; CHECK-NEXT: ld ra, 88(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 80(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s3, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s4, 48(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s5, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s6, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 96 ; CHECK-NEXT: ret -; CHECK-NEXT: .LBB44_18: # %entry -; CHECK-NEXT: mv a0, a7 -; CHECK-NEXT: fcvt.l.s a1, fs5, rtz -; CHECK-NEXT: blt s2, a7, .LBB44_2 -; CHECK-NEXT: .LBB44_19: # %entry -; CHECK-NEXT: mv s2, a7 -; CHECK-NEXT: fcvt.l.s a2, fs4, rtz -; CHECK-NEXT: blt a1, a7, .LBB44_3 -; CHECK-NEXT: .LBB44_20: # %entry -; CHECK-NEXT: mv a1, a7 -; CHECK-NEXT: fcvt.l.s a3, fs3, rtz -; CHECK-NEXT: blt a2, a7, .LBB44_4 -; CHECK-NEXT: .LBB44_21: # %entry -; CHECK-NEXT: mv a2, a7 -; CHECK-NEXT: fcvt.l.s a4, fs2, rtz -; CHECK-NEXT: blt a3, a7, .LBB44_5 -; CHECK-NEXT: .LBB44_22: # %entry -; CHECK-NEXT: mv a3, a7 -; CHECK-NEXT: fcvt.l.s a5, fs1, rtz -; CHECK-NEXT: blt a4, a7, .LBB44_6 -; CHECK-NEXT: .LBB44_23: # %entry -; CHECK-NEXT: mv a4, a7 -; CHECK-NEXT: fcvt.l.s a6, fs0, rtz -; CHECK-NEXT: blt a5, a7, .LBB44_7 -; CHECK-NEXT: .LBB44_24: # %entry -; CHECK-NEXT: mv a5, a7 -; CHECK-NEXT: blt a6, a7, .LBB44_8 -; CHECK-NEXT: .LBB44_25: # %entry -; CHECK-NEXT: mv a6, a7 -; CHECK-NEXT: bgtz a6, .LBB44_9 -; CHECK-NEXT: .LBB44_26: # %entry -; CHECK-NEXT: li a6, 0 -; CHECK-NEXT: bgtz a5, .LBB44_10 -; CHECK-NEXT: .LBB44_27: # %entry -; CHECK-NEXT: li a5, 0 -; CHECK-NEXT: bgtz a4, .LBB44_11 -; CHECK-NEXT: .LBB44_28: # %entry -; CHECK-NEXT: li a4, 0 -; CHECK-NEXT: bgtz a3, .LBB44_12 -; CHECK-NEXT: .LBB44_29: # %entry -; CHECK-NEXT: li a3, 0 -; CHECK-NEXT: bgtz a2, .LBB44_13 -; CHECK-NEXT: .LBB44_30: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bgtz a1, .LBB44_14 -; CHECK-NEXT: .LBB44_31: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bgtz s2, .LBB44_15 -; CHECK-NEXT: .LBB44_32: # %entry -; CHECK-NEXT: li s2, 0 -; CHECK-NEXT: blez a0, .LBB44_16 -; CHECK-NEXT: j .LBB44_17 entry: %conv = fptosi <8 x half> %x to <8 x i32> %spec.store.select = call <8 x i32> @llvm.smin.v8i32(<8 x i32> %conv, <8 x i32> ) @@ -3482,38 +2418,45 @@ define <2 x i64> @stest_f64i64_mm(<2 x double> %x) { ; CHECK-LABEL: stest_f64i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixdfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixdfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bgez a1, .LBB45_17 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 +; CHECK-NEXT: mv a4, s0 +; CHECK-NEXT: bgez s1, .LBB45_17 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a2, a3, .LBB45_18 +; CHECK-NEXT: bgeu s0, a3, .LBB45_18 ; CHECK-NEXT: .LBB45_2: # %entry -; CHECK-NEXT: bnez a1, .LBB45_19 +; CHECK-NEXT: bnez s1, .LBB45_19 ; CHECK-NEXT: .LBB45_3: # %entry -; CHECK-NEXT: mv a4, s0 -; CHECK-NEXT: bgez s1, .LBB45_20 +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: bgez a1, .LBB45_20 ; CHECK-NEXT: .LBB45_4: # %entry -; CHECK-NEXT: bgeu s0, a3, .LBB45_21 +; CHECK-NEXT: bgeu a0, a3, .LBB45_21 ; CHECK-NEXT: .LBB45_5: # %entry -; CHECK-NEXT: bnez s1, .LBB45_22 +; CHECK-NEXT: bnez a1, .LBB45_22 ; CHECK-NEXT: .LBB45_6: # %entry ; CHECK-NEXT: bgez a1, .LBB45_23 ; CHECK-NEXT: .LBB45_7: # %entry @@ -3521,49 +2464,58 @@ ; CHECK-NEXT: .LBB45_8: # %entry ; CHECK-NEXT: li s1, 0 ; CHECK-NEXT: .LBB45_9: # %entry -; CHECK-NEXT: slli a3, a0, 63 +; CHECK-NEXT: slli a3, a2, 63 ; CHECK-NEXT: mv a4, s0 ; CHECK-NEXT: bltz s1, .LBB45_24 ; CHECK-NEXT: # %bb.10: # %entry ; CHECK-NEXT: bgeu a3, s0, .LBB45_25 ; CHECK-NEXT: .LBB45_11: # %entry -; CHECK-NEXT: bne s1, a0, .LBB45_26 +; CHECK-NEXT: bne s1, a2, .LBB45_26 ; CHECK-NEXT: .LBB45_12: # %entry -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bltz a1, .LBB45_27 ; CHECK-NEXT: .LBB45_13: # %entry -; CHECK-NEXT: bgeu a3, a2, .LBB45_28 +; CHECK-NEXT: bgeu a3, a0, .LBB45_28 ; CHECK-NEXT: .LBB45_14: # %entry -; CHECK-NEXT: beq a1, a0, .LBB45_16 +; CHECK-NEXT: beq a1, a2, .LBB45_16 ; CHECK-NEXT: .LBB45_15: # %entry -; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: .LBB45_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB45_17: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a2, a3, .LBB45_2 +; CHECK-NEXT: bltu s0, a3, .LBB45_2 ; CHECK-NEXT: .LBB45_18: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: beqz a1, .LBB45_3 +; CHECK-NEXT: mv s0, a3 +; CHECK-NEXT: beqz s1, .LBB45_3 ; CHECK-NEXT: .LBB45_19: # %entry -; CHECK-NEXT: mv a2, a4 -; CHECK-NEXT: mv a4, s0 -; CHECK-NEXT: bltz s1, .LBB45_4 +; CHECK-NEXT: mv s0, a4 +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: bltz a1, .LBB45_4 ; CHECK-NEXT: .LBB45_20: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu s0, a3, .LBB45_5 +; CHECK-NEXT: bltu a0, a3, .LBB45_5 ; CHECK-NEXT: .LBB45_21: # %entry -; CHECK-NEXT: mv s0, a3 -; CHECK-NEXT: beqz s1, .LBB45_6 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: beqz a1, .LBB45_6 ; CHECK-NEXT: .LBB45_22: # %entry -; CHECK-NEXT: mv s0, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: bltz a1, .LBB45_7 ; CHECK-NEXT: .LBB45_23: # %entry ; CHECK-NEXT: li a1, 0 @@ -3574,17 +2526,17 @@ ; CHECK-NEXT: bltu a3, s0, .LBB45_11 ; CHECK-NEXT: .LBB45_25: # %entry ; CHECK-NEXT: mv s0, a3 -; CHECK-NEXT: beq s1, a0, .LBB45_12 +; CHECK-NEXT: beq s1, a2, .LBB45_12 ; CHECK-NEXT: .LBB45_26: # %entry ; CHECK-NEXT: mv s0, a4 -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bgez a1, .LBB45_13 ; CHECK-NEXT: .LBB45_27: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a3, a2, .LBB45_14 +; CHECK-NEXT: bltu a3, a0, .LBB45_14 ; CHECK-NEXT: .LBB45_28: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: bne a1, a0, .LBB45_15 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bne a1, a2, .LBB45_15 ; CHECK-NEXT: j .LBB45_16 entry: %conv = fptosi <2 x double> %x to <2 x i128> @@ -3597,52 +2549,68 @@ define <2 x i64> @utest_f64i64_mm(<2 x double> %x) { ; CHECK-LABEL: utest_f64i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa0 -; CHECK-NEXT: fmv.d fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixunsdfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixunsdfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beqz a3, .LBB46_2 +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: beqz s1, .LBB46_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: mv s0, a2 ; CHECK-NEXT: .LBB46_2: # %entry ; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: bne a3, a4, .LBB46_7 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bne s1, a4, .LBB46_7 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: bnez s1, .LBB46_8 +; CHECK-NEXT: bnez a1, .LBB46_8 ; CHECK-NEXT: .LBB46_4: # %entry -; CHECK-NEXT: beq s1, a4, .LBB46_6 +; CHECK-NEXT: beq a1, a4, .LBB46_6 ; CHECK-NEXT: .LBB46_5: # %entry -; CHECK-NEXT: mv a1, s0 +; CHECK-NEXT: mv a2, a0 ; CHECK-NEXT: .LBB46_6: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a2, 24(sp) +; CHECK-NEXT: sd a3, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB46_7: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: beqz s1, .LBB46_4 +; CHECK-NEXT: mv a3, s0 +; CHECK-NEXT: beqz a1, .LBB46_4 ; CHECK-NEXT: .LBB46_8: # %entry -; CHECK-NEXT: mv s0, a1 -; CHECK-NEXT: bne s1, a4, .LBB46_5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: bne a1, a4, .LBB46_5 ; CHECK-NEXT: j .LBB46_6 entry: %conv = fptoui <2 x double> %x to <2 x i128> @@ -3654,90 +2622,106 @@ define <2 x i64> @ustest_f64i64_mm(<2 x double> %x) { ; CHECK-LABEL: ustest_f64i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.d fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixdfti@plt -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.d fa0, fs0 +; CHECK-NEXT: mv s1, a0 +; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: vsetivli zero, 0, e64, m1, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixdfti@plt -; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: li a5, 1 -; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bgtz a1, .LBB47_12 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a4, s1 -; CHECK-NEXT: bgtz s1, .LBB47_13 +; CHECK-NEXT: bgtz s0, .LBB47_13 ; CHECK-NEXT: .LBB47_2: # %entry -; CHECK-NEXT: bgtz a2, .LBB47_14 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: bne s0, a5, .LBB47_14 ; CHECK-NEXT: .LBB47_3: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bne a2, a5, .LBB47_15 +; CHECK-NEXT: bgtz a1, .LBB47_15 ; CHECK-NEXT: .LBB47_4: # %entry -; CHECK-NEXT: bgtz s1, .LBB47_16 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: bne a1, a5, .LBB47_16 ; CHECK-NEXT: .LBB47_5: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bne s1, a5, .LBB47_17 +; CHECK-NEXT: bgtz s0, .LBB47_17 ; CHECK-NEXT: .LBB47_6: # %entry -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: blez a4, .LBB47_18 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: blez s0, .LBB47_18 ; CHECK-NEXT: .LBB47_7: # %entry -; CHECK-NEXT: bnez a4, .LBB47_19 +; CHECK-NEXT: bnez s0, .LBB47_19 ; CHECK-NEXT: .LBB47_8: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: blez a3, .LBB47_20 +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: blez a2, .LBB47_20 ; CHECK-NEXT: .LBB47_9: # %entry -; CHECK-NEXT: beqz a3, .LBB47_11 +; CHECK-NEXT: beqz a2, .LBB47_11 ; CHECK-NEXT: .LBB47_10: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: .LBB47_11: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a4, 24(sp) +; CHECK-NEXT: sd a3, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB47_12: # %entry -; CHECK-NEXT: li a3, 1 -; CHECK-NEXT: mv a4, s1 -; CHECK-NEXT: blez s1, .LBB47_2 +; CHECK-NEXT: li a2, 1 +; CHECK-NEXT: blez s0, .LBB47_2 ; CHECK-NEXT: .LBB47_13: # %entry -; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: blez a2, .LBB47_3 +; CHECK-NEXT: li s1, 0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: beq s0, a5, .LBB47_3 ; CHECK-NEXT: .LBB47_14: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beq a2, a5, .LBB47_4 +; CHECK-NEXT: mv a3, s1 +; CHECK-NEXT: blez a1, .LBB47_4 ; CHECK-NEXT: .LBB47_15: # %entry -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: blez s1, .LBB47_5 -; CHECK-NEXT: .LBB47_16: # %entry -; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: beq s1, a5, .LBB47_6 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: beq a1, a5, .LBB47_5 +; CHECK-NEXT: .LBB47_16: # %entry +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: blez s0, .LBB47_6 ; CHECK-NEXT: .LBB47_17: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: bgtz a4, .LBB47_7 +; CHECK-NEXT: li s0, 1 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bgtz s0, .LBB47_7 ; CHECK-NEXT: .LBB47_18: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: beqz a4, .LBB47_8 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: beqz s0, .LBB47_8 ; CHECK-NEXT: .LBB47_19: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgtz a3, .LBB47_9 +; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: bgtz a2, .LBB47_9 ; CHECK-NEXT: .LBB47_20: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bnez a3, .LBB47_10 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: bnez a2, .LBB47_10 ; CHECK-NEXT: j .LBB47_11 entry: %conv = fptosi <2 x double> %x to <2 x i128> @@ -3750,38 +2734,45 @@ define <2 x i64> @stest_f32i64_mm(<2 x float> %x) { ; CHECK-LABEL: stest_f32i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixsfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: mv a4, a2 -; CHECK-NEXT: bgez a1, .LBB48_17 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 +; CHECK-NEXT: mv a4, s0 +; CHECK-NEXT: bgez s1, .LBB48_17 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a2, a3, .LBB48_18 +; CHECK-NEXT: bgeu s0, a3, .LBB48_18 ; CHECK-NEXT: .LBB48_2: # %entry -; CHECK-NEXT: bnez a1, .LBB48_19 +; CHECK-NEXT: bnez s1, .LBB48_19 ; CHECK-NEXT: .LBB48_3: # %entry -; CHECK-NEXT: mv a4, s0 -; CHECK-NEXT: bgez s1, .LBB48_20 +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: bgez a1, .LBB48_20 ; CHECK-NEXT: .LBB48_4: # %entry -; CHECK-NEXT: bgeu s0, a3, .LBB48_21 +; CHECK-NEXT: bgeu a0, a3, .LBB48_21 ; CHECK-NEXT: .LBB48_5: # %entry -; CHECK-NEXT: bnez s1, .LBB48_22 +; CHECK-NEXT: bnez a1, .LBB48_22 ; CHECK-NEXT: .LBB48_6: # %entry ; CHECK-NEXT: bgez a1, .LBB48_23 ; CHECK-NEXT: .LBB48_7: # %entry @@ -3789,49 +2780,58 @@ ; CHECK-NEXT: .LBB48_8: # %entry ; CHECK-NEXT: li s1, 0 ; CHECK-NEXT: .LBB48_9: # %entry -; CHECK-NEXT: slli a3, a0, 63 +; CHECK-NEXT: slli a3, a2, 63 ; CHECK-NEXT: mv a4, s0 ; CHECK-NEXT: bltz s1, .LBB48_24 ; CHECK-NEXT: # %bb.10: # %entry ; CHECK-NEXT: bgeu a3, s0, .LBB48_25 ; CHECK-NEXT: .LBB48_11: # %entry -; CHECK-NEXT: bne s1, a0, .LBB48_26 +; CHECK-NEXT: bne s1, a2, .LBB48_26 ; CHECK-NEXT: .LBB48_12: # %entry -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bltz a1, .LBB48_27 ; CHECK-NEXT: .LBB48_13: # %entry -; CHECK-NEXT: bgeu a3, a2, .LBB48_28 +; CHECK-NEXT: bgeu a3, a0, .LBB48_28 ; CHECK-NEXT: .LBB48_14: # %entry -; CHECK-NEXT: beq a1, a0, .LBB48_16 +; CHECK-NEXT: beq a1, a2, .LBB48_16 ; CHECK-NEXT: .LBB48_15: # %entry -; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: .LBB48_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 24(sp) +; CHECK-NEXT: sd s0, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB48_17: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a2, a3, .LBB48_2 +; CHECK-NEXT: bltu s0, a3, .LBB48_2 ; CHECK-NEXT: .LBB48_18: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: beqz a1, .LBB48_3 +; CHECK-NEXT: mv s0, a3 +; CHECK-NEXT: beqz s1, .LBB48_3 ; CHECK-NEXT: .LBB48_19: # %entry -; CHECK-NEXT: mv a2, a4 -; CHECK-NEXT: mv a4, s0 -; CHECK-NEXT: bltz s1, .LBB48_4 +; CHECK-NEXT: mv s0, a4 +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: bltz a1, .LBB48_4 ; CHECK-NEXT: .LBB48_20: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu s0, a3, .LBB48_5 +; CHECK-NEXT: bltu a0, a3, .LBB48_5 ; CHECK-NEXT: .LBB48_21: # %entry -; CHECK-NEXT: mv s0, a3 -; CHECK-NEXT: beqz s1, .LBB48_6 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: beqz a1, .LBB48_6 ; CHECK-NEXT: .LBB48_22: # %entry -; CHECK-NEXT: mv s0, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: bltz a1, .LBB48_7 ; CHECK-NEXT: .LBB48_23: # %entry ; CHECK-NEXT: li a1, 0 @@ -3842,17 +2842,17 @@ ; CHECK-NEXT: bltu a3, s0, .LBB48_11 ; CHECK-NEXT: .LBB48_25: # %entry ; CHECK-NEXT: mv s0, a3 -; CHECK-NEXT: beq s1, a0, .LBB48_12 +; CHECK-NEXT: beq s1, a2, .LBB48_12 ; CHECK-NEXT: .LBB48_26: # %entry ; CHECK-NEXT: mv s0, a4 -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bgez a1, .LBB48_13 ; CHECK-NEXT: .LBB48_27: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a3, a2, .LBB48_14 +; CHECK-NEXT: bltu a3, a0, .LBB48_14 ; CHECK-NEXT: .LBB48_28: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: bne a1, a0, .LBB48_15 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bne a1, a2, .LBB48_15 ; CHECK-NEXT: j .LBB48_16 entry: %conv = fptosi <2 x float> %x to <2 x i128> @@ -3865,52 +2865,68 @@ define <2 x i64> @utest_f32i64_mm(<2 x float> %x) { ; CHECK-LABEL: utest_f32i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa0 -; CHECK-NEXT: fmv.s fa0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixunssfti@plt ; CHECK-NEXT: mv s0, a0 ; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixunssfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beqz a3, .LBB49_2 +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: beqz s1, .LBB49_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: mv s0, a2 ; CHECK-NEXT: .LBB49_2: # %entry ; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: bne a3, a4, .LBB49_7 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bne s1, a4, .LBB49_7 ; CHECK-NEXT: # %bb.3: # %entry -; CHECK-NEXT: bnez s1, .LBB49_8 +; CHECK-NEXT: bnez a1, .LBB49_8 ; CHECK-NEXT: .LBB49_4: # %entry -; CHECK-NEXT: beq s1, a4, .LBB49_6 +; CHECK-NEXT: beq a1, a4, .LBB49_6 ; CHECK-NEXT: .LBB49_5: # %entry -; CHECK-NEXT: mv a1, s0 +; CHECK-NEXT: mv a2, a0 ; CHECK-NEXT: .LBB49_6: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a2, 24(sp) +; CHECK-NEXT: sd a3, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB49_7: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: beqz s1, .LBB49_4 +; CHECK-NEXT: mv a3, s0 +; CHECK-NEXT: beqz a1, .LBB49_4 ; CHECK-NEXT: .LBB49_8: # %entry -; CHECK-NEXT: mv s0, a1 -; CHECK-NEXT: bne s1, a4, .LBB49_5 +; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: bne a1, a4, .LBB49_5 ; CHECK-NEXT: j .LBB49_6 entry: %conv = fptoui <2 x float> %x to <2 x i128> @@ -3922,90 +2938,106 @@ define <2 x i64> @ustest_f32i64_mm(<2 x float> %x) { ; CHECK-LABEL: ustest_f32i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: fsd fs0, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -80 +; CHECK-NEXT: .cfi_def_cfa_offset 80 +; CHECK-NEXT: sd ra, 72(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 64(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 56(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 -; CHECK-NEXT: .cfi_offset fs0, -32 -; CHECK-NEXT: fmv.s fs0, fa1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: sub sp, sp, a0 +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill +; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; CHECK-NEXT: vslidedown.vi v9, v8, 1 +; CHECK-NEXT: vfmv.f.s fa0, v9 ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv s0, a0 -; CHECK-NEXT: mv s1, a1 -; CHECK-NEXT: fmv.s fa0, fs0 +; CHECK-NEXT: mv s1, a0 +; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: vsetivli zero, 0, e32, mf2, ta, mu +; CHECK-NEXT: addi a0, sp, 48 +; CHECK-NEXT: vl1r.v v8, (a0) # Unknown-size Folded Reload +; CHECK-NEXT: vfmv.f.s fa0, v8 ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: li a5, 1 -; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bgtz a1, .LBB50_12 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a4, s1 -; CHECK-NEXT: bgtz s1, .LBB50_13 +; CHECK-NEXT: bgtz s0, .LBB50_13 ; CHECK-NEXT: .LBB50_2: # %entry -; CHECK-NEXT: bgtz a2, .LBB50_14 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: bne s0, a5, .LBB50_14 ; CHECK-NEXT: .LBB50_3: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bne a2, a5, .LBB50_15 +; CHECK-NEXT: bgtz a1, .LBB50_15 ; CHECK-NEXT: .LBB50_4: # %entry -; CHECK-NEXT: bgtz s1, .LBB50_16 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: bne a1, a5, .LBB50_16 ; CHECK-NEXT: .LBB50_5: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: bne s1, a5, .LBB50_17 +; CHECK-NEXT: bgtz s0, .LBB50_17 ; CHECK-NEXT: .LBB50_6: # %entry -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: blez a4, .LBB50_18 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: blez s0, .LBB50_18 ; CHECK-NEXT: .LBB50_7: # %entry -; CHECK-NEXT: bnez a4, .LBB50_19 +; CHECK-NEXT: bnez s0, .LBB50_19 ; CHECK-NEXT: .LBB50_8: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: blez a3, .LBB50_20 +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: blez a2, .LBB50_20 ; CHECK-NEXT: .LBB50_9: # %entry -; CHECK-NEXT: beqz a3, .LBB50_11 +; CHECK-NEXT: beqz a2, .LBB50_11 ; CHECK-NEXT: .LBB50_10: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: .LBB50_11: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: fld fs0, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a4, 24(sp) +; CHECK-NEXT: sd a3, 32(sp) +; CHECK-NEXT: addi a0, sp, 24 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, sp, 32 +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: csrr a0, vlenb +; CHECK-NEXT: slli a0, a0, 1 +; CHECK-NEXT: add sp, sp, a0 +; CHECK-NEXT: ld ra, 72(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 64(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 56(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 80 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB50_12: # %entry -; CHECK-NEXT: li a3, 1 -; CHECK-NEXT: mv a4, s1 -; CHECK-NEXT: blez s1, .LBB50_2 +; CHECK-NEXT: li a2, 1 +; CHECK-NEXT: blez s0, .LBB50_2 ; CHECK-NEXT: .LBB50_13: # %entry -; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: blez a2, .LBB50_3 +; CHECK-NEXT: li s1, 0 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: beq s0, a5, .LBB50_3 ; CHECK-NEXT: .LBB50_14: # %entry -; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beq a2, a5, .LBB50_4 +; CHECK-NEXT: mv a3, s1 +; CHECK-NEXT: blez a1, .LBB50_4 ; CHECK-NEXT: .LBB50_15: # %entry -; CHECK-NEXT: mv a1, a0 -; CHECK-NEXT: blez s1, .LBB50_5 -; CHECK-NEXT: .LBB50_16: # %entry -; CHECK-NEXT: li s0, 0 ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: beq s1, a5, .LBB50_6 +; CHECK-NEXT: li a4, 0 +; CHECK-NEXT: beq a1, a5, .LBB50_5 +; CHECK-NEXT: .LBB50_16: # %entry +; CHECK-NEXT: mv a4, a0 +; CHECK-NEXT: blez s0, .LBB50_6 ; CHECK-NEXT: .LBB50_17: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: bgtz a4, .LBB50_7 +; CHECK-NEXT: li s0, 1 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bgtz s0, .LBB50_7 ; CHECK-NEXT: .LBB50_18: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: beqz a4, .LBB50_8 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: beqz s0, .LBB50_8 ; CHECK-NEXT: .LBB50_19: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgtz a3, .LBB50_9 +; CHECK-NEXT: mv a3, a0 +; CHECK-NEXT: mv a0, a4 +; CHECK-NEXT: bgtz a2, .LBB50_9 ; CHECK-NEXT: .LBB50_20: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bnez a3, .LBB50_10 +; CHECK-NEXT: li a0, 0 +; CHECK-NEXT: bnez a2, .LBB50_10 ; CHECK-NEXT: j .LBB50_11 entry: %conv = fptosi <2 x float> %x to <2 x i128> @@ -4018,12 +3050,12 @@ define <2 x i64> @stest_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: stest_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -4036,13 +3068,12 @@ ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: li a0, -1 -; CHECK-NEXT: srli a3, a0, 1 -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: li a2, -1 +; CHECK-NEXT: srli a3, a2, 1 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bgez a1, .LBB51_17 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: bgeu a2, a3, .LBB51_18 +; CHECK-NEXT: bgeu a0, a3, .LBB51_18 ; CHECK-NEXT: .LBB51_2: # %entry ; CHECK-NEXT: bnez a1, .LBB51_19 ; CHECK-NEXT: .LBB51_3: # %entry @@ -4059,39 +3090,46 @@ ; CHECK-NEXT: .LBB51_8: # %entry ; CHECK-NEXT: li s1, 0 ; CHECK-NEXT: .LBB51_9: # %entry -; CHECK-NEXT: slli a3, a0, 63 +; CHECK-NEXT: slli a3, a2, 63 ; CHECK-NEXT: mv a4, s0 ; CHECK-NEXT: bltz s1, .LBB51_24 ; CHECK-NEXT: # %bb.10: # %entry ; CHECK-NEXT: bgeu a3, s0, .LBB51_25 ; CHECK-NEXT: .LBB51_11: # %entry -; CHECK-NEXT: bne s1, a0, .LBB51_26 +; CHECK-NEXT: bne s1, a2, .LBB51_26 ; CHECK-NEXT: .LBB51_12: # %entry -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bltz a1, .LBB51_27 ; CHECK-NEXT: .LBB51_13: # %entry -; CHECK-NEXT: bgeu a3, a2, .LBB51_28 +; CHECK-NEXT: bgeu a3, a0, .LBB51_28 ; CHECK-NEXT: .LBB51_14: # %entry -; CHECK-NEXT: beq a1, a0, .LBB51_16 +; CHECK-NEXT: beq a1, a2, .LBB51_16 ; CHECK-NEXT: .LBB51_15: # %entry -; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: .LBB51_16: # %entry -; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a1, a2 -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a0, 8(sp) +; CHECK-NEXT: sd s0, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB51_17: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a2, a3, .LBB51_2 +; CHECK-NEXT: bltu a0, a3, .LBB51_2 ; CHECK-NEXT: .LBB51_18: # %entry -; CHECK-NEXT: mv a2, a3 +; CHECK-NEXT: mv a0, a3 ; CHECK-NEXT: beqz a1, .LBB51_3 ; CHECK-NEXT: .LBB51_19: # %entry -; CHECK-NEXT: mv a2, a4 +; CHECK-NEXT: mv a0, a4 ; CHECK-NEXT: mv a4, s0 ; CHECK-NEXT: bltz s1, .LBB51_4 ; CHECK-NEXT: .LBB51_20: # %entry @@ -4112,17 +3150,17 @@ ; CHECK-NEXT: bltu a3, s0, .LBB51_11 ; CHECK-NEXT: .LBB51_25: # %entry ; CHECK-NEXT: mv s0, a3 -; CHECK-NEXT: beq s1, a0, .LBB51_12 +; CHECK-NEXT: beq s1, a2, .LBB51_12 ; CHECK-NEXT: .LBB51_26: # %entry ; CHECK-NEXT: mv s0, a4 -; CHECK-NEXT: mv a4, a2 +; CHECK-NEXT: mv a4, a0 ; CHECK-NEXT: bgez a1, .LBB51_13 ; CHECK-NEXT: .LBB51_27: # %entry ; CHECK-NEXT: mv a4, a3 -; CHECK-NEXT: bltu a3, a2, .LBB51_14 +; CHECK-NEXT: bltu a3, a0, .LBB51_14 ; CHECK-NEXT: .LBB51_28: # %entry -; CHECK-NEXT: mv a2, a3 -; CHECK-NEXT: bne a1, a0, .LBB51_15 +; CHECK-NEXT: mv a0, a3 +; CHECK-NEXT: bne a1, a2, .LBB51_15 ; CHECK-NEXT: j .LBB51_16 entry: %conv = fptosi <2 x half> %x to <2 x i128> @@ -4135,12 +3173,12 @@ define <2 x i64> @utesth_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: utesth_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -4154,34 +3192,41 @@ ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: call __fixunssfti@plt -; CHECK-NEXT: mv a2, a0 -; CHECK-NEXT: mv a3, a1 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beqz a3, .LBB52_2 +; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: beqz a1, .LBB52_2 ; CHECK-NEXT: # %bb.1: # %entry -; CHECK-NEXT: mv a2, a1 +; CHECK-NEXT: mv a0, a2 ; CHECK-NEXT: .LBB52_2: # %entry ; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: mv a0, a1 -; CHECK-NEXT: bne a3, a4, .LBB52_7 +; CHECK-NEXT: mv a3, a2 +; CHECK-NEXT: bne a1, a4, .LBB52_7 ; CHECK-NEXT: # %bb.3: # %entry ; CHECK-NEXT: bnez s1, .LBB52_8 ; CHECK-NEXT: .LBB52_4: # %entry ; CHECK-NEXT: beq s1, a4, .LBB52_6 ; CHECK-NEXT: .LBB52_5: # %entry -; CHECK-NEXT: mv a1, s0 +; CHECK-NEXT: mv a2, s0 ; CHECK-NEXT: .LBB52_6: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a2, 8(sp) +; CHECK-NEXT: sd a3, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB52_7: # %entry -; CHECK-NEXT: mv a0, a2 +; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: beqz s1, .LBB52_4 ; CHECK-NEXT: .LBB52_8: # %entry -; CHECK-NEXT: mv s0, a1 +; CHECK-NEXT: mv s0, a2 ; CHECK-NEXT: bne s1, a4, .LBB52_5 ; CHECK-NEXT: j .LBB52_6 entry: @@ -4194,12 +3239,12 @@ define <2 x i64> @ustest_f16i64_mm(<2 x half> %x) { ; CHECK-LABEL: ustest_f16i64_mm: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: addi sp, sp, -32 -; CHECK-NEXT: .cfi_def_cfa_offset 32 -; CHECK-NEXT: sd ra, 24(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s0, 16(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s1, 8(sp) # 8-byte Folded Spill -; CHECK-NEXT: sd s2, 0(sp) # 8-byte Folded Spill +; CHECK-NEXT: addi sp, sp, -48 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: sd ra, 40(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s0, 32(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s1, 24(sp) # 8-byte Folded Spill +; CHECK-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; CHECK-NEXT: .cfi_offset ra, -8 ; CHECK-NEXT: .cfi_offset s0, -16 ; CHECK-NEXT: .cfi_offset s1, -24 @@ -4212,55 +3257,63 @@ ; CHECK-NEXT: mv a0, s2 ; CHECK-NEXT: call __extendhfsf2@plt ; CHECK-NEXT: call __fixsfti@plt -; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: li a5, 1 -; CHECK-NEXT: mv a3, a1 +; CHECK-NEXT: mv a2, a1 ; CHECK-NEXT: bgtz a1, .LBB53_12 ; CHECK-NEXT: # %bb.1: # %entry ; CHECK-NEXT: mv a4, s1 ; CHECK-NEXT: bgtz s1, .LBB53_13 ; CHECK-NEXT: .LBB53_2: # %entry -; CHECK-NEXT: bgtz a2, .LBB53_14 +; CHECK-NEXT: bgtz a1, .LBB53_14 ; CHECK-NEXT: .LBB53_3: # %entry -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: bne a2, a5, .LBB53_15 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: bne a1, a5, .LBB53_15 ; CHECK-NEXT: .LBB53_4: # %entry ; CHECK-NEXT: bgtz s1, .LBB53_16 ; CHECK-NEXT: .LBB53_5: # %entry ; CHECK-NEXT: li a0, 0 ; CHECK-NEXT: bne s1, a5, .LBB53_17 ; CHECK-NEXT: .LBB53_6: # %entry -; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: blez a4, .LBB53_18 ; CHECK-NEXT: .LBB53_7: # %entry ; CHECK-NEXT: bnez a4, .LBB53_19 ; CHECK-NEXT: .LBB53_8: # %entry -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: blez a3, .LBB53_20 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: blez a2, .LBB53_20 ; CHECK-NEXT: .LBB53_9: # %entry -; CHECK-NEXT: beqz a3, .LBB53_11 +; CHECK-NEXT: beqz a2, .LBB53_11 ; CHECK-NEXT: .LBB53_10: # %entry -; CHECK-NEXT: mv a1, a2 +; CHECK-NEXT: mv a3, a1 ; CHECK-NEXT: .LBB53_11: # %entry -; CHECK-NEXT: ld ra, 24(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s0, 16(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s1, 8(sp) # 8-byte Folded Reload -; CHECK-NEXT: ld s2, 0(sp) # 8-byte Folded Reload -; CHECK-NEXT: addi sp, sp, 32 +; CHECK-NEXT: sd a3, 8(sp) +; CHECK-NEXT: sd a0, 0(sp) +; CHECK-NEXT: addi a0, sp, 8 +; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, mu +; CHECK-NEXT: vle64.v v9, (a0) +; CHECK-NEXT: mv a0, sp +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, tu, mu +; CHECK-NEXT: vslideup.vi v8, v9, 1 +; CHECK-NEXT: ld ra, 40(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s0, 32(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s1, 24(sp) # 8-byte Folded Reload +; CHECK-NEXT: ld s2, 16(sp) # 8-byte Folded Reload +; CHECK-NEXT: addi sp, sp, 48 ; CHECK-NEXT: ret ; CHECK-NEXT: .LBB53_12: # %entry -; CHECK-NEXT: li a3, 1 +; CHECK-NEXT: li a2, 1 ; CHECK-NEXT: mv a4, s1 ; CHECK-NEXT: blez s1, .LBB53_2 ; CHECK-NEXT: .LBB53_13: # %entry ; CHECK-NEXT: li a4, 1 -; CHECK-NEXT: blez a2, .LBB53_3 +; CHECK-NEXT: blez a1, .LBB53_3 ; CHECK-NEXT: .LBB53_14: # %entry ; CHECK-NEXT: li a0, 0 -; CHECK-NEXT: li a1, 0 -; CHECK-NEXT: beq a2, a5, .LBB53_4 +; CHECK-NEXT: li a3, 0 +; CHECK-NEXT: beq a1, a5, .LBB53_4 ; CHECK-NEXT: .LBB53_15: # %entry -; CHECK-NEXT: mv a1, a0 +; CHECK-NEXT: mv a3, a0 ; CHECK-NEXT: blez s1, .LBB53_5 ; CHECK-NEXT: .LBB53_16: # %entry ; CHECK-NEXT: li s0, 0 @@ -4268,18 +3321,18 @@ ; CHECK-NEXT: beq s1, a5, .LBB53_6 ; CHECK-NEXT: .LBB53_17: # %entry ; CHECK-NEXT: mv a0, s0 -; CHECK-NEXT: mv a2, a0 +; CHECK-NEXT: mv a1, a0 ; CHECK-NEXT: bgtz a4, .LBB53_7 ; CHECK-NEXT: .LBB53_18: # %entry -; CHECK-NEXT: li a2, 0 +; CHECK-NEXT: li a1, 0 ; CHECK-NEXT: beqz a4, .LBB53_8 ; CHECK-NEXT: .LBB53_19: # %entry -; CHECK-NEXT: mv a0, a2 -; CHECK-NEXT: mv a2, a1 -; CHECK-NEXT: bgtz a3, .LBB53_9 +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: mv a1, a3 +; CHECK-NEXT: bgtz a2, .LBB53_9 ; CHECK-NEXT: .LBB53_20: # %entry -; CHECK-NEXT: li a2, 0 -; CHECK-NEXT: bnez a3, .LBB53_10 +; CHECK-NEXT: li a1, 0 +; CHECK-NEXT: bnez a2, .LBB53_10 ; CHECK-NEXT: j .LBB53_11 entry: %conv = fptosi <2 x half> %x to <2 x i128> Index: llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll =================================================================== --- llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll +++ llvm/test/CodeGen/RISCV/vec3-setcc-crash.ll @@ -11,72 +11,42 @@ define void @vec3_setcc_crash(<3 x i8>* %in, <3 x i8>* %out) { ; RV32-LABEL: vec3_setcc_crash: ; RV32: # %bb.0: +; RV32-NEXT: addi sp, sp, -16 +; RV32-NEXT: .cfi_def_cfa_offset 16 ; RV32-NEXT: lw a0, 0(a0) -; RV32-NEXT: slli a2, a0, 8 -; RV32-NEXT: slli a3, a0, 24 -; RV32-NEXT: slli a4, a0, 16 -; RV32-NEXT: srai a5, a4, 24 -; RV32-NEXT: srai a3, a3, 24 -; RV32-NEXT: bgtz a5, .LBB0_2 -; RV32-NEXT: # %bb.1: -; RV32-NEXT: li a5, 0 -; RV32-NEXT: j .LBB0_3 -; RV32-NEXT: .LBB0_2: -; RV32-NEXT: srli a5, a4, 24 -; RV32-NEXT: .LBB0_3: -; RV32-NEXT: srai a4, a2, 24 -; RV32-NEXT: slli a2, a5, 8 -; RV32-NEXT: mv a5, a0 -; RV32-NEXT: bgtz a3, .LBB0_5 -; RV32-NEXT: # %bb.4: -; RV32-NEXT: li a5, 0 -; RV32-NEXT: .LBB0_5: -; RV32-NEXT: andi a3, a5, 255 -; RV32-NEXT: or a2, a3, a2 -; RV32-NEXT: bgtz a4, .LBB0_7 -; RV32-NEXT: # %bb.6: -; RV32-NEXT: li a0, 0 -; RV32-NEXT: j .LBB0_8 -; RV32-NEXT: .LBB0_7: -; RV32-NEXT: srli a0, a0, 16 -; RV32-NEXT: .LBB0_8: -; RV32-NEXT: sb a0, 2(a1) -; RV32-NEXT: sh a2, 0(a1) +; RV32-NEXT: sw a0, 12(sp) +; RV32-NEXT: addi a0, sp, 12 +; RV32-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV32-NEXT: vle32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV32-NEXT: vmax.vx v8, v8, zero +; RV32-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV32-NEXT: vslidedown.vi v9, v8, 2 +; RV32-NEXT: addi a0, a1, 2 +; RV32-NEXT: vse8.v v9, (a0) +; RV32-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV32-NEXT: vse16.v v8, (a1) +; RV32-NEXT: addi sp, sp, 16 ; RV32-NEXT: ret ; ; RV64-LABEL: vec3_setcc_crash: ; RV64: # %bb.0: -; RV64-NEXT: lwu a0, 0(a0) -; RV64-NEXT: slli a2, a0, 40 -; RV64-NEXT: slli a3, a0, 56 -; RV64-NEXT: slli a4, a0, 48 -; RV64-NEXT: srai a5, a4, 56 -; RV64-NEXT: srai a3, a3, 56 -; RV64-NEXT: bgtz a5, .LBB0_2 -; RV64-NEXT: # %bb.1: -; RV64-NEXT: li a5, 0 -; RV64-NEXT: j .LBB0_3 -; RV64-NEXT: .LBB0_2: -; RV64-NEXT: srli a5, a4, 56 -; RV64-NEXT: .LBB0_3: -; RV64-NEXT: srai a4, a2, 56 -; RV64-NEXT: slli a2, a5, 8 -; RV64-NEXT: mv a5, a0 -; RV64-NEXT: bgtz a3, .LBB0_5 -; RV64-NEXT: # %bb.4: -; RV64-NEXT: li a5, 0 -; RV64-NEXT: .LBB0_5: -; RV64-NEXT: andi a3, a5, 255 -; RV64-NEXT: or a2, a3, a2 -; RV64-NEXT: bgtz a4, .LBB0_7 -; RV64-NEXT: # %bb.6: -; RV64-NEXT: li a0, 0 -; RV64-NEXT: j .LBB0_8 -; RV64-NEXT: .LBB0_7: -; RV64-NEXT: srli a0, a0, 16 -; RV64-NEXT: .LBB0_8: -; RV64-NEXT: sb a0, 2(a1) -; RV64-NEXT: sh a2, 0(a1) +; RV64-NEXT: addi sp, sp, -16 +; RV64-NEXT: .cfi_def_cfa_offset 16 +; RV64-NEXT: lw a0, 0(a0) +; RV64-NEXT: sw a0, 12(sp) +; RV64-NEXT: addi a0, sp, 12 +; RV64-NEXT: vsetivli zero, 1, e32, mf2, ta, mu +; RV64-NEXT: vle32.v v8, (a0) +; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, mu +; RV64-NEXT: vmax.vx v8, v8, zero +; RV64-NEXT: vsetivli zero, 1, e8, mf4, ta, mu +; RV64-NEXT: vslidedown.vi v9, v8, 2 +; RV64-NEXT: addi a0, a1, 2 +; RV64-NEXT: vse8.v v9, (a0) +; RV64-NEXT: vsetivli zero, 1, e16, mf4, ta, mu +; RV64-NEXT: vse16.v v8, (a1) +; RV64-NEXT: addi sp, sp, 16 ; RV64-NEXT: ret %a = load <3 x i8>, <3 x i8>* %in %cmp = icmp sgt <3 x i8> %a, zeroinitializer Index: llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll @@ -101,17 +101,63 @@ define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) { ; CHECK-LABEL: @uniform_store_i1( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT]], <2 x i64*> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT3]], <2 x i64*> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i64* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x i64*> [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[TMP3]], i32 2 +; CHECK-NEXT: [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>* +; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP2]], i64 1 +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq <2 x i64*> [[TMP8]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <2 x i64*> [[TMP9]], [[BROADCAST_SPLAT4]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0 +; CHECK-NEXT: store i1 [[TMP12]], i1* [[DST:%.*]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1 +; CHECK-NEXT: store i1 [[TMP13]], i1* [[DST]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0 +; CHECK-NEXT: store i1 [[TMP14]], i1* [[DST]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1 +; CHECK-NEXT: store i1 [[TMP15]], i1* [[DST]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 4 +; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START:%.*]], [[ENTRY:%.*]] ] -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ] +; CHECK-NEXT: [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64* [[FIRST_SROA]], align 4 +; CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[FIRST_SROA]], align 4 ; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[FIRST_SROA]], i64 1 ; CHECK-NEXT: [[CMP_NOT:%.*]] = icmp eq i64* [[INCDEC_PTR]], [[START]] -; CHECK-NEXT: store i1 [[CMP_NOT]], i1* [[DST:%.*]], align 1 -; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]] -; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]], !llvm.loop [[LOOP0]] +; CHECK-NEXT: store i1 [[CMP_NOT]], i1* [[DST]], align 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: end: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/inloop-reduction.ll @@ -82,49 +82,37 @@ ; INLOOP-NEXT: [[CMP6:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; INLOOP-NEXT: br i1 [[CMP6]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; INLOOP: for.body.preheader: -; INLOOP-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP1:%.*]] = mul i32 [[TMP0]], 8 -; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], [[TMP1]] +; INLOOP-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16 ; INLOOP-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; INLOOP: vector.ph: -; INLOOP-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP3:%.*]] = mul i32 [[TMP2]], 8 -; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], [[TMP3]] +; INLOOP-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N]], 16 ; INLOOP-NEXT: [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]] ; INLOOP-NEXT: br label [[VECTOR_BODY:%.*]] ; INLOOP: vector.body: ; INLOOP-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] -; INLOOP-NEXT: [[TMP4:%.*]] = add i32 [[INDEX]], 0 -; INLOOP-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP6:%.*]] = mul i32 [[TMP5]], 4 -; INLOOP-NEXT: [[TMP7:%.*]] = add i32 [[TMP6]], 0 -; INLOOP-NEXT: [[TMP8:%.*]] = mul i32 [[TMP7]], 1 -; INLOOP-NEXT: [[TMP9:%.*]] = add i32 [[INDEX]], [[TMP8]] -; INLOOP-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP4]] -; INLOOP-NEXT: [[TMP11:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP9]] -; INLOOP-NEXT: [[TMP12:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 0 -; INLOOP-NEXT: [[TMP13:%.*]] = bitcast i16* [[TMP12]] to * -; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP13]], align 2 -; INLOOP-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP15:%.*]] = mul i32 [[TMP14]], 4 -; INLOOP-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, i16* [[TMP10]], i32 [[TMP15]] -; INLOOP-NEXT: [[TMP17:%.*]] = bitcast i16* [[TMP16]] to * -; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load , * [[TMP17]], align 2 -; INLOOP-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD]] to -; INLOOP-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD2]] to -; INLOOP-NEXT: [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP18]]) -; INLOOP-NEXT: [[TMP21]] = add i32 [[TMP20]], [[VEC_PHI]] -; INLOOP-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) -; INLOOP-NEXT: [[TMP23]] = add i32 [[TMP22]], [[VEC_PHI1]] -; INLOOP-NEXT: [[TMP24:%.*]] = call i32 @llvm.vscale.i32() -; INLOOP-NEXT: [[TMP25:%.*]] = mul i32 [[TMP24]], 8 -; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP25]] -; INLOOP-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; INLOOP-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; INLOOP-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; INLOOP-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; INLOOP-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 8 +; INLOOP-NEXT: [[TMP2:%.*]] = getelementptr inbounds i16, i16* [[X:%.*]], i32 [[TMP0]] +; INLOOP-NEXT: [[TMP3:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[TMP1]] +; INLOOP-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 0 +; INLOOP-NEXT: [[TMP5:%.*]] = bitcast i16* [[TMP4]] to <8 x i16>* +; INLOOP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP5]], align 2 +; INLOOP-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP2]], i32 8 +; INLOOP-NEXT: [[TMP7:%.*]] = bitcast i16* [[TMP6]] to <8 x i16>* +; INLOOP-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, <8 x i16>* [[TMP7]], align 2 +; INLOOP-NEXT: [[TMP8:%.*]] = sext <8 x i16> [[WIDE_LOAD]] to <8 x i32> +; INLOOP-NEXT: [[TMP9:%.*]] = sext <8 x i16> [[WIDE_LOAD2]] to <8 x i32> +; INLOOP-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP8]]) +; INLOOP-NEXT: [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]] +; INLOOP-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP9]]) +; INLOOP-NEXT: [[TMP13]] = add i32 [[TMP12]], [[VEC_PHI1]] +; INLOOP-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; INLOOP-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; INLOOP-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; INLOOP: middle.block: -; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP23]], [[TMP21]] +; INLOOP-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP11]] ; INLOOP-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; INLOOP-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; INLOOP: scalar.ph: @@ -135,8 +123,8 @@ ; INLOOP-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; INLOOP-NEXT: [[R_07:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ] ; INLOOP-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i16, i16* [[X]], i32 [[I_08]] -; INLOOP-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 -; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP27]] to i32 +; INLOOP-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX]], align 2 +; INLOOP-NEXT: [[CONV:%.*]] = sext i16 [[TMP15]] to i32 ; INLOOP-NEXT: [[ADD]] = add nsw i32 [[R_07]], [[CONV]] ; INLOOP-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 ; INLOOP-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N]] Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-basics.ll @@ -320,16 +320,45 @@ define void @indexed_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; VLENUNK-LABEL: @indexed_store( ; VLENUNK-NEXT: entry: +; VLENUNK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VLENUNK: vector.ph: +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; VLENUNK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0 +; VLENUNK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] +; VLENUNK: vector.body: +; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; VLENUNK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]] +; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; VLENUNK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]] +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD1]] +; VLENUNK-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT]], <2 x ptr> [[TMP6]], i32 8, <2 x i1> ) +; VLENUNK-NEXT: call void @llvm.masked.scatter.v2i64.v2p0(<2 x i64> [[BROADCAST_SPLAT3]], <2 x ptr> [[TMP7]], i32 8, <2 x i1> ) +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VLENUNK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VLENUNK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLENUNK: middle.block: +; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VLENUNK: scalar.ph: +; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: -; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; VLENUNK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]] +; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; VLENUNK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] ; VLENUNK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8 -; VLENUNK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]] -; VLENUNK-NEXT: store i64 [[V:%.*]], ptr [[AADDR]], align 8 +; VLENUNK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]] +; VLENUNK-NEXT: store i64 [[V]], ptr [[AADDR]], align 8 ; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; VLENUNK: for.end: ; VLENUNK-NEXT: ret void ; @@ -397,20 +426,52 @@ define i64 @indexed_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i64 %v, i64 %n) { ; VLENUNK-LABEL: @indexed_load( ; VLENUNK-NEXT: entry: +; VLENUNK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VLENUNK: vector.ph: +; VLENUNK-NEXT: br label [[VECTOR_BODY:%.*]] +; VLENUNK: vector.body: +; VLENUNK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; VLENUNK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; VLENUNK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; VLENUNK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[TMP0]] +; VLENUNK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[TMP1]] +; VLENUNK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; VLENUNK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; VLENUNK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; VLENUNK-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; VLENUNK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], <2 x i64> [[WIDE_LOAD]] +; VLENUNK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], <2 x i64> [[WIDE_LOAD2]] +; VLENUNK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> , <2 x i64> undef) +; VLENUNK-NEXT: [[WIDE_MASKED_GATHER3:%.*]] = call <2 x i64> @llvm.masked.gather.v2i64.v2p0(<2 x ptr> [[TMP7]], i32 8, <2 x i1> , <2 x i64> undef) +; VLENUNK-NEXT: [[TMP8]] = add <2 x i64> [[VEC_PHI]], [[WIDE_MASKED_GATHER]] +; VLENUNK-NEXT: [[TMP9]] = add <2 x i64> [[VEC_PHI1]], [[WIDE_MASKED_GATHER3]] +; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VLENUNK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; VLENUNK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLENUNK: middle.block: +; VLENUNK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[TMP9]], [[TMP8]] +; VLENUNK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VLENUNK: scalar.ph: +; VLENUNK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VLENUNK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: br label [[FOR_BODY:%.*]] ; VLENUNK: for.body: -; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; VLENUNK-NEXT: [[SUM:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] -; VLENUNK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[IV]] +; VLENUNK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; VLENUNK-NEXT: [[SUM:%.*]] = phi i64 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[SUM_NEXT:%.*]], [[FOR_BODY]] ] +; VLENUNK-NEXT: [[BADDR:%.*]] = getelementptr inbounds i64, ptr [[B]], i64 [[IV]] ; VLENUNK-NEXT: [[AIDX:%.*]] = load i64, ptr [[BADDR]], align 8 -; VLENUNK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[AIDX]] +; VLENUNK-NEXT: [[AADDR:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[AIDX]] ; VLENUNK-NEXT: [[ELEM:%.*]] = load i64, ptr [[AADDR]], align 8 ; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VLENUNK-NEXT: [[SUM_NEXT]] = add i64 [[SUM]], [[ELEM]] ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; VLENUNK: for.end: -; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ] +; VLENUNK-NEXT: [[SUM_NEXT_LCSSA:%.*]] = phi i64 [ [[SUM_NEXT]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ] ; VLENUNK-NEXT: ret i64 [[SUM_NEXT_LCSSA]] ; ; VLEN128-LABEL: @indexed_load( @@ -504,7 +565,7 @@ ; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -517,7 +578,7 @@ ; VLENUNK-NEXT: store i64 [[V]], ptr [[ARRAYIDX]], align 8 ; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; VLENUNK: for.end: ; VLENUNK-NEXT: ret void ; @@ -596,7 +657,7 @@ ; VLENUNK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() ; VLENUNK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] ; VLENUNK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; VLENUNK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; VLENUNK: middle.block: ; VLENUNK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; VLENUNK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -609,7 +670,7 @@ ; VLENUNK-NEXT: store ptr [[V]], ptr [[ARRAYIDX]], align 8 ; VLENUNK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VLENUNK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; VLENUNK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; VLENUNK: for.end: ; VLENUNK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/RISCV/scalable-divrem.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/scalable-divrem.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/scalable-divrem.ll @@ -249,12 +249,83 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_udiv( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE9:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] +; CHECK: pred.udiv.if: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = udiv i64 [[TMP9]], [[V]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE]] +; CHECK: pred.udiv.continue: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_UDIV_IF]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]] +; CHECK: pred.udiv.if4: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = udiv i64 [[TMP14]], [[V]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP15]], i32 1 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE5]] +; CHECK: pred.udiv.continue5: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP12]], [[PRED_UDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_UDIV_IF4]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_UDIV_IF6:%.*]], label [[PRED_UDIV_CONTINUE7:%.*]] +; CHECK: pred.udiv.if6: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = udiv i64 [[TMP19]], [[V]] +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE7]] +; CHECK: pred.udiv.continue7: +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i64> [ poison, [[PRED_UDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_UDIV_IF6]] ] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_UDIV_IF8:%.*]], label [[PRED_UDIV_CONTINUE9]] +; CHECK: pred.udiv.if8: +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = udiv i64 [[TMP24]], [[V]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP25]], i32 1 +; CHECK-NEXT: br label [[PRED_UDIV_CONTINUE9]] +; CHECK: pred.udiv.continue9: +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x i64> [ [[TMP22]], [[PRED_UDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_UDIV_IF8]] ] +; CHECK-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP17]], <2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP27]], <2 x i64> [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP30]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: store <2 x i64> [[PREDPHI10]], ptr [[TMP31]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V:%.*]], 0 +; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 ; CHECK-NEXT: br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]] ; CHECK: do_op: ; CHECK-NEXT: [[DIVREM:%.*]] = udiv i64 [[ELEM]], [[V]] @@ -264,7 +335,7 @@ ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -294,12 +365,83 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; CHECK-LABEL: @predicated_sdiv( ; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[V:%.*]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[V]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE9:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT]], zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = icmp ne <2 x i64> [[BROADCAST_SPLAT3]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 +; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK: pred.sdiv.if: +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = sdiv i64 [[TMP9]], [[V]] +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[TMP10]], i32 0 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] +; CHECK: pred.sdiv.continue: +; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP11]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 +; CHECK-NEXT: br i1 [[TMP13]], label [[PRED_SDIV_IF4:%.*]], label [[PRED_SDIV_CONTINUE5:%.*]] +; CHECK: pred.sdiv.if4: +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP15:%.*]] = sdiv i64 [[TMP14]], [[V]] +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <2 x i64> [[TMP12]], i64 [[TMP15]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE5]] +; CHECK: pred.sdiv.continue5: +; CHECK-NEXT: [[TMP17:%.*]] = phi <2 x i64> [ [[TMP12]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP16]], [[PRED_SDIV_IF4]] ] +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <2 x i1> [[TMP7]], i32 0 +; CHECK-NEXT: br i1 [[TMP18]], label [[PRED_SDIV_IF6:%.*]], label [[PRED_SDIV_CONTINUE7:%.*]] +; CHECK: pred.sdiv.if6: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = sdiv i64 [[TMP19]], [[V]] +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <2 x i64> poison, i64 [[TMP20]], i32 0 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE7]] +; CHECK: pred.sdiv.continue7: +; CHECK-NEXT: [[TMP22:%.*]] = phi <2 x i64> [ poison, [[PRED_SDIV_CONTINUE5]] ], [ [[TMP21]], [[PRED_SDIV_IF6]] ] +; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP7]], i32 1 +; CHECK-NEXT: br i1 [[TMP23]], label [[PRED_SDIV_IF8:%.*]], label [[PRED_SDIV_CONTINUE9]] +; CHECK: pred.sdiv.if8: +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[WIDE_LOAD1]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = sdiv i64 [[TMP24]], [[V]] +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <2 x i64> [[TMP22]], i64 [[TMP25]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE9]] +; CHECK: pred.sdiv.continue9: +; CHECK-NEXT: [[TMP27:%.*]] = phi <2 x i64> [ [[TMP22]], [[PRED_SDIV_CONTINUE7]] ], [ [[TMP26]], [[PRED_SDIV_IF8]] ] +; CHECK-NEXT: [[TMP28:%.*]] = xor <2 x i1> [[TMP6]], +; CHECK-NEXT: [[TMP29:%.*]] = xor <2 x i1> [[TMP7]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <2 x i1> [[TMP6]], <2 x i64> [[TMP17]], <2 x i64> [[WIDE_LOAD]] +; CHECK-NEXT: [[PREDPHI10:%.*]] = select <2 x i1> [[TMP7]], <2 x i64> [[TMP27]], <2 x i64> [[WIDE_LOAD1]] +; CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 +; CHECK-NEXT: store <2 x i64> [[PREDPHI]], ptr [[TMP30]], align 8 +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: store <2 x i64> [[PREDPHI10]], ptr [[TMP31]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V:%.*]], 0 +; CHECK-NEXT: [[C:%.*]] = icmp ne i64 [[V]], 0 ; CHECK-NEXT: br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]] ; CHECK: do_op: ; CHECK-NEXT: [[DIVREM:%.*]] = sdiv i64 [[ELEM]], [[V]] @@ -309,7 +451,7 @@ ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -361,7 +503,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -382,7 +524,7 @@ ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -434,7 +576,7 @@ ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -455,7 +597,7 @@ ; CHECK-NEXT: store i64 [[PHI]], ptr [[ARRAYIDX]], align 8 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; @@ -484,11 +626,273 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; CHECK-LABEL: @predicated_sdiv_by_minus_one( -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_SDIV_CONTINUE30:%.*]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ne <16 x i8> [[WIDE_LOAD]], +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <16 x i1> [[TMP3]], i32 0 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_SDIV_IF:%.*]], label [[PRED_SDIV_CONTINUE:%.*]] +; CHECK: pred.sdiv.if: +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = sdiv i8 [[TMP5]], -1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> poison, i8 [[TMP6]], i32 0 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE]] +; CHECK: pred.sdiv.continue: +; CHECK-NEXT: [[TMP8:%.*]] = phi <16 x i8> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_SDIV_IF]] ] +; CHECK-NEXT: [[TMP9:%.*]] = extractelement <16 x i1> [[TMP3]], i32 1 +; CHECK-NEXT: br i1 [[TMP9]], label [[PRED_SDIV_IF1:%.*]], label [[PRED_SDIV_CONTINUE2:%.*]] +; CHECK: pred.sdiv.if1: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 1 +; CHECK-NEXT: [[TMP11:%.*]] = sdiv i8 [[TMP10]], -1 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[TMP11]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE2]] +; CHECK: pred.sdiv.continue2: +; CHECK-NEXT: [[TMP13:%.*]] = phi <16 x i8> [ [[TMP8]], [[PRED_SDIV_CONTINUE]] ], [ [[TMP12]], [[PRED_SDIV_IF1]] ] +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <16 x i1> [[TMP3]], i32 2 +; CHECK-NEXT: br i1 [[TMP14]], label [[PRED_SDIV_IF3:%.*]], label [[PRED_SDIV_CONTINUE4:%.*]] +; CHECK: pred.sdiv.if3: +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 2 +; CHECK-NEXT: [[TMP16:%.*]] = sdiv i8 [[TMP15]], -1 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[TMP16]], i32 2 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE4]] +; CHECK: pred.sdiv.continue4: +; CHECK-NEXT: [[TMP18:%.*]] = phi <16 x i8> [ [[TMP13]], [[PRED_SDIV_CONTINUE2]] ], [ [[TMP17]], [[PRED_SDIV_IF3]] ] +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <16 x i1> [[TMP3]], i32 3 +; CHECK-NEXT: br i1 [[TMP19]], label [[PRED_SDIV_IF5:%.*]], label [[PRED_SDIV_CONTINUE6:%.*]] +; CHECK: pred.sdiv.if5: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = sdiv i8 [[TMP20]], -1 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP18]], i8 [[TMP21]], i32 3 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE6]] +; CHECK: pred.sdiv.continue6: +; CHECK-NEXT: [[TMP23:%.*]] = phi <16 x i8> [ [[TMP18]], [[PRED_SDIV_CONTINUE4]] ], [ [[TMP22]], [[PRED_SDIV_IF5]] ] +; CHECK-NEXT: [[TMP24:%.*]] = extractelement <16 x i1> [[TMP3]], i32 4 +; CHECK-NEXT: br i1 [[TMP24]], label [[PRED_SDIV_IF7:%.*]], label [[PRED_SDIV_CONTINUE8:%.*]] +; CHECK: pred.sdiv.if7: +; CHECK-NEXT: [[TMP25:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 4 +; CHECK-NEXT: [[TMP26:%.*]] = sdiv i8 [[TMP25]], -1 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[TMP26]], i32 4 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE8]] +; CHECK: pred.sdiv.continue8: +; CHECK-NEXT: [[TMP28:%.*]] = phi <16 x i8> [ [[TMP23]], [[PRED_SDIV_CONTINUE6]] ], [ [[TMP27]], [[PRED_SDIV_IF7]] ] +; CHECK-NEXT: [[TMP29:%.*]] = extractelement <16 x i1> [[TMP3]], i32 5 +; CHECK-NEXT: br i1 [[TMP29]], label [[PRED_SDIV_IF9:%.*]], label [[PRED_SDIV_CONTINUE10:%.*]] +; CHECK: pred.sdiv.if9: +; CHECK-NEXT: [[TMP30:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 5 +; CHECK-NEXT: [[TMP31:%.*]] = sdiv i8 [[TMP30]], -1 +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[TMP31]], i32 5 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE10]] +; CHECK: pred.sdiv.continue10: +; CHECK-NEXT: [[TMP33:%.*]] = phi <16 x i8> [ [[TMP28]], [[PRED_SDIV_CONTINUE8]] ], [ [[TMP32]], [[PRED_SDIV_IF9]] ] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <16 x i1> [[TMP3]], i32 6 +; CHECK-NEXT: br i1 [[TMP34]], label [[PRED_SDIV_IF11:%.*]], label [[PRED_SDIV_CONTINUE12:%.*]] +; CHECK: pred.sdiv.if11: +; CHECK-NEXT: [[TMP35:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 6 +; CHECK-NEXT: [[TMP36:%.*]] = sdiv i8 [[TMP35]], -1 +; CHECK-NEXT: [[TMP37:%.*]] = insertelement <16 x i8> [[TMP33]], i8 [[TMP36]], i32 6 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE12]] +; CHECK: pred.sdiv.continue12: +; CHECK-NEXT: [[TMP38:%.*]] = phi <16 x i8> [ [[TMP33]], [[PRED_SDIV_CONTINUE10]] ], [ [[TMP37]], [[PRED_SDIV_IF11]] ] +; CHECK-NEXT: [[TMP39:%.*]] = extractelement <16 x i1> [[TMP3]], i32 7 +; CHECK-NEXT: br i1 [[TMP39]], label [[PRED_SDIV_IF13:%.*]], label [[PRED_SDIV_CONTINUE14:%.*]] +; CHECK: pred.sdiv.if13: +; CHECK-NEXT: [[TMP40:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 7 +; CHECK-NEXT: [[TMP41:%.*]] = sdiv i8 [[TMP40]], -1 +; CHECK-NEXT: [[TMP42:%.*]] = insertelement <16 x i8> [[TMP38]], i8 [[TMP41]], i32 7 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE14]] +; CHECK: pred.sdiv.continue14: +; CHECK-NEXT: [[TMP43:%.*]] = phi <16 x i8> [ [[TMP38]], [[PRED_SDIV_CONTINUE12]] ], [ [[TMP42]], [[PRED_SDIV_IF13]] ] +; CHECK-NEXT: [[TMP44:%.*]] = extractelement <16 x i1> [[TMP3]], i32 8 +; CHECK-NEXT: br i1 [[TMP44]], label [[PRED_SDIV_IF15:%.*]], label [[PRED_SDIV_CONTINUE16:%.*]] +; CHECK: pred.sdiv.if15: +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 8 +; CHECK-NEXT: [[TMP46:%.*]] = sdiv i8 [[TMP45]], -1 +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <16 x i8> [[TMP43]], i8 [[TMP46]], i32 8 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE16]] +; CHECK: pred.sdiv.continue16: +; CHECK-NEXT: [[TMP48:%.*]] = phi <16 x i8> [ [[TMP43]], [[PRED_SDIV_CONTINUE14]] ], [ [[TMP47]], [[PRED_SDIV_IF15]] ] +; CHECK-NEXT: [[TMP49:%.*]] = extractelement <16 x i1> [[TMP3]], i32 9 +; CHECK-NEXT: br i1 [[TMP49]], label [[PRED_SDIV_IF17:%.*]], label [[PRED_SDIV_CONTINUE18:%.*]] +; CHECK: pred.sdiv.if17: +; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 9 +; CHECK-NEXT: [[TMP51:%.*]] = sdiv i8 [[TMP50]], -1 +; CHECK-NEXT: [[TMP52:%.*]] = insertelement <16 x i8> [[TMP48]], i8 [[TMP51]], i32 9 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE18]] +; CHECK: pred.sdiv.continue18: +; CHECK-NEXT: [[TMP53:%.*]] = phi <16 x i8> [ [[TMP48]], [[PRED_SDIV_CONTINUE16]] ], [ [[TMP52]], [[PRED_SDIV_IF17]] ] +; CHECK-NEXT: [[TMP54:%.*]] = extractelement <16 x i1> [[TMP3]], i32 10 +; CHECK-NEXT: br i1 [[TMP54]], label [[PRED_SDIV_IF19:%.*]], label [[PRED_SDIV_CONTINUE20:%.*]] +; CHECK: pred.sdiv.if19: +; CHECK-NEXT: [[TMP55:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 10 +; CHECK-NEXT: [[TMP56:%.*]] = sdiv i8 [[TMP55]], -1 +; CHECK-NEXT: [[TMP57:%.*]] = insertelement <16 x i8> [[TMP53]], i8 [[TMP56]], i32 10 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE20]] +; CHECK: pred.sdiv.continue20: +; CHECK-NEXT: [[TMP58:%.*]] = phi <16 x i8> [ [[TMP53]], [[PRED_SDIV_CONTINUE18]] ], [ [[TMP57]], [[PRED_SDIV_IF19]] ] +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <16 x i1> [[TMP3]], i32 11 +; CHECK-NEXT: br i1 [[TMP59]], label [[PRED_SDIV_IF21:%.*]], label [[PRED_SDIV_CONTINUE22:%.*]] +; CHECK: pred.sdiv.if21: +; CHECK-NEXT: [[TMP60:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 11 +; CHECK-NEXT: [[TMP61:%.*]] = sdiv i8 [[TMP60]], -1 +; CHECK-NEXT: [[TMP62:%.*]] = insertelement <16 x i8> [[TMP58]], i8 [[TMP61]], i32 11 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE22]] +; CHECK: pred.sdiv.continue22: +; CHECK-NEXT: [[TMP63:%.*]] = phi <16 x i8> [ [[TMP58]], [[PRED_SDIV_CONTINUE20]] ], [ [[TMP62]], [[PRED_SDIV_IF21]] ] +; CHECK-NEXT: [[TMP64:%.*]] = extractelement <16 x i1> [[TMP3]], i32 12 +; CHECK-NEXT: br i1 [[TMP64]], label [[PRED_SDIV_IF23:%.*]], label [[PRED_SDIV_CONTINUE24:%.*]] +; CHECK: pred.sdiv.if23: +; CHECK-NEXT: [[TMP65:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 12 +; CHECK-NEXT: [[TMP66:%.*]] = sdiv i8 [[TMP65]], -1 +; CHECK-NEXT: [[TMP67:%.*]] = insertelement <16 x i8> [[TMP63]], i8 [[TMP66]], i32 12 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE24]] +; CHECK: pred.sdiv.continue24: +; CHECK-NEXT: [[TMP68:%.*]] = phi <16 x i8> [ [[TMP63]], [[PRED_SDIV_CONTINUE22]] ], [ [[TMP67]], [[PRED_SDIV_IF23]] ] +; CHECK-NEXT: [[TMP69:%.*]] = extractelement <16 x i1> [[TMP3]], i32 13 +; CHECK-NEXT: br i1 [[TMP69]], label [[PRED_SDIV_IF25:%.*]], label [[PRED_SDIV_CONTINUE26:%.*]] +; CHECK: pred.sdiv.if25: +; CHECK-NEXT: [[TMP70:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 13 +; CHECK-NEXT: [[TMP71:%.*]] = sdiv i8 [[TMP70]], -1 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <16 x i8> [[TMP68]], i8 [[TMP71]], i32 13 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE26]] +; CHECK: pred.sdiv.continue26: +; CHECK-NEXT: [[TMP73:%.*]] = phi <16 x i8> [ [[TMP68]], [[PRED_SDIV_CONTINUE24]] ], [ [[TMP72]], [[PRED_SDIV_IF25]] ] +; CHECK-NEXT: [[TMP74:%.*]] = extractelement <16 x i1> [[TMP3]], i32 14 +; CHECK-NEXT: br i1 [[TMP74]], label [[PRED_SDIV_IF27:%.*]], label [[PRED_SDIV_CONTINUE28:%.*]] +; CHECK: pred.sdiv.if27: +; CHECK-NEXT: [[TMP75:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 14 +; CHECK-NEXT: [[TMP76:%.*]] = sdiv i8 [[TMP75]], -1 +; CHECK-NEXT: [[TMP77:%.*]] = insertelement <16 x i8> [[TMP73]], i8 [[TMP76]], i32 14 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE28]] +; CHECK: pred.sdiv.continue28: +; CHECK-NEXT: [[TMP78:%.*]] = phi <16 x i8> [ [[TMP73]], [[PRED_SDIV_CONTINUE26]] ], [ [[TMP77]], [[PRED_SDIV_IF27]] ] +; CHECK-NEXT: [[TMP79:%.*]] = extractelement <16 x i1> [[TMP3]], i32 15 +; CHECK-NEXT: br i1 [[TMP79]], label [[PRED_SDIV_IF29:%.*]], label [[PRED_SDIV_CONTINUE30]] +; CHECK: pred.sdiv.if29: +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <16 x i8> [[WIDE_LOAD]], i32 15 +; CHECK-NEXT: [[TMP81:%.*]] = sdiv i8 [[TMP80]], -1 +; CHECK-NEXT: [[TMP82:%.*]] = insertelement <16 x i8> [[TMP78]], i8 [[TMP81]], i32 15 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE30]] +; CHECK: pred.sdiv.continue30: +; CHECK-NEXT: [[TMP83:%.*]] = phi <16 x i8> [ [[TMP78]], [[PRED_SDIV_CONTINUE28]] ], [ [[TMP82]], [[PRED_SDIV_IF29]] ] +; CHECK-NEXT: [[TMP84:%.*]] = xor <16 x i1> [[TMP3]], +; CHECK-NEXT: [[PREDPHI:%.*]] = select <16 x i1> [[TMP3]], <16 x i8> [[TMP83]], <16 x i8> [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP85:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 0 +; CHECK-NEXT: store <16 x i8> [[PREDPHI]], ptr [[TMP85]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP86:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP86]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: +; CHECK-NEXT: br i1 true, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] +; CHECK: vec.epilog.ph: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; CHECK: vec.epilog.vector.body: +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT51:%.*]], [[PRED_SDIV_CONTINUE49:%.*]] ] +; CHECK-NEXT: [[TMP87:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP88:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP87]] +; CHECK-NEXT: [[TMP89:%.*]] = getelementptr inbounds i8, ptr [[TMP88]], i32 0 +; CHECK-NEXT: [[WIDE_LOAD33:%.*]] = load <8 x i8>, ptr [[TMP89]], align 1 +; CHECK-NEXT: [[TMP90:%.*]] = icmp ne <8 x i8> [[WIDE_LOAD33]], +; CHECK-NEXT: [[TMP91:%.*]] = extractelement <8 x i1> [[TMP90]], i32 0 +; CHECK-NEXT: br i1 [[TMP91]], label [[PRED_SDIV_IF34:%.*]], label [[PRED_SDIV_CONTINUE35:%.*]] +; CHECK: pred.sdiv.if34: +; CHECK-NEXT: [[TMP92:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 0 +; CHECK-NEXT: [[TMP93:%.*]] = sdiv i8 [[TMP92]], -1 +; CHECK-NEXT: [[TMP94:%.*]] = insertelement <8 x i8> poison, i8 [[TMP93]], i32 0 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE35]] +; CHECK: pred.sdiv.continue35: +; CHECK-NEXT: [[TMP95:%.*]] = phi <8 x i8> [ poison, [[VEC_EPILOG_VECTOR_BODY]] ], [ [[TMP94]], [[PRED_SDIV_IF34]] ] +; CHECK-NEXT: [[TMP96:%.*]] = extractelement <8 x i1> [[TMP90]], i32 1 +; CHECK-NEXT: br i1 [[TMP96]], label [[PRED_SDIV_IF36:%.*]], label [[PRED_SDIV_CONTINUE37:%.*]] +; CHECK: pred.sdiv.if36: +; CHECK-NEXT: [[TMP97:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 1 +; CHECK-NEXT: [[TMP98:%.*]] = sdiv i8 [[TMP97]], -1 +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <8 x i8> [[TMP95]], i8 [[TMP98]], i32 1 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE37]] +; CHECK: pred.sdiv.continue37: +; CHECK-NEXT: [[TMP100:%.*]] = phi <8 x i8> [ [[TMP95]], [[PRED_SDIV_CONTINUE35]] ], [ [[TMP99]], [[PRED_SDIV_IF36]] ] +; CHECK-NEXT: [[TMP101:%.*]] = extractelement <8 x i1> [[TMP90]], i32 2 +; CHECK-NEXT: br i1 [[TMP101]], label [[PRED_SDIV_IF38:%.*]], label [[PRED_SDIV_CONTINUE39:%.*]] +; CHECK: pred.sdiv.if38: +; CHECK-NEXT: [[TMP102:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 2 +; CHECK-NEXT: [[TMP103:%.*]] = sdiv i8 [[TMP102]], -1 +; CHECK-NEXT: [[TMP104:%.*]] = insertelement <8 x i8> [[TMP100]], i8 [[TMP103]], i32 2 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE39]] +; CHECK: pred.sdiv.continue39: +; CHECK-NEXT: [[TMP105:%.*]] = phi <8 x i8> [ [[TMP100]], [[PRED_SDIV_CONTINUE37]] ], [ [[TMP104]], [[PRED_SDIV_IF38]] ] +; CHECK-NEXT: [[TMP106:%.*]] = extractelement <8 x i1> [[TMP90]], i32 3 +; CHECK-NEXT: br i1 [[TMP106]], label [[PRED_SDIV_IF40:%.*]], label [[PRED_SDIV_CONTINUE41:%.*]] +; CHECK: pred.sdiv.if40: +; CHECK-NEXT: [[TMP107:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 3 +; CHECK-NEXT: [[TMP108:%.*]] = sdiv i8 [[TMP107]], -1 +; CHECK-NEXT: [[TMP109:%.*]] = insertelement <8 x i8> [[TMP105]], i8 [[TMP108]], i32 3 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE41]] +; CHECK: pred.sdiv.continue41: +; CHECK-NEXT: [[TMP110:%.*]] = phi <8 x i8> [ [[TMP105]], [[PRED_SDIV_CONTINUE39]] ], [ [[TMP109]], [[PRED_SDIV_IF40]] ] +; CHECK-NEXT: [[TMP111:%.*]] = extractelement <8 x i1> [[TMP90]], i32 4 +; CHECK-NEXT: br i1 [[TMP111]], label [[PRED_SDIV_IF42:%.*]], label [[PRED_SDIV_CONTINUE43:%.*]] +; CHECK: pred.sdiv.if42: +; CHECK-NEXT: [[TMP112:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 4 +; CHECK-NEXT: [[TMP113:%.*]] = sdiv i8 [[TMP112]], -1 +; CHECK-NEXT: [[TMP114:%.*]] = insertelement <8 x i8> [[TMP110]], i8 [[TMP113]], i32 4 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE43]] +; CHECK: pred.sdiv.continue43: +; CHECK-NEXT: [[TMP115:%.*]] = phi <8 x i8> [ [[TMP110]], [[PRED_SDIV_CONTINUE41]] ], [ [[TMP114]], [[PRED_SDIV_IF42]] ] +; CHECK-NEXT: [[TMP116:%.*]] = extractelement <8 x i1> [[TMP90]], i32 5 +; CHECK-NEXT: br i1 [[TMP116]], label [[PRED_SDIV_IF44:%.*]], label [[PRED_SDIV_CONTINUE45:%.*]] +; CHECK: pred.sdiv.if44: +; CHECK-NEXT: [[TMP117:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 5 +; CHECK-NEXT: [[TMP118:%.*]] = sdiv i8 [[TMP117]], -1 +; CHECK-NEXT: [[TMP119:%.*]] = insertelement <8 x i8> [[TMP115]], i8 [[TMP118]], i32 5 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE45]] +; CHECK: pred.sdiv.continue45: +; CHECK-NEXT: [[TMP120:%.*]] = phi <8 x i8> [ [[TMP115]], [[PRED_SDIV_CONTINUE43]] ], [ [[TMP119]], [[PRED_SDIV_IF44]] ] +; CHECK-NEXT: [[TMP121:%.*]] = extractelement <8 x i1> [[TMP90]], i32 6 +; CHECK-NEXT: br i1 [[TMP121]], label [[PRED_SDIV_IF46:%.*]], label [[PRED_SDIV_CONTINUE47:%.*]] +; CHECK: pred.sdiv.if46: +; CHECK-NEXT: [[TMP122:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 6 +; CHECK-NEXT: [[TMP123:%.*]] = sdiv i8 [[TMP122]], -1 +; CHECK-NEXT: [[TMP124:%.*]] = insertelement <8 x i8> [[TMP120]], i8 [[TMP123]], i32 6 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE47]] +; CHECK: pred.sdiv.continue47: +; CHECK-NEXT: [[TMP125:%.*]] = phi <8 x i8> [ [[TMP120]], [[PRED_SDIV_CONTINUE45]] ], [ [[TMP124]], [[PRED_SDIV_IF46]] ] +; CHECK-NEXT: [[TMP126:%.*]] = extractelement <8 x i1> [[TMP90]], i32 7 +; CHECK-NEXT: br i1 [[TMP126]], label [[PRED_SDIV_IF48:%.*]], label [[PRED_SDIV_CONTINUE49]] +; CHECK: pred.sdiv.if48: +; CHECK-NEXT: [[TMP127:%.*]] = extractelement <8 x i8> [[WIDE_LOAD33]], i32 7 +; CHECK-NEXT: [[TMP128:%.*]] = sdiv i8 [[TMP127]], -1 +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <8 x i8> [[TMP125]], i8 [[TMP128]], i32 7 +; CHECK-NEXT: br label [[PRED_SDIV_CONTINUE49]] +; CHECK: pred.sdiv.continue49: +; CHECK-NEXT: [[TMP130:%.*]] = phi <8 x i8> [ [[TMP125]], [[PRED_SDIV_CONTINUE47]] ], [ [[TMP129]], [[PRED_SDIV_IF48]] ] +; CHECK-NEXT: [[TMP131:%.*]] = xor <8 x i1> [[TMP90]], +; CHECK-NEXT: [[PREDPHI50:%.*]] = select <8 x i1> [[TMP90]], <8 x i8> [[TMP130]], <8 x i8> [[WIDE_LOAD33]] +; CHECK-NEXT: [[TMP132:%.*]] = getelementptr inbounds i8, ptr [[TMP88]], i32 0 +; CHECK-NEXT: store <8 x i8> [[PREDPHI50]], ptr [[TMP132]], align 1 +; CHECK-NEXT: [[INDEX_NEXT51]] = add nuw i64 [[OFFSET_IDX]], 8 +; CHECK-NEXT: [[TMP133:%.*]] = icmp eq i64 [[INDEX_NEXT51]], 1024 +; CHECK-NEXT: br i1 [[TMP133]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; CHECK: vec.epilog.middle.block: +; CHECK-NEXT: [[CMP_N31:%.*]] = icmp eq i64 1024, 1024 +; CHECK-NEXT: br i1 [[CMP_N31]], label [[FOR_END]], label [[VEC_EPILOG_SCALAR_PH]] +; CHECK: vec.epilog.scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[IV]] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LATCH:%.*]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[IV]] ; CHECK-NEXT: [[ELEM:%.*]] = load i8, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[C:%.*]] = icmp ne i8 [[ELEM]], -128 ; CHECK-NEXT: br i1 [[C]], label [[DO_OP:%.*]], label [[LATCH]] @@ -500,7 +904,7 @@ ; CHECK-NEXT: store i8 [[PHI]], ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ;